title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
year = 2019,
month = jun,
booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
publisher = {Association for Computational Linguistics},
address = {Minneapolis, Minnesota},
pages = {4171--4186},
doi = {10.18653/v1/N19-1423},
url = {https://aclanthology.org/N19-1423},
abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).}
abstract = {Language model (LM) pretraining captures various knowledge from text corpora, helping downstream tasks. However, existing methods such as BERT model a single document, and do not capture dependencies or knowledge that span across documents. In this work, we propose LinkBERT, an LM pretraining method that leverages links between documents, e.g., hyperlinks. Given a text corpus, we view it as a graph of documents and create LM inputs by placing linked documents in the same context. We then pretrain the LM with two joint self-supervised objectives: masked language modeling and our new proposal, document relation prediction. We show that LinkBERT outperforms BERT on various downstream tasks across two domains: the general domain (pretrained on Wikipedia with hyperlinks) and biomedical domain (pretrained on PubMed with citation links). LinkBERT is especially effective for multi-hop reasoning and few-shot QA (+5{\%} absolute improvement on HotpotQA and TriviaQA), and our biomedical LinkBERT sets new states of the art on various BioNLP tasks (+7{\%} on BioASQ and USMLE). We release our pretrained models, LinkBERT and BioLinkBERT, as well as code and data.}
title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Purohit, Shivanshu and Songz, Tri and Phil, Wang and Weinbach, Samuel},
title = {{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
author = {Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
year = 2022,
booktitle = {Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
title = {Augmented {SBERT}: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
author = {Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
year = 2021,
month = 6,
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences}
title = {Adversarial {NLI}: A New Benchmark for Natural Language Understanding},
author = {Nie, Yixin and Williams, Adina and Dinan, Emily and Bansal, Mohit and Weston, Jason and Kiela, Douwe},
year = 2020,
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
publisher = {Association for Computational Linguistics}
}
@inproceedings{N18-1101,
title = {A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
author = {Williams, Adina and Nangia, Nikita and Bowman, Samuel},
year = 2018,
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
location = {New Orleans, Louisiana},
publisher = {Association for Computational Linguistics},
title = {Natural Questions: A Benchmark for Question Answering Research},
author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},
year = 2019,
journal = {Transactions of the Association for Computational Linguistics},
publisher = {MIT Press},
address = {Cambridge, MA},
volume = 7,
pages = {452--466},
doi = {10.1162/tacl_a_00276},
url = {https://aclanthology.org/Q19-1026},
abstract = {We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature.}
author = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert{-}Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
author = {Yasunaga, Michihiro and Leskovec, Jure and Liang, Percy},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {LinkBERT: Pretraining Language Models with Document Links},
author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
author = {Roberts, Adam and Raffel, Colin and Shazeer, Noam},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {How Much Knowledge Can You Pack Into the Parameters of a Language Model?},
author = {Auer, S\"{o}ren and Bizer, Christian and Kobilarov, Georgi and Lehmann, Jens and Cyganiak, Richard and Ives, Zachary},
title = {DBpedia: A Nucleus for a Web of Open Data},
year = 2007,
isbn = 3540762973,
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
abstract = {DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data. We describe the extraction of the DBpedia datasets, and how the resulting information is published on the Web for human-andmachine-consumption. We describe some emerging applications from the DBpedia community and show how website authors can facilitate DBpedia content within their sites. Finally, we present the current status of interlinking DBpedia with other open datasets on the Web and outline how DBpedia could serve as a nucleus for an emerging Web of open data.},
booktitle = {Proceedings of the 6th International The Semantic Web and 2nd Asian Conference on Asian Semantic Web Conference},
pages = {722–735},
numpages = 14,
location = {Busan, Korea},
series = {ISWC'07/ASWC'07}
}
@article{10.1145/2629489,
author = {Vrande\v{c}i\'{c}, Denny and Kr\"{o}tzsch, Markus},
title = {Wikidata: A Free Collaborative Knowledgebase},
year = 2014,
issue_date = {October 2014},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = 57,
number = 10,
issn = {0001-0782},
url = {https://doi.org/10.1145/2629489},
doi = {10.1145/2629489},
abstract = {This collaboratively edited knowledgebase provides a common source of data for Wikipedia, and everyone else.},
journal = {Commun. ACM},
month = {sep},
pages = {78–85},
numpages = 8
}
@misc{https://doi.org/10.48550/arxiv.1508.05326,
doi = {10.48550/ARXIV.1508.05326},
url = {https://arxiv.org/abs/1508.05326},
author = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.},
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {A large annotated corpus for learning natural language inference},
author = {Nie, Yixin and Williams, Adina and Dinan, Emily and Bansal, Mohit and Weston, Jason and Kiela, Douwe},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Adversarial NLI: A New Benchmark for Natural Language Understanding},
author = {Williams, Adina and Nangia, Nikita and Bowman, Samuel},
title = {A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
year = 2018,
publisher = {Association for Computational Linguistics},
pages = {1112--1122},
location = {New Orleans, Louisiana},
url = {http://aclweb.org/anthology/N18-1101}
}
@article{47761,
title = {Natural Questions: a Benchmark for Question Answering Research},
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year = 2019,
journal = {Transactions of the Association of Computational Linguistics}
}
@misc{https://doi.org/10.48550/arxiv.1704.05179,
doi = {10.48550/ARXIV.1704.05179},
url = {https://arxiv.org/abs/1704.05179},
author = {Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V. Ugur and Cirik, Volkan and Cho, Kyunghyun},
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {SearchQA: A New Q \& A Dataset Augmented with Context from a Search Engine},
author = {Zouhar, Vilém and Mosbach, Marius and Biswas, Debanjali and Klakow , Dietrich},
keywords = {Computation and Language (cs.CL), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Artefact Retrieval: Overview of NLP Models with Knowledge Base Access},
author = {Hrkút, Patrik and Toth, Štefan and Ďuračík, Michal and Meško, Matej and Krsak, Emil and Mikušová, Miroslava},
year = 2020,
month = {03},
pages = {60--70},
title = {Data Collection for Natural Language Processing Systems},
isbn = {978-981-15-3379-2},
doi = {10.1007/978-981-15-3380-8_6}
}
@article{Chiche2022,
author = {Chiche, Alebachew and Yitagesu, Betselot},
title = {Part of speech tagging: a systematic review of deep learning and machine learning approaches},
journal = {Journal of Big Data},
year = 2022,
month = {Jan},
day = 24,
volume = 9,
number = 1,
pages = 10,
abstract = {Natural language processing (NLP) tools have sparked a great deal of interest due to rapid improvements in information and communications technologies. As a result, many different NLP tools are being produced. However, there are many challenges for developing efficient and effective NLP tools that accurately process natural languages. One such tool is part of speech (POS) tagging, which tags a particular sentence or words in a paragraph by looking at the context of the sentence/words inside the paragraph. Despite enormous efforts by researchers, POS tagging still faces challenges in improving accuracy while reducing false-positive rates and in tagging unknown words. Furthermore, the presence of ambiguity when tagging terms with different contextual meanings inside a sentence cannot be overlooked. Recently, Deep learning (DL) and Machine learning (ML)-based POS taggers are being implemented as potential solutions to efficiently identify words in a given sentence across a paragraph. This article first clarifies the concept of part of speech POS tagging. It then provides the broad categorization based on the famous ML and DL techniques employed in designing and implementing part of speech taggers. A comprehensive review of the latest POS tagging articles is provided by discussing the weakness and strengths of the proposed approaches. Then, recent trends and advancements of DL and ML-based part-of-speech-taggers are presented in terms of the proposed approaches deployed and their performance evaluation metrics. Using the limitations of the proposed approaches, we emphasized various research gaps and presented future recommendations for the research in advancing DL and ML-based POS tagging.},
title = {Augmented {SBERT}: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
author = {Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
month = 6,
year = 2021,
address = {Online},
publisher = {Association for Computational Linguistics},
url = {https://arxiv.org/abs/2010.08240},
pages = {296--310}
}
@misc{https://doi.org/10.48550/arxiv.2006.03654,
doi = {10.48550/ARXIV.2006.03654},
url = {https://arxiv.org/abs/2006.03654},
author = {He, Pengcheng and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences, I.2; I.7, cs.CL, cs.GL},
title = {DeBERTa: Decoding-enhanced BERT with Disentangled Attention},
title = {Tagging Efficiency Analysis on Part of Speech Taggers},
doi = {10.1109/ICIT.2017.57}
}
@article{2019t5,
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},