update

2022-12-25 11:30:03 +07:00 · 2022-12-25 11:30:03 +07:00 · 0c12e6b0ff
parent eef7dfce56
commit 0c12e6b0ff
5 changed files with 285 additions and 3 deletions
--- a/research/Internet-NLP/paper/abstract/main.tex
+++ b/research/Internet-NLP/paper/abstract/main.tex
@ -22,7 +22,7 @@ As a result of these models, datasets, and Internet-NLP, the accuracy and reliab

 Internet-NLP and the new NLP and NLI models, which were trained on the general-purpose datasets (ALotNLI, and ALotOpenBookQA). Internet-NLP, by default utilizes an Text-Generative model GPT-NeoX \cite{gpt-neox-library, gpt-neox-20b} for long responses and LinkBERT \cite{yasunaga-etal-2022-linkbert} for short responses. For 2 choices (for ex: True and False) Bi-Encoder NLI has been used and for multiple choices CrossEncoder will be used \cite{thakur-2020-AugSBERT}.

-Internet-NLP, in layperson terms, provides the context for context-needing NLP models to let them function. Internet-NLP can be improved via finetuning, and training of LSTM and Reinforcement Learning model (which can be trained alongside the NLP model), which enables for better search queries, and subsequently results. It obtains state-of-the-art results in QA and NLI without context.
+Internet-NLP, in layperson terms, provides the context for context-needing NLP models to let them function. Internet-NLP can be improved via finetuning, and training of LSTM and Reinforcement Learning model (which can be trained alongside the NLP model), which enables for better search queries, and subsequently results. It obtains state-of-the-art (SOTA) results in QA and NLI without context.

 Internet-NLP is a subset of a larger package, Internet-ML and is open-source. $\footnote{Internet-NLP, subset of Internet-ML is public, and open-source: \url{https://github.com/thamognya/internet_ml}}. \label{footnote:code}$ 
 Old versions of Internet-NLP is also publicly available. $\footnote{Old Versions of Internet-NLP is public: \url{https://pypi.org/project/internet-nlp/}}. \label{footnote:code-old}$
--- a/research/Internet-NLP/paper/introduction/artefacts_diagram.pdf
+++ b/research/Internet-NLP/paper/introduction/artefacts_diagram.pdf
--- a/research/Internet-NLP/paper/introduction/main.tex
+++ b/research/Internet-NLP/paper/introduction/main.tex
@ -15,11 +15,12 @@ There are currently two main solutions for utilizing NLP tasks with no context p
                \caption{This is an illustration of the architecture of T5, a popular Text2Text-Generation model \cite{alammar}.}
                \label{fig:CurrSolOneSecondImg}
            \end{figure}
-            \item Pre-trained Text2Text-generation models, like T5 \cite{https://doi.org/10.48550/arxiv.1910.10683} that have open-domain question-answering closed-book (no context) language models (ODQA LM) capabilities \cite{weng2020odqa}. These closed-book QDQA LMs are comparatively state-of-the-art performance in many no-context NLP tasks, mainly question-answering. Text2Text-generation models for such no-context NLP tasks are usually large, slow, and have a low accuracy \cite{https://doi.org/10.48550/arxiv.2002.08910}.
+            \item Pre-trained Text-generation models, like GPT-NeoX, GPT-3, and etc. \cite{gpt-neox-20b,gpt-neox-library,DBLP:journals/corr/abs-2005-14165} can be trained for open-domain question-answering closed-book language model tasks (ODQA LM) \cite{weng2020odqa}. When used in ODQA tasks, they achieve SOTA results in such tasks, have high accuracy and are fast but are much larger in size than open-book (context-needing) language models.
+            \item Additionally Pre-trained Text2Text-generation models, like T5 \cite{https://doi.org/10.48550/arxiv.1910.10683} that have open-domain question-answering closed-book (no context) language models (ODQA LM) capabilities \cite{weng2020odqa}. These closed-book QDQA LMs are comparatively state-of-the-art performance in many no-context NLP tasks, mainly question-answering. Text2Text-generation models for such no-context NLP tasks are usually large, slow, and have a low accuracy \cite{DBLP:journals/corr/abs-2002-08910}.
            \item Example: T5 \cite{https://doi.org/10.48550/arxiv.1910.10683}
            \item Illustration of how ODQA LM work: \ref{fig:CurrSolOneImg}
        \end{itemize}
-    \item Large Knowledge Base with a Contex-Needing Language Model \label{CurrSolTwo}
+    \item Large Knowledge Database with a Contex-Needing Language Model \label{CurrSolTwo}
        \begin{itemize}[leftmargin=1em]
            \begin{figure}
                \includegraphics[width=1.0\columnwidth]{artefacts_diagram.pdf}
--- a/research/Internet-NLP/paper/main.pdf
+++ b/research/Internet-NLP/paper/main.pdf
--- a/research/Internet-NLP/paper/ref.bib
+++ b/research/Internet-NLP/paper/ref.bib
@ -156,3 +156,284 @@
 	journal      = {The Illustrated Transformer  Jay Alammar  Visualizing machine learning one concept at a time.},
 	author       = {Alammar, Jay}
 }
+@article{DBLP:journals/corr/abs-2005-14165,
+	author       = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert{-}Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
+	title        = {Language Models are Few-Shot Learners},
+	journal      = {CoRR},
+	volume       = {abs/2005.14165},
+	year         = 2020,
+	url          = {https://arxiv.org/abs/2005.14165},
+	eprinttype   = {arXiv},
+	eprint       = {2005.14165},
+	timestamp    = {Wed, 03 Jun 2020 11:36:54 +0200},
+	biburl       = {https://dblp.org/rec/journals/corr/abs-2005-14165.bib},
+	bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+@article{weng2020odqa,
+	title        = {How to Build an Open-Domain Question Answering System?},
+	author       = {Weng, Lilian},
+	journal      = {lilianweng.github.io},
+	year         = 2020,
+	month        = {Oct},
+	url          = {https://lilianweng.github.io/posts/2020-10-29-odqa/}
+}
+@article{DBLP:journals/corr/abs-2002-08910,
+	author       = {Adam Roberts and Colin Raffel and Noam Shazeer},
+	title        = {How Much Knowledge Can You Pack Into the Parameters of a Language Model?},
+	journal      = {CoRR},
+	volume       = {abs/2002.08910},
+	year         = 2020,
+	url          = {https://arxiv.org/abs/2002.08910},
+	eprinttype   = {arXiv},
+	eprint       = {2002.08910},
+	timestamp    = {Mon, 02 Mar 2020 16:46:06 +0100},
+	biburl       = {https://dblp.org/rec/journals/corr/abs-2002-08910.bib},
+	bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+@misc{https://doi.org/10.48550/arxiv.1810.04805,
+	doi          = {10.48550/ARXIV.1810.04805},
+	url          = {https://arxiv.org/abs/1810.04805},
+	author       = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+	publisher    = {arXiv},
+	year         = 2018,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{https://doi.org/10.48550/arxiv.2203.15827,
+	doi          = {10.48550/ARXIV.2203.15827},
+	url          = {https://arxiv.org/abs/2203.15827},
+	author       = {Yasunaga, Michihiro and Leskovec, Jure and Liang, Percy},
+	keywords     = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {LinkBERT: Pretraining Language Models with Document Links},
+	publisher    = {arXiv},
+	year         = 2022,
+	copyright    = {Creative Commons Attribution 4.0 International}
+}
+@misc{https://doi.org/10.48550/arxiv.1908.10084,
+	doi          = {10.48550/ARXIV.1908.10084},
+	url          = {https://arxiv.org/abs/1908.10084},
+	author       = {Reimers, Nils and Gurevych, Iryna},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
+	publisher    = {arXiv},
+	year         = 2019,
+	copyright    = {Creative Commons Attribution Share Alike 4.0 International}
+}
+@misc{https://doi.org/10.48550/arxiv.1910.10683,
+	doi          = {10.48550/ARXIV.1910.10683},
+	url          = {https://arxiv.org/abs/1910.10683},
+	author       = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
+	keywords     = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+	publisher    = {arXiv},
+	year         = 2019,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{https://doi.org/10.48550/arxiv.1809.02789,
+	doi          = {10.48550/ARXIV.1809.02789},
+	url          = {https://arxiv.org/abs/1809.02789},
+	author       = {Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
+	publisher    = {arXiv},
+	year         = 2018,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@article{Bartolo_2020,
+	doi          = {10.1162/tacl_a_00338},
+	url          = {https://doi.org/10.1162%2Ftacl_a_00338},
+	year         = 2020,
+	month        = {dec},
+	publisher    = {{MIT} Press - Journals},
+	volume       = 8,
+	pages        = {662--678},
+	author       = {Max Bartolo and Alastair Roberts and Johannes Welbl and Sebastian Riedel and Pontus Stenetorp},
+	title        = {Beat the {AI}: Investigating Adversarial Human Annotation for Reading Comprehension},
+	journal      = {Transactions of the Association for Computational Linguistics}
+}
+@article{weng2020odqa,
+	title        = {How to Build an Open-Domain Question Answering System?},
+	author       = {Weng, Lilian},
+	journal      = {lilianweng.github.io},
+	year         = 2020,
+	month        = {Oct},
+	url          = {https://lilianweng.github.io/posts/2020-10-29-odqa/}
+}
+@misc{https://doi.org/10.48550/arxiv.2002.08910,
+	doi          = {10.48550/ARXIV.2002.08910},
+	url          = {https://arxiv.org/abs/2002.08910},
+	author       = {Roberts, Adam and Raffel, Colin and Shazeer, Noam},
+	keywords     = {Computation and Language (cs.CL), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {How Much Knowledge Can You Pack Into the Parameters of a Language Model?},
+	publisher    = {arXiv},
+	year         = 2020,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@inproceedings{10.5555/1785162.1785216,
+	author       = {Auer, S\"{o}ren and Bizer, Christian and Kobilarov, Georgi and Lehmann, Jens and Cyganiak, Richard and Ives, Zachary},
+	title        = {DBpedia: A Nucleus for a Web of Open Data},
+	year         = 2007,
+	isbn         = 3540762973,
+	publisher    = {Springer-Verlag},
+	address      = {Berlin, Heidelberg},
+	abstract     = {DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data. We describe the extraction of the DBpedia datasets, and how the resulting information is published on the Web for human-andmachine-consumption. We describe some emerging applications from the DBpedia community and show how website authors can facilitate DBpedia content within their sites. Finally, we present the current status of interlinking DBpedia with other open datasets on the Web and outline how DBpedia could serve as a nucleus for an emerging Web of open data.},
+	booktitle    = {Proceedings of the 6th International The Semantic Web and 2nd Asian Conference on Asian Semantic Web Conference},
+	pages        = {722–735},
+	numpages     = 14,
+	location     = {Busan, Korea},
+	series       = {ISWC'07/ASWC'07}
+}
+@article{10.1145/2629489,
+	author       = {Vrande\v{c}i\'{c}, Denny and Kr\"{o}tzsch, Markus},
+	title        = {Wikidata: A Free Collaborative Knowledgebase},
+	year         = 2014,
+	issue_date   = {October 2014},
+	publisher    = {Association for Computing Machinery},
+	address      = {New York, NY, USA},
+	volume       = 57,
+	number       = 10,
+	issn         = {0001-0782},
+	url          = {https://doi.org/10.1145/2629489},
+	doi          = {10.1145/2629489},
+	abstract     = {This collaboratively edited knowledgebase provides a common source of data for Wikipedia, and everyone else.},
+	journal      = {Commun. ACM},
+	month        = {sep},
+	pages        = {78–85},
+	numpages     = 8
+}
+@misc{https://doi.org/10.48550/arxiv.1508.05326,
+	doi          = {10.48550/ARXIV.1508.05326},
+	url          = {https://arxiv.org/abs/1508.05326},
+	author       = {Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher and Manning, Christopher D.},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {A large annotated corpus for learning natural language inference},
+	publisher    = {arXiv},
+	year         = 2015,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{https://doi.org/10.48550/arxiv.1910.14599,
+	doi          = {10.48550/ARXIV.1910.14599},
+	url          = {https://arxiv.org/abs/1910.14599},
+	author       = {Nie, Yixin and Williams, Adina and Dinan, Emily and Bansal, Mohit and Weston, Jason and Kiela, Douwe},
+	keywords     = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {Adversarial NLI: A New Benchmark for Natural Language Understanding},
+	publisher    = {arXiv},
+	year         = 2019,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@inproceedings{N18-1101,
+	author       = {Williams, Adina and Nangia, Nikita and Bowman, Samuel},
+	title        = {A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
+	booktitle    = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
+	year         = 2018,
+	publisher    = {Association for Computational Linguistics},
+	pages        = {1112--1122},
+	location     = {New Orleans, Louisiana},
+	url          = {http://aclweb.org/anthology/N18-1101}
+}
+@article{47761,
+	title        = {Natural Questions: a Benchmark for Question Answering Research},
+	author       = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
+	year         = 2019,
+	journal      = {Transactions of the Association of Computational Linguistics}
+}
+@misc{https://doi.org/10.48550/arxiv.1704.05179,
+	doi          = {10.48550/ARXIV.1704.05179},
+	url          = {https://arxiv.org/abs/1704.05179},
+	author       = {Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V. Ugur and Cirik, Volkan and Cho, Kyunghyun},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {SearchQA: A New Q \& A Dataset Augmented with Context from a Search Engine},
+	publisher    = {arXiv},
+	year         = 2017,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{https://doi.org/10.48550/arxiv.1705.03551,
+	doi          = {10.48550/ARXIV.1705.03551},
+	url          = {https://arxiv.org/abs/1705.03551},
+	author       = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer , Luke},
+	keywords     = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+	publisher    = {arXiv},
+	year         = 2017,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{https://doi.org/10.48550/arxiv.2201.09651,
+	doi          = {10.48550/ARXIV.2201.09651},
+	url          = {https://arxiv.org/abs/2201.09651},
+	author       = {Zouhar, Vilém and Mosbach, Marius and Biswas, Debanjali and Klakow , Dietrich},
+	keywords     = {Computation and Language (cs.CL), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences},
+	title        = {Artefact Retrieval: Overview of NLP Models with Knowledge Base Access},
+	publisher    = {arXiv},
+	year         = 2022,
+	copyright    = {Creative Commons Attribution Share Alike 4.0 International}
+}
+@inbook{inbook,
+	author       = {Hrkút, Patrik and Toth, Štefan and Ďuračík, Michal and Meško, Matej and Krsak, Emil and Mikušová, Miroslava},
+	year         = 2020,
+	month        = {03},
+	pages        = {60--70},
+	title        = {Data Collection for Natural Language Processing Systems},
+	isbn         = {978-981-15-3379-2},
+	doi          = {10.1007/978-981-15-3380-8_6}
+}
+@article{Chiche2022,
+	author       = {Chiche, Alebachew and Yitagesu, Betselot},
+	title        = {Part of speech tagging: a systematic review of deep learning and machine learning approaches},
+	journal      = {Journal of Big Data},
+	year         = 2022,
+	month        = {Jan},
+	day          = 24,
+	volume       = 9,
+	number       = 1,
+	pages        = 10,
+	abstract     = {Natural language processing (NLP) tools have sparked a great deal of interest due to rapid improvements in information and communications technologies. As a result, many different NLP tools are being produced. However, there are many challenges for developing efficient and effective NLP tools that accurately process natural languages. One such tool is part of speech (POS) tagging, which tags a particular sentence or words in a paragraph by looking at the context of the sentence/words inside the paragraph. Despite enormous efforts by researchers, POS tagging still faces challenges in improving accuracy while reducing false-positive rates and in tagging unknown words. Furthermore, the presence of ambiguity when tagging terms with different contextual meanings inside a sentence cannot be overlooked. Recently, Deep learning (DL) and Machine learning (ML)-based POS taggers are being implemented as potential solutions to efficiently identify words in a given sentence across a paragraph. This article first clarifies the concept of part of speech POS tagging. It then provides the broad categorization based on the famous ML and DL techniques employed in designing and implementing part of speech taggers. A comprehensive review of the latest POS tagging articles is provided by discussing the weakness and strengths of the proposed approaches. Then, recent trends and advancements of DL and ML-based part-of-speech-taggers are presented in terms of the proposed approaches deployed and their performance evaluation metrics. Using the limitations of the proposed approaches, we emphasized various research gaps and presented future recommendations for the research in advancing DL and ML-based POS tagging.},
+	issn         = {2196-1115},
+	doi          = {10.1186/s40537-022-00561-y},
+	url          = {https://doi.org/10.1186/s40537-022-00561-y}
+}
+@inproceedings{thakur-2020-AugSBERT,
+	title        = {Augmented {SBERT}: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks},
+	author       = {Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and Gurevych, Iryna},
+	booktitle    = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+	month        = 6,
+	year         = 2021,
+	address      = {Online},
+	publisher    = {Association for Computational Linguistics},
+	url          = {https://arxiv.org/abs/2010.08240},
+	pages        = {296--310}
+}
+@misc{https://doi.org/10.48550/arxiv.2006.03654,
+	doi          = {10.48550/ARXIV.2006.03654},
+	url          = {https://arxiv.org/abs/2006.03654},
+	author       = {He, Pengcheng and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu},
+	keywords     = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences, I.2; I.7, cs.CL, cs.GL},
+	title        = {DeBERTa: Decoding-enhanced BERT with Disentangled Attention},
+	publisher    = {arXiv},
+	year         = 2020,
+	copyright    = {arXiv.org perpetual, non-exclusive license}
+}
+@misc{FormalInformal,
+	title        = {Formal and Informal Style},
+	url          = {https://www.niu.edu/writingtutorial/style/formal-and-informal-style.shtml}
+}
+@misc{BetterWebSearches,
+	title        = {Refine web searches},
+	url          = {https://support.google.com/websearch/answer/2466433}
+}
+@inproceedings{inproceedings,
+	author       = {Banga, Ritu and Mehndiratta, Pulkit},
+	year         = 2017,
+	month        = 12,
+	pages        = {264--267},
+	title        = {Tagging Efficiency Analysis on Part of Speech Taggers},
+	doi          = {10.1109/ICIT.2017.57}
+}
+@article{2019t5,
+	author       = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+	title        = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+	journal      = {arXiv e-prints},
+	year         = 2019,
+	archiveprefix = {arXiv},
+	eprint       = {1910.10683}
+}