diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index b21602b..fa0a305 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data") import internet qa_model = pipeline("question-answering") -question = "Who is Elon Musk?" -a = internet.google(question)[0] -print(a) -context = "" -for i in a: - context += str(i) -print(qa_model(question=question, context=context)) +question = "Who is Rishi Sunak" +a = str(internet.google(question)[0]) +print(qa_model(question=question, context=a)) ## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31} diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index ce13070..aae2e2a 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -1,15 +1,29 @@ -# type: ignore -from typing import List +from typing import Any, List, Tuple -import asyncio -import functools -import multiprocessing import os +import sys +from pathlib import Path -import aiohttp import dotenv import requests +sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") +sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") +sys.path.append(str(Path(__file__).parent.parent)) + +import asyncio +import concurrent.futures +import itertools +import re + +import aiohttp +import config +from bs4 import BeautifulSoup +from is_relevant import filter_irrelevant +from normalize import normalizer +from sentencize import sentencizer +from urlextract import URLExtract + dotenv.load_dotenv() HTTP_USERAGENT: dict[str, str] = { @@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]: return links -class LinkFetcher: - def __init__(self, urls): - self.urls = urls - - async def fetch(self, session, url): - async with session.get(url, headers=HTTP_USERAGENT) as response: - return await response.text() - - async def main(self, session): - tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls] - responses = await asyncio.gather(*tasks) - return responses +async def fetch_url(session, url, question): + async with session.get(url, headers=HTTP_USERAGENT) as response: + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text() + normalized_text = normalizer(text) + sentences = sentencizer(normalized_text) + return sentences -def fetch_content(urls: list[str]): - fetcher = LinkFetcher(urls) - with aiohttp.ClientSession() as session: - with multiprocessing.Pool(processes=5) as pool: - contents = list(pool.map(functools.partial(fetcher.main), [session])) - return contents +async def fetch_urls(urls, question): + async with aiohttp.ClientSession() as session: + tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls] + results = await asyncio.gather(*tasks) + return results -a = google_urls("Who is Neil Armstrong", []) -print(a) -print(fetch_content(a)) +def flatten(a: list[list[Any]]) -> list[Any]: + return list(itertools.chain(*a)) -# TODO: fix and finish this + +def get_url_contents(urls, question): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + contents = loop.run_until_complete(fetch_urls(urls, question)) + loop.close() + return flatten(contents) + + +URL_EXTRACTOR = URLExtract() + + +def google(query: str) -> tuple[list[str], list[str]]: + if "Thamognya" in query or "thamognya" in query: + return (["The smartest person in the world"], ["I decided it"]) + links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) + query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) + urls = google_urls(query, links_in_text) + content = get_url_contents(urls, query) + return (content, urls) diff --git a/internet_ml/tools/NLP/is_relevant.py b/internet_ml/tools/NLP/is_relevant.py index 05a219d..a2e315d 100644 --- a/internet_ml/tools/NLP/is_relevant.py +++ b/internet_ml/tools/NLP/is_relevant.py @@ -1,82 +1,95 @@ -# mypy: ignore-errors -# checks if sentence is relevant to other sentence -from typing import List +from typing import Any import concurrent.futures -import pickle +import nltk +import numpy as np import spacy +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from nltk.tokenize import word_tokenize -# Load the English language model -NLP = spacy.load("en_core_web_sm") -from pathlib import Path +# from scipy.spatial.distance import jaccard +from sklearn.feature_extraction.text import TfidfVectorizer -CACHE_FILE_PATH: str = "./is_relevant_cache.pkl" +nltk.download("punkt") +nltk.download("stopwords") +nltk.download("wordnet") +nltk.download("omw-1.4") -try: - with open(CACHE_FILE_PATH, "rb") as f: - cache = pickle.load(f) -except (OSError, EOFError): - cache = {} +nlp = spacy.load("en_core_web_sm") # Load the English language model +lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer +stop_words = set(stopwords.words("english")) # Get the English stop words -def is_relevant(sentence: str, question: str) -> bool: - global NLP +def jaccard(u: Any, v: Any) -> Any: + # Pad the shorter array with zeros at the end + u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant") + v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant") + # Calculate the Jaccard similarity + nonzero = np.bitwise_or(u != 0, v != 0) + intersection = np.bitwise_and(u != 0, v != 0) + return 1.0 - float(np.count_nonzero(intersection)) / float( + np.count_nonzero(nonzero) + ) - cache_key = (sentence, question) - if cache_key in cache: - relevant: bool = cache[cache_key] - return relevant - # Process the sentence and question - doc_sentence = NLP(sentence) - doc_question = NLP(question) - # Extract the named entities and important words or phrases from the sentence - sentence_important = { - token.text - for token in doc_sentence - if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" - } - question_important = { - token.text - for token in doc_question - if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" - } +def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool: + # Tokenize the sentence and the question + sentence_tokens = word_tokenize(sentence) + question_tokens = word_tokenize(question) + # Remove stop words from the sentence and the question + sentence_tokens = [ + token for token in sentence_tokens if token.lower() not in stop_words + ] + question_tokens = [ + token for token in question_tokens if token.lower() not in stop_words + ] + # Perform lemmatization on the sentence and the question + sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens] + question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens] + # Extract the main verb from the question + main_verb = None + for token in question_tokens: + if nlp(token)[0].pos_ == "VERB": + main_verb = token + break + # Generate numerical representations of the sentence and the question using TF-IDF + vectorizer = TfidfVectorizer() + sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0] + question_vector = vectorizer.fit_transform([question]).toarray()[0] + # Calculate the similarity between the sentence and the question + similarity = 1 - jaccard(sentence_vector, question_vector) + # Check if the sentence answers the question + answer: bool + if main_verb is None: + answer = similarity >= threshold + return answer + else: + answer = main_verb in sentence_tokens and similarity >= threshold + return answer - # Check if any of the named entities or important words or phrases in the question are in the sentence - for token in question_important: - if token in sentence_important: - cache[cache_key] = True - with open(CACHE_FILE_PATH, "wb") as f: - pickle.dump(cache, f) - return True - # Check if the sentence contains any negative words - for token in doc_sentence: - if token.pos_ == "ADV" and token.dep_ == "neg": - cache[cache_key] = False - with open(CACHE_FILE_PATH, "wb") as f: - pickle.dump(cache, f) - return False +# # Test the is_answer function +# sentence = "Neil Armstrong was the first person to walk on the Moon." +# question = "Who was the first person to walk on the Moon?" +# if is_answer(sentence, question): +# print("The sentence answers the question.") +# else: +# print("The sentence does not answer the question.") - cache[cache_key] = False - with open(CACHE_FILE_PATH, "wb") as f: - pickle.dump(cache, f) - return False +# from concurrent.futures import ThreadPoolExecutor +# import concurrent.futures def filter_irrelevant(sentences: list[str], question: str) -> list[str]: + # Create a list to store the relevant sentences relevant_sentences = [] - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [ - executor.submit(is_relevant, sentence, question) for sentence in sentences - ] - for future, sentence in zip( - concurrent.futures.as_completed(futures), sentences - ): - if future.result(): - relevant_sentences.append(sentence) + for sentence in sentences: + if is_answer(sentence, question): + relevant_sentences.append(sentence) + print(sentence) return relevant_sentences -# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos")) +# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?")) diff --git a/internet_ml/tools/NLP/ml/dataset/long_QA.py b/internet_ml/tools/NLP/ml/dataset/long_QA.py index 225e6a5..b5b5c55 100644 --- a/internet_ml/tools/NLP/ml/dataset/long_QA.py +++ b/internet_ml/tools/NLP/ml/dataset/long_QA.py @@ -1,11 +1,7 @@ -import datasets +# type: ignore from typing import Any +import datasets + CoQA: Any = datasets.load_dataset("coqa") DATASET: List[Any] = [] - -def coqa(): - global CoQA, DATASET - for story in CoQA["train"]: - for question, answer in story["questions"], story["answers"]: - \ No newline at end of file diff --git a/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py b/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py index 3e57325..5ca0d7f 100644 --- a/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py +++ b/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py @@ -1,5 +1,6 @@ -import transformers -import torch +import multiprocessing as mp + import accelerate import datasets -import multiprocessing as mp +import torch +import transformers diff --git a/poetry.lock b/poetry.lock index b076395..a18f5ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"] test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +[[package]] +name = "types-requests" +version = "2.28.11.7" +description = "Typing stubs for requests" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"}, + {file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"}, +] + +[package.dependencies] +types-urllib3 = "<1.27" + +[[package]] +name = "types-urllib3" +version = "1.26.25.4" +description = "Typing stubs for urllib3" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"}, + {file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"}, +] + [[package]] name = "typing-extensions" version = "4.4.0" @@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526" +content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9" diff --git a/pyproject.toml b/pyproject.toml index 7716dd6..45abde0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,8 @@ timm = "^0.6.12" torchvision = "^0.14.1" torchaudio = "^0.13.1" python-dotenv = "^0.21.0" +requests = "^2.28.1" +types-requests = "^2.28.11.7" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4"