update

2022-12-26 13:07:39 +07:00 · 2022-12-26 13:07:39 +07:00 · 1b03c8ddb4
parent 7b744b61fc
commit 1b03c8ddb4
7 changed files with 169 additions and 107 deletions
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
 import internet
 qa_model = pipeline("question-answering")
-question = "Who is Elon Musk?"
+question = "Who is Rishi Sunak"
-a = internet.google(question)[0]
+a = str(internet.google(question)[0])
-print(a)
+print(qa_model(question=question, context=a))
 context = ""
 for i in a:
    context += str(i)
 print(qa_model(question=question, context=context))
 ## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@ -1,15 +1,29 @@
-# type: ignore
+from typing import Any, List, Tuple
 from typing import List
 import asyncio
 import functools
 import multiprocessing
 import os
 import sys
 from pathlib import Path
 import aiohttp
 import dotenv
 import requests
 sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
 sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
 sys.path.append(str(Path(__file__).parent.parent))
 import asyncio
 import concurrent.futures
 import itertools
 import re
 import aiohttp
 import config
 from bs4 import BeautifulSoup
 from is_relevant import filter_irrelevant
 from normalize import normalizer
 from sentencize import sentencizer
 from urlextract import URLExtract
 dotenv.load_dotenv()
 HTTP_USERAGENT: dict[str, str] = {
@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]:
    return links
-class LinkFetcher:
+async def fetch_url(session, url, question):
-    def __init__(self, urls):
+    async with session.get(url, headers=HTTP_USERAGENT) as response:
-        self.urls = urls
+        html = await response.text()
-
+        soup = BeautifulSoup(html, "html.parser")
-    async def fetch(self, session, url):
+        text = soup.get_text()
-        async with session.get(url, headers=HTTP_USERAGENT) as response:
+        normalized_text = normalizer(text)
-            return await response.text()
+        sentences = sentencizer(normalized_text)
-
+        return sentences
    async def main(self, session):
        tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls]
        responses = await asyncio.gather(*tasks)
        return responses
-def fetch_content(urls: list[str]):
+async def fetch_urls(urls, question):
-    fetcher = LinkFetcher(urls)
+    async with aiohttp.ClientSession() as session:
-    with aiohttp.ClientSession() as session:
+        tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
-        with multiprocessing.Pool(processes=5) as pool:
+        results = await asyncio.gather(*tasks)
-            contents = list(pool.map(functools.partial(fetcher.main), [session]))
+        return results
    return contents
-a = google_urls("Who is Neil Armstrong", [])
+def flatten(a: list[list[Any]]) -> list[Any]:
-print(a)
+    return list(itertools.chain(*a))
 print(fetch_content(a))
-# TODO: fix and finish this
+
 def get_url_contents(urls, question):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    contents = loop.run_until_complete(fetch_urls(urls, question))
    loop.close()
    return flatten(contents)
 URL_EXTRACTOR = URLExtract()
 def google(query: str) -> tuple[list[str], list[str]]:
    if "Thamognya" in query or "thamognya" in query:
        return (["The smartest person in the world"], ["I decided it"])
    links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
    query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
    urls = google_urls(query, links_in_text)
    content = get_url_contents(urls, query)
    return (content, urls)
--- a/internet_ml/tools/NLP/is_relevant.py
+++ b/internet_ml/tools/NLP/is_relevant.py
@ -1,82 +1,95 @@
-# mypy: ignore-errors
+from typing import Any
 # checks if sentence is relevant to other sentence
 from typing import List
 import concurrent.futures
 import pickle
 import nltk
 import numpy as np
 import spacy
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from nltk.tokenize import word_tokenize
-# Load the English language model
+# from scipy.spatial.distance import jaccard
-NLP = spacy.load("en_core_web_sm")
+from sklearn.feature_extraction.text import TfidfVectorizer
 from pathlib import Path
-CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
+nltk.download("punkt")
 nltk.download("stopwords")
 nltk.download("wordnet")
 nltk.download("omw-1.4")
-try:
+nlp = spacy.load("en_core_web_sm")  # Load the English language model
-    with open(CACHE_FILE_PATH, "rb") as f:
+lemmatizer = WordNetLemmatizer()  # Initialize the WordNet lemmatizer
-        cache = pickle.load(f)
+stop_words = set(stopwords.words("english"))  # Get the English stop words
 except (OSError, EOFError):
    cache = {}
-def is_relevant(sentence: str, question: str) -> bool:
+def jaccard(u: Any, v: Any) -> Any:
-    global NLP
+    # Pad the shorter array with zeros at the end
    u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant")
    v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant")
    # Calculate the Jaccard similarity
    nonzero = np.bitwise_or(u != 0, v != 0)
    intersection = np.bitwise_and(u != 0, v != 0)
    return 1.0 - float(np.count_nonzero(intersection)) / float(
        np.count_nonzero(nonzero)
    )
    cache_key = (sentence, question)
    if cache_key in cache:
        relevant: bool = cache[cache_key]
        return relevant
    # Process the sentence and question
    doc_sentence = NLP(sentence)
    doc_question = NLP(question)
-    # Extract the named entities and important words or phrases from the sentence
+def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
-    sentence_important = {
+    # Tokenize the sentence and the question
-        token.text
+    sentence_tokens = word_tokenize(sentence)
-        for token in doc_sentence
+    question_tokens = word_tokenize(question)
-        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
+    # Remove stop words from the sentence and the question
-    }
+    sentence_tokens = [
-    question_important = {
+        token for token in sentence_tokens if token.lower() not in stop_words
-        token.text
+    ]
-        for token in doc_question
+    question_tokens = [
-        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
+        token for token in question_tokens if token.lower() not in stop_words
-    }
+    ]
    # Perform lemmatization on the sentence and the question
    sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens]
    question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens]
    # Extract the main verb from the question
    main_verb = None
    for token in question_tokens:
        if nlp(token)[0].pos_ == "VERB":
            main_verb = token
            break
    # Generate numerical representations of the sentence and the question using TF-IDF
    vectorizer = TfidfVectorizer()
    sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0]
    question_vector = vectorizer.fit_transform([question]).toarray()[0]
    # Calculate the similarity between the sentence and the question
    similarity = 1 - jaccard(sentence_vector, question_vector)
    # Check if the sentence answers the question
    answer: bool
    if main_verb is None:
        answer = similarity >= threshold
        return answer
    else:
        answer = main_verb in sentence_tokens and similarity >= threshold
        return answer
    # Check if any of the named entities or important words or phrases in the question are in the sentence
    for token in question_important:
        if token in sentence_important:
            cache[cache_key] = True
            with open(CACHE_FILE_PATH, "wb") as f:
                pickle.dump(cache, f)
            return True
-    # Check if the sentence contains any negative words
+# # Test the is_answer function
-    for token in doc_sentence:
+# sentence = "Neil Armstrong was the first person to walk on the Moon."
-        if token.pos_ == "ADV" and token.dep_ == "neg":
+# question = "Who was the first person to walk on the Moon?"
-            cache[cache_key] = False
+# if is_answer(sentence, question):
-            with open(CACHE_FILE_PATH, "wb") as f:
+#     print("The sentence answers the question.")
-                pickle.dump(cache, f)
+# else:
-            return False
+#     print("The sentence does not answer the question.")
-    cache[cache_key] = False
+# from concurrent.futures import ThreadPoolExecutor
-    with open(CACHE_FILE_PATH, "wb") as f:
+# import concurrent.futures
        pickle.dump(cache, f)
    return False
 def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
    # Create a list to store the relevant sentences
    relevant_sentences = []
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+    for sentence in sentences:
-        futures = [
+        if is_answer(sentence, question):
-            executor.submit(is_relevant, sentence, question) for sentence in sentences
+            relevant_sentences.append(sentence)
-        ]
+            print(sentence)
        for future, sentence in zip(
            concurrent.futures.as_completed(futures), sentences
        ):
            if future.result():
                relevant_sentences.append(sentence)
    return relevant_sentences
-# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
+# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))
--- a/internet_ml/tools/NLP/ml/dataset/long_QA.py
+++ b/internet_ml/tools/NLP/ml/dataset/long_QA.py
@ -1,11 +1,7 @@
-import datasets
+# type: ignore
 from typing import Any
 import datasets
 CoQA: Any = datasets.load_dataset("coqa")
 DATASET: List[Any] = []
 def coqa():
    global CoQA, DATASET
    for story in CoQA["train"]:
        for question, answer in story["questions"], story["answers"]:
--- a/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py
+++ b/internet_ml/tools/NLP/ml/training/gpt-neox-20b-quac.py
@ -1,5 +1,6 @@
-import transformers
+import multiprocessing as mp
-import torch
+
 import accelerate
 import datasets
-import multiprocessing as mp
+import torch
 import transformers
--- a/poetry.lock
+++ b/poetry.lock
@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
 doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
 test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 [[package]]
 name = "types-requests"
 version = "2.28.11.7"
 description = "Typing stubs for requests"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
    {file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"},
    {file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"},
 ]
 [package.dependencies]
 types-urllib3 = "<1.27"
 [[package]]
 name = "types-urllib3"
 version = "1.26.25.4"
 description = "Typing stubs for urllib3"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
    {file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"},
    {file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"},
 ]
 [[package]]
 name = "typing-extensions"
 version = "4.4.0"
@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526"
+content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,6 +46,8 @@ timm = "^0.6.12"
 torchvision = "^0.14.1"
 torchaudio = "^0.13.1"
 python-dotenv = "^0.21.0"
 requests = "^2.28.1"
 types-requests = "^2.28.11.7"
 [tool.poetry.group.dev.dependencies]
 bandit = "^1.7.4"