NLP from past added. Need to change some relative import

2022-12-24 08:20:25 +07:00 · 2022-12-24 08:20:25 +07:00 · 43938d8377
parent 434fbb342f
commit 43938d8377
23 changed files with 617 additions and 0 deletions
--- a/internet_ml/NLP/README.md
+++ b/internet_ml/NLP/README.md
@ -0,0 +1,3 @@
+# Explanation
+
+Here is where the explanation of how internet-nlp works
--- a/internet_ml/NLP/context/init.py
+++ b/internet_ml/NLP/context/init.py
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
--- a/internet_ml/NLP/no_context/init.py
+++ b/internet_ml/NLP/no_context/init.py
--- a/internet_ml/tools/NLP/init.py
+++ b/internet_ml/tools/NLP/init.py
--- a/internet_ml/tools/NLP/data/init.py
+++ b/internet_ml/tools/NLP/data/init.py
--- a/internet_ml/tools/NLP/data/database.py
+++ b/internet_ml/tools/NLP/data/database.py
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@ -0,0 +1,253 @@
+from typing import Any, Dict, List
+
+import asyncio
+import logging
+import re
+import time
+import urllib
+
+import aiohttp
+from bs4 import BeautifulSoup
+
+# Set up logging
+logging.basicConfig(
+    filename="internet.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
+# import concurrent.futures
+
+# Import the config module
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
+import config
+
+sys.path.append(str(Path(__file__).parent.parent))
+import pickle
+
+from is_relevant import filter_irrelevant
+from normalize import normalizer
+from sentencize import sentencizer
+from urlextract import URLExtract
+
+# Define the user agent
+HTTP_USERAGENT: dict[str, str] = {
+    "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+}
+# Define the google domains
+UNWANTED_DOMAINS = {
+    "https://www.google.",
+    "https://google.",
+    "https://webcache.googleusercontent.",
+    "http://webcache.googleusercontent.",
+    "https://policies.google.",
+    "https://support.google.",
+    "https://maps.google.",
+    "https://youtube.",
+    "https://translate.google.",
+}
+
+CACHE_FILE_PATH: str = "./internet_cache.pkl"
+CACHE_TIME: int = 86400  # one day
+
+URL_EXTRACTOR = URLExtract()
+
+# Load the cache from the file (if it exists)
+try:
+    with open(CACHE_FILE_PATH, "rb") as f:
+        cache = pickle.load(f)
+except FileNotFoundError:
+    cache = {}
+
+# Define the fetch_url function
+async def fetch_url(session: aiohttp.ClientSession, url: str) -> str:
+    global HTTP_USERAGENT
+    async with session.get(url, headers=HTTP_USERAGENT) as response:
+        return await response.text()
+
+
+# Define the google_urls function
+async def google_urls(query: str, links: list[str]) -> list[str]:
+    """
+    Asynchronously search Google for the given query and retrieve the URLs of the top results.
+
+    Parameters:
+        query (str): The query to search for.
+
+    Returns:
+        List[str]: A list of the URLs of the top search results.
+    """
+    global UNWANTED_DOMAINS
+    # Initialize an empty list to store the URLs
+    urls: list[str] = links
+
+    # Determine the number of results to retrieve based on the configuration mode
+    num_of_res: int = (
+        5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
+    )
+
+    # Log the number of results wanted (if debugging is enabled)
+    if config.CONF_DEBUG:
+        logging.info(f"number of results wanted: {num_of_res}")
+
+    # Construct the search URL
+    search_url: str = (
+        "https://www.google.com/search?q="
+        + str(urllib.parse.quote_plus(query))
+        + "&num="
+        + str(num_of_res)
+    )
+
+    # Log the search URL (if debugging is enabled)
+    if config.CONF_DEBUG:
+        logging.info(f"url: {search_url}")
+
+    # Create an aiohttp session and use it to fetch the search results
+    async with aiohttp.ClientSession() as session:
+        response: str = await fetch_url(session, search_url)
+
+        # Wait 10 seconds before parsing the results (to avoid being rate-limited)
+        await asyncio.sleep(10.0)
+
+        # Parse the search results using BeautifulSoup
+        soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
+
+        # Iterate over the links in the search results
+        for link in list(soup.select("a[href]")):
+            # Extract the URL from the link
+            url = str(link["href"])
+
+            # Check if the URL is valid and not a Google or YouTube link
+            if ("http" in url) and (
+                not any(url.startswith(s) for s in UNWANTED_DOMAINS)
+            ):
+                urls.append(url)
+                if config.CONF_DEBUG:
+                    logging.info(f"added {url}")
+            if len(urls) == num_of_res:
+                break
+    return urls
+
+
+async def fetch_url_text(
+    session: aiohttp.ClientSession, url: str, query: str
+) -> list[str]:
+    """
+    Extract the text from the given HTML content.
+
+    Parameters:
+        session (aiohttp.ClientSession): aiohttp session
+        url (str): The url content to get text from.
+
+    Returns:
+        str: The extracted text.
+    """
+    global HTTP_USERAGENT
+    try:
+        async with session.get(url, headers=HTTP_USERAGENT) as response:
+            soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
+            text = normalizer(soup.get_text())
+            if config.CONF_DEBUG:
+                logging.info(f"Text: {text}")
+            sentences: list[str] = sentencizer(text)
+            sentences = filter_irrelevant(sentences, query)
+            return sentences
+    except Exception as e:
+        # Log the error and continue execution
+        logging.error(f"Error occurred: {e}")
+        return []
+
+
+def flatten(l):
+    return [item for sublist in l for item in sublist]
+
+
+async def get_text_content(urls: list[str], query: str) -> list[str]:
+    # Create a list to store the results
+    results: list[str] = []
+    # Create an aiohttp session
+    async with aiohttp.ClientSession() as session:
+        # Create a list of tasks to run concurrently
+        tasks: list[Any] = [
+            asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
+        ]
+        # Use asyncio.gather to run the tasks concurrently
+        results = await asyncio.gather(*tasks)
+    sentences: list[str] = flatten(results)
+    return sentences
+
+
+def google(query: str) -> list[str]:
+    global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
+    links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
+    query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
+    entry = cache.get(query)
+    if entry is None:
+        # no query exists, so add a new entry to the cache
+        text = asyncio.run(
+            get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
+        )
+        cache[query] = (text, time.time() + CACHE_TIME)  # cache expires in one hour
+    elif entry[1] < time.time():
+        # update as it expired
+        text = asyncio.run(
+            get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
+        )
+        cache[query] = (text, time.time() + CACHE_TIME)  # cache expires in one hour
+    else:
+        # available so return it
+        text = entry[0]
+    # Save the cache to the file
+    with open(CACHE_FILE_PATH, "wb") as f:
+        pickle.dump(cache, f)
+    # Return the text
+    return text
+
+
+print(
+    google(
+        "who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en"
+    )
+)
+
+
+"""
+async + multithreading since web scraping is I/O bound
+https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
+normal
+________________________________________________________
+Executed in    1.67 secs      fish           external
+   usr time  137.29 millis    0.11 millis  137.18 millis
+   sys time   38.39 millis    1.25 millis   37.13 millis
+Async
+________________________________________________________
+Executed in  624.82 millis    fish           external
+   usr time  141.92 millis    0.11 millis  141.81 millis
+   sys time   38.00 millis    1.45 millis   36.55 millis
+
+concurrent
+________________________________________________________
+Executed in  629.67 millis    fish           external
+   usr time  136.72 millis    0.12 millis  136.60 millis
+   sys time   36.86 millis    1.32 millis   35.54 millis
+
+multiprocessing
+________________________________________________________
+Executed in  754.61 millis    fish           external
+   usr time  399.25 millis    0.11 millis  399.14 millis
+   sys time  164.39 millis    1.49 millis  162.90 millis
+
+multiprocessing
+
+OVERALL
+multithreading bs4
+________________________________________________________
+Executed in   14.67 secs    fish           external
+   usr time    1.81 secs    0.12 millis    1.81 secs
+   sys time    0.14 secs    1.50 millis    0.14 secs
+multiprocessing bs4
+"""
--- a/internet_ml/tools/NLP/entity_recognition.py
+++ b/internet_ml/tools/NLP/entity_recognition.py
--- a/internet_ml/tools/NLP/is_relevant.py
+++ b/internet_ml/tools/NLP/is_relevant.py
@ -0,0 +1,82 @@
+# mypy: ignore-errors
+# checks if sentence is relevant to other sentence
+from typing import List
+
+import concurrent.futures
+import pickle
+
+import spacy
+
+# Load the English language model
+NLP = spacy.load("en_core_web_sm")
+from pathlib import Path
+
+CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
+
+try:
+    with open(CACHE_FILE_PATH, "rb") as f:
+        cache = pickle.load(f)
+except (OSError, EOFError):
+    cache = {}
+
+
+def is_relevant(sentence: str, question: str) -> bool:
+    global NLP
+
+    cache_key = (sentence, question)
+    if cache_key in cache:
+        relevant: bool = cache[cache_key]
+        return relevant
+    # Process the sentence and question
+    doc_sentence = NLP(sentence)
+    doc_question = NLP(question)
+
+    # Extract the named entities and important words or phrases from the sentence
+    sentence_important = {
+        token.text
+        for token in doc_sentence
+        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
+    }
+    question_important = {
+        token.text
+        for token in doc_question
+        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
+    }
+
+    # Check if any of the named entities or important words or phrases in the question are in the sentence
+    for token in question_important:
+        if token in sentence_important:
+            cache[cache_key] = True
+            with open(CACHE_FILE_PATH, "wb") as f:
+                pickle.dump(cache, f)
+            return True
+
+    # Check if the sentence contains any negative words
+    for token in doc_sentence:
+        if token.pos_ == "ADV" and token.dep_ == "neg":
+            cache[cache_key] = False
+            with open(CACHE_FILE_PATH, "wb") as f:
+                pickle.dump(cache, f)
+            return False
+
+    cache[cache_key] = False
+    with open(CACHE_FILE_PATH, "wb") as f:
+        pickle.dump(cache, f)
+    return False
+
+
+def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
+    relevant_sentences = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(is_relevant, sentence, question) for sentence in sentences
+        ]
+        for future, sentence in zip(
+            concurrent.futures.as_completed(futures), sentences
+        ):
+            if future.result():
+                relevant_sentences.append(sentence)
+    return relevant_sentences
+
+
+# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
--- a/internet_ml/tools/NLP/ml/init.py
+++ b/internet_ml/tools/NLP/ml/init.py
--- a/internet_ml/tools/NLP/ml/dataset/init.py
+++ b/internet_ml/tools/NLP/ml/dataset/init.py
--- a/internet_ml/tools/NLP/ml/dataset/long_QA.py
+++ b/internet_ml/tools/NLP/ml/dataset/long_QA.py
--- a/internet_ml/tools/NLP/ml/long_QA.py
+++ b/internet_ml/tools/NLP/ml/long_QA.py
--- a/internet_ml/tools/NLP/ml/nli.py
+++ b/internet_ml/tools/NLP/ml/nli.py
--- a/internet_ml/tools/NLP/ml/question_converter.py
+++ b/internet_ml/tools/NLP/ml/question_converter.py
--- a/internet_ml/tools/NLP/ml/summarize.py
+++ b/internet_ml/tools/NLP/ml/summarize.py
--- a/internet_ml/tools/NLP/ml/training/init.py
+++ b/internet_ml/tools/NLP/ml/training/init.py
--- a/internet_ml/tools/NLP/normalize.py
+++ b/internet_ml/tools/NLP/normalize.py
@ -0,0 +1,83 @@
+import logging
+
+# logging config
+logging.basicConfig(
+    filename="normalize.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
+import concurrent.futures
+import string
+import sys
+from pathlib import Path
+
+import contractions
+import tokenizers
+from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
+
+# Add utils directory to path
+sys.path.append(str(Path(__file__).parent.parent) + "/utils")
+import config
+
+# Define normalization sequence
+NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
+    [NFKD(), Strip(), StripAccents()]
+)
+
+
+def remove_non_ascii(string: str) -> str:
+    return string.encode("ascii", errors="ignore").decode()
+
+
+def normalizer(text: str) -> str:
+    global remove_non_ascii
+    """Normalize input text.
+
+    Args:
+        text (str): Input text to normalize.
+
+    Returns:
+        str: Normalized text.
+    """
+    global NORMALIZER_SEQ
+    # Expand contractions
+    contractions.fix(text)
+    # Remove punctuation
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    # Normalize string
+    text = NORMALIZER_SEQ.normalize_str(text)
+    text = (
+        text.replace("\n", " ")
+        .replace("\t", " ")
+        .replace("\r", " ")
+        .replace("'", " ")
+        .replace("\\x", " ")
+        .replace('"', " ")
+        .replace("\\", " ")
+        .replace("\\", " ")
+        .replace("\\r", " ")
+        .replace("\\f", " ")
+        .replace("\\a", " ")
+        .replace(r"\/a", " ")
+        .replace(r"\/f", " ")
+        .replace(r"\/b", " ")
+        .replace("               ", " ")
+    )
+    text = remove_non_ascii(text)
+    if config.CONF_DEBUG:
+        logging.info(text)
+    return text
+
+
+def normalize_sentences(sentences: list[str]) -> list[str]:
+    normalized_sentences = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = [executor.submit(normalizer, sentence) for sentence in sentences]
+        for future, sentence in zip(
+            concurrent.futures.as_completed(futures), sentences
+        ):
+            if future.result():
+                normalized_sentences.append(sentence)
+    return normalized_sentences
--- a/internet_ml/tools/NLP/pos_tagging.py
+++ b/internet_ml/tools/NLP/pos_tagging.py
--- a/internet_ml/tools/NLP/sentencize.py
+++ b/internet_ml/tools/NLP/sentencize.py
@ -0,0 +1,61 @@
+from typing import List
+
+import logging
+
+# logging config
+logging.basicConfig(
+    filename="sentencize.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
+import sys
+from pathlib import Path
+
+# Add utils directory to path
+sys.path.append(str(Path(__file__).parent.parent) + "/utils")
+import concurrent.futures
+
+import config
+import nltk
+
+try:
+    nltk.data.find("words")
+except LookupError:
+    nltk.download("words")
+
+ENGLISH_WORDS = set(nltk.corpus.words.words())
+
+
+def convert_to_english(text: str) -> str:
+    global ENGLISH_WORDS
+    return " ".join(
+        w
+        for w in nltk.wordpunct_tokenize(text)
+        if w.lower() in ENGLISH_WORDS or not w.isalpha()
+    )
+
+
+def sentencizer(text: str) -> list[str]:
+    global convert_to_english
+    inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
+    english_sentences: list[str] = []
+
+    # Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
+    with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
+        # Create a list of futures to process the sentences concurrently
+        futures = [
+            executor.submit(convert_to_english, sentence)
+            for sentence in inital_sentences
+        ]
+        # Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
+        for future in concurrent.futures.as_completed(futures):
+            english_sentences.append(future.result())
+
+    if config.CONF_DEBUG:
+        logging.info(f"sentences: {english_sentences}")
+    return english_sentences
+
+
+# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))
--- a/internet_ml/utils/NLP/constants.py
+++ b/internet_ml/utils/NLP/constants.py
@ -0,0 +1,110 @@
+import typing
+
+contractions_dict: dict[str, str] = {
+    "aint": "are not",
+    "s": " is",
+    "arent": "are not",
+    "cant": "cannot",
+    "cantve": "cannot have",
+    "cause": "because",
+    "couldve": "could have",
+    "couldnt": "could not",
+    "couldntve": "could not have",
+    "didnt": "did not",
+    "doesnt": "does not",
+    "dont": "do not",
+    "hadnt": "had not",
+    "hadntve": "had not have",
+    "hasnt": "has not",
+    "havent": "have not",
+    "hed": "he would",
+    "hedve": "he would have",
+    "hell": "he will",
+    "hellve": "he will have",
+    "howd": "how did",
+    "howdy": "how do you",
+    "howll": "how will",
+    "Id": "I would",
+    "Idve": "I would have",
+    "Ill": "I will",
+    "Illve": "I will have",
+    "Im": "I am",
+    "Ive": "I have",
+    "isnt": "is not",
+    "itd": "it would",
+    "itdve": "it would have",
+    "itll": "it will",
+    "itllve": "it will have",
+    "lets": "let us",
+    "maam": "madam",
+    "maynt": "may not",
+    "mightve": "might have",
+    "mightnt": "might not",
+    "mightntve": "might not have",
+    "mustve": "must have",
+    "mustnt": "must not",
+    "mustntve": "must not have",
+    "neednt": "need not",
+    "needntve": "need not have",
+    "oclock": "of the clock",
+    "oughtnt": "ought not",
+    "oughtntve": "ought not have",
+    "shant": "shall not",
+    "shant": "shall not",
+    "shantve": "shall not have",
+    "shed": "she would",
+    "shedve": "she would have",
+    "shell": "she will",
+    "shellve": "she will have",
+    "shouldve": "should have",
+    "shouldnt": "should not",
+    "shouldntve": "should not have",
+    "sove": "so have",
+    "thatd": "that would",
+    "thatdve": "that would have",
+    "thered": "there would",
+    "theredve": "there would have",
+    "theyd": "they would",
+    "theydve": "they would have",
+    "theyll": "they will",
+    "theyllve": "they will have",
+    "theyre": "they are",
+    "theyve": "they have",
+    "tove": "to have",
+    "wasnt": "was not",
+    "wed": "we would",
+    "wedve": "we would have",
+    "well": "we will",
+    "wellve": "we will have",
+    "were": "we are",
+    "weve": "we have",
+    "werent": "were not",
+    "whatll": "what will",
+    "whatllve": "what will have",
+    "whatre": "what are",
+    "whatve": "what have",
+    "whenve": "when have",
+    "whered": "where did",
+    "whereve": "where have",
+    "wholl": "who will",
+    "whollve": "who will have",
+    "whove": "who have",
+    "whyve": "why have",
+    "willve": "will have",
+    "wont": "will not",
+    "wontve": "will not have",
+    "wouldve": "would have",
+    "wouldnt": "would not",
+    "wouldntve": "would not have",
+    "yall": "you all",
+    "yalld": "you all would",
+    "yalldve": "you all would have",
+    "yallre": "you all are",
+    "yallve": "you all have",
+    "youd": "you would",
+    "youdve": "you would have",
+    "youll": "you will",
+    "youllve": "you will have",
+    "youre": "you are",
+    "youve": "you have",
+}
--- a/internet_ml/utils/config.py
+++ b/internet_ml/utils/config.py
@ -0,0 +1,25 @@
+import typing
+
+import logging
+
+logging.basicConfig(
+    filename="config.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
+# Global
+CONF_DEBUG: bool = True
+# NLP
+CONF_MODE: str = "default"
+
+
+def NLP_config(mode: str = "default", debug: bool = True) -> None:
+    global conf_MODE, conf_DEBUG
+    CONF_DEBUG = debug
+    if mode == "accuracy" or mode == "speed":
+        CONF_MODE = mode
+    else:
+        if CONF_DEBUG:
+            logging.warn(f"mode: {mode} does not exist")