NLP from past added. Need to change some relative import

2022-12-24 08:20:25 +07:00 · 2022-12-24 08:20:25 +07:00 · 43938d8377
parent 434fbb342f
commit 43938d8377
23 changed files with 617 additions and 0 deletions
--- a/internet_ml/NLP/README.md
+++ b/internet_ml/NLP/README.md
@ -0,0 +1,3 @@
 # Explanation
 Here is where the explanation of how internet-nlp works
--- a/internet_ml/NLP/context/init.py
+++ b/internet_ml/NLP/context/init.py
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
--- a/internet_ml/NLP/no_context/init.py
+++ b/internet_ml/NLP/no_context/init.py
--- a/internet_ml/tools/NLP/init.py
+++ b/internet_ml/tools/NLP/init.py
--- a/internet_ml/tools/NLP/data/init.py
+++ b/internet_ml/tools/NLP/data/init.py
--- a/internet_ml/tools/NLP/data/database.py
+++ b/internet_ml/tools/NLP/data/database.py
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@ -0,0 +1,253 @@
 from typing import Any, Dict, List
 import asyncio
 import logging
 import re
 import time
 import urllib
 import aiohttp
 from bs4 import BeautifulSoup
 # Set up logging
 logging.basicConfig(
    filename="internet.log",
    filemode="w",
    level=logging.INFO,
    format="%(name)s - %(levelname)s - %(message)s",
 )
 # import concurrent.futures
 # Import the config module
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
 import config
 sys.path.append(str(Path(__file__).parent.parent))
 import pickle
 from is_relevant import filter_irrelevant
 from normalize import normalizer
 from sentencize import sentencizer
 from urlextract import URLExtract
 # Define the user agent
 HTTP_USERAGENT: dict[str, str] = {
    "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 }
 # Define the google domains
 UNWANTED_DOMAINS = {
    "https://www.google.",
    "https://google.",
    "https://webcache.googleusercontent.",
    "http://webcache.googleusercontent.",
    "https://policies.google.",
    "https://support.google.",
    "https://maps.google.",
    "https://youtube.",
    "https://translate.google.",
 }
 CACHE_FILE_PATH: str = "./internet_cache.pkl"
 CACHE_TIME: int = 86400  # one day
 URL_EXTRACTOR = URLExtract()
 # Load the cache from the file (if it exists)
 try:
    with open(CACHE_FILE_PATH, "rb") as f:
        cache = pickle.load(f)
 except FileNotFoundError:
    cache = {}
 # Define the fetch_url function
 async def fetch_url(session: aiohttp.ClientSession, url: str) -> str:
    global HTTP_USERAGENT
    async with session.get(url, headers=HTTP_USERAGENT) as response:
        return await response.text()
 # Define the google_urls function
 async def google_urls(query: str, links: list[str]) -> list[str]:
    """
    Asynchronously search Google for the given query and retrieve the URLs of the top results.
    Parameters:
        query (str): The query to search for.
    Returns:
        List[str]: A list of the URLs of the top search results.
    """
    global UNWANTED_DOMAINS
    # Initialize an empty list to store the URLs
    urls: list[str] = links
    # Determine the number of results to retrieve based on the configuration mode
    num_of_res: int = (
        5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
    )
    # Log the number of results wanted (if debugging is enabled)
    if config.CONF_DEBUG:
        logging.info(f"number of results wanted: {num_of_res}")
    # Construct the search URL
    search_url: str = (
        "https://www.google.com/search?q="
        + str(urllib.parse.quote_plus(query))
        + "&num="
        + str(num_of_res)
    )
    # Log the search URL (if debugging is enabled)
    if config.CONF_DEBUG:
        logging.info(f"url: {search_url}")
    # Create an aiohttp session and use it to fetch the search results
    async with aiohttp.ClientSession() as session:
        response: str = await fetch_url(session, search_url)
        # Wait 10 seconds before parsing the results (to avoid being rate-limited)
        await asyncio.sleep(10.0)
        # Parse the search results using BeautifulSoup
        soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
        # Iterate over the links in the search results
        for link in list(soup.select("a[href]")):
            # Extract the URL from the link
            url = str(link["href"])
            # Check if the URL is valid and not a Google or YouTube link
            if ("http" in url) and (
                not any(url.startswith(s) for s in UNWANTED_DOMAINS)
            ):
                urls.append(url)
                if config.CONF_DEBUG:
                    logging.info(f"added {url}")
            if len(urls) == num_of_res:
                break
    return urls
 async def fetch_url_text(
    session: aiohttp.ClientSession, url: str, query: str
 ) -> list[str]:
    """
    Extract the text from the given HTML content.
    Parameters:
        session (aiohttp.ClientSession): aiohttp session
        url (str): The url content to get text from.
    Returns:
        str: The extracted text.
    """
    global HTTP_USERAGENT
    try:
        async with session.get(url, headers=HTTP_USERAGENT) as response:
            soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
            text = normalizer(soup.get_text())
            if config.CONF_DEBUG:
                logging.info(f"Text: {text}")
            sentences: list[str] = sentencizer(text)
            sentences = filter_irrelevant(sentences, query)
            return sentences
    except Exception as e:
        # Log the error and continue execution
        logging.error(f"Error occurred: {e}")
        return []
 def flatten(l):
    return [item for sublist in l for item in sublist]
 async def get_text_content(urls: list[str], query: str) -> list[str]:
    # Create a list to store the results
    results: list[str] = []
    # Create an aiohttp session
    async with aiohttp.ClientSession() as session:
        # Create a list of tasks to run concurrently
        tasks: list[Any] = [
            asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
        ]
        # Use asyncio.gather to run the tasks concurrently
        results = await asyncio.gather(*tasks)
    sentences: list[str] = flatten(results)
    return sentences
 def google(query: str) -> list[str]:
    global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
    links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
    query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
    entry = cache.get(query)
    if entry is None:
        # no query exists, so add a new entry to the cache
        text = asyncio.run(
            get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
        )
        cache[query] = (text, time.time() + CACHE_TIME)  # cache expires in one hour
    elif entry[1] < time.time():
        # update as it expired
        text = asyncio.run(
            get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
        )
        cache[query] = (text, time.time() + CACHE_TIME)  # cache expires in one hour
    else:
        # available so return it
        text = entry[0]
    # Save the cache to the file
    with open(CACHE_FILE_PATH, "wb") as f:
        pickle.dump(cache, f)
    # Return the text
    return text
 print(
    google(
        "who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en"
    )
 )
 """
 async + multithreading since web scraping is I/O bound
 https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
 normal
 ________________________________________________________
 Executed in    1.67 secs      fish           external
   usr time  137.29 millis    0.11 millis  137.18 millis
   sys time   38.39 millis    1.25 millis   37.13 millis
 Async
 ________________________________________________________
 Executed in  624.82 millis    fish           external
   usr time  141.92 millis    0.11 millis  141.81 millis
   sys time   38.00 millis    1.45 millis   36.55 millis
 concurrent
 ________________________________________________________
 Executed in  629.67 millis    fish           external
   usr time  136.72 millis    0.12 millis  136.60 millis
   sys time   36.86 millis    1.32 millis   35.54 millis
 multiprocessing
 ________________________________________________________
 Executed in  754.61 millis    fish           external
   usr time  399.25 millis    0.11 millis  399.14 millis
   sys time  164.39 millis    1.49 millis  162.90 millis
 multiprocessing
 OVERALL
 multithreading bs4
 ________________________________________________________
 Executed in   14.67 secs    fish           external
   usr time    1.81 secs    0.12 millis    1.81 secs
   sys time    0.14 secs    1.50 millis    0.14 secs
 multiprocessing bs4
 """
--- a/internet_ml/tools/NLP/entity_recognition.py
+++ b/internet_ml/tools/NLP/entity_recognition.py
--- a/internet_ml/tools/NLP/is_relevant.py
+++ b/internet_ml/tools/NLP/is_relevant.py
@ -0,0 +1,82 @@
 # mypy: ignore-errors
 # checks if sentence is relevant to other sentence
 from typing import List
 import concurrent.futures
 import pickle
 import spacy
 # Load the English language model
 NLP = spacy.load("en_core_web_sm")
 from pathlib import Path
 CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
 try:
    with open(CACHE_FILE_PATH, "rb") as f:
        cache = pickle.load(f)
 except (OSError, EOFError):
    cache = {}
 def is_relevant(sentence: str, question: str) -> bool:
    global NLP
    cache_key = (sentence, question)
    if cache_key in cache:
        relevant: bool = cache[cache_key]
        return relevant
    # Process the sentence and question
    doc_sentence = NLP(sentence)
    doc_question = NLP(question)
    # Extract the named entities and important words or phrases from the sentence
    sentence_important = {
        token.text
        for token in doc_sentence
        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
    }
    question_important = {
        token.text
        for token in doc_question
        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
    }
    # Check if any of the named entities or important words or phrases in the question are in the sentence
    for token in question_important:
        if token in sentence_important:
            cache[cache_key] = True
            with open(CACHE_FILE_PATH, "wb") as f:
                pickle.dump(cache, f)
            return True
    # Check if the sentence contains any negative words
    for token in doc_sentence:
        if token.pos_ == "ADV" and token.dep_ == "neg":
            cache[cache_key] = False
            with open(CACHE_FILE_PATH, "wb") as f:
                pickle.dump(cache, f)
            return False
    cache[cache_key] = False
    with open(CACHE_FILE_PATH, "wb") as f:
        pickle.dump(cache, f)
    return False
 def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
    relevant_sentences = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(is_relevant, sentence, question) for sentence in sentences
        ]
        for future, sentence in zip(
            concurrent.futures.as_completed(futures), sentences
        ):
            if future.result():
                relevant_sentences.append(sentence)
    return relevant_sentences
 # print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
--- a/internet_ml/tools/NLP/ml/init.py
+++ b/internet_ml/tools/NLP/ml/init.py
--- a/internet_ml/tools/NLP/ml/dataset/init.py
+++ b/internet_ml/tools/NLP/ml/dataset/init.py
--- a/internet_ml/tools/NLP/ml/dataset/long_QA.py
+++ b/internet_ml/tools/NLP/ml/dataset/long_QA.py
--- a/internet_ml/tools/NLP/ml/long_QA.py
+++ b/internet_ml/tools/NLP/ml/long_QA.py
--- a/internet_ml/tools/NLP/ml/nli.py
+++ b/internet_ml/tools/NLP/ml/nli.py
--- a/internet_ml/tools/NLP/ml/question_converter.py
+++ b/internet_ml/tools/NLP/ml/question_converter.py
--- a/internet_ml/tools/NLP/ml/summarize.py
+++ b/internet_ml/tools/NLP/ml/summarize.py
--- a/internet_ml/tools/NLP/ml/training/init.py
+++ b/internet_ml/tools/NLP/ml/training/init.py
--- a/internet_ml/tools/NLP/normalize.py
+++ b/internet_ml/tools/NLP/normalize.py
@ -0,0 +1,83 @@
 import logging
 # logging config
 logging.basicConfig(
    filename="normalize.log",
    filemode="w",
    level=logging.INFO,
    format="%(name)s - %(levelname)s - %(message)s",
 )
 import concurrent.futures
 import string
 import sys
 from pathlib import Path
 import contractions
 import tokenizers
 from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
 # Add utils directory to path
 sys.path.append(str(Path(__file__).parent.parent) + "/utils")
 import config
 # Define normalization sequence
 NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
    [NFKD(), Strip(), StripAccents()]
 )
 def remove_non_ascii(string: str) -> str:
    return string.encode("ascii", errors="ignore").decode()
 def normalizer(text: str) -> str:
    global remove_non_ascii
    """Normalize input text.
    Args:
        text (str): Input text to normalize.
    Returns:
        str: Normalized text.
    """
    global NORMALIZER_SEQ
    # Expand contractions
    contractions.fix(text)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Normalize string
    text = NORMALIZER_SEQ.normalize_str(text)
    text = (
        text.replace("\n", " ")
        .replace("\t", " ")
        .replace("\r", " ")
        .replace("'", " ")
        .replace("\\x", " ")
        .replace('"', " ")
        .replace("\\", " ")
        .replace("\\", " ")
        .replace("\\r", " ")
        .replace("\\f", " ")
        .replace("\\a", " ")
        .replace(r"\/a", " ")
        .replace(r"\/f", " ")
        .replace(r"\/b", " ")
        .replace("               ", " ")
    )
    text = remove_non_ascii(text)
    if config.CONF_DEBUG:
        logging.info(text)
    return text
 def normalize_sentences(sentences: list[str]) -> list[str]:
    normalized_sentences = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(normalizer, sentence) for sentence in sentences]
        for future, sentence in zip(
            concurrent.futures.as_completed(futures), sentences
        ):
            if future.result():
                normalized_sentences.append(sentence)
    return normalized_sentences
--- a/internet_ml/tools/NLP/pos_tagging.py
+++ b/internet_ml/tools/NLP/pos_tagging.py
--- a/internet_ml/tools/NLP/sentencize.py
+++ b/internet_ml/tools/NLP/sentencize.py
@ -0,0 +1,61 @@
 from typing import List
 import logging
 # logging config
 logging.basicConfig(
    filename="sentencize.log",
    filemode="w",
    level=logging.INFO,
    format="%(name)s - %(levelname)s - %(message)s",
 )
 import sys
 from pathlib import Path
 # Add utils directory to path
 sys.path.append(str(Path(__file__).parent.parent) + "/utils")
 import concurrent.futures
 import config
 import nltk
 try:
    nltk.data.find("words")
 except LookupError:
    nltk.download("words")
 ENGLISH_WORDS = set(nltk.corpus.words.words())
 def convert_to_english(text: str) -> str:
    global ENGLISH_WORDS
    return " ".join(
        w
        for w in nltk.wordpunct_tokenize(text)
        if w.lower() in ENGLISH_WORDS or not w.isalpha()
    )
 def sentencizer(text: str) -> list[str]:
    global convert_to_english
    inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
    english_sentences: list[str] = []
    # Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
        # Create a list of futures to process the sentences concurrently
        futures = [
            executor.submit(convert_to_english, sentence)
            for sentence in inital_sentences
        ]
        # Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
        for future in concurrent.futures.as_completed(futures):
            english_sentences.append(future.result())
    if config.CONF_DEBUG:
        logging.info(f"sentences: {english_sentences}")
    return english_sentences
 # print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))
--- a/internet_ml/utils/NLP/constants.py
+++ b/internet_ml/utils/NLP/constants.py
@ -0,0 +1,110 @@
 import typing
 contractions_dict: dict[str, str] = {
    "aint": "are not",
    "s": " is",
    "arent": "are not",
    "cant": "cannot",
    "cantve": "cannot have",
    "cause": "because",
    "couldve": "could have",
    "couldnt": "could not",
    "couldntve": "could not have",
    "didnt": "did not",
    "doesnt": "does not",
    "dont": "do not",
    "hadnt": "had not",
    "hadntve": "had not have",
    "hasnt": "has not",
    "havent": "have not",
    "hed": "he would",
    "hedve": "he would have",
    "hell": "he will",
    "hellve": "he will have",
    "howd": "how did",
    "howdy": "how do you",
    "howll": "how will",
    "Id": "I would",
    "Idve": "I would have",
    "Ill": "I will",
    "Illve": "I will have",
    "Im": "I am",
    "Ive": "I have",
    "isnt": "is not",
    "itd": "it would",
    "itdve": "it would have",
    "itll": "it will",
    "itllve": "it will have",
    "lets": "let us",
    "maam": "madam",
    "maynt": "may not",
    "mightve": "might have",
    "mightnt": "might not",
    "mightntve": "might not have",
    "mustve": "must have",
    "mustnt": "must not",
    "mustntve": "must not have",
    "neednt": "need not",
    "needntve": "need not have",
    "oclock": "of the clock",
    "oughtnt": "ought not",
    "oughtntve": "ought not have",
    "shant": "shall not",
    "shant": "shall not",
    "shantve": "shall not have",
    "shed": "she would",
    "shedve": "she would have",
    "shell": "she will",
    "shellve": "she will have",
    "shouldve": "should have",
    "shouldnt": "should not",
    "shouldntve": "should not have",
    "sove": "so have",
    "thatd": "that would",
    "thatdve": "that would have",
    "thered": "there would",
    "theredve": "there would have",
    "theyd": "they would",
    "theydve": "they would have",
    "theyll": "they will",
    "theyllve": "they will have",
    "theyre": "they are",
    "theyve": "they have",
    "tove": "to have",
    "wasnt": "was not",
    "wed": "we would",
    "wedve": "we would have",
    "well": "we will",
    "wellve": "we will have",
    "were": "we are",
    "weve": "we have",
    "werent": "were not",
    "whatll": "what will",
    "whatllve": "what will have",
    "whatre": "what are",
    "whatve": "what have",
    "whenve": "when have",
    "whered": "where did",
    "whereve": "where have",
    "wholl": "who will",
    "whollve": "who will have",
    "whove": "who have",
    "whyve": "why have",
    "willve": "will have",
    "wont": "will not",
    "wontve": "will not have",
    "wouldve": "would have",
    "wouldnt": "would not",
    "wouldntve": "would not have",
    "yall": "you all",
    "yalld": "you all would",
    "yalldve": "you all would have",
    "yallre": "you all are",
    "yallve": "you all have",
    "youd": "you would",
    "youdve": "you would have",
    "youll": "you will",
    "youllve": "you will have",
    "youre": "you are",
    "youve": "you have",
 }
--- a/internet_ml/utils/config.py
+++ b/internet_ml/utils/config.py
@ -0,0 +1,25 @@
 import typing
 import logging
 logging.basicConfig(
    filename="config.log",
    filemode="w",
    level=logging.INFO,
    format="%(name)s - %(levelname)s - %(message)s",
 )
 # Global
 CONF_DEBUG: bool = True
 # NLP
 CONF_MODE: str = "default"
 def NLP_config(mode: str = "default", debug: bool = True) -> None:
    global conf_MODE, conf_DEBUG
    CONF_DEBUG = debug
    if mode == "accuracy" or mode == "speed":
        CONF_MODE = mode
    else:
        if CONF_DEBUG:
            logging.warn(f"mode: {mode} does not exist")
		`@ -0,0 +1,3 @@`
							`# Explanation`

							`Here is where the explanation of how internet-nlp works`