From 43938d83776dd225fa3c43d095bcac460c01fe5b Mon Sep 17 00:00:00 2001 From: Thamognya Kodi Date: Sat, 24 Dec 2022 08:20:25 +0700 Subject: [PATCH] NLP from past added. Need to change some relative import --- internet_ml/NLP/README.md | 3 + internet_ml/NLP/context/__init__.py | 0 internet_ml/NLP/no_context/QA.py | 0 internet_ml/NLP/no_context/__init__.py | 0 internet_ml/tools/NLP/__init__.py | 0 internet_ml/tools/NLP/data/__init__.py | 0 internet_ml/tools/NLP/data/database.py | 0 internet_ml/tools/NLP/data/internet.py | 253 ++++++++++++++++++ internet_ml/tools/NLP/entity_recognition.py | 0 internet_ml/tools/NLP/is_relevant.py | 82 ++++++ internet_ml/tools/NLP/ml/__init__.py | 0 internet_ml/tools/NLP/ml/dataset/__init__.py | 0 internet_ml/tools/NLP/ml/dataset/long_QA.py | 0 internet_ml/tools/NLP/ml/long_QA.py | 0 internet_ml/tools/NLP/ml/nli.py | 0 .../tools/NLP/ml/question_converter.py | 0 internet_ml/tools/NLP/ml/summarize.py | 0 internet_ml/tools/NLP/ml/training/__init__.py | 0 internet_ml/tools/NLP/normalize.py | 83 ++++++ internet_ml/tools/NLP/pos_tagging.py | 0 internet_ml/tools/NLP/sentencize.py | 61 +++++ internet_ml/utils/NLP/constants.py | 110 ++++++++ internet_ml/utils/config.py | 25 ++ 23 files changed, 617 insertions(+) create mode 100644 internet_ml/NLP/README.md create mode 100644 internet_ml/NLP/context/__init__.py create mode 100644 internet_ml/NLP/no_context/QA.py create mode 100644 internet_ml/NLP/no_context/__init__.py create mode 100644 internet_ml/tools/NLP/__init__.py create mode 100644 internet_ml/tools/NLP/data/__init__.py create mode 100644 internet_ml/tools/NLP/data/database.py create mode 100644 internet_ml/tools/NLP/data/internet.py create mode 100644 internet_ml/tools/NLP/entity_recognition.py create mode 100644 internet_ml/tools/NLP/is_relevant.py create mode 100644 internet_ml/tools/NLP/ml/__init__.py create mode 100644 internet_ml/tools/NLP/ml/dataset/__init__.py create mode 100644 internet_ml/tools/NLP/ml/dataset/long_QA.py create mode 100644 internet_ml/tools/NLP/ml/long_QA.py create mode 100644 internet_ml/tools/NLP/ml/nli.py create mode 100644 internet_ml/tools/NLP/ml/question_converter.py create mode 100644 internet_ml/tools/NLP/ml/summarize.py create mode 100644 internet_ml/tools/NLP/ml/training/__init__.py create mode 100644 internet_ml/tools/NLP/normalize.py create mode 100644 internet_ml/tools/NLP/pos_tagging.py create mode 100644 internet_ml/tools/NLP/sentencize.py create mode 100644 internet_ml/utils/NLP/constants.py create mode 100644 internet_ml/utils/config.py diff --git a/internet_ml/NLP/README.md b/internet_ml/NLP/README.md new file mode 100644 index 0000000..ae97df7 --- /dev/null +++ b/internet_ml/NLP/README.md @@ -0,0 +1,3 @@ +# Explanation + +Here is where the explanation of how internet-nlp works diff --git a/internet_ml/NLP/context/__init__.py b/internet_ml/NLP/context/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/NLP/no_context/__init__.py b/internet_ml/NLP/no_context/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/__init__.py b/internet_ml/tools/NLP/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/data/__init__.py b/internet_ml/tools/NLP/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/data/database.py b/internet_ml/tools/NLP/data/database.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py new file mode 100644 index 0000000..4f4c321 --- /dev/null +++ b/internet_ml/tools/NLP/data/internet.py @@ -0,0 +1,253 @@ +from typing import Any, Dict, List + +import asyncio +import logging +import re +import time +import urllib + +import aiohttp +from bs4 import BeautifulSoup + +# Set up logging +logging.basicConfig( + filename="internet.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + +# import concurrent.futures + +# Import the config module +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils") +import config + +sys.path.append(str(Path(__file__).parent.parent)) +import pickle + +from is_relevant import filter_irrelevant +from normalize import normalizer +from sentencize import sentencizer +from urlextract import URLExtract + +# Define the user agent +HTTP_USERAGENT: dict[str, str] = { + "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" +} +# Define the google domains +UNWANTED_DOMAINS = { + "https://www.google.", + "https://google.", + "https://webcache.googleusercontent.", + "http://webcache.googleusercontent.", + "https://policies.google.", + "https://support.google.", + "https://maps.google.", + "https://youtube.", + "https://translate.google.", +} + +CACHE_FILE_PATH: str = "./internet_cache.pkl" +CACHE_TIME: int = 86400 # one day + +URL_EXTRACTOR = URLExtract() + +# Load the cache from the file (if it exists) +try: + with open(CACHE_FILE_PATH, "rb") as f: + cache = pickle.load(f) +except FileNotFoundError: + cache = {} + +# Define the fetch_url function +async def fetch_url(session: aiohttp.ClientSession, url: str) -> str: + global HTTP_USERAGENT + async with session.get(url, headers=HTTP_USERAGENT) as response: + return await response.text() + + +# Define the google_urls function +async def google_urls(query: str, links: list[str]) -> list[str]: + """ + Asynchronously search Google for the given query and retrieve the URLs of the top results. + + Parameters: + query (str): The query to search for. + + Returns: + List[str]: A list of the URLs of the top search results. + """ + global UNWANTED_DOMAINS + # Initialize an empty list to store the URLs + urls: list[str] = links + + # Determine the number of results to retrieve based on the configuration mode + num_of_res: int = ( + 5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10) + ) + + # Log the number of results wanted (if debugging is enabled) + if config.CONF_DEBUG: + logging.info(f"number of results wanted: {num_of_res}") + + # Construct the search URL + search_url: str = ( + "https://www.google.com/search?q=" + + str(urllib.parse.quote_plus(query)) + + "&num=" + + str(num_of_res) + ) + + # Log the search URL (if debugging is enabled) + if config.CONF_DEBUG: + logging.info(f"url: {search_url}") + + # Create an aiohttp session and use it to fetch the search results + async with aiohttp.ClientSession() as session: + response: str = await fetch_url(session, search_url) + + # Wait 10 seconds before parsing the results (to avoid being rate-limited) + await asyncio.sleep(10.0) + + # Parse the search results using BeautifulSoup + soup: BeautifulSoup = BeautifulSoup(response, "html.parser") + + # Iterate over the links in the search results + for link in list(soup.select("a[href]")): + # Extract the URL from the link + url = str(link["href"]) + + # Check if the URL is valid and not a Google or YouTube link + if ("http" in url) and ( + not any(url.startswith(s) for s in UNWANTED_DOMAINS) + ): + urls.append(url) + if config.CONF_DEBUG: + logging.info(f"added {url}") + if len(urls) == num_of_res: + break + return urls + + +async def fetch_url_text( + session: aiohttp.ClientSession, url: str, query: str +) -> list[str]: + """ + Extract the text from the given HTML content. + + Parameters: + session (aiohttp.ClientSession): aiohttp session + url (str): The url content to get text from. + + Returns: + str: The extracted text. + """ + global HTTP_USERAGENT + try: + async with session.get(url, headers=HTTP_USERAGENT) as response: + soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser") + text = normalizer(soup.get_text()) + if config.CONF_DEBUG: + logging.info(f"Text: {text}") + sentences: list[str] = sentencizer(text) + sentences = filter_irrelevant(sentences, query) + return sentences + except Exception as e: + # Log the error and continue execution + logging.error(f"Error occurred: {e}") + return [] + + +def flatten(l): + return [item for sublist in l for item in sublist] + + +async def get_text_content(urls: list[str], query: str) -> list[str]: + # Create a list to store the results + results: list[str] = [] + # Create an aiohttp session + async with aiohttp.ClientSession() as session: + # Create a list of tasks to run concurrently + tasks: list[Any] = [ + asyncio.create_task(fetch_url_text(session, url, query)) for url in urls + ] + # Use asyncio.gather to run the tasks concurrently + results = await asyncio.gather(*tasks) + sentences: list[str] = flatten(results) + return sentences + + +def google(query: str) -> list[str]: + global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR + links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) + query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) + entry = cache.get(query) + if entry is None: + # no query exists, so add a new entry to the cache + text = asyncio.run( + get_text_content(asyncio.run(google_urls(query, links_in_text)), query) + ) + cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour + elif entry[1] < time.time(): + # update as it expired + text = asyncio.run( + get_text_content(asyncio.run(google_urls(query, links_in_text)), query) + ) + cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour + else: + # available so return it + text = entry[0] + # Save the cache to the file + with open(CACHE_FILE_PATH, "wb") as f: + pickle.dump(cache, f) + # Return the text + return text + + +print( + google( + "who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en" + ) +) + + +""" +async + multithreading since web scraping is I/O bound +https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio +normal +________________________________________________________ +Executed in 1.67 secs fish external + usr time 137.29 millis 0.11 millis 137.18 millis + sys time 38.39 millis 1.25 millis 37.13 millis +Async +________________________________________________________ +Executed in 624.82 millis fish external + usr time 141.92 millis 0.11 millis 141.81 millis + sys time 38.00 millis 1.45 millis 36.55 millis + +concurrent +________________________________________________________ +Executed in 629.67 millis fish external + usr time 136.72 millis 0.12 millis 136.60 millis + sys time 36.86 millis 1.32 millis 35.54 millis + +multiprocessing +________________________________________________________ +Executed in 754.61 millis fish external + usr time 399.25 millis 0.11 millis 399.14 millis + sys time 164.39 millis 1.49 millis 162.90 millis + +multiprocessing + +OVERALL +multithreading bs4 +________________________________________________________ +Executed in 14.67 secs fish external + usr time 1.81 secs 0.12 millis 1.81 secs + sys time 0.14 secs 1.50 millis 0.14 secs +multiprocessing bs4 +""" diff --git a/internet_ml/tools/NLP/entity_recognition.py b/internet_ml/tools/NLP/entity_recognition.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/is_relevant.py b/internet_ml/tools/NLP/is_relevant.py new file mode 100644 index 0000000..05a219d --- /dev/null +++ b/internet_ml/tools/NLP/is_relevant.py @@ -0,0 +1,82 @@ +# mypy: ignore-errors +# checks if sentence is relevant to other sentence +from typing import List + +import concurrent.futures +import pickle + +import spacy + +# Load the English language model +NLP = spacy.load("en_core_web_sm") +from pathlib import Path + +CACHE_FILE_PATH: str = "./is_relevant_cache.pkl" + +try: + with open(CACHE_FILE_PATH, "rb") as f: + cache = pickle.load(f) +except (OSError, EOFError): + cache = {} + + +def is_relevant(sentence: str, question: str) -> bool: + global NLP + + cache_key = (sentence, question) + if cache_key in cache: + relevant: bool = cache[cache_key] + return relevant + # Process the sentence and question + doc_sentence = NLP(sentence) + doc_question = NLP(question) + + # Extract the named entities and important words or phrases from the sentence + sentence_important = { + token.text + for token in doc_sentence + if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" + } + question_important = { + token.text + for token in doc_question + if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" + } + + # Check if any of the named entities or important words or phrases in the question are in the sentence + for token in question_important: + if token in sentence_important: + cache[cache_key] = True + with open(CACHE_FILE_PATH, "wb") as f: + pickle.dump(cache, f) + return True + + # Check if the sentence contains any negative words + for token in doc_sentence: + if token.pos_ == "ADV" and token.dep_ == "neg": + cache[cache_key] = False + with open(CACHE_FILE_PATH, "wb") as f: + pickle.dump(cache, f) + return False + + cache[cache_key] = False + with open(CACHE_FILE_PATH, "wb") as f: + pickle.dump(cache, f) + return False + + +def filter_irrelevant(sentences: list[str], question: str) -> list[str]: + relevant_sentences = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [ + executor.submit(is_relevant, sentence, question) for sentence in sentences + ] + for future, sentence in zip( + concurrent.futures.as_completed(futures), sentences + ): + if future.result(): + relevant_sentences.append(sentence) + return relevant_sentences + + +# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos")) diff --git a/internet_ml/tools/NLP/ml/__init__.py b/internet_ml/tools/NLP/ml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/dataset/__init__.py b/internet_ml/tools/NLP/ml/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/dataset/long_QA.py b/internet_ml/tools/NLP/ml/dataset/long_QA.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/long_QA.py b/internet_ml/tools/NLP/ml/long_QA.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/nli.py b/internet_ml/tools/NLP/ml/nli.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/question_converter.py b/internet_ml/tools/NLP/ml/question_converter.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/summarize.py b/internet_ml/tools/NLP/ml/summarize.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/ml/training/__init__.py b/internet_ml/tools/NLP/ml/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/normalize.py b/internet_ml/tools/NLP/normalize.py new file mode 100644 index 0000000..cb65ddd --- /dev/null +++ b/internet_ml/tools/NLP/normalize.py @@ -0,0 +1,83 @@ +import logging + +# logging config +logging.basicConfig( + filename="normalize.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + +import concurrent.futures +import string +import sys +from pathlib import Path + +import contractions +import tokenizers +from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents + +# Add utils directory to path +sys.path.append(str(Path(__file__).parent.parent) + "/utils") +import config + +# Define normalization sequence +NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence( + [NFKD(), Strip(), StripAccents()] +) + + +def remove_non_ascii(string: str) -> str: + return string.encode("ascii", errors="ignore").decode() + + +def normalizer(text: str) -> str: + global remove_non_ascii + """Normalize input text. + + Args: + text (str): Input text to normalize. + + Returns: + str: Normalized text. + """ + global NORMALIZER_SEQ + # Expand contractions + contractions.fix(text) + # Remove punctuation + text = text.translate(str.maketrans("", "", string.punctuation)) + # Normalize string + text = NORMALIZER_SEQ.normalize_str(text) + text = ( + text.replace("\n", " ") + .replace("\t", " ") + .replace("\r", " ") + .replace("'", " ") + .replace("\\x", " ") + .replace('"', " ") + .replace("\\", " ") + .replace("\\", " ") + .replace("\\r", " ") + .replace("\\f", " ") + .replace("\\a", " ") + .replace(r"\/a", " ") + .replace(r"\/f", " ") + .replace(r"\/b", " ") + .replace(" ", " ") + ) + text = remove_non_ascii(text) + if config.CONF_DEBUG: + logging.info(text) + return text + + +def normalize_sentences(sentences: list[str]) -> list[str]: + normalized_sentences = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(normalizer, sentence) for sentence in sentences] + for future, sentence in zip( + concurrent.futures.as_completed(futures), sentences + ): + if future.result(): + normalized_sentences.append(sentence) + return normalized_sentences diff --git a/internet_ml/tools/NLP/pos_tagging.py b/internet_ml/tools/NLP/pos_tagging.py new file mode 100644 index 0000000..e69de29 diff --git a/internet_ml/tools/NLP/sentencize.py b/internet_ml/tools/NLP/sentencize.py new file mode 100644 index 0000000..6c132aa --- /dev/null +++ b/internet_ml/tools/NLP/sentencize.py @@ -0,0 +1,61 @@ +from typing import List + +import logging + +# logging config +logging.basicConfig( + filename="sentencize.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + +import sys +from pathlib import Path + +# Add utils directory to path +sys.path.append(str(Path(__file__).parent.parent) + "/utils") +import concurrent.futures + +import config +import nltk + +try: + nltk.data.find("words") +except LookupError: + nltk.download("words") + +ENGLISH_WORDS = set(nltk.corpus.words.words()) + + +def convert_to_english(text: str) -> str: + global ENGLISH_WORDS + return " ".join( + w + for w in nltk.wordpunct_tokenize(text) + if w.lower() in ENGLISH_WORDS or not w.isalpha() + ) + + +def sentencizer(text: str) -> list[str]: + global convert_to_english + inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text) + english_sentences: list[str] = [] + + # Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently + with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor: + # Create a list of futures to process the sentences concurrently + futures = [ + executor.submit(convert_to_english, sentence) + for sentence in inital_sentences + ] + # Use concurrent.futures.as_completed to retrieve the results of the futures as they complete + for future in concurrent.futures.as_completed(futures): + english_sentences.append(future.result()) + + if config.CONF_DEBUG: + logging.info(f"sentences: {english_sentences}") + return english_sentences + + +# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every")) diff --git a/internet_ml/utils/NLP/constants.py b/internet_ml/utils/NLP/constants.py new file mode 100644 index 0000000..2be2ebf --- /dev/null +++ b/internet_ml/utils/NLP/constants.py @@ -0,0 +1,110 @@ +import typing + +contractions_dict: dict[str, str] = { + "aint": "are not", + "s": " is", + "arent": "are not", + "cant": "cannot", + "cantve": "cannot have", + "cause": "because", + "couldve": "could have", + "couldnt": "could not", + "couldntve": "could not have", + "didnt": "did not", + "doesnt": "does not", + "dont": "do not", + "hadnt": "had not", + "hadntve": "had not have", + "hasnt": "has not", + "havent": "have not", + "hed": "he would", + "hedve": "he would have", + "hell": "he will", + "hellve": "he will have", + "howd": "how did", + "howdy": "how do you", + "howll": "how will", + "Id": "I would", + "Idve": "I would have", + "Ill": "I will", + "Illve": "I will have", + "Im": "I am", + "Ive": "I have", + "isnt": "is not", + "itd": "it would", + "itdve": "it would have", + "itll": "it will", + "itllve": "it will have", + "lets": "let us", + "maam": "madam", + "maynt": "may not", + "mightve": "might have", + "mightnt": "might not", + "mightntve": "might not have", + "mustve": "must have", + "mustnt": "must not", + "mustntve": "must not have", + "neednt": "need not", + "needntve": "need not have", + "oclock": "of the clock", + "oughtnt": "ought not", + "oughtntve": "ought not have", + "shant": "shall not", + "shant": "shall not", + "shantve": "shall not have", + "shed": "she would", + "shedve": "she would have", + "shell": "she will", + "shellve": "she will have", + "shouldve": "should have", + "shouldnt": "should not", + "shouldntve": "should not have", + "sove": "so have", + "thatd": "that would", + "thatdve": "that would have", + "thered": "there would", + "theredve": "there would have", + "theyd": "they would", + "theydve": "they would have", + "theyll": "they will", + "theyllve": "they will have", + "theyre": "they are", + "theyve": "they have", + "tove": "to have", + "wasnt": "was not", + "wed": "we would", + "wedve": "we would have", + "well": "we will", + "wellve": "we will have", + "were": "we are", + "weve": "we have", + "werent": "were not", + "whatll": "what will", + "whatllve": "what will have", + "whatre": "what are", + "whatve": "what have", + "whenve": "when have", + "whered": "where did", + "whereve": "where have", + "wholl": "who will", + "whollve": "who will have", + "whove": "who have", + "whyve": "why have", + "willve": "will have", + "wont": "will not", + "wontve": "will not have", + "wouldve": "would have", + "wouldnt": "would not", + "wouldntve": "would not have", + "yall": "you all", + "yalld": "you all would", + "yalldve": "you all would have", + "yallre": "you all are", + "yallve": "you all have", + "youd": "you would", + "youdve": "you would have", + "youll": "you will", + "youllve": "you will have", + "youre": "you are", + "youve": "you have", +} diff --git a/internet_ml/utils/config.py b/internet_ml/utils/config.py new file mode 100644 index 0000000..21b9372 --- /dev/null +++ b/internet_ml/utils/config.py @@ -0,0 +1,25 @@ +import typing + +import logging + +logging.basicConfig( + filename="config.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + +# Global +CONF_DEBUG: bool = True +# NLP +CONF_MODE: str = "default" + + +def NLP_config(mode: str = "default", debug: bool = True) -> None: + global conf_MODE, conf_DEBUG + CONF_DEBUG = debug + if mode == "accuracy" or mode == "speed": + CONF_MODE = mode + else: + if CONF_DEBUG: + logging.warn(f"mode: {mode} does not exist")