NLP from past added. Need to change some relative import

main
Thamognya Kodi 2022-12-24 08:20:25 +07:00
parent 434fbb342f
commit 43938d8377
23 changed files with 617 additions and 0 deletions

View File

@ -0,0 +1,3 @@
# Explanation
Here is where the explanation of how internet-nlp works

View File

View File

View File

@ -0,0 +1,253 @@
from typing import Any, Dict, List
import asyncio
import logging
import re
import time
import urllib
import aiohttp
from bs4 import BeautifulSoup
# Set up logging
logging.basicConfig(
filename="internet.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
# import concurrent.futures
# Import the config module
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
import config
sys.path.append(str(Path(__file__).parent.parent))
import pickle
from is_relevant import filter_irrelevant
from normalize import normalizer
from sentencize import sentencizer
from urlextract import URLExtract
# Define the user agent
HTTP_USERAGENT: dict[str, str] = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# Define the google domains
UNWANTED_DOMAINS = {
"https://www.google.",
"https://google.",
"https://webcache.googleusercontent.",
"http://webcache.googleusercontent.",
"https://policies.google.",
"https://support.google.",
"https://maps.google.",
"https://youtube.",
"https://translate.google.",
}
CACHE_FILE_PATH: str = "./internet_cache.pkl"
CACHE_TIME: int = 86400 # one day
URL_EXTRACTOR = URLExtract()
# Load the cache from the file (if it exists)
try:
with open(CACHE_FILE_PATH, "rb") as f:
cache = pickle.load(f)
except FileNotFoundError:
cache = {}
# Define the fetch_url function
async def fetch_url(session: aiohttp.ClientSession, url: str) -> str:
global HTTP_USERAGENT
async with session.get(url, headers=HTTP_USERAGENT) as response:
return await response.text()
# Define the google_urls function
async def google_urls(query: str, links: list[str]) -> list[str]:
"""
Asynchronously search Google for the given query and retrieve the URLs of the top results.
Parameters:
query (str): The query to search for.
Returns:
List[str]: A list of the URLs of the top search results.
"""
global UNWANTED_DOMAINS
# Initialize an empty list to store the URLs
urls: list[str] = links
# Determine the number of results to retrieve based on the configuration mode
num_of_res: int = (
5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
)
# Log the number of results wanted (if debugging is enabled)
if config.CONF_DEBUG:
logging.info(f"number of results wanted: {num_of_res}")
# Construct the search URL
search_url: str = (
"https://www.google.com/search?q="
+ str(urllib.parse.quote_plus(query))
+ "&num="
+ str(num_of_res)
)
# Log the search URL (if debugging is enabled)
if config.CONF_DEBUG:
logging.info(f"url: {search_url}")
# Create an aiohttp session and use it to fetch the search results
async with aiohttp.ClientSession() as session:
response: str = await fetch_url(session, search_url)
# Wait 10 seconds before parsing the results (to avoid being rate-limited)
await asyncio.sleep(10.0)
# Parse the search results using BeautifulSoup
soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
# Iterate over the links in the search results
for link in list(soup.select("a[href]")):
# Extract the URL from the link
url = str(link["href"])
# Check if the URL is valid and not a Google or YouTube link
if ("http" in url) and (
not any(url.startswith(s) for s in UNWANTED_DOMAINS)
):
urls.append(url)
if config.CONF_DEBUG:
logging.info(f"added {url}")
if len(urls) == num_of_res:
break
return urls
async def fetch_url_text(
session: aiohttp.ClientSession, url: str, query: str
) -> list[str]:
"""
Extract the text from the given HTML content.
Parameters:
session (aiohttp.ClientSession): aiohttp session
url (str): The url content to get text from.
Returns:
str: The extracted text.
"""
global HTTP_USERAGENT
try:
async with session.get(url, headers=HTTP_USERAGENT) as response:
soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
text = normalizer(soup.get_text())
if config.CONF_DEBUG:
logging.info(f"Text: {text}")
sentences: list[str] = sentencizer(text)
sentences = filter_irrelevant(sentences, query)
return sentences
except Exception as e:
# Log the error and continue execution
logging.error(f"Error occurred: {e}")
return []
def flatten(l):
return [item for sublist in l for item in sublist]
async def get_text_content(urls: list[str], query: str) -> list[str]:
# Create a list to store the results
results: list[str] = []
# Create an aiohttp session
async with aiohttp.ClientSession() as session:
# Create a list of tasks to run concurrently
tasks: list[Any] = [
asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
]
# Use asyncio.gather to run the tasks concurrently
results = await asyncio.gather(*tasks)
sentences: list[str] = flatten(results)
return sentences
def google(query: str) -> list[str]:
global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
entry = cache.get(query)
if entry is None:
# no query exists, so add a new entry to the cache
text = asyncio.run(
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
)
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
elif entry[1] < time.time():
# update as it expired
text = asyncio.run(
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
)
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
else:
# available so return it
text = entry[0]
# Save the cache to the file
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
# Return the text
return text
print(
google(
"who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en"
)
)
"""
async + multithreading since web scraping is I/O bound
https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
normal
________________________________________________________
Executed in 1.67 secs fish external
usr time 137.29 millis 0.11 millis 137.18 millis
sys time 38.39 millis 1.25 millis 37.13 millis
Async
________________________________________________________
Executed in 624.82 millis fish external
usr time 141.92 millis 0.11 millis 141.81 millis
sys time 38.00 millis 1.45 millis 36.55 millis
concurrent
________________________________________________________
Executed in 629.67 millis fish external
usr time 136.72 millis 0.12 millis 136.60 millis
sys time 36.86 millis 1.32 millis 35.54 millis
multiprocessing
________________________________________________________
Executed in 754.61 millis fish external
usr time 399.25 millis 0.11 millis 399.14 millis
sys time 164.39 millis 1.49 millis 162.90 millis
multiprocessing
OVERALL
multithreading bs4
________________________________________________________
Executed in 14.67 secs fish external
usr time 1.81 secs 0.12 millis 1.81 secs
sys time 0.14 secs 1.50 millis 0.14 secs
multiprocessing bs4
"""

View File

@ -0,0 +1,82 @@
# mypy: ignore-errors
# checks if sentence is relevant to other sentence
from typing import List
import concurrent.futures
import pickle
import spacy
# Load the English language model
NLP = spacy.load("en_core_web_sm")
from pathlib import Path
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
try:
with open(CACHE_FILE_PATH, "rb") as f:
cache = pickle.load(f)
except (OSError, EOFError):
cache = {}
def is_relevant(sentence: str, question: str) -> bool:
global NLP
cache_key = (sentence, question)
if cache_key in cache:
relevant: bool = cache[cache_key]
return relevant
# Process the sentence and question
doc_sentence = NLP(sentence)
doc_question = NLP(question)
# Extract the named entities and important words or phrases from the sentence
sentence_important = {
token.text
for token in doc_sentence
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
question_important = {
token.text
for token in doc_question
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
# Check if any of the named entities or important words or phrases in the question are in the sentence
for token in question_important:
if token in sentence_important:
cache[cache_key] = True
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return True
# Check if the sentence contains any negative words
for token in doc_sentence:
if token.pos_ == "ADV" and token.dep_ == "neg":
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
relevant_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [
executor.submit(is_relevant, sentence, question) for sentence in sentences
]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
relevant_sentences.append(sentence)
return relevant_sentences
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))

View File

View File

@ -0,0 +1,83 @@
import logging
# logging config
logging.basicConfig(
filename="normalize.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
import concurrent.futures
import string
import sys
from pathlib import Path
import contractions
import tokenizers
from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
# Add utils directory to path
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
import config
# Define normalization sequence
NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
[NFKD(), Strip(), StripAccents()]
)
def remove_non_ascii(string: str) -> str:
return string.encode("ascii", errors="ignore").decode()
def normalizer(text: str) -> str:
global remove_non_ascii
"""Normalize input text.
Args:
text (str): Input text to normalize.
Returns:
str: Normalized text.
"""
global NORMALIZER_SEQ
# Expand contractions
contractions.fix(text)
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Normalize string
text = NORMALIZER_SEQ.normalize_str(text)
text = (
text.replace("\n", " ")
.replace("\t", " ")
.replace("\r", " ")
.replace("'", " ")
.replace("\\x", " ")
.replace('"', " ")
.replace("\\", " ")
.replace("\\", " ")
.replace("\\r", " ")
.replace("\\f", " ")
.replace("\\a", " ")
.replace(r"\/a", " ")
.replace(r"\/f", " ")
.replace(r"\/b", " ")
.replace(" ", " ")
)
text = remove_non_ascii(text)
if config.CONF_DEBUG:
logging.info(text)
return text
def normalize_sentences(sentences: list[str]) -> list[str]:
normalized_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(normalizer, sentence) for sentence in sentences]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
normalized_sentences.append(sentence)
return normalized_sentences

View File

@ -0,0 +1,61 @@
from typing import List
import logging
# logging config
logging.basicConfig(
filename="sentencize.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
import sys
from pathlib import Path
# Add utils directory to path
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
import concurrent.futures
import config
import nltk
try:
nltk.data.find("words")
except LookupError:
nltk.download("words")
ENGLISH_WORDS = set(nltk.corpus.words.words())
def convert_to_english(text: str) -> str:
global ENGLISH_WORDS
return " ".join(
w
for w in nltk.wordpunct_tokenize(text)
if w.lower() in ENGLISH_WORDS or not w.isalpha()
)
def sentencizer(text: str) -> list[str]:
global convert_to_english
inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
english_sentences: list[str] = []
# Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
# Create a list of futures to process the sentences concurrently
futures = [
executor.submit(convert_to_english, sentence)
for sentence in inital_sentences
]
# Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
for future in concurrent.futures.as_completed(futures):
english_sentences.append(future.result())
if config.CONF_DEBUG:
logging.info(f"sentences: {english_sentences}")
return english_sentences
# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))

View File

@ -0,0 +1,110 @@
import typing
contractions_dict: dict[str, str] = {
"aint": "are not",
"s": " is",
"arent": "are not",
"cant": "cannot",
"cantve": "cannot have",
"cause": "because",
"couldve": "could have",
"couldnt": "could not",
"couldntve": "could not have",
"didnt": "did not",
"doesnt": "does not",
"dont": "do not",
"hadnt": "had not",
"hadntve": "had not have",
"hasnt": "has not",
"havent": "have not",
"hed": "he would",
"hedve": "he would have",
"hell": "he will",
"hellve": "he will have",
"howd": "how did",
"howdy": "how do you",
"howll": "how will",
"Id": "I would",
"Idve": "I would have",
"Ill": "I will",
"Illve": "I will have",
"Im": "I am",
"Ive": "I have",
"isnt": "is not",
"itd": "it would",
"itdve": "it would have",
"itll": "it will",
"itllve": "it will have",
"lets": "let us",
"maam": "madam",
"maynt": "may not",
"mightve": "might have",
"mightnt": "might not",
"mightntve": "might not have",
"mustve": "must have",
"mustnt": "must not",
"mustntve": "must not have",
"neednt": "need not",
"needntve": "need not have",
"oclock": "of the clock",
"oughtnt": "ought not",
"oughtntve": "ought not have",
"shant": "shall not",
"shant": "shall not",
"shantve": "shall not have",
"shed": "she would",
"shedve": "she would have",
"shell": "she will",
"shellve": "she will have",
"shouldve": "should have",
"shouldnt": "should not",
"shouldntve": "should not have",
"sove": "so have",
"thatd": "that would",
"thatdve": "that would have",
"thered": "there would",
"theredve": "there would have",
"theyd": "they would",
"theydve": "they would have",
"theyll": "they will",
"theyllve": "they will have",
"theyre": "they are",
"theyve": "they have",
"tove": "to have",
"wasnt": "was not",
"wed": "we would",
"wedve": "we would have",
"well": "we will",
"wellve": "we will have",
"were": "we are",
"weve": "we have",
"werent": "were not",
"whatll": "what will",
"whatllve": "what will have",
"whatre": "what are",
"whatve": "what have",
"whenve": "when have",
"whered": "where did",
"whereve": "where have",
"wholl": "who will",
"whollve": "who will have",
"whove": "who have",
"whyve": "why have",
"willve": "will have",
"wont": "will not",
"wontve": "will not have",
"wouldve": "would have",
"wouldnt": "would not",
"wouldntve": "would not have",
"yall": "you all",
"yalld": "you all would",
"yalldve": "you all would have",
"yallre": "you all are",
"yallve": "you all have",
"youd": "you would",
"youdve": "you would have",
"youll": "you will",
"youllve": "you will have",
"youre": "you are",
"youve": "you have",
}

View File

@ -0,0 +1,25 @@
import typing
import logging
logging.basicConfig(
filename="config.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
# Global
CONF_DEBUG: bool = True
# NLP
CONF_MODE: str = "default"
def NLP_config(mode: str = "default", debug: bool = True) -> None:
global conf_MODE, conf_DEBUG
CONF_DEBUG = debug
if mode == "accuracy" or mode == "speed":
CONF_MODE = mode
else:
if CONF_DEBUG:
logging.warn(f"mode: {mode} does not exist")