NLP from past added. Need to change some relative import
parent
434fbb342f
commit
43938d8377
|
@ -0,0 +1,3 @@
|
|||
# Explanation
|
||||
|
||||
Here is where the explanation of how internet-nlp works
|
|
@ -0,0 +1,253 @@
|
|||
from typing import Any, Dict, List
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
filename="internet.log",
|
||||
filemode="w",
|
||||
level=logging.INFO,
|
||||
format="%(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
# import concurrent.futures
|
||||
|
||||
# Import the config module
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
|
||||
import config
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
import pickle
|
||||
|
||||
from is_relevant import filter_irrelevant
|
||||
from normalize import normalizer
|
||||
from sentencize import sentencizer
|
||||
from urlextract import URLExtract
|
||||
|
||||
# Define the user agent
|
||||
HTTP_USERAGENT: dict[str, str] = {
|
||||
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||
}
|
||||
# Define the google domains
|
||||
UNWANTED_DOMAINS = {
|
||||
"https://www.google.",
|
||||
"https://google.",
|
||||
"https://webcache.googleusercontent.",
|
||||
"http://webcache.googleusercontent.",
|
||||
"https://policies.google.",
|
||||
"https://support.google.",
|
||||
"https://maps.google.",
|
||||
"https://youtube.",
|
||||
"https://translate.google.",
|
||||
}
|
||||
|
||||
CACHE_FILE_PATH: str = "./internet_cache.pkl"
|
||||
CACHE_TIME: int = 86400 # one day
|
||||
|
||||
URL_EXTRACTOR = URLExtract()
|
||||
|
||||
# Load the cache from the file (if it exists)
|
||||
try:
|
||||
with open(CACHE_FILE_PATH, "rb") as f:
|
||||
cache = pickle.load(f)
|
||||
except FileNotFoundError:
|
||||
cache = {}
|
||||
|
||||
# Define the fetch_url function
|
||||
async def fetch_url(session: aiohttp.ClientSession, url: str) -> str:
|
||||
global HTTP_USERAGENT
|
||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||
return await response.text()
|
||||
|
||||
|
||||
# Define the google_urls function
|
||||
async def google_urls(query: str, links: list[str]) -> list[str]:
|
||||
"""
|
||||
Asynchronously search Google for the given query and retrieve the URLs of the top results.
|
||||
|
||||
Parameters:
|
||||
query (str): The query to search for.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of the URLs of the top search results.
|
||||
"""
|
||||
global UNWANTED_DOMAINS
|
||||
# Initialize an empty list to store the URLs
|
||||
urls: list[str] = links
|
||||
|
||||
# Determine the number of results to retrieve based on the configuration mode
|
||||
num_of_res: int = (
|
||||
5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
|
||||
)
|
||||
|
||||
# Log the number of results wanted (if debugging is enabled)
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(f"number of results wanted: {num_of_res}")
|
||||
|
||||
# Construct the search URL
|
||||
search_url: str = (
|
||||
"https://www.google.com/search?q="
|
||||
+ str(urllib.parse.quote_plus(query))
|
||||
+ "&num="
|
||||
+ str(num_of_res)
|
||||
)
|
||||
|
||||
# Log the search URL (if debugging is enabled)
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(f"url: {search_url}")
|
||||
|
||||
# Create an aiohttp session and use it to fetch the search results
|
||||
async with aiohttp.ClientSession() as session:
|
||||
response: str = await fetch_url(session, search_url)
|
||||
|
||||
# Wait 10 seconds before parsing the results (to avoid being rate-limited)
|
||||
await asyncio.sleep(10.0)
|
||||
|
||||
# Parse the search results using BeautifulSoup
|
||||
soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
|
||||
|
||||
# Iterate over the links in the search results
|
||||
for link in list(soup.select("a[href]")):
|
||||
# Extract the URL from the link
|
||||
url = str(link["href"])
|
||||
|
||||
# Check if the URL is valid and not a Google or YouTube link
|
||||
if ("http" in url) and (
|
||||
not any(url.startswith(s) for s in UNWANTED_DOMAINS)
|
||||
):
|
||||
urls.append(url)
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(f"added {url}")
|
||||
if len(urls) == num_of_res:
|
||||
break
|
||||
return urls
|
||||
|
||||
|
||||
async def fetch_url_text(
|
||||
session: aiohttp.ClientSession, url: str, query: str
|
||||
) -> list[str]:
|
||||
"""
|
||||
Extract the text from the given HTML content.
|
||||
|
||||
Parameters:
|
||||
session (aiohttp.ClientSession): aiohttp session
|
||||
url (str): The url content to get text from.
|
||||
|
||||
Returns:
|
||||
str: The extracted text.
|
||||
"""
|
||||
global HTTP_USERAGENT
|
||||
try:
|
||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||
soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
|
||||
text = normalizer(soup.get_text())
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(f"Text: {text}")
|
||||
sentences: list[str] = sentencizer(text)
|
||||
sentences = filter_irrelevant(sentences, query)
|
||||
return sentences
|
||||
except Exception as e:
|
||||
# Log the error and continue execution
|
||||
logging.error(f"Error occurred: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def flatten(l):
|
||||
return [item for sublist in l for item in sublist]
|
||||
|
||||
|
||||
async def get_text_content(urls: list[str], query: str) -> list[str]:
|
||||
# Create a list to store the results
|
||||
results: list[str] = []
|
||||
# Create an aiohttp session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Create a list of tasks to run concurrently
|
||||
tasks: list[Any] = [
|
||||
asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
|
||||
]
|
||||
# Use asyncio.gather to run the tasks concurrently
|
||||
results = await asyncio.gather(*tasks)
|
||||
sentences: list[str] = flatten(results)
|
||||
return sentences
|
||||
|
||||
|
||||
def google(query: str) -> list[str]:
|
||||
global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
|
||||
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
||||
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
||||
entry = cache.get(query)
|
||||
if entry is None:
|
||||
# no query exists, so add a new entry to the cache
|
||||
text = asyncio.run(
|
||||
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
|
||||
)
|
||||
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
|
||||
elif entry[1] < time.time():
|
||||
# update as it expired
|
||||
text = asyncio.run(
|
||||
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
|
||||
)
|
||||
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
|
||||
else:
|
||||
# available so return it
|
||||
text = entry[0]
|
||||
# Save the cache to the file
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
# Return the text
|
||||
return text
|
||||
|
||||
|
||||
print(
|
||||
google(
|
||||
"who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
async + multithreading since web scraping is I/O bound
|
||||
https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
|
||||
normal
|
||||
________________________________________________________
|
||||
Executed in 1.67 secs fish external
|
||||
usr time 137.29 millis 0.11 millis 137.18 millis
|
||||
sys time 38.39 millis 1.25 millis 37.13 millis
|
||||
Async
|
||||
________________________________________________________
|
||||
Executed in 624.82 millis fish external
|
||||
usr time 141.92 millis 0.11 millis 141.81 millis
|
||||
sys time 38.00 millis 1.45 millis 36.55 millis
|
||||
|
||||
concurrent
|
||||
________________________________________________________
|
||||
Executed in 629.67 millis fish external
|
||||
usr time 136.72 millis 0.12 millis 136.60 millis
|
||||
sys time 36.86 millis 1.32 millis 35.54 millis
|
||||
|
||||
multiprocessing
|
||||
________________________________________________________
|
||||
Executed in 754.61 millis fish external
|
||||
usr time 399.25 millis 0.11 millis 399.14 millis
|
||||
sys time 164.39 millis 1.49 millis 162.90 millis
|
||||
|
||||
multiprocessing
|
||||
|
||||
OVERALL
|
||||
multithreading bs4
|
||||
________________________________________________________
|
||||
Executed in 14.67 secs fish external
|
||||
usr time 1.81 secs 0.12 millis 1.81 secs
|
||||
sys time 0.14 secs 1.50 millis 0.14 secs
|
||||
multiprocessing bs4
|
||||
"""
|
|
@ -0,0 +1,82 @@
|
|||
# mypy: ignore-errors
|
||||
# checks if sentence is relevant to other sentence
|
||||
from typing import List
|
||||
|
||||
import concurrent.futures
|
||||
import pickle
|
||||
|
||||
import spacy
|
||||
|
||||
# Load the English language model
|
||||
NLP = spacy.load("en_core_web_sm")
|
||||
from pathlib import Path
|
||||
|
||||
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
|
||||
|
||||
try:
|
||||
with open(CACHE_FILE_PATH, "rb") as f:
|
||||
cache = pickle.load(f)
|
||||
except (OSError, EOFError):
|
||||
cache = {}
|
||||
|
||||
|
||||
def is_relevant(sentence: str, question: str) -> bool:
|
||||
global NLP
|
||||
|
||||
cache_key = (sentence, question)
|
||||
if cache_key in cache:
|
||||
relevant: bool = cache[cache_key]
|
||||
return relevant
|
||||
# Process the sentence and question
|
||||
doc_sentence = NLP(sentence)
|
||||
doc_question = NLP(question)
|
||||
|
||||
# Extract the named entities and important words or phrases from the sentence
|
||||
sentence_important = {
|
||||
token.text
|
||||
for token in doc_sentence
|
||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||
}
|
||||
question_important = {
|
||||
token.text
|
||||
for token in doc_question
|
||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||
}
|
||||
|
||||
# Check if any of the named entities or important words or phrases in the question are in the sentence
|
||||
for token in question_important:
|
||||
if token in sentence_important:
|
||||
cache[cache_key] = True
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return True
|
||||
|
||||
# Check if the sentence contains any negative words
|
||||
for token in doc_sentence:
|
||||
if token.pos_ == "ADV" and token.dep_ == "neg":
|
||||
cache[cache_key] = False
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return False
|
||||
|
||||
cache[cache_key] = False
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return False
|
||||
|
||||
|
||||
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
||||
relevant_sentences = []
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [
|
||||
executor.submit(is_relevant, sentence, question) for sentence in sentences
|
||||
]
|
||||
for future, sentence in zip(
|
||||
concurrent.futures.as_completed(futures), sentences
|
||||
):
|
||||
if future.result():
|
||||
relevant_sentences.append(sentence)
|
||||
return relevant_sentences
|
||||
|
||||
|
||||
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
|
|
@ -0,0 +1,83 @@
|
|||
import logging
|
||||
|
||||
# logging config
|
||||
logging.basicConfig(
|
||||
filename="normalize.log",
|
||||
filemode="w",
|
||||
level=logging.INFO,
|
||||
format="%(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
import concurrent.futures
|
||||
import string
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import contractions
|
||||
import tokenizers
|
||||
from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
|
||||
|
||||
# Add utils directory to path
|
||||
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
|
||||
import config
|
||||
|
||||
# Define normalization sequence
|
||||
NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
|
||||
[NFKD(), Strip(), StripAccents()]
|
||||
)
|
||||
|
||||
|
||||
def remove_non_ascii(string: str) -> str:
|
||||
return string.encode("ascii", errors="ignore").decode()
|
||||
|
||||
|
||||
def normalizer(text: str) -> str:
|
||||
global remove_non_ascii
|
||||
"""Normalize input text.
|
||||
|
||||
Args:
|
||||
text (str): Input text to normalize.
|
||||
|
||||
Returns:
|
||||
str: Normalized text.
|
||||
"""
|
||||
global NORMALIZER_SEQ
|
||||
# Expand contractions
|
||||
contractions.fix(text)
|
||||
# Remove punctuation
|
||||
text = text.translate(str.maketrans("", "", string.punctuation))
|
||||
# Normalize string
|
||||
text = NORMALIZER_SEQ.normalize_str(text)
|
||||
text = (
|
||||
text.replace("\n", " ")
|
||||
.replace("\t", " ")
|
||||
.replace("\r", " ")
|
||||
.replace("'", " ")
|
||||
.replace("\\x", " ")
|
||||
.replace('"', " ")
|
||||
.replace("\\", " ")
|
||||
.replace("\\", " ")
|
||||
.replace("\\r", " ")
|
||||
.replace("\\f", " ")
|
||||
.replace("\\a", " ")
|
||||
.replace(r"\/a", " ")
|
||||
.replace(r"\/f", " ")
|
||||
.replace(r"\/b", " ")
|
||||
.replace(" ", " ")
|
||||
)
|
||||
text = remove_non_ascii(text)
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(text)
|
||||
return text
|
||||
|
||||
|
||||
def normalize_sentences(sentences: list[str]) -> list[str]:
|
||||
normalized_sentences = []
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(normalizer, sentence) for sentence in sentences]
|
||||
for future, sentence in zip(
|
||||
concurrent.futures.as_completed(futures), sentences
|
||||
):
|
||||
if future.result():
|
||||
normalized_sentences.append(sentence)
|
||||
return normalized_sentences
|
|
@ -0,0 +1,61 @@
|
|||
from typing import List
|
||||
|
||||
import logging
|
||||
|
||||
# logging config
|
||||
logging.basicConfig(
|
||||
filename="sentencize.log",
|
||||
filemode="w",
|
||||
level=logging.INFO,
|
||||
format="%(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add utils directory to path
|
||||
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
|
||||
import concurrent.futures
|
||||
|
||||
import config
|
||||
import nltk
|
||||
|
||||
try:
|
||||
nltk.data.find("words")
|
||||
except LookupError:
|
||||
nltk.download("words")
|
||||
|
||||
ENGLISH_WORDS = set(nltk.corpus.words.words())
|
||||
|
||||
|
||||
def convert_to_english(text: str) -> str:
|
||||
global ENGLISH_WORDS
|
||||
return " ".join(
|
||||
w
|
||||
for w in nltk.wordpunct_tokenize(text)
|
||||
if w.lower() in ENGLISH_WORDS or not w.isalpha()
|
||||
)
|
||||
|
||||
|
||||
def sentencizer(text: str) -> list[str]:
|
||||
global convert_to_english
|
||||
inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
|
||||
english_sentences: list[str] = []
|
||||
|
||||
# Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
|
||||
# Create a list of futures to process the sentences concurrently
|
||||
futures = [
|
||||
executor.submit(convert_to_english, sentence)
|
||||
for sentence in inital_sentences
|
||||
]
|
||||
# Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
english_sentences.append(future.result())
|
||||
|
||||
if config.CONF_DEBUG:
|
||||
logging.info(f"sentences: {english_sentences}")
|
||||
return english_sentences
|
||||
|
||||
|
||||
# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))
|
|
@ -0,0 +1,110 @@
|
|||
import typing
|
||||
|
||||
contractions_dict: dict[str, str] = {
|
||||
"aint": "are not",
|
||||
"s": " is",
|
||||
"arent": "are not",
|
||||
"cant": "cannot",
|
||||
"cantve": "cannot have",
|
||||
"cause": "because",
|
||||
"couldve": "could have",
|
||||
"couldnt": "could not",
|
||||
"couldntve": "could not have",
|
||||
"didnt": "did not",
|
||||
"doesnt": "does not",
|
||||
"dont": "do not",
|
||||
"hadnt": "had not",
|
||||
"hadntve": "had not have",
|
||||
"hasnt": "has not",
|
||||
"havent": "have not",
|
||||
"hed": "he would",
|
||||
"hedve": "he would have",
|
||||
"hell": "he will",
|
||||
"hellve": "he will have",
|
||||
"howd": "how did",
|
||||
"howdy": "how do you",
|
||||
"howll": "how will",
|
||||
"Id": "I would",
|
||||
"Idve": "I would have",
|
||||
"Ill": "I will",
|
||||
"Illve": "I will have",
|
||||
"Im": "I am",
|
||||
"Ive": "I have",
|
||||
"isnt": "is not",
|
||||
"itd": "it would",
|
||||
"itdve": "it would have",
|
||||
"itll": "it will",
|
||||
"itllve": "it will have",
|
||||
"lets": "let us",
|
||||
"maam": "madam",
|
||||
"maynt": "may not",
|
||||
"mightve": "might have",
|
||||
"mightnt": "might not",
|
||||
"mightntve": "might not have",
|
||||
"mustve": "must have",
|
||||
"mustnt": "must not",
|
||||
"mustntve": "must not have",
|
||||
"neednt": "need not",
|
||||
"needntve": "need not have",
|
||||
"oclock": "of the clock",
|
||||
"oughtnt": "ought not",
|
||||
"oughtntve": "ought not have",
|
||||
"shant": "shall not",
|
||||
"shant": "shall not",
|
||||
"shantve": "shall not have",
|
||||
"shed": "she would",
|
||||
"shedve": "she would have",
|
||||
"shell": "she will",
|
||||
"shellve": "she will have",
|
||||
"shouldve": "should have",
|
||||
"shouldnt": "should not",
|
||||
"shouldntve": "should not have",
|
||||
"sove": "so have",
|
||||
"thatd": "that would",
|
||||
"thatdve": "that would have",
|
||||
"thered": "there would",
|
||||
"theredve": "there would have",
|
||||
"theyd": "they would",
|
||||
"theydve": "they would have",
|
||||
"theyll": "they will",
|
||||
"theyllve": "they will have",
|
||||
"theyre": "they are",
|
||||
"theyve": "they have",
|
||||
"tove": "to have",
|
||||
"wasnt": "was not",
|
||||
"wed": "we would",
|
||||
"wedve": "we would have",
|
||||
"well": "we will",
|
||||
"wellve": "we will have",
|
||||
"were": "we are",
|
||||
"weve": "we have",
|
||||
"werent": "were not",
|
||||
"whatll": "what will",
|
||||
"whatllve": "what will have",
|
||||
"whatre": "what are",
|
||||
"whatve": "what have",
|
||||
"whenve": "when have",
|
||||
"whered": "where did",
|
||||
"whereve": "where have",
|
||||
"wholl": "who will",
|
||||
"whollve": "who will have",
|
||||
"whove": "who have",
|
||||
"whyve": "why have",
|
||||
"willve": "will have",
|
||||
"wont": "will not",
|
||||
"wontve": "will not have",
|
||||
"wouldve": "would have",
|
||||
"wouldnt": "would not",
|
||||
"wouldntve": "would not have",
|
||||
"yall": "you all",
|
||||
"yalld": "you all would",
|
||||
"yalldve": "you all would have",
|
||||
"yallre": "you all are",
|
||||
"yallve": "you all have",
|
||||
"youd": "you would",
|
||||
"youdve": "you would have",
|
||||
"youll": "you will",
|
||||
"youllve": "you will have",
|
||||
"youre": "you are",
|
||||
"youve": "you have",
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
import typing
|
||||
|
||||
import logging
|
||||
|
||||
logging.basicConfig(
|
||||
filename="config.log",
|
||||
filemode="w",
|
||||
level=logging.INFO,
|
||||
format="%(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
# Global
|
||||
CONF_DEBUG: bool = True
|
||||
# NLP
|
||||
CONF_MODE: str = "default"
|
||||
|
||||
|
||||
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
||||
global conf_MODE, conf_DEBUG
|
||||
CONF_DEBUG = debug
|
||||
if mode == "accuracy" or mode == "speed":
|
||||
CONF_MODE = mode
|
||||
else:
|
||||
if CONF_DEBUG:
|
||||
logging.warn(f"mode: {mode} does not exist")
|
Loading…
Reference in New Issue