NLP from past added. Need to change some relative import
parent
434fbb342f
commit
43938d8377
|
@ -0,0 +1,3 @@
|
||||||
|
# Explanation
|
||||||
|
|
||||||
|
Here is where the explanation of how internet-nlp works
|
|
@ -0,0 +1,253 @@
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="internet.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
# import concurrent.futures
|
||||||
|
|
||||||
|
# Import the config module
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
|
||||||
|
import config
|
||||||
|
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from is_relevant import filter_irrelevant
|
||||||
|
from normalize import normalizer
|
||||||
|
from sentencize import sentencizer
|
||||||
|
from urlextract import URLExtract
|
||||||
|
|
||||||
|
# Define the user agent
|
||||||
|
HTTP_USERAGENT: dict[str, str] = {
|
||||||
|
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||||
|
}
|
||||||
|
# Define the google domains
|
||||||
|
UNWANTED_DOMAINS = {
|
||||||
|
"https://www.google.",
|
||||||
|
"https://google.",
|
||||||
|
"https://webcache.googleusercontent.",
|
||||||
|
"http://webcache.googleusercontent.",
|
||||||
|
"https://policies.google.",
|
||||||
|
"https://support.google.",
|
||||||
|
"https://maps.google.",
|
||||||
|
"https://youtube.",
|
||||||
|
"https://translate.google.",
|
||||||
|
}
|
||||||
|
|
||||||
|
CACHE_FILE_PATH: str = "./internet_cache.pkl"
|
||||||
|
CACHE_TIME: int = 86400 # one day
|
||||||
|
|
||||||
|
URL_EXTRACTOR = URLExtract()
|
||||||
|
|
||||||
|
# Load the cache from the file (if it exists)
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE_PATH, "rb") as f:
|
||||||
|
cache = pickle.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
# Define the fetch_url function
|
||||||
|
async def fetch_url(session: aiohttp.ClientSession, url: str) -> str:
|
||||||
|
global HTTP_USERAGENT
|
||||||
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
|
return await response.text()
|
||||||
|
|
||||||
|
|
||||||
|
# Define the google_urls function
|
||||||
|
async def google_urls(query: str, links: list[str]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Asynchronously search Google for the given query and retrieve the URLs of the top results.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
query (str): The query to search for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of the URLs of the top search results.
|
||||||
|
"""
|
||||||
|
global UNWANTED_DOMAINS
|
||||||
|
# Initialize an empty list to store the URLs
|
||||||
|
urls: list[str] = links
|
||||||
|
|
||||||
|
# Determine the number of results to retrieve based on the configuration mode
|
||||||
|
num_of_res: int = (
|
||||||
|
5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log the number of results wanted (if debugging is enabled)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"number of results wanted: {num_of_res}")
|
||||||
|
|
||||||
|
# Construct the search URL
|
||||||
|
search_url: str = (
|
||||||
|
"https://www.google.com/search?q="
|
||||||
|
+ str(urllib.parse.quote_plus(query))
|
||||||
|
+ "&num="
|
||||||
|
+ str(num_of_res)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log the search URL (if debugging is enabled)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"url: {search_url}")
|
||||||
|
|
||||||
|
# Create an aiohttp session and use it to fetch the search results
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
response: str = await fetch_url(session, search_url)
|
||||||
|
|
||||||
|
# Wait 10 seconds before parsing the results (to avoid being rate-limited)
|
||||||
|
await asyncio.sleep(10.0)
|
||||||
|
|
||||||
|
# Parse the search results using BeautifulSoup
|
||||||
|
soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
|
||||||
|
|
||||||
|
# Iterate over the links in the search results
|
||||||
|
for link in list(soup.select("a[href]")):
|
||||||
|
# Extract the URL from the link
|
||||||
|
url = str(link["href"])
|
||||||
|
|
||||||
|
# Check if the URL is valid and not a Google or YouTube link
|
||||||
|
if ("http" in url) and (
|
||||||
|
not any(url.startswith(s) for s in UNWANTED_DOMAINS)
|
||||||
|
):
|
||||||
|
urls.append(url)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"added {url}")
|
||||||
|
if len(urls) == num_of_res:
|
||||||
|
break
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_url_text(
|
||||||
|
session: aiohttp.ClientSession, url: str, query: str
|
||||||
|
) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract the text from the given HTML content.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
session (aiohttp.ClientSession): aiohttp session
|
||||||
|
url (str): The url content to get text from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted text.
|
||||||
|
"""
|
||||||
|
global HTTP_USERAGENT
|
||||||
|
try:
|
||||||
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
|
soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
|
||||||
|
text = normalizer(soup.get_text())
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Text: {text}")
|
||||||
|
sentences: list[str] = sentencizer(text)
|
||||||
|
sentences = filter_irrelevant(sentences, query)
|
||||||
|
return sentences
|
||||||
|
except Exception as e:
|
||||||
|
# Log the error and continue execution
|
||||||
|
logging.error(f"Error occurred: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def flatten(l):
|
||||||
|
return [item for sublist in l for item in sublist]
|
||||||
|
|
||||||
|
|
||||||
|
async def get_text_content(urls: list[str], query: str) -> list[str]:
|
||||||
|
# Create a list to store the results
|
||||||
|
results: list[str] = []
|
||||||
|
# Create an aiohttp session
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
# Create a list of tasks to run concurrently
|
||||||
|
tasks: list[Any] = [
|
||||||
|
asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
|
||||||
|
]
|
||||||
|
# Use asyncio.gather to run the tasks concurrently
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
sentences: list[str] = flatten(results)
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
|
def google(query: str) -> list[str]:
|
||||||
|
global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
|
||||||
|
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
||||||
|
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
||||||
|
entry = cache.get(query)
|
||||||
|
if entry is None:
|
||||||
|
# no query exists, so add a new entry to the cache
|
||||||
|
text = asyncio.run(
|
||||||
|
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
|
||||||
|
)
|
||||||
|
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
|
||||||
|
elif entry[1] < time.time():
|
||||||
|
# update as it expired
|
||||||
|
text = asyncio.run(
|
||||||
|
get_text_content(asyncio.run(google_urls(query, links_in_text)), query)
|
||||||
|
)
|
||||||
|
cache[query] = (text, time.time() + CACHE_TIME) # cache expires in one hour
|
||||||
|
else:
|
||||||
|
# available so return it
|
||||||
|
text = entry[0]
|
||||||
|
# Save the cache to the file
|
||||||
|
with open(CACHE_FILE_PATH, "wb") as f:
|
||||||
|
pickle.dump(cache, f)
|
||||||
|
# Return the text
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
print(
|
||||||
|
google(
|
||||||
|
"who is lionel messi https://en.wikipedia.org/wiki/Lionel_Messi https://en.wikipedia.org/wiki/Cristiano_Ronaldo https://www.instagram.com/leomessi/?hl=en"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
async + multithreading since web scraping is I/O bound
|
||||||
|
https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
|
||||||
|
normal
|
||||||
|
________________________________________________________
|
||||||
|
Executed in 1.67 secs fish external
|
||||||
|
usr time 137.29 millis 0.11 millis 137.18 millis
|
||||||
|
sys time 38.39 millis 1.25 millis 37.13 millis
|
||||||
|
Async
|
||||||
|
________________________________________________________
|
||||||
|
Executed in 624.82 millis fish external
|
||||||
|
usr time 141.92 millis 0.11 millis 141.81 millis
|
||||||
|
sys time 38.00 millis 1.45 millis 36.55 millis
|
||||||
|
|
||||||
|
concurrent
|
||||||
|
________________________________________________________
|
||||||
|
Executed in 629.67 millis fish external
|
||||||
|
usr time 136.72 millis 0.12 millis 136.60 millis
|
||||||
|
sys time 36.86 millis 1.32 millis 35.54 millis
|
||||||
|
|
||||||
|
multiprocessing
|
||||||
|
________________________________________________________
|
||||||
|
Executed in 754.61 millis fish external
|
||||||
|
usr time 399.25 millis 0.11 millis 399.14 millis
|
||||||
|
sys time 164.39 millis 1.49 millis 162.90 millis
|
||||||
|
|
||||||
|
multiprocessing
|
||||||
|
|
||||||
|
OVERALL
|
||||||
|
multithreading bs4
|
||||||
|
________________________________________________________
|
||||||
|
Executed in 14.67 secs fish external
|
||||||
|
usr time 1.81 secs 0.12 millis 1.81 secs
|
||||||
|
sys time 0.14 secs 1.50 millis 0.14 secs
|
||||||
|
multiprocessing bs4
|
||||||
|
"""
|
|
@ -0,0 +1,82 @@
|
||||||
|
# mypy: ignore-errors
|
||||||
|
# checks if sentence is relevant to other sentence
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
# Load the English language model
|
||||||
|
NLP = spacy.load("en_core_web_sm")
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE_PATH, "rb") as f:
|
||||||
|
cache = pickle.load(f)
|
||||||
|
except (OSError, EOFError):
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def is_relevant(sentence: str, question: str) -> bool:
|
||||||
|
global NLP
|
||||||
|
|
||||||
|
cache_key = (sentence, question)
|
||||||
|
if cache_key in cache:
|
||||||
|
relevant: bool = cache[cache_key]
|
||||||
|
return relevant
|
||||||
|
# Process the sentence and question
|
||||||
|
doc_sentence = NLP(sentence)
|
||||||
|
doc_question = NLP(question)
|
||||||
|
|
||||||
|
# Extract the named entities and important words or phrases from the sentence
|
||||||
|
sentence_important = {
|
||||||
|
token.text
|
||||||
|
for token in doc_sentence
|
||||||
|
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||||
|
}
|
||||||
|
question_important = {
|
||||||
|
token.text
|
||||||
|
for token in doc_question
|
||||||
|
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if any of the named entities or important words or phrases in the question are in the sentence
|
||||||
|
for token in question_important:
|
||||||
|
if token in sentence_important:
|
||||||
|
cache[cache_key] = True
|
||||||
|
with open(CACHE_FILE_PATH, "wb") as f:
|
||||||
|
pickle.dump(cache, f)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if the sentence contains any negative words
|
||||||
|
for token in doc_sentence:
|
||||||
|
if token.pos_ == "ADV" and token.dep_ == "neg":
|
||||||
|
cache[cache_key] = False
|
||||||
|
with open(CACHE_FILE_PATH, "wb") as f:
|
||||||
|
pickle.dump(cache, f)
|
||||||
|
return False
|
||||||
|
|
||||||
|
cache[cache_key] = False
|
||||||
|
with open(CACHE_FILE_PATH, "wb") as f:
|
||||||
|
pickle.dump(cache, f)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
||||||
|
relevant_sentences = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(is_relevant, sentence, question) for sentence in sentences
|
||||||
|
]
|
||||||
|
for future, sentence in zip(
|
||||||
|
concurrent.futures.as_completed(futures), sentences
|
||||||
|
):
|
||||||
|
if future.result():
|
||||||
|
relevant_sentences.append(sentence)
|
||||||
|
return relevant_sentences
|
||||||
|
|
||||||
|
|
||||||
|
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
|
|
@ -0,0 +1,83 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# logging config
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="normalize.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import string
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import contractions
|
||||||
|
import tokenizers
|
||||||
|
from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
|
||||||
|
|
||||||
|
# Add utils directory to path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
|
||||||
|
import config
|
||||||
|
|
||||||
|
# Define normalization sequence
|
||||||
|
NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
|
||||||
|
[NFKD(), Strip(), StripAccents()]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_non_ascii(string: str) -> str:
|
||||||
|
return string.encode("ascii", errors="ignore").decode()
|
||||||
|
|
||||||
|
|
||||||
|
def normalizer(text: str) -> str:
|
||||||
|
global remove_non_ascii
|
||||||
|
"""Normalize input text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Input text to normalize.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Normalized text.
|
||||||
|
"""
|
||||||
|
global NORMALIZER_SEQ
|
||||||
|
# Expand contractions
|
||||||
|
contractions.fix(text)
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
# Normalize string
|
||||||
|
text = NORMALIZER_SEQ.normalize_str(text)
|
||||||
|
text = (
|
||||||
|
text.replace("\n", " ")
|
||||||
|
.replace("\t", " ")
|
||||||
|
.replace("\r", " ")
|
||||||
|
.replace("'", " ")
|
||||||
|
.replace("\\x", " ")
|
||||||
|
.replace('"', " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
.replace("\\", " ")
|
||||||
|
.replace("\\r", " ")
|
||||||
|
.replace("\\f", " ")
|
||||||
|
.replace("\\a", " ")
|
||||||
|
.replace(r"\/a", " ")
|
||||||
|
.replace(r"\/f", " ")
|
||||||
|
.replace(r"\/b", " ")
|
||||||
|
.replace(" ", " ")
|
||||||
|
)
|
||||||
|
text = remove_non_ascii(text)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_sentences(sentences: list[str]) -> list[str]:
|
||||||
|
normalized_sentences = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
futures = [executor.submit(normalizer, sentence) for sentence in sentences]
|
||||||
|
for future, sentence in zip(
|
||||||
|
concurrent.futures.as_completed(futures), sentences
|
||||||
|
):
|
||||||
|
if future.result():
|
||||||
|
normalized_sentences.append(sentence)
|
||||||
|
return normalized_sentences
|
|
@ -0,0 +1,61 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# logging config
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="sentencize.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add utils directory to path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent) + "/utils")
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
import config
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
try:
|
||||||
|
nltk.data.find("words")
|
||||||
|
except LookupError:
|
||||||
|
nltk.download("words")
|
||||||
|
|
||||||
|
ENGLISH_WORDS = set(nltk.corpus.words.words())
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_english(text: str) -> str:
|
||||||
|
global ENGLISH_WORDS
|
||||||
|
return " ".join(
|
||||||
|
w
|
||||||
|
for w in nltk.wordpunct_tokenize(text)
|
||||||
|
if w.lower() in ENGLISH_WORDS or not w.isalpha()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def sentencizer(text: str) -> list[str]:
|
||||||
|
global convert_to_english
|
||||||
|
inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
|
||||||
|
english_sentences: list[str] = []
|
||||||
|
|
||||||
|
# Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
|
||||||
|
# Create a list of futures to process the sentences concurrently
|
||||||
|
futures = [
|
||||||
|
executor.submit(convert_to_english, sentence)
|
||||||
|
for sentence in inital_sentences
|
||||||
|
]
|
||||||
|
# Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
english_sentences.append(future.result())
|
||||||
|
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"sentences: {english_sentences}")
|
||||||
|
return english_sentences
|
||||||
|
|
||||||
|
|
||||||
|
# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))
|
|
@ -0,0 +1,110 @@
|
||||||
|
import typing
|
||||||
|
|
||||||
|
contractions_dict: dict[str, str] = {
|
||||||
|
"aint": "are not",
|
||||||
|
"s": " is",
|
||||||
|
"arent": "are not",
|
||||||
|
"cant": "cannot",
|
||||||
|
"cantve": "cannot have",
|
||||||
|
"cause": "because",
|
||||||
|
"couldve": "could have",
|
||||||
|
"couldnt": "could not",
|
||||||
|
"couldntve": "could not have",
|
||||||
|
"didnt": "did not",
|
||||||
|
"doesnt": "does not",
|
||||||
|
"dont": "do not",
|
||||||
|
"hadnt": "had not",
|
||||||
|
"hadntve": "had not have",
|
||||||
|
"hasnt": "has not",
|
||||||
|
"havent": "have not",
|
||||||
|
"hed": "he would",
|
||||||
|
"hedve": "he would have",
|
||||||
|
"hell": "he will",
|
||||||
|
"hellve": "he will have",
|
||||||
|
"howd": "how did",
|
||||||
|
"howdy": "how do you",
|
||||||
|
"howll": "how will",
|
||||||
|
"Id": "I would",
|
||||||
|
"Idve": "I would have",
|
||||||
|
"Ill": "I will",
|
||||||
|
"Illve": "I will have",
|
||||||
|
"Im": "I am",
|
||||||
|
"Ive": "I have",
|
||||||
|
"isnt": "is not",
|
||||||
|
"itd": "it would",
|
||||||
|
"itdve": "it would have",
|
||||||
|
"itll": "it will",
|
||||||
|
"itllve": "it will have",
|
||||||
|
"lets": "let us",
|
||||||
|
"maam": "madam",
|
||||||
|
"maynt": "may not",
|
||||||
|
"mightve": "might have",
|
||||||
|
"mightnt": "might not",
|
||||||
|
"mightntve": "might not have",
|
||||||
|
"mustve": "must have",
|
||||||
|
"mustnt": "must not",
|
||||||
|
"mustntve": "must not have",
|
||||||
|
"neednt": "need not",
|
||||||
|
"needntve": "need not have",
|
||||||
|
"oclock": "of the clock",
|
||||||
|
"oughtnt": "ought not",
|
||||||
|
"oughtntve": "ought not have",
|
||||||
|
"shant": "shall not",
|
||||||
|
"shant": "shall not",
|
||||||
|
"shantve": "shall not have",
|
||||||
|
"shed": "she would",
|
||||||
|
"shedve": "she would have",
|
||||||
|
"shell": "she will",
|
||||||
|
"shellve": "she will have",
|
||||||
|
"shouldve": "should have",
|
||||||
|
"shouldnt": "should not",
|
||||||
|
"shouldntve": "should not have",
|
||||||
|
"sove": "so have",
|
||||||
|
"thatd": "that would",
|
||||||
|
"thatdve": "that would have",
|
||||||
|
"thered": "there would",
|
||||||
|
"theredve": "there would have",
|
||||||
|
"theyd": "they would",
|
||||||
|
"theydve": "they would have",
|
||||||
|
"theyll": "they will",
|
||||||
|
"theyllve": "they will have",
|
||||||
|
"theyre": "they are",
|
||||||
|
"theyve": "they have",
|
||||||
|
"tove": "to have",
|
||||||
|
"wasnt": "was not",
|
||||||
|
"wed": "we would",
|
||||||
|
"wedve": "we would have",
|
||||||
|
"well": "we will",
|
||||||
|
"wellve": "we will have",
|
||||||
|
"were": "we are",
|
||||||
|
"weve": "we have",
|
||||||
|
"werent": "were not",
|
||||||
|
"whatll": "what will",
|
||||||
|
"whatllve": "what will have",
|
||||||
|
"whatre": "what are",
|
||||||
|
"whatve": "what have",
|
||||||
|
"whenve": "when have",
|
||||||
|
"whered": "where did",
|
||||||
|
"whereve": "where have",
|
||||||
|
"wholl": "who will",
|
||||||
|
"whollve": "who will have",
|
||||||
|
"whove": "who have",
|
||||||
|
"whyve": "why have",
|
||||||
|
"willve": "will have",
|
||||||
|
"wont": "will not",
|
||||||
|
"wontve": "will not have",
|
||||||
|
"wouldve": "would have",
|
||||||
|
"wouldnt": "would not",
|
||||||
|
"wouldntve": "would not have",
|
||||||
|
"yall": "you all",
|
||||||
|
"yalld": "you all would",
|
||||||
|
"yalldve": "you all would have",
|
||||||
|
"yallre": "you all are",
|
||||||
|
"yallve": "you all have",
|
||||||
|
"youd": "you would",
|
||||||
|
"youdve": "you would have",
|
||||||
|
"youll": "you will",
|
||||||
|
"youllve": "you will have",
|
||||||
|
"youre": "you are",
|
||||||
|
"youve": "you have",
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
import typing
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="config.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Global
|
||||||
|
CONF_DEBUG: bool = True
|
||||||
|
# NLP
|
||||||
|
CONF_MODE: str = "default"
|
||||||
|
|
||||||
|
|
||||||
|
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
||||||
|
global conf_MODE, conf_DEBUG
|
||||||
|
CONF_DEBUG = debug
|
||||||
|
if mode == "accuracy" or mode == "speed":
|
||||||
|
CONF_MODE = mode
|
||||||
|
else:
|
||||||
|
if CONF_DEBUG:
|
||||||
|
logging.warn(f"mode: {mode} does not exist")
|
Loading…
Reference in New Issue