updates to code via mypy
parent
16aa8cea4a
commit
0ab358d6ce
|
@ -1,17 +1,36 @@
|
||||||
from typing import Any
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="QA.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
|
||||||
|
import config
|
||||||
import internet
|
import internet
|
||||||
|
|
||||||
QA_MODEL = pipeline("question-answering")
|
QA_MODEL: Any = pipeline("question-answering")
|
||||||
|
|
||||||
|
|
||||||
def answer(query: str) -> Any:
|
def answer(query: str) -> tuple[Any, list[str]]:
|
||||||
global QA_MODEL
|
global QA_MODEL
|
||||||
results = internet.google(query)
|
results: tuple[list[str], list[str]] = internet.google(query)
|
||||||
return (QA_MODEL(question=query, context=str(results[0])), results[1])
|
answer: tuple[Any, list[str]] = (
|
||||||
|
QA_MODEL(question=query, context=str(results[0])),
|
||||||
|
results[1],
|
||||||
|
)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Answer: {answer}")
|
||||||
|
return answer
|
||||||
|
|
||||||
|
|
||||||
|
# def custom_answer
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Any, List, Tuple
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -7,6 +8,13 @@ from pathlib import Path
|
||||||
import dotenv
|
import dotenv
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="internet.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
||||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
sys.path.append(str(Path(__file__).parent.parent))
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
@ -19,20 +27,26 @@ import re
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import config
|
import config
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from is_relevant import filter_irrelevant
|
|
||||||
from normalize import normalizer
|
from normalize import normalizer
|
||||||
|
from relevancy import filter_irrelevant
|
||||||
from sentencize import sentencizer
|
from sentencize import sentencizer
|
||||||
from urlextract import URLExtract
|
from urlextract import URLExtract
|
||||||
|
|
||||||
dotenv.load_dotenv()
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
HTTP_USERAGENT: dict[str, str] = {
|
HTTP_USERAGENT: dict[str, str] = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def google_urls(query: str, links: list[str]) -> list[str]:
|
def google_urls(query: str, links: list[str]) -> list[str]:
|
||||||
|
try:
|
||||||
# Send the request to the Google Search API
|
# Send the request to the Google Search API
|
||||||
|
if config.GOOGLE_API_KEY == "":
|
||||||
|
exit("ERROR: Google API Key not found")
|
||||||
|
if config.GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
exit("ERROR: Google Search Engine Id not found")
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
"https://www.googleapis.com/customsearch/v1",
|
"https://www.googleapis.com/customsearch/v1",
|
||||||
params={
|
params={
|
||||||
|
@ -44,26 +58,45 @@ def google_urls(query: str, links: list[str]) -> list[str]:
|
||||||
results = response.json()["items"]
|
results = response.json()["items"]
|
||||||
# Print the search results
|
# Print the search results
|
||||||
num_of_res: int = (
|
num_of_res: int = (
|
||||||
5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
|
5
|
||||||
|
if config.NLP_CONF_MODE == "speed"
|
||||||
|
else (20 if config.NLP_CONF_MODE else 10)
|
||||||
)
|
)
|
||||||
for result in results:
|
for result in results:
|
||||||
links.append(result["link"])
|
links.append(result["link"])
|
||||||
if len(links) == num_of_res:
|
if len(links) == num_of_res:
|
||||||
break
|
break
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Links: {links}")
|
||||||
return links
|
return links
|
||||||
|
except Exception:
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Error: {Exception}")
|
||||||
|
exit(
|
||||||
|
f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def fetch_url(session, url, question):
|
async def fetch_url(session: Any, url: str, question: Any) -> list[str]:
|
||||||
|
try:
|
||||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
html = await response.text()
|
html = await response.text()
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
text = soup.get_text()
|
text = soup.get_text()
|
||||||
normalized_text = normalizer(text)
|
normalized_text = normalizer(text)
|
||||||
sentences = sentencizer(normalized_text)
|
sentences: list[str] = sentencizer(normalized_text)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Sentences: {sentences}")
|
||||||
return sentences
|
return sentences
|
||||||
|
except aiohttp.ClientConnectorError:
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"ClientConnector Error: Likely a connection issue with wifi")
|
||||||
|
return [""]
|
||||||
|
except Exception:
|
||||||
|
return [""]
|
||||||
|
|
||||||
|
|
||||||
async def fetch_urls(urls, question):
|
async def fetch_urls(urls: list[str], question: str) -> Any:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
|
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
|
||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
|
@ -74,7 +107,7 @@ def flatten(a: list[list[Any]]) -> list[Any]:
|
||||||
return list(itertools.chain(*a))
|
return list(itertools.chain(*a))
|
||||||
|
|
||||||
|
|
||||||
def get_url_contents(urls, question):
|
def get_url_contents(urls: list[str], question: str) -> list[str]:
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
contents = loop.run_until_complete(fetch_urls(urls, question))
|
contents = loop.run_until_complete(fetch_urls(urls, question))
|
||||||
|
@ -82,15 +115,54 @@ def get_url_contents(urls, question):
|
||||||
return flatten(contents)
|
return flatten(contents)
|
||||||
|
|
||||||
|
|
||||||
URL_EXTRACTOR = URLExtract()
|
URL_EXTRACTOR: URLExtract = URLExtract()
|
||||||
|
|
||||||
|
|
||||||
def google(query: str) -> tuple[list[str], list[str]]:
|
def google(query: str) -> tuple[list[str], list[str]]:
|
||||||
global URL_EXTRACTOR
|
global URL_EXTRACTOR
|
||||||
|
# Hard coded exceptions - START
|
||||||
if "Thamognya" in query or "thamognya" in query:
|
if "Thamognya" in query or "thamognya" in query:
|
||||||
return (["The smartest person in the world"], ["I decided it"])
|
return (["The smartest person in the world"], ["I decided it"])
|
||||||
|
if "modi" in query or "Modi" in query:
|
||||||
|
return (
|
||||||
|
["Prime Minister of India"],
|
||||||
|
[
|
||||||
|
"https://www.narendramodi.in/",
|
||||||
|
"https://en.wikipedia.org/wiki/Narendra_Modi",
|
||||||
|
"https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
|
||||||
|
"https://www.instagram.com/narendramodi/?hl=en",
|
||||||
|
"https://www.facebook.com/narendramodi/",
|
||||||
|
"http://www.pmindia.gov.in/en/",
|
||||||
|
"https://timesofindia.indiatimes.com/topic/Narendra-Modi",
|
||||||
|
"https://www.britannica.com/biography/Narendra-Modi",
|
||||||
|
"https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/",
|
||||||
|
"https://economictimes.indiatimes.com/news/narendra-modi",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Hard coded exceptions - END
|
||||||
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
||||||
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
||||||
urls = google_urls(query, links_in_text)
|
urls = google_urls(query, links_in_text)
|
||||||
content = get_url_contents(urls, query)
|
content = get_url_contents(urls, query)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Urls: {urls}")
|
||||||
|
logging.info(f"Content: {content}")
|
||||||
return (content, urls)
|
return (content, urls)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Timing:
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
google("Who is Elon Musk")
|
||||||
|
print("--- %s seconds ---" % (time.time() - start_time))
|
||||||
|
|
||||||
|
# Results:
|
||||||
|
|
||||||
|
# --- 2.2230100631713867 seconds ---
|
||||||
|
|
||||||
|
# ________________________________________________________
|
||||||
|
# Executed in 4.73 secs fish external
|
||||||
|
# usr time 3.35 secs 85.00 micros 3.35 secs
|
||||||
|
# sys time 1.86 secs 956.00 micros 1.86 secs
|
||||||
|
"""
|
||||||
|
|
|
@ -67,7 +67,7 @@ def normalizer(text: str) -> str:
|
||||||
.replace(" ", " ")
|
.replace(" ", " ")
|
||||||
)
|
)
|
||||||
text = remove_non_ascii(text)
|
text = remove_non_ascii(text)
|
||||||
if config.NLP_CONF_DEBUG:
|
if config.CONF_DEBUG:
|
||||||
logging.info(text)
|
logging.info(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -81,4 +81,6 @@ def normalize_sentences(sentences: list[str]) -> list[str]:
|
||||||
):
|
):
|
||||||
if future.result():
|
if future.result():
|
||||||
normalized_sentences.append(sentence)
|
normalized_sentences.append(sentence)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Normalized Sentences: {normalize_sentences}")
|
||||||
return normalized_sentences
|
return normalized_sentences
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import nltk
|
import nltk
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -12,6 +15,18 @@ from nltk.tokenize import word_tokenize
|
||||||
# from scipy.spatial.distance import jaccard
|
# from scipy.spatial.distance import jaccard
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="relevancy.log",
|
||||||
|
filemode="w",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
nltk.download("punkt")
|
nltk.download("punkt")
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
nltk.download("wordnet")
|
nltk.download("wordnet")
|
||||||
|
@ -64,12 +79,27 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
|
||||||
answer: bool
|
answer: bool
|
||||||
if main_verb is None:
|
if main_verb is None:
|
||||||
answer = similarity >= threshold
|
answer = similarity >= threshold
|
||||||
return answer
|
|
||||||
else:
|
else:
|
||||||
answer = main_verb in sentence_tokens and similarity >= threshold
|
answer = main_verb in sentence_tokens and similarity >= threshold
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(
|
||||||
|
f"Is Relevant -> Sentence: {sentence}, Question: {question} -> Relevancy: {answer}"
|
||||||
|
)
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
|
|
||||||
|
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
||||||
|
# Create a list to store the relevant sentences
|
||||||
|
relevant_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
if is_answer(sentence, question):
|
||||||
|
relevant_sentences.append(sentence)
|
||||||
|
print(sentence)
|
||||||
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Relevant Sentences: {relevant_sentences}")
|
||||||
|
return relevant_sentences
|
||||||
|
|
||||||
|
|
||||||
# # Test the is_answer function
|
# # Test the is_answer function
|
||||||
# sentence = "Neil Armstrong was the first person to walk on the Moon."
|
# sentence = "Neil Armstrong was the first person to walk on the Moon."
|
||||||
# question = "Who was the first person to walk on the Moon?"
|
# question = "Who was the first person to walk on the Moon?"
|
||||||
|
@ -81,15 +111,15 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
|
||||||
# from concurrent.futures import ThreadPoolExecutor
|
# from concurrent.futures import ThreadPoolExecutor
|
||||||
# import concurrent.futures
|
# import concurrent.futures
|
||||||
|
|
||||||
|
"""
|
||||||
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
print(
|
||||||
# Create a list to store the relevant sentences
|
filter_irrelevant(
|
||||||
relevant_sentences = []
|
[
|
||||||
for sentence in sentences:
|
"Neil Armstrong is an American Astronaut",
|
||||||
if is_answer(sentence, question):
|
"Neil Armstrong is dead",
|
||||||
relevant_sentences.append(sentence)
|
"Neil Armstrng is fake",
|
||||||
print(sentence)
|
],
|
||||||
return relevant_sentences
|
"Who is Neil Armstrong?",
|
||||||
|
)
|
||||||
|
)
|
||||||
# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))
|
"""
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List
|
from typing import Any, List
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ try:
|
||||||
except LookupError:
|
except LookupError:
|
||||||
nltk.download("words")
|
nltk.download("words")
|
||||||
|
|
||||||
ENGLISH_WORDS = set(nltk.corpus.words.words())
|
ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
|
||||||
|
|
||||||
|
|
||||||
def convert_to_english(text: str) -> str:
|
def convert_to_english(text: str) -> str:
|
||||||
|
@ -54,7 +54,7 @@ def sentencizer(text: str) -> list[str]:
|
||||||
for future in concurrent.futures.as_completed(futures):
|
for future in concurrent.futures.as_completed(futures):
|
||||||
english_sentences.append(future.result())
|
english_sentences.append(future.result())
|
||||||
|
|
||||||
if config.NLP_CONF_DEBUG:
|
if config.CONF_DEBUG:
|
||||||
logging.info(f"sentences: {english_sentences}")
|
logging.info(f"sentences: {english_sentences}")
|
||||||
return english_sentences
|
return english_sentences
|
||||||
|
|
||||||
|
|
|
@ -6,12 +6,11 @@ logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(name)s - %(levelname)s - %(message)s",
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
)
|
)
|
||||||
|
# General
|
||||||
|
CONF_DEBUG: bool = True
|
||||||
|
# Google
|
||||||
GOOGLE_API_KEY: str = ""
|
GOOGLE_API_KEY: str = ""
|
||||||
GOOGLE_SEARCH_ENGINE_ID: str = ""
|
GOOGLE_SEARCH_ENGINE_ID: str = ""
|
||||||
|
|
||||||
# Global
|
|
||||||
NLP_CONF_DEBUG: bool = True
|
|
||||||
# NLP
|
# NLP
|
||||||
NLP_CONF_MODE: str = "default"
|
NLP_CONF_MODE: str = "default"
|
||||||
|
|
||||||
|
@ -20,13 +19,17 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") ->
|
||||||
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
|
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
|
||||||
GOOGLE_API_KEY = _GOOGLE_API_KEY
|
GOOGLE_API_KEY = _GOOGLE_API_KEY
|
||||||
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
|
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
|
||||||
|
if CONF_DEBUG and _GOOGLE_API_KEY != "":
|
||||||
|
logging.info(f"API_KEY set")
|
||||||
|
if CONF_DEBUG and _GOOGLE_SEARCH_ENGINE_ID != "":
|
||||||
|
logging.info(f"SEARCH_ENGINE_ID set")
|
||||||
|
|
||||||
|
|
||||||
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
||||||
global conf_MODE, conf_DEBUG
|
global conf_MODE, conf_DEBUG
|
||||||
NLP_CONF_DEBUG = debug
|
CONF_DEBUG = debug
|
||||||
if mode == "accuracy" or mode == "speed":
|
if mode == "accuracy" or mode == "speed":
|
||||||
NLP_CONF_MODE = mode
|
NLP_CONF_MODE = mode
|
||||||
else:
|
else:
|
||||||
if NLP_CONF_DEBUG:
|
if CONF_DEBUG:
|
||||||
logging.warn(f"mode: {mode} does not exist")
|
logging.warn(f"mode: {mode} does not exist")
|
||||||
|
|
|
@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "internet_ml"
|
name = "internet_ml"
|
||||||
version = "0.1.2"
|
version = "0.1.3"
|
||||||
description = "Internet-ML: Allowing ML to connect to the internet"
|
description = "Internet-ML: Allowing ML to connect to the internet"
|
||||||
readme = "./.github/README.md"
|
readme = "./.github/README.md"
|
||||||
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
||||||
|
|
Loading…
Reference in New Issue