updates to code via mypy

main
Thamognya Kodi 2022-12-27 13:38:47 +07:00
parent 16aa8cea4a
commit 0ab358d6ce
7 changed files with 187 additions and 61 deletions

View File

@ -1,17 +1,36 @@
from typing import Any from typing import Any, List, Tuple
import logging
import sys import sys
from pathlib import Path from pathlib import Path
from transformers import pipeline from transformers import pipeline
logging.basicConfig(
filename="QA.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data") sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
import config
import internet import internet
QA_MODEL = pipeline("question-answering") QA_MODEL: Any = pipeline("question-answering")
def answer(query: str) -> Any: def answer(query: str) -> tuple[Any, list[str]]:
global QA_MODEL global QA_MODEL
results = internet.google(query) results: tuple[list[str], list[str]] = internet.google(query)
return (QA_MODEL(question=query, context=str(results[0])), results[1]) answer: tuple[Any, list[str]] = (
QA_MODEL(question=query, context=str(results[0])),
results[1],
)
if config.CONF_DEBUG:
logging.info(f"Answer: {answer}")
return answer
# def custom_answer

View File

@ -1,5 +1,6 @@
from typing import Any, List, Tuple from typing import Any, List, Tuple
import logging
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
@ -7,6 +8,13 @@ from pathlib import Path
import dotenv import dotenv
import requests import requests
logging.basicConfig(
filename="internet.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent)) sys.path.append(str(Path(__file__).parent.parent))
@ -19,20 +27,26 @@ import re
import aiohttp import aiohttp
import config import config
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from is_relevant import filter_irrelevant
from normalize import normalizer from normalize import normalizer
from relevancy import filter_irrelevant
from sentencize import sentencizer from sentencize import sentencizer
from urlextract import URLExtract from urlextract import URLExtract
dotenv.load_dotenv() dotenv.load_dotenv()
HTTP_USERAGENT: dict[str, str] = { HTTP_USERAGENT: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
} }
def google_urls(query: str, links: list[str]) -> list[str]: def google_urls(query: str, links: list[str]) -> list[str]:
try:
# Send the request to the Google Search API # Send the request to the Google Search API
if config.GOOGLE_API_KEY == "":
exit("ERROR: Google API Key not found")
if config.GOOGLE_SEARCH_ENGINE_ID == "":
exit("ERROR: Google Search Engine Id not found")
response = requests.get( response = requests.get(
"https://www.googleapis.com/customsearch/v1", "https://www.googleapis.com/customsearch/v1",
params={ params={
@ -44,26 +58,45 @@ def google_urls(query: str, links: list[str]) -> list[str]:
results = response.json()["items"] results = response.json()["items"]
# Print the search results # Print the search results
num_of_res: int = ( num_of_res: int = (
5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10) 5
if config.NLP_CONF_MODE == "speed"
else (20 if config.NLP_CONF_MODE else 10)
) )
for result in results: for result in results:
links.append(result["link"]) links.append(result["link"])
if len(links) == num_of_res: if len(links) == num_of_res:
break break
if config.CONF_DEBUG:
logging.info(f"Links: {links}")
return links return links
except Exception:
if config.CONF_DEBUG:
logging.info(f"Error: {Exception}")
exit(
f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com"
)
async def fetch_url(session, url, question): async def fetch_url(session: Any, url: str, question: Any) -> list[str]:
try:
async with session.get(url, headers=HTTP_USERAGENT) as response: async with session.get(url, headers=HTTP_USERAGENT) as response:
html = await response.text() html = await response.text()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
text = soup.get_text() text = soup.get_text()
normalized_text = normalizer(text) normalized_text = normalizer(text)
sentences = sentencizer(normalized_text) sentences: list[str] = sentencizer(normalized_text)
if config.CONF_DEBUG:
logging.info(f"Sentences: {sentences}")
return sentences return sentences
except aiohttp.ClientConnectorError:
if config.CONF_DEBUG:
logging.info(f"ClientConnector Error: Likely a connection issue with wifi")
return [""]
except Exception:
return [""]
async def fetch_urls(urls, question): async def fetch_urls(urls: list[str], question: str) -> Any:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls] tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
@ -74,7 +107,7 @@ def flatten(a: list[list[Any]]) -> list[Any]:
return list(itertools.chain(*a)) return list(itertools.chain(*a))
def get_url_contents(urls, question): def get_url_contents(urls: list[str], question: str) -> list[str]:
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
contents = loop.run_until_complete(fetch_urls(urls, question)) contents = loop.run_until_complete(fetch_urls(urls, question))
@ -82,15 +115,54 @@ def get_url_contents(urls, question):
return flatten(contents) return flatten(contents)
URL_EXTRACTOR = URLExtract() URL_EXTRACTOR: URLExtract = URLExtract()
def google(query: str) -> tuple[list[str], list[str]]: def google(query: str) -> tuple[list[str], list[str]]:
global URL_EXTRACTOR global URL_EXTRACTOR
# Hard coded exceptions - START
if "Thamognya" in query or "thamognya" in query: if "Thamognya" in query or "thamognya" in query:
return (["The smartest person in the world"], ["I decided it"]) return (["The smartest person in the world"], ["I decided it"])
if "modi" in query or "Modi" in query:
return (
["Prime Minister of India"],
[
"https://www.narendramodi.in/",
"https://en.wikipedia.org/wiki/Narendra_Modi",
"https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
"https://www.instagram.com/narendramodi/?hl=en",
"https://www.facebook.com/narendramodi/",
"http://www.pmindia.gov.in/en/",
"https://timesofindia.indiatimes.com/topic/Narendra-Modi",
"https://www.britannica.com/biography/Narendra-Modi",
"https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/",
"https://economictimes.indiatimes.com/news/narendra-modi",
],
)
# Hard coded exceptions - END
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
urls = google_urls(query, links_in_text) urls = google_urls(query, links_in_text)
content = get_url_contents(urls, query) content = get_url_contents(urls, query)
if config.CONF_DEBUG:
logging.info(f"Urls: {urls}")
logging.info(f"Content: {content}")
return (content, urls) return (content, urls)
"""
Timing:
import time
start_time = time.time()
google("Who is Elon Musk")
print("--- %s seconds ---" % (time.time() - start_time))
# Results:
# --- 2.2230100631713867 seconds ---
# ________________________________________________________
# Executed in 4.73 secs fish external
# usr time 3.35 secs 85.00 micros 3.35 secs
# sys time 1.86 secs 956.00 micros 1.86 secs
"""

View File

@ -67,7 +67,7 @@ def normalizer(text: str) -> str:
.replace(" ", " ") .replace(" ", " ")
) )
text = remove_non_ascii(text) text = remove_non_ascii(text)
if config.NLP_CONF_DEBUG: if config.CONF_DEBUG:
logging.info(text) logging.info(text)
return text return text
@ -81,4 +81,6 @@ def normalize_sentences(sentences: list[str]) -> list[str]:
): ):
if future.result(): if future.result():
normalized_sentences.append(sentence) normalized_sentences.append(sentence)
if config.CONF_DEBUG:
logging.info(f"Normalized Sentences: {normalize_sentences}")
return normalized_sentences return normalized_sentences

View File

@ -1,6 +1,9 @@
from typing import Any from typing import Any
import concurrent.futures import concurrent.futures
import logging
import sys
from pathlib import Path
import nltk import nltk
import numpy as np import numpy as np
@ -12,6 +15,18 @@ from nltk.tokenize import word_tokenize
# from scipy.spatial.distance import jaccard # from scipy.spatial.distance import jaccard
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
import config
logging.basicConfig(
filename="relevancy.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
nltk.download("punkt") nltk.download("punkt")
nltk.download("stopwords") nltk.download("stopwords")
nltk.download("wordnet") nltk.download("wordnet")
@ -64,12 +79,27 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
answer: bool answer: bool
if main_verb is None: if main_verb is None:
answer = similarity >= threshold answer = similarity >= threshold
return answer
else: else:
answer = main_verb in sentence_tokens and similarity >= threshold answer = main_verb in sentence_tokens and similarity >= threshold
if config.CONF_DEBUG:
logging.info(
f"Is Relevant -> Sentence: {sentence}, Question: {question} -> Relevancy: {answer}"
)
return answer return answer
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
# Create a list to store the relevant sentences
relevant_sentences = []
for sentence in sentences:
if is_answer(sentence, question):
relevant_sentences.append(sentence)
print(sentence)
if config.CONF_DEBUG:
logging.info(f"Relevant Sentences: {relevant_sentences}")
return relevant_sentences
# # Test the is_answer function # # Test the is_answer function
# sentence = "Neil Armstrong was the first person to walk on the Moon." # sentence = "Neil Armstrong was the first person to walk on the Moon."
# question = "Who was the first person to walk on the Moon?" # question = "Who was the first person to walk on the Moon?"
@ -81,15 +111,15 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
# from concurrent.futures import ThreadPoolExecutor # from concurrent.futures import ThreadPoolExecutor
# import concurrent.futures # import concurrent.futures
"""
def filter_irrelevant(sentences: list[str], question: str) -> list[str]: print(
# Create a list to store the relevant sentences filter_irrelevant(
relevant_sentences = [] [
for sentence in sentences: "Neil Armstrong is an American Astronaut",
if is_answer(sentence, question): "Neil Armstrong is dead",
relevant_sentences.append(sentence) "Neil Armstrng is fake",
print(sentence) ],
return relevant_sentences "Who is Neil Armstrong?",
)
)
# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?")) """

View File

@ -1,4 +1,4 @@
from typing import List from typing import Any, List
import logging import logging
@ -26,7 +26,7 @@ try:
except LookupError: except LookupError:
nltk.download("words") nltk.download("words")
ENGLISH_WORDS = set(nltk.corpus.words.words()) ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
def convert_to_english(text: str) -> str: def convert_to_english(text: str) -> str:
@ -54,7 +54,7 @@ def sentencizer(text: str) -> list[str]:
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
english_sentences.append(future.result()) english_sentences.append(future.result())
if config.NLP_CONF_DEBUG: if config.CONF_DEBUG:
logging.info(f"sentences: {english_sentences}") logging.info(f"sentences: {english_sentences}")
return english_sentences return english_sentences

View File

@ -6,12 +6,11 @@ logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s", format="%(name)s - %(levelname)s - %(message)s",
) )
# General
CONF_DEBUG: bool = True
# Google
GOOGLE_API_KEY: str = "" GOOGLE_API_KEY: str = ""
GOOGLE_SEARCH_ENGINE_ID: str = "" GOOGLE_SEARCH_ENGINE_ID: str = ""
# Global
NLP_CONF_DEBUG: bool = True
# NLP # NLP
NLP_CONF_MODE: str = "default" NLP_CONF_MODE: str = "default"
@ -20,13 +19,17 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") ->
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
GOOGLE_API_KEY = _GOOGLE_API_KEY GOOGLE_API_KEY = _GOOGLE_API_KEY
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
if CONF_DEBUG and _GOOGLE_API_KEY != "":
logging.info(f"API_KEY set")
if CONF_DEBUG and _GOOGLE_SEARCH_ENGINE_ID != "":
logging.info(f"SEARCH_ENGINE_ID set")
def NLP_config(mode: str = "default", debug: bool = True) -> None: def NLP_config(mode: str = "default", debug: bool = True) -> None:
global conf_MODE, conf_DEBUG global conf_MODE, conf_DEBUG
NLP_CONF_DEBUG = debug CONF_DEBUG = debug
if mode == "accuracy" or mode == "speed": if mode == "accuracy" or mode == "speed":
NLP_CONF_MODE = mode NLP_CONF_MODE = mode
else: else:
if NLP_CONF_DEBUG: if CONF_DEBUG:
logging.warn(f"mode: {mode} does not exist") logging.warn(f"mode: {mode} does not exist")

View File

@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "internet_ml" name = "internet_ml"
version = "0.1.2" version = "0.1.3"
description = "Internet-ML: Allowing ML to connect to the internet" description = "Internet-ML: Allowing ML to connect to the internet"
readme = "./.github/README.md" readme = "./.github/README.md"
authors = ["Thamognya Kodi <contact@thamognya.com>"] authors = ["Thamognya Kodi <contact@thamognya.com>"]