From 0ab358d6ce8ce546653b6f220117a3199ff3ca42 Mon Sep 17 00:00:00 2001 From: Thamognya Kodi Date: Tue, 27 Dec 2022 13:38:47 +0700 Subject: [PATCH] updates to code via mypy --- internet_ml/NLP/no_context/QA.py | 29 +++- internet_ml/tools/NLP/data/internet.py | 134 ++++++++++++++---- internet_ml/tools/NLP/normalize.py | 4 +- .../NLP/{is_relevant.py => relevancy.py} | 58 ++++++-- internet_ml/tools/NLP/sentencize.py | 6 +- internet_ml/utils/config.py | 15 +- pyproject.toml | 2 +- 7 files changed, 187 insertions(+), 61 deletions(-) rename internet_ml/tools/NLP/{is_relevant.py => relevancy.py} (79%) diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index 02a8c07..ee8e82e 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -1,17 +1,36 @@ -from typing import Any +from typing import Any, List, Tuple +import logging import sys from pathlib import Path from transformers import pipeline +logging.basicConfig( + filename="QA.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data") +sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils") +import config import internet -QA_MODEL = pipeline("question-answering") +QA_MODEL: Any = pipeline("question-answering") -def answer(query: str) -> Any: +def answer(query: str) -> tuple[Any, list[str]]: global QA_MODEL - results = internet.google(query) - return (QA_MODEL(question=query, context=str(results[0])), results[1]) + results: tuple[list[str], list[str]] = internet.google(query) + answer: tuple[Any, list[str]] = ( + QA_MODEL(question=query, context=str(results[0])), + results[1], + ) + if config.CONF_DEBUG: + logging.info(f"Answer: {answer}") + return answer + + +# def custom_answer diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index 8a67b7c..957530d 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -1,5 +1,6 @@ from typing import Any, List, Tuple +import logging import os import sys from pathlib import Path @@ -7,6 +8,13 @@ from pathlib import Path import dotenv import requests +logging.basicConfig( + filename="internet.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") sys.path.append(str(Path(__file__).parent.parent)) @@ -19,51 +27,76 @@ import re import aiohttp import config from bs4 import BeautifulSoup -from is_relevant import filter_irrelevant from normalize import normalizer +from relevancy import filter_irrelevant from sentencize import sentencizer from urlextract import URLExtract dotenv.load_dotenv() + HTTP_USERAGENT: dict[str, str] = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } def google_urls(query: str, links: list[str]) -> list[str]: - # Send the request to the Google Search API - response = requests.get( - "https://www.googleapis.com/customsearch/v1", - params={ - "key": config.GOOGLE_API_KEY, - "q": query, - "cx": config.GOOGLE_SEARCH_ENGINE_ID, - }, - ) - results = response.json()["items"] - # Print the search results - num_of_res: int = ( - 5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10) - ) - for result in results: - links.append(result["link"]) - if len(links) == num_of_res: - break - return links + try: + # Send the request to the Google Search API + if config.GOOGLE_API_KEY == "": + exit("ERROR: Google API Key not found") + if config.GOOGLE_SEARCH_ENGINE_ID == "": + exit("ERROR: Google Search Engine Id not found") + response = requests.get( + "https://www.googleapis.com/customsearch/v1", + params={ + "key": config.GOOGLE_API_KEY, + "q": query, + "cx": config.GOOGLE_SEARCH_ENGINE_ID, + }, + ) + results = response.json()["items"] + # Print the search results + num_of_res: int = ( + 5 + if config.NLP_CONF_MODE == "speed" + else (20 if config.NLP_CONF_MODE else 10) + ) + for result in results: + links.append(result["link"]) + if len(links) == num_of_res: + break + if config.CONF_DEBUG: + logging.info(f"Links: {links}") + return links + except Exception: + if config.CONF_DEBUG: + logging.info(f"Error: {Exception}") + exit( + f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com" + ) -async def fetch_url(session, url, question): - async with session.get(url, headers=HTTP_USERAGENT) as response: - html = await response.text() - soup = BeautifulSoup(html, "html.parser") - text = soup.get_text() - normalized_text = normalizer(text) - sentences = sentencizer(normalized_text) - return sentences +async def fetch_url(session: Any, url: str, question: Any) -> list[str]: + try: + async with session.get(url, headers=HTTP_USERAGENT) as response: + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text() + normalized_text = normalizer(text) + sentences: list[str] = sentencizer(normalized_text) + if config.CONF_DEBUG: + logging.info(f"Sentences: {sentences}") + return sentences + except aiohttp.ClientConnectorError: + if config.CONF_DEBUG: + logging.info(f"ClientConnector Error: Likely a connection issue with wifi") + return [""] + except Exception: + return [""] -async def fetch_urls(urls, question): +async def fetch_urls(urls: list[str], question: str) -> Any: async with aiohttp.ClientSession() as session: tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls] results = await asyncio.gather(*tasks) @@ -74,7 +107,7 @@ def flatten(a: list[list[Any]]) -> list[Any]: return list(itertools.chain(*a)) -def get_url_contents(urls, question): +def get_url_contents(urls: list[str], question: str) -> list[str]: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) contents = loop.run_until_complete(fetch_urls(urls, question)) @@ -82,15 +115,54 @@ def get_url_contents(urls, question): return flatten(contents) -URL_EXTRACTOR = URLExtract() +URL_EXTRACTOR: URLExtract = URLExtract() def google(query: str) -> tuple[list[str], list[str]]: global URL_EXTRACTOR + # Hard coded exceptions - START if "Thamognya" in query or "thamognya" in query: return (["The smartest person in the world"], ["I decided it"]) + if "modi" in query or "Modi" in query: + return ( + ["Prime Minister of India"], + [ + "https://www.narendramodi.in/", + "https://en.wikipedia.org/wiki/Narendra_Modi", + "https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor", + "https://www.instagram.com/narendramodi/?hl=en", + "https://www.facebook.com/narendramodi/", + "http://www.pmindia.gov.in/en/", + "https://timesofindia.indiatimes.com/topic/Narendra-Modi", + "https://www.britannica.com/biography/Narendra-Modi", + "https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/", + "https://economictimes.indiatimes.com/news/narendra-modi", + ], + ) + # Hard coded exceptions - END links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) urls = google_urls(query, links_in_text) content = get_url_contents(urls, query) + if config.CONF_DEBUG: + logging.info(f"Urls: {urls}") + logging.info(f"Content: {content}") return (content, urls) + + +""" +Timing: +import time +start_time = time.time() +google("Who is Elon Musk") +print("--- %s seconds ---" % (time.time() - start_time)) + +# Results: + +# --- 2.2230100631713867 seconds --- + +# ________________________________________________________ +# Executed in 4.73 secs fish external +# usr time 3.35 secs 85.00 micros 3.35 secs +# sys time 1.86 secs 956.00 micros 1.86 secs +""" diff --git a/internet_ml/tools/NLP/normalize.py b/internet_ml/tools/NLP/normalize.py index 1e216f9..3b6857d 100644 --- a/internet_ml/tools/NLP/normalize.py +++ b/internet_ml/tools/NLP/normalize.py @@ -67,7 +67,7 @@ def normalizer(text: str) -> str: .replace(" ", " ") ) text = remove_non_ascii(text) - if config.NLP_CONF_DEBUG: + if config.CONF_DEBUG: logging.info(text) return text @@ -81,4 +81,6 @@ def normalize_sentences(sentences: list[str]) -> list[str]: ): if future.result(): normalized_sentences.append(sentence) + if config.CONF_DEBUG: + logging.info(f"Normalized Sentences: {normalize_sentences}") return normalized_sentences diff --git a/internet_ml/tools/NLP/is_relevant.py b/internet_ml/tools/NLP/relevancy.py similarity index 79% rename from internet_ml/tools/NLP/is_relevant.py rename to internet_ml/tools/NLP/relevancy.py index a2e315d..82fc9c9 100644 --- a/internet_ml/tools/NLP/is_relevant.py +++ b/internet_ml/tools/NLP/relevancy.py @@ -1,6 +1,9 @@ from typing import Any import concurrent.futures +import logging +import sys +from pathlib import Path import nltk import numpy as np @@ -12,6 +15,18 @@ from nltk.tokenize import word_tokenize # from scipy.spatial.distance import jaccard from sklearn.feature_extraction.text import TfidfVectorizer +sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") +sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") + +import config + +logging.basicConfig( + filename="relevancy.log", + filemode="w", + level=logging.INFO, + format="%(name)s - %(levelname)s - %(message)s", +) + nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") @@ -64,10 +79,25 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool: answer: bool if main_verb is None: answer = similarity >= threshold - return answer else: answer = main_verb in sentence_tokens and similarity >= threshold - return answer + if config.CONF_DEBUG: + logging.info( + f"Is Relevant -> Sentence: {sentence}, Question: {question} -> Relevancy: {answer}" + ) + return answer + + +def filter_irrelevant(sentences: list[str], question: str) -> list[str]: + # Create a list to store the relevant sentences + relevant_sentences = [] + for sentence in sentences: + if is_answer(sentence, question): + relevant_sentences.append(sentence) + print(sentence) + if config.CONF_DEBUG: + logging.info(f"Relevant Sentences: {relevant_sentences}") + return relevant_sentences # # Test the is_answer function @@ -81,15 +111,15 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool: # from concurrent.futures import ThreadPoolExecutor # import concurrent.futures - -def filter_irrelevant(sentences: list[str], question: str) -> list[str]: - # Create a list to store the relevant sentences - relevant_sentences = [] - for sentence in sentences: - if is_answer(sentence, question): - relevant_sentences.append(sentence) - print(sentence) - return relevant_sentences - - -# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?")) +""" +print( + filter_irrelevant( + [ + "Neil Armstrong is an American Astronaut", + "Neil Armstrong is dead", + "Neil Armstrng is fake", + ], + "Who is Neil Armstrong?", + ) +) +""" diff --git a/internet_ml/tools/NLP/sentencize.py b/internet_ml/tools/NLP/sentencize.py index 93f0487..67236d8 100644 --- a/internet_ml/tools/NLP/sentencize.py +++ b/internet_ml/tools/NLP/sentencize.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, List import logging @@ -26,7 +26,7 @@ try: except LookupError: nltk.download("words") -ENGLISH_WORDS = set(nltk.corpus.words.words()) +ENGLISH_WORDS: Any = set(nltk.corpus.words.words()) def convert_to_english(text: str) -> str: @@ -54,7 +54,7 @@ def sentencizer(text: str) -> list[str]: for future in concurrent.futures.as_completed(futures): english_sentences.append(future.result()) - if config.NLP_CONF_DEBUG: + if config.CONF_DEBUG: logging.info(f"sentences: {english_sentences}") return english_sentences diff --git a/internet_ml/utils/config.py b/internet_ml/utils/config.py index 1a0dced..9054bab 100644 --- a/internet_ml/utils/config.py +++ b/internet_ml/utils/config.py @@ -6,12 +6,11 @@ logging.basicConfig( level=logging.INFO, format="%(name)s - %(levelname)s - %(message)s", ) - +# General +CONF_DEBUG: bool = True +# Google GOOGLE_API_KEY: str = "" GOOGLE_SEARCH_ENGINE_ID: str = "" - -# Global -NLP_CONF_DEBUG: bool = True # NLP NLP_CONF_MODE: str = "default" @@ -20,13 +19,17 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") -> global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY GOOGLE_API_KEY = _GOOGLE_API_KEY GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID + if CONF_DEBUG and _GOOGLE_API_KEY != "": + logging.info(f"API_KEY set") + if CONF_DEBUG and _GOOGLE_SEARCH_ENGINE_ID != "": + logging.info(f"SEARCH_ENGINE_ID set") def NLP_config(mode: str = "default", debug: bool = True) -> None: global conf_MODE, conf_DEBUG - NLP_CONF_DEBUG = debug + CONF_DEBUG = debug if mode == "accuracy" or mode == "speed": NLP_CONF_MODE = mode else: - if NLP_CONF_DEBUG: + if CONF_DEBUG: logging.warn(f"mode: {mode} does not exist") diff --git a/pyproject.toml b/pyproject.toml index 557cd68..8109918 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "internet_ml" -version = "0.1.2" +version = "0.1.3" description = "Internet-ML: Allowing ML to connect to the internet" readme = "./.github/README.md" authors = ["Thamognya Kodi "]