From 0ab358d6ce8ce546653b6f220117a3199ff3ca42 Mon Sep 17 00:00:00 2001
From: Thamognya Kodi <contact@thamognya.com>
Date: Tue, 27 Dec 2022 13:38:47 +0700
Subject: [PATCH] updates to code via mypy

---
 internet_ml/NLP/no_context/QA.py              |  29 +++-
 internet_ml/tools/NLP/data/internet.py        | 134 ++++++++++++++----
 internet_ml/tools/NLP/normalize.py            |   4 +-
 .../NLP/{is_relevant.py => relevancy.py}      |  58 ++++++--
 internet_ml/tools/NLP/sentencize.py           |   6 +-
 internet_ml/utils/config.py                   |  15 +-
 pyproject.toml                                |   2 +-
 7 files changed, 187 insertions(+), 61 deletions(-)
 rename internet_ml/tools/NLP/{is_relevant.py => relevancy.py} (79%)

diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py
index 02a8c07..ee8e82e 100644
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
@@ -1,17 +1,36 @@
-from typing import Any
+from typing import Any, List, Tuple
 
+import logging
 import sys
 from pathlib import Path
 
 from transformers import pipeline
 
+logging.basicConfig(
+    filename="QA.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
 sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
+sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
+import config
 import internet
 
-QA_MODEL = pipeline("question-answering")
+QA_MODEL: Any = pipeline("question-answering")
 
 
-def answer(query: str) -> Any:
+def answer(query: str) -> tuple[Any, list[str]]:
     global QA_MODEL
-    results = internet.google(query)
-    return (QA_MODEL(question=query, context=str(results[0])), results[1])
+    results: tuple[list[str], list[str]] = internet.google(query)
+    answer: tuple[Any, list[str]] = (
+        QA_MODEL(question=query, context=str(results[0])),
+        results[1],
+    )
+    if config.CONF_DEBUG:
+        logging.info(f"Answer: {answer}")
+    return answer
+
+
+# def custom_answer
diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py
index 8a67b7c..957530d 100644
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@@ -1,5 +1,6 @@
 from typing import Any, List, Tuple
 
+import logging
 import os
 import sys
 from pathlib import Path
@@ -7,6 +8,13 @@ from pathlib import Path
 import dotenv
 import requests
 
+logging.basicConfig(
+    filename="internet.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
 sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
 sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
 sys.path.append(str(Path(__file__).parent.parent))
@@ -19,51 +27,76 @@ import re
 import aiohttp
 import config
 from bs4 import BeautifulSoup
-from is_relevant import filter_irrelevant
 from normalize import normalizer
+from relevancy import filter_irrelevant
 from sentencize import sentencizer
 from urlextract import URLExtract
 
 dotenv.load_dotenv()
 
+
 HTTP_USERAGENT: dict[str, str] = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 }
 
 
 def google_urls(query: str, links: list[str]) -> list[str]:
-    # Send the request to the Google Search API
-    response = requests.get(
-        "https://www.googleapis.com/customsearch/v1",
-        params={
-            "key": config.GOOGLE_API_KEY,
-            "q": query,
-            "cx": config.GOOGLE_SEARCH_ENGINE_ID,
-        },
-    )
-    results = response.json()["items"]
-    # Print the search results
-    num_of_res: int = (
-        5 if config.CONF_MODE == "speed" else (20 if config.CONF_MODE else 10)
-    )
-    for result in results:
-        links.append(result["link"])
-        if len(links) == num_of_res:
-            break
-    return links
+    try:
+        # Send the request to the Google Search API
+        if config.GOOGLE_API_KEY == "":
+            exit("ERROR: Google API Key not found")
+        if config.GOOGLE_SEARCH_ENGINE_ID == "":
+            exit("ERROR: Google Search Engine Id not found")
+        response = requests.get(
+            "https://www.googleapis.com/customsearch/v1",
+            params={
+                "key": config.GOOGLE_API_KEY,
+                "q": query,
+                "cx": config.GOOGLE_SEARCH_ENGINE_ID,
+            },
+        )
+        results = response.json()["items"]
+        # Print the search results
+        num_of_res: int = (
+            5
+            if config.NLP_CONF_MODE == "speed"
+            else (20 if config.NLP_CONF_MODE else 10)
+        )
+        for result in results:
+            links.append(result["link"])
+            if len(links) == num_of_res:
+                break
+        if config.CONF_DEBUG:
+            logging.info(f"Links: {links}")
+        return links
+    except Exception:
+        if config.CONF_DEBUG:
+            logging.info(f"Error: {Exception}")
+        exit(
+            f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com"
+        )
 
 
-async def fetch_url(session, url, question):
-    async with session.get(url, headers=HTTP_USERAGENT) as response:
-        html = await response.text()
-        soup = BeautifulSoup(html, "html.parser")
-        text = soup.get_text()
-        normalized_text = normalizer(text)
-        sentences = sentencizer(normalized_text)
-        return sentences
+async def fetch_url(session: Any, url: str, question: Any) -> list[str]:
+    try:
+        async with session.get(url, headers=HTTP_USERAGENT) as response:
+            html = await response.text()
+            soup = BeautifulSoup(html, "html.parser")
+            text = soup.get_text()
+            normalized_text = normalizer(text)
+            sentences: list[str] = sentencizer(normalized_text)
+            if config.CONF_DEBUG:
+                logging.info(f"Sentences: {sentences}")
+            return sentences
+    except aiohttp.ClientConnectorError:
+        if config.CONF_DEBUG:
+            logging.info(f"ClientConnector Error: Likely a connection issue with wifi")
+        return [""]
+    except Exception:
+        return [""]
 
 
-async def fetch_urls(urls, question):
+async def fetch_urls(urls: list[str], question: str) -> Any:
     async with aiohttp.ClientSession() as session:
         tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
         results = await asyncio.gather(*tasks)
@@ -74,7 +107,7 @@ def flatten(a: list[list[Any]]) -> list[Any]:
     return list(itertools.chain(*a))
 
 
-def get_url_contents(urls, question):
+def get_url_contents(urls: list[str], question: str) -> list[str]:
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     contents = loop.run_until_complete(fetch_urls(urls, question))
@@ -82,15 +115,54 @@ def get_url_contents(urls, question):
     return flatten(contents)
 
 
-URL_EXTRACTOR = URLExtract()
+URL_EXTRACTOR: URLExtract = URLExtract()
 
 
 def google(query: str) -> tuple[list[str], list[str]]:
     global URL_EXTRACTOR
+    # Hard coded exceptions - START
     if "Thamognya" in query or "thamognya" in query:
         return (["The smartest person in the world"], ["I decided it"])
+    if "modi" in query or "Modi" in query:
+        return (
+            ["Prime Minister of India"],
+            [
+                "https://www.narendramodi.in/",
+                "https://en.wikipedia.org/wiki/Narendra_Modi",
+                "https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
+                "https://www.instagram.com/narendramodi/?hl=en",
+                "https://www.facebook.com/narendramodi/",
+                "http://www.pmindia.gov.in/en/",
+                "https://timesofindia.indiatimes.com/topic/Narendra-Modi",
+                "https://www.britannica.com/biography/Narendra-Modi",
+                "https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/",
+                "https://economictimes.indiatimes.com/news/narendra-modi",
+            ],
+        )
+    # Hard coded exceptions - END
     links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
     query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
     urls = google_urls(query, links_in_text)
     content = get_url_contents(urls, query)
+    if config.CONF_DEBUG:
+        logging.info(f"Urls: {urls}")
+        logging.info(f"Content: {content}")
     return (content, urls)
+
+
+"""
+Timing:
+import time
+start_time = time.time()
+google("Who is Elon Musk")
+print("--- %s seconds ---" % (time.time() - start_time))
+
+# Results:
+
+# --- 2.2230100631713867 seconds ---
+
+# ________________________________________________________
+# Executed in    4.73 secs    fish           external
+#    usr time    3.35 secs   85.00 micros    3.35 secs
+#    sys time    1.86 secs  956.00 micros    1.86 secs
+"""
diff --git a/internet_ml/tools/NLP/normalize.py b/internet_ml/tools/NLP/normalize.py
index 1e216f9..3b6857d 100644
--- a/internet_ml/tools/NLP/normalize.py
+++ b/internet_ml/tools/NLP/normalize.py
@@ -67,7 +67,7 @@ def normalizer(text: str) -> str:
         .replace("               ", " ")
     )
     text = remove_non_ascii(text)
-    if config.NLP_CONF_DEBUG:
+    if config.CONF_DEBUG:
         logging.info(text)
     return text
 
@@ -81,4 +81,6 @@ def normalize_sentences(sentences: list[str]) -> list[str]:
         ):
             if future.result():
                 normalized_sentences.append(sentence)
+    if config.CONF_DEBUG:
+        logging.info(f"Normalized Sentences: {normalize_sentences}")
     return normalized_sentences
diff --git a/internet_ml/tools/NLP/is_relevant.py b/internet_ml/tools/NLP/relevancy.py
similarity index 79%
rename from internet_ml/tools/NLP/is_relevant.py
rename to internet_ml/tools/NLP/relevancy.py
index a2e315d..82fc9c9 100644
--- a/internet_ml/tools/NLP/is_relevant.py
+++ b/internet_ml/tools/NLP/relevancy.py
@@ -1,6 +1,9 @@
 from typing import Any
 
 import concurrent.futures
+import logging
+import sys
+from pathlib import Path
 
 import nltk
 import numpy as np
@@ -12,6 +15,18 @@ from nltk.tokenize import word_tokenize
 # from scipy.spatial.distance import jaccard
 from sklearn.feature_extraction.text import TfidfVectorizer
 
+sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
+sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
+
+import config
+
+logging.basicConfig(
+    filename="relevancy.log",
+    filemode="w",
+    level=logging.INFO,
+    format="%(name)s - %(levelname)s - %(message)s",
+)
+
 nltk.download("punkt")
 nltk.download("stopwords")
 nltk.download("wordnet")
@@ -64,10 +79,25 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
     answer: bool
     if main_verb is None:
         answer = similarity >= threshold
-        return answer
     else:
         answer = main_verb in sentence_tokens and similarity >= threshold
-        return answer
+    if config.CONF_DEBUG:
+        logging.info(
+            f"Is Relevant -> Sentence: {sentence}, Question: {question} -> Relevancy: {answer}"
+        )
+    return answer
+
+
+def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
+    # Create a list to store the relevant sentences
+    relevant_sentences = []
+    for sentence in sentences:
+        if is_answer(sentence, question):
+            relevant_sentences.append(sentence)
+            print(sentence)
+    if config.CONF_DEBUG:
+        logging.info(f"Relevant Sentences: {relevant_sentences}")
+    return relevant_sentences
 
 
 # # Test the is_answer function
@@ -81,15 +111,15 @@ def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
 # from concurrent.futures import ThreadPoolExecutor
 # import concurrent.futures
 
-
-def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
-    # Create a list to store the relevant sentences
-    relevant_sentences = []
-    for sentence in sentences:
-        if is_answer(sentence, question):
-            relevant_sentences.append(sentence)
-            print(sentence)
-    return relevant_sentences
-
-
-# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))
+"""
+print(
+    filter_irrelevant(
+        [
+            "Neil Armstrong is an American Astronaut",
+            "Neil Armstrong is dead",
+            "Neil Armstrng is fake",
+        ],
+        "Who is Neil Armstrong?",
+    )
+)
+"""
diff --git a/internet_ml/tools/NLP/sentencize.py b/internet_ml/tools/NLP/sentencize.py
index 93f0487..67236d8 100644
--- a/internet_ml/tools/NLP/sentencize.py
+++ b/internet_ml/tools/NLP/sentencize.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, List
 
 import logging
 
@@ -26,7 +26,7 @@ try:
 except LookupError:
     nltk.download("words")
 
-ENGLISH_WORDS = set(nltk.corpus.words.words())
+ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
 
 
 def convert_to_english(text: str) -> str:
@@ -54,7 +54,7 @@ def sentencizer(text: str) -> list[str]:
         for future in concurrent.futures.as_completed(futures):
             english_sentences.append(future.result())
 
-    if config.NLP_CONF_DEBUG:
+    if config.CONF_DEBUG:
         logging.info(f"sentences: {english_sentences}")
     return english_sentences
 
diff --git a/internet_ml/utils/config.py b/internet_ml/utils/config.py
index 1a0dced..9054bab 100644
--- a/internet_ml/utils/config.py
+++ b/internet_ml/utils/config.py
@@ -6,12 +6,11 @@ logging.basicConfig(
     level=logging.INFO,
     format="%(name)s - %(levelname)s - %(message)s",
 )
-
+# General
+CONF_DEBUG: bool = True
+# Google
 GOOGLE_API_KEY: str = ""
 GOOGLE_SEARCH_ENGINE_ID: str = ""
-
-# Global
-NLP_CONF_DEBUG: bool = True
 # NLP
 NLP_CONF_MODE: str = "default"
 
@@ -20,13 +19,17 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") ->
     global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
     GOOGLE_API_KEY = _GOOGLE_API_KEY
     GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
+    if CONF_DEBUG and _GOOGLE_API_KEY != "":
+        logging.info(f"API_KEY set")
+    if CONF_DEBUG and _GOOGLE_SEARCH_ENGINE_ID != "":
+        logging.info(f"SEARCH_ENGINE_ID set")
 
 
 def NLP_config(mode: str = "default", debug: bool = True) -> None:
     global conf_MODE, conf_DEBUG
-    NLP_CONF_DEBUG = debug
+    CONF_DEBUG = debug
     if mode == "accuracy" or mode == "speed":
         NLP_CONF_MODE = mode
     else:
-        if NLP_CONF_DEBUG:
+        if CONF_DEBUG:
             logging.warn(f"mode: {mode} does not exist")
diff --git a/pyproject.toml b/pyproject.toml
index 557cd68..8109918 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "internet_ml"
-version = "0.1.2"
+version = "0.1.3"
 description = "Internet-ML: Allowing ML to connect to the internet"
 readme = "./.github/README.md"
 authors = ["Thamognya Kodi <contact@thamognya.com>"]