diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index 3a5d1c0..fef033e 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -21,15 +21,9 @@ import internet QA_MODEL: Any = pipeline("question-answering") -def answer( - query: str, - GOOGLE_API_KEY: str = config.GOOGLE_API_KEY, - GOOGLE_SEARCH_ENGINE_ID: str = config.GOOGLE_SEARCH_ENGINE_ID, -) -> tuple[Any, list[str]]: +def answer(query: str) -> tuple[Any, list[str]]: global QA_MODEL - results: tuple[list[str], list[str]] = internet.google( - query, GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ) + results: tuple[list[str], list[str]] = internet.google(query) answer: tuple[Any, list[str]] = ( QA_MODEL(question=query, context=str(results[0])), results[1], @@ -39,4 +33,6 @@ def answer( return answer +print(answer("Who is Rishi Sunack")) + # def custom_answer diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index 02875d2..cb2f314 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -41,124 +41,130 @@ HTTP_USERAGENT: dict[str, str] = { } -def google_urls( - query: str, links: list[str], GOOGLE_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str -) -> list[str]: - try: - # Send the request to the Google Search API - if GOOGLE_API_KEY == "": - exit("ERROR: Google API Key not found") - if GOOGLE_SEARCH_ENGINE_ID == "": - exit("ERROR: Google Search Engine Id not found") - response = requests.get( - "https://www.googleapis.com/customsearch/v1", - params={ - "key": config.GOOGLE_API_KEY, - "q": query, - "cx": config.GOOGLE_SEARCH_ENGINE_ID, - }, - ) - results = response.json()["items"] - # Print the search results - num_of_res: int = ( +class Google: + def __init__(self: Any, query: str) -> None: + self.GOOGLE_SEARCH_API_KEY: str = "" + self.GOOGLE_SEARCH_ENGINE_ID: str = "" + self.__num_res: int = ( 5 if config.NLP_CONF_MODE == "speed" else (20 if config.NLP_CONF_MODE else 10) ) + self.__query = query + self.__URL_EXTRACTOR: URLExtract = URLExtract() + self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query) + self.__query = re.sub( + r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query + ) + + @property + def google_search_api_key(self: Any) -> str: + val: str = self.GOOGLE_SEARCH_API_KEY + return val + + @google_search_api_key.setter + def google_search_api_key(self: Any, val: str) -> None: + self.GOOGLE_SEARCH_API_KEY = val + + @property + def google_search_engine_id(self: Any) -> str: + val: str = self.GOOGLE_SEARCH_ENGINE_ID + return val + + @google_search_engine_id.setter + def google_search_engine_id(self: Any, val: str) -> None: + self.GOOGLE_SEARCH_ENGINE_ID = val + + def __get_urls(self: Any) -> None: + # Send the request to the Google Search API + if self.GOOGLE_SEARCH_API_KEY == "": + exit("ERROR: Google API Key not found") + if self.GOOGLE_SEARCH_ENGINE_ID == "": + exit("ERROR: Google Search Engine Id not found") + response = requests.get( + "https://www.googleapis.com/customsearch/v1", + params={ + "key": self.GOOGLE_SEARCH_API_KEY, + "q": self.__query, + "cx": self.GOOGLE_SEARCH_ENGINE_ID, + }, + ) + results = response.json()["items"] for result in results: - links.append(result["link"]) - if len(links) == num_of_res: + self.__urls.append(result["link"]) + if len(self.__urls) == self.__num_res: break if config.CONF_DEBUG: - logging.info(f"Links: {links}") - return links - except Exception: - if config.CONF_DEBUG: - logging.info(f"Error: {Exception}") - exit( - f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com" - ) + logging.info(f"Links: {self.__urls}") - -async def fetch_url(session: Any, url: str, question: Any) -> list[str]: - try: - async with session.get(url, headers=HTTP_USERAGENT) as response: - html = await response.text() - soup = BeautifulSoup(html, "html.parser") - text = soup.get_text() - normalized_text = normalizer(text) - sentences: list[str] = sentencizer(normalized_text) + async def __fetch_url(self: Any, session: Any, url: str) -> list[str]: + try: + async with session.get(url, headers=HTTP_USERAGENT) as response: + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text() + normalized_text = normalizer(text) + sentences: list[str] = sentencizer(normalized_text) + if config.CONF_DEBUG: + logging.info(f"Sentences: {sentences}") + return sentences + except aiohttp.ClientConnectorError: if config.CONF_DEBUG: - logging.info(f"Sentences: {sentences}") - return sentences - except aiohttp.ClientConnectorError: - if config.CONF_DEBUG: - logging.info(f"ClientConnector Error: Likely a connection issue with wifi") - return [""] - except Exception: - return [""] + logging.info( + f"ClientConnector Error: Likely a connection issue with wifi" + ) + return [""] + except Exception: + return [""] + + async def __fetch_urls(self: Any, urls: list[str]) -> Any: + async with aiohttp.ClientSession() as session: + tasks = [ + asyncio.create_task(self.__fetch_url(session, url)) for url in urls + ] + results = await asyncio.gather(*tasks) + return results + + def __flatten(self: Any, a: list[list[Any]]) -> list[Any]: + return list(itertools.chain(*a)) + + def __get_urls_contents(self: Any) -> None: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + contents = loop.run_until_complete(self.__fetch_urls(self.__urls)) + loop.close() + self.__content = self.__flatten(contents) + + def google(self: Any) -> tuple[list[str], list[str]]: + # Hard coded exceptions - START + if "Thamognya" in self.__query or "thamognya" in self.__query: + return (["The smartest person in the world"], ["I decided it"]) + if "modi" in self.__query or "Modi" in self.__query: + return ( + ["Prime Minister of India"], + [ + "https://www.narendramodi.in/", + "https://en.wikipedia.org/wiki/Narendra_Modi", + "https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor", + "https://www.instagram.com/narendramodi/?hl=en", + "https://www.facebook.com/narendramodi/", + "http://www.pmindia.gov.in/en/", + "https://timesofindia.indiatimes.com/topic/Narendra-Modi", + "https://www.britannica.com/biography/Narendra-Modi", + "https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/", + "https://economictimes.indiatimes.com/news/narendra-modi", + ], + ) + self.__get_urls() + self.__get_urls_contents() + return (self.__content, self.__urls) -async def fetch_urls(urls: list[str], question: str) -> Any: - async with aiohttp.ClientSession() as session: - tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls] - results = await asyncio.gather(*tasks) - return results - - -def flatten(a: list[list[Any]]) -> list[Any]: - return list(itertools.chain(*a)) - - -def get_url_contents(urls: list[str], question: str) -> list[str]: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - contents = loop.run_until_complete(fetch_urls(urls, question)) - loop.close() - return flatten(contents) - - -URL_EXTRACTOR: URLExtract = URLExtract() - - -def google( - query: str, API_KEY: str, SEARCH_ENGINE_ID: str -) -> tuple[list[str], list[str]]: - reload(config) - global URL_EXTRACTOR - # Hard coded exceptions - START - if "Thamognya" in query or "thamognya" in query: - return (["The smartest person in the world"], ["I decided it"]) - if "modi" in query or "Modi" in query: - return ( - ["Prime Minister of India"], - [ - "https://www.narendramodi.in/", - "https://en.wikipedia.org/wiki/Narendra_Modi", - "https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor", - "https://www.instagram.com/narendramodi/?hl=en", - "https://www.facebook.com/narendramodi/", - "http://www.pmindia.gov.in/en/", - "https://timesofindia.indiatimes.com/topic/Narendra-Modi", - "https://www.britannica.com/biography/Narendra-Modi", - "https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/", - "https://economictimes.indiatimes.com/news/narendra-modi", - ], - ) - # Hard coded exceptions - END - links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) - query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) - urls = google_urls( - query, - links_in_text, - GOOGLE_API_KEY=API_KEY, - GOOGLE_SEARCH_ENGINE_ID=SEARCH_ENGINE_ID, - ) - content = get_url_contents(urls, query) - if config.CONF_DEBUG: - logging.info(f"Urls: {urls}") - logging.info(f"Content: {content}") - return (content, urls) +def google(query: str) -> tuple[list[str], list[str]]: + _google = Google(query) + _google.google_search_api_key = config.GET_GOOGLE_API_CONFIG()[0] + _google.google_search_engine_id = config.GET_GOOGLE_API_CONFIG()[1] + return _google.google() """ diff --git a/internet_ml/tools/NLP/relevancy.py b/internet_ml/tools/NLP/relevancy.py index 82fc9c9..120470e 100644 --- a/internet_ml/tools/NLP/relevancy.py +++ b/internet_ml/tools/NLP/relevancy.py @@ -27,10 +27,10 @@ logging.basicConfig( format="%(name)s - %(levelname)s - %(message)s", ) -nltk.download("punkt") -nltk.download("stopwords") -nltk.download("wordnet") -nltk.download("omw-1.4") +nltk.download("punkt", quiet=True) +nltk.download("stopwords", quiet=True) +nltk.download("wordnet", quiet=True) +nltk.download("omw-1.4", quiet=True) nlp = spacy.load("en_core_web_sm") # Load the English language model lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer diff --git a/internet_ml/tools/NLP/sentencize.py b/internet_ml/tools/NLP/sentencize.py index 67236d8..812892a 100644 --- a/internet_ml/tools/NLP/sentencize.py +++ b/internet_ml/tools/NLP/sentencize.py @@ -21,10 +21,7 @@ import concurrent.futures import config import nltk -try: - nltk.data.find("words") -except LookupError: - nltk.download("words") +nltk.download("words", quiet=True) ENGLISH_WORDS: Any = set(nltk.corpus.words.words()) diff --git a/internet_ml/utils/config.py b/internet_ml/utils/config.py index 9054bab..7971d97 100644 --- a/internet_ml/utils/config.py +++ b/internet_ml/utils/config.py @@ -1,3 +1,5 @@ +from typing import List, Tuple + import logging logging.basicConfig( @@ -15,7 +17,7 @@ GOOGLE_SEARCH_ENGINE_ID: str = "" NLP_CONF_MODE: str = "default" -def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") -> None: +def GOOGLE_API_CONFIG(_GOOGLE_API_KEY: str, _GOOGLE_SEARCH_ENGINE_ID: str) -> None: global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY GOOGLE_API_KEY = _GOOGLE_API_KEY GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID @@ -25,8 +27,13 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") -> logging.info(f"SEARCH_ENGINE_ID set") +def GET_GOOGLE_API_CONFIG() -> tuple[str, str]: + global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY + return (GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID) + + def NLP_config(mode: str = "default", debug: bool = True) -> None: - global conf_MODE, conf_DEBUG + global NLP_CONF_MODE, CONF_DEBUG CONF_DEBUG = debug if mode == "accuracy" or mode == "speed": NLP_CONF_MODE = mode diff --git a/pyproject.toml b/pyproject.toml index 0aae1b8..e3d611b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "internet_ml" -version = "0.1.7" +version = "0.2.0" description = "Internet-ML: Allowing ML to connect to the internet" readme = "./.github/README.md" authors = ["Thamognya Kodi "]