update: look at todo

main
Thamognya Kodi 2022-12-27 19:19:01 +07:00
parent c766fe1c8a
commit 960219f412
6 changed files with 132 additions and 126 deletions

View File

@ -21,15 +21,9 @@ import internet
QA_MODEL: Any = pipeline("question-answering") QA_MODEL: Any = pipeline("question-answering")
def answer( def answer(query: str) -> tuple[Any, list[str]]:
query: str,
GOOGLE_API_KEY: str = config.GOOGLE_API_KEY,
GOOGLE_SEARCH_ENGINE_ID: str = config.GOOGLE_SEARCH_ENGINE_ID,
) -> tuple[Any, list[str]]:
global QA_MODEL global QA_MODEL
results: tuple[list[str], list[str]] = internet.google( results: tuple[list[str], list[str]] = internet.google(query)
query, GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID
)
answer: tuple[Any, list[str]] = ( answer: tuple[Any, list[str]] = (
QA_MODEL(question=query, context=str(results[0])), QA_MODEL(question=query, context=str(results[0])),
results[1], results[1],
@ -39,4 +33,6 @@ def answer(
return answer return answer
print(answer("Who is Rishi Sunack"))
# def custom_answer # def custom_answer

View File

@ -41,46 +41,63 @@ HTTP_USERAGENT: dict[str, str] = {
} }
def google_urls( class Google:
query: str, links: list[str], GOOGLE_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str def __init__(self: Any, query: str) -> None:
) -> list[str]: self.GOOGLE_SEARCH_API_KEY: str = ""
try: self.GOOGLE_SEARCH_ENGINE_ID: str = ""
# Send the request to the Google Search API self.__num_res: int = (
if GOOGLE_API_KEY == "":
exit("ERROR: Google API Key not found")
if GOOGLE_SEARCH_ENGINE_ID == "":
exit("ERROR: Google Search Engine Id not found")
response = requests.get(
"https://www.googleapis.com/customsearch/v1",
params={
"key": config.GOOGLE_API_KEY,
"q": query,
"cx": config.GOOGLE_SEARCH_ENGINE_ID,
},
)
results = response.json()["items"]
# Print the search results
num_of_res: int = (
5 5
if config.NLP_CONF_MODE == "speed" if config.NLP_CONF_MODE == "speed"
else (20 if config.NLP_CONF_MODE else 10) else (20 if config.NLP_CONF_MODE else 10)
) )
for result in results: self.__query = query
links.append(result["link"]) self.__URL_EXTRACTOR: URLExtract = URLExtract()
if len(links) == num_of_res: self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
break self.__query = re.sub(
if config.CONF_DEBUG: r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query
logging.info(f"Links: {links}")
return links
except Exception:
if config.CONF_DEBUG:
logging.info(f"Error: {Exception}")
exit(
f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com"
) )
@property
def google_search_api_key(self: Any) -> str:
val: str = self.GOOGLE_SEARCH_API_KEY
return val
async def fetch_url(session: Any, url: str, question: Any) -> list[str]: @google_search_api_key.setter
def google_search_api_key(self: Any, val: str) -> None:
self.GOOGLE_SEARCH_API_KEY = val
@property
def google_search_engine_id(self: Any) -> str:
val: str = self.GOOGLE_SEARCH_ENGINE_ID
return val
@google_search_engine_id.setter
def google_search_engine_id(self: Any, val: str) -> None:
self.GOOGLE_SEARCH_ENGINE_ID = val
def __get_urls(self: Any) -> None:
# Send the request to the Google Search API
if self.GOOGLE_SEARCH_API_KEY == "":
exit("ERROR: Google API Key not found")
if self.GOOGLE_SEARCH_ENGINE_ID == "":
exit("ERROR: Google Search Engine Id not found")
response = requests.get(
"https://www.googleapis.com/customsearch/v1",
params={
"key": self.GOOGLE_SEARCH_API_KEY,
"q": self.__query,
"cx": self.GOOGLE_SEARCH_ENGINE_ID,
},
)
results = response.json()["items"]
for result in results:
self.__urls.append(result["link"])
if len(self.__urls) == self.__num_res:
break
if config.CONF_DEBUG:
logging.info(f"Links: {self.__urls}")
async def __fetch_url(self: Any, session: Any, url: str) -> list[str]:
try: try:
async with session.get(url, headers=HTTP_USERAGENT) as response: async with session.get(url, headers=HTTP_USERAGENT) as response:
html = await response.text() html = await response.text()
@ -93,43 +110,36 @@ async def fetch_url(session: Any, url: str, question: Any) -> list[str]:
return sentences return sentences
except aiohttp.ClientConnectorError: except aiohttp.ClientConnectorError:
if config.CONF_DEBUG: if config.CONF_DEBUG:
logging.info(f"ClientConnector Error: Likely a connection issue with wifi") logging.info(
f"ClientConnector Error: Likely a connection issue with wifi"
)
return [""] return [""]
except Exception: except Exception:
return [""] return [""]
async def __fetch_urls(self: Any, urls: list[str]) -> Any:
async def fetch_urls(urls: list[str], question: str) -> Any:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls] tasks = [
asyncio.create_task(self.__fetch_url(session, url)) for url in urls
]
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
return results return results
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
def flatten(a: list[list[Any]]) -> list[Any]:
return list(itertools.chain(*a)) return list(itertools.chain(*a))
def __get_urls_contents(self: Any) -> None:
def get_url_contents(urls: list[str], question: str) -> list[str]:
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
contents = loop.run_until_complete(fetch_urls(urls, question)) contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
loop.close() loop.close()
return flatten(contents) self.__content = self.__flatten(contents)
def google(self: Any) -> tuple[list[str], list[str]]:
URL_EXTRACTOR: URLExtract = URLExtract()
def google(
query: str, API_KEY: str, SEARCH_ENGINE_ID: str
) -> tuple[list[str], list[str]]:
reload(config)
global URL_EXTRACTOR
# Hard coded exceptions - START # Hard coded exceptions - START
if "Thamognya" in query or "thamognya" in query: if "Thamognya" in self.__query or "thamognya" in self.__query:
return (["The smartest person in the world"], ["I decided it"]) return (["The smartest person in the world"], ["I decided it"])
if "modi" in query or "Modi" in query: if "modi" in self.__query or "Modi" in self.__query:
return ( return (
["Prime Minister of India"], ["Prime Minister of India"],
[ [
@ -145,20 +155,16 @@ def google(
"https://economictimes.indiatimes.com/news/narendra-modi", "https://economictimes.indiatimes.com/news/narendra-modi",
], ],
) )
# Hard coded exceptions - END self.__get_urls()
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) self.__get_urls_contents()
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) return (self.__content, self.__urls)
urls = google_urls(
query,
links_in_text, def google(query: str) -> tuple[list[str], list[str]]:
GOOGLE_API_KEY=API_KEY, _google = Google(query)
GOOGLE_SEARCH_ENGINE_ID=SEARCH_ENGINE_ID, _google.google_search_api_key = config.GET_GOOGLE_API_CONFIG()[0]
) _google.google_search_engine_id = config.GET_GOOGLE_API_CONFIG()[1]
content = get_url_contents(urls, query) return _google.google()
if config.CONF_DEBUG:
logging.info(f"Urls: {urls}")
logging.info(f"Content: {content}")
return (content, urls)
""" """

View File

@ -27,10 +27,10 @@ logging.basicConfig(
format="%(name)s - %(levelname)s - %(message)s", format="%(name)s - %(levelname)s - %(message)s",
) )
nltk.download("punkt") nltk.download("punkt", quiet=True)
nltk.download("stopwords") nltk.download("stopwords", quiet=True)
nltk.download("wordnet") nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4") nltk.download("omw-1.4", quiet=True)
nlp = spacy.load("en_core_web_sm") # Load the English language model nlp = spacy.load("en_core_web_sm") # Load the English language model
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer

View File

@ -21,10 +21,7 @@ import concurrent.futures
import config import config
import nltk import nltk
try: nltk.download("words", quiet=True)
nltk.data.find("words")
except LookupError:
nltk.download("words")
ENGLISH_WORDS: Any = set(nltk.corpus.words.words()) ENGLISH_WORDS: Any = set(nltk.corpus.words.words())

View File

@ -1,3 +1,5 @@
from typing import List, Tuple
import logging import logging
logging.basicConfig( logging.basicConfig(
@ -15,7 +17,7 @@ GOOGLE_SEARCH_ENGINE_ID: str = ""
NLP_CONF_MODE: str = "default" NLP_CONF_MODE: str = "default"
def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") -> None: def GOOGLE_API_CONFIG(_GOOGLE_API_KEY: str, _GOOGLE_SEARCH_ENGINE_ID: str) -> None:
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
GOOGLE_API_KEY = _GOOGLE_API_KEY GOOGLE_API_KEY = _GOOGLE_API_KEY
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
@ -25,8 +27,13 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") ->
logging.info(f"SEARCH_ENGINE_ID set") logging.info(f"SEARCH_ENGINE_ID set")
def GET_GOOGLE_API_CONFIG() -> tuple[str, str]:
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
return (GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID)
def NLP_config(mode: str = "default", debug: bool = True) -> None: def NLP_config(mode: str = "default", debug: bool = True) -> None:
global conf_MODE, conf_DEBUG global NLP_CONF_MODE, CONF_DEBUG
CONF_DEBUG = debug CONF_DEBUG = debug
if mode == "accuracy" or mode == "speed": if mode == "accuracy" or mode == "speed":
NLP_CONF_MODE = mode NLP_CONF_MODE = mode

View File

@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "internet_ml" name = "internet_ml"
version = "0.1.7" version = "0.2.0"
description = "Internet-ML: Allowing ML to connect to the internet" description = "Internet-ML: Allowing ML to connect to the internet"
readme = "./.github/README.md" readme = "./.github/README.md"
authors = ["Thamognya Kodi <contact@thamognya.com>"] authors = ["Thamognya Kodi <contact@thamognya.com>"]