update: look at todo
parent
c766fe1c8a
commit
960219f412
|
@ -21,15 +21,9 @@ import internet
|
||||||
QA_MODEL: Any = pipeline("question-answering")
|
QA_MODEL: Any = pipeline("question-answering")
|
||||||
|
|
||||||
|
|
||||||
def answer(
|
def answer(query: str) -> tuple[Any, list[str]]:
|
||||||
query: str,
|
|
||||||
GOOGLE_API_KEY: str = config.GOOGLE_API_KEY,
|
|
||||||
GOOGLE_SEARCH_ENGINE_ID: str = config.GOOGLE_SEARCH_ENGINE_ID,
|
|
||||||
) -> tuple[Any, list[str]]:
|
|
||||||
global QA_MODEL
|
global QA_MODEL
|
||||||
results: tuple[list[str], list[str]] = internet.google(
|
results: tuple[list[str], list[str]] = internet.google(query)
|
||||||
query, GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID
|
|
||||||
)
|
|
||||||
answer: tuple[Any, list[str]] = (
|
answer: tuple[Any, list[str]] = (
|
||||||
QA_MODEL(question=query, context=str(results[0])),
|
QA_MODEL(question=query, context=str(results[0])),
|
||||||
results[1],
|
results[1],
|
||||||
|
@ -39,4 +33,6 @@ def answer(
|
||||||
return answer
|
return answer
|
||||||
|
|
||||||
|
|
||||||
|
print(answer("Who is Rishi Sunack"))
|
||||||
|
|
||||||
# def custom_answer
|
# def custom_answer
|
||||||
|
|
|
@ -41,124 +41,130 @@ HTTP_USERAGENT: dict[str, str] = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def google_urls(
|
class Google:
|
||||||
query: str, links: list[str], GOOGLE_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str
|
def __init__(self: Any, query: str) -> None:
|
||||||
) -> list[str]:
|
self.GOOGLE_SEARCH_API_KEY: str = ""
|
||||||
try:
|
self.GOOGLE_SEARCH_ENGINE_ID: str = ""
|
||||||
# Send the request to the Google Search API
|
self.__num_res: int = (
|
||||||
if GOOGLE_API_KEY == "":
|
|
||||||
exit("ERROR: Google API Key not found")
|
|
||||||
if GOOGLE_SEARCH_ENGINE_ID == "":
|
|
||||||
exit("ERROR: Google Search Engine Id not found")
|
|
||||||
response = requests.get(
|
|
||||||
"https://www.googleapis.com/customsearch/v1",
|
|
||||||
params={
|
|
||||||
"key": config.GOOGLE_API_KEY,
|
|
||||||
"q": query,
|
|
||||||
"cx": config.GOOGLE_SEARCH_ENGINE_ID,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
results = response.json()["items"]
|
|
||||||
# Print the search results
|
|
||||||
num_of_res: int = (
|
|
||||||
5
|
5
|
||||||
if config.NLP_CONF_MODE == "speed"
|
if config.NLP_CONF_MODE == "speed"
|
||||||
else (20 if config.NLP_CONF_MODE else 10)
|
else (20 if config.NLP_CONF_MODE else 10)
|
||||||
)
|
)
|
||||||
|
self.__query = query
|
||||||
|
self.__URL_EXTRACTOR: URLExtract = URLExtract()
|
||||||
|
self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
|
||||||
|
self.__query = re.sub(
|
||||||
|
r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def google_search_api_key(self: Any) -> str:
|
||||||
|
val: str = self.GOOGLE_SEARCH_API_KEY
|
||||||
|
return val
|
||||||
|
|
||||||
|
@google_search_api_key.setter
|
||||||
|
def google_search_api_key(self: Any, val: str) -> None:
|
||||||
|
self.GOOGLE_SEARCH_API_KEY = val
|
||||||
|
|
||||||
|
@property
|
||||||
|
def google_search_engine_id(self: Any) -> str:
|
||||||
|
val: str = self.GOOGLE_SEARCH_ENGINE_ID
|
||||||
|
return val
|
||||||
|
|
||||||
|
@google_search_engine_id.setter
|
||||||
|
def google_search_engine_id(self: Any, val: str) -> None:
|
||||||
|
self.GOOGLE_SEARCH_ENGINE_ID = val
|
||||||
|
|
||||||
|
def __get_urls(self: Any) -> None:
|
||||||
|
# Send the request to the Google Search API
|
||||||
|
if self.GOOGLE_SEARCH_API_KEY == "":
|
||||||
|
exit("ERROR: Google API Key not found")
|
||||||
|
if self.GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
exit("ERROR: Google Search Engine Id not found")
|
||||||
|
response = requests.get(
|
||||||
|
"https://www.googleapis.com/customsearch/v1",
|
||||||
|
params={
|
||||||
|
"key": self.GOOGLE_SEARCH_API_KEY,
|
||||||
|
"q": self.__query,
|
||||||
|
"cx": self.GOOGLE_SEARCH_ENGINE_ID,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
results = response.json()["items"]
|
||||||
for result in results:
|
for result in results:
|
||||||
links.append(result["link"])
|
self.__urls.append(result["link"])
|
||||||
if len(links) == num_of_res:
|
if len(self.__urls) == self.__num_res:
|
||||||
break
|
break
|
||||||
if config.CONF_DEBUG:
|
if config.CONF_DEBUG:
|
||||||
logging.info(f"Links: {links}")
|
logging.info(f"Links: {self.__urls}")
|
||||||
return links
|
|
||||||
except Exception:
|
|
||||||
if config.CONF_DEBUG:
|
|
||||||
logging.info(f"Error: {Exception}")
|
|
||||||
exit(
|
|
||||||
f"There is an unknown excpetion: {Exception}. Since no links are scraped, nothing futher can continue. Please report it at https://github.com/thamognya/internet_ml/issues or mail me at contact@thamognya.com"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
async def __fetch_url(self: Any, session: Any, url: str) -> list[str]:
|
||||||
async def fetch_url(session: Any, url: str, question: Any) -> list[str]:
|
try:
|
||||||
try:
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
html = await response.text()
|
||||||
html = await response.text()
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
text = soup.get_text()
|
||||||
text = soup.get_text()
|
normalized_text = normalizer(text)
|
||||||
normalized_text = normalizer(text)
|
sentences: list[str] = sentencizer(normalized_text)
|
||||||
sentences: list[str] = sentencizer(normalized_text)
|
if config.CONF_DEBUG:
|
||||||
|
logging.info(f"Sentences: {sentences}")
|
||||||
|
return sentences
|
||||||
|
except aiohttp.ClientConnectorError:
|
||||||
if config.CONF_DEBUG:
|
if config.CONF_DEBUG:
|
||||||
logging.info(f"Sentences: {sentences}")
|
logging.info(
|
||||||
return sentences
|
f"ClientConnector Error: Likely a connection issue with wifi"
|
||||||
except aiohttp.ClientConnectorError:
|
)
|
||||||
if config.CONF_DEBUG:
|
return [""]
|
||||||
logging.info(f"ClientConnector Error: Likely a connection issue with wifi")
|
except Exception:
|
||||||
return [""]
|
return [""]
|
||||||
except Exception:
|
|
||||||
return [""]
|
async def __fetch_urls(self: Any, urls: list[str]) -> Any:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tasks = [
|
||||||
|
asyncio.create_task(self.__fetch_url(session, url)) for url in urls
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
|
||||||
|
return list(itertools.chain(*a))
|
||||||
|
|
||||||
|
def __get_urls_contents(self: Any) -> None:
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
||||||
|
loop.close()
|
||||||
|
self.__content = self.__flatten(contents)
|
||||||
|
|
||||||
|
def google(self: Any) -> tuple[list[str], list[str]]:
|
||||||
|
# Hard coded exceptions - START
|
||||||
|
if "Thamognya" in self.__query or "thamognya" in self.__query:
|
||||||
|
return (["The smartest person in the world"], ["I decided it"])
|
||||||
|
if "modi" in self.__query or "Modi" in self.__query:
|
||||||
|
return (
|
||||||
|
["Prime Minister of India"],
|
||||||
|
[
|
||||||
|
"https://www.narendramodi.in/",
|
||||||
|
"https://en.wikipedia.org/wiki/Narendra_Modi",
|
||||||
|
"https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
|
||||||
|
"https://www.instagram.com/narendramodi/?hl=en",
|
||||||
|
"https://www.facebook.com/narendramodi/",
|
||||||
|
"http://www.pmindia.gov.in/en/",
|
||||||
|
"https://timesofindia.indiatimes.com/topic/Narendra-Modi",
|
||||||
|
"https://www.britannica.com/biography/Narendra-Modi",
|
||||||
|
"https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/",
|
||||||
|
"https://economictimes.indiatimes.com/news/narendra-modi",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
self.__get_urls()
|
||||||
|
self.__get_urls_contents()
|
||||||
|
return (self.__content, self.__urls)
|
||||||
|
|
||||||
|
|
||||||
async def fetch_urls(urls: list[str], question: str) -> Any:
|
def google(query: str) -> tuple[list[str], list[str]]:
|
||||||
async with aiohttp.ClientSession() as session:
|
_google = Google(query)
|
||||||
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
|
_google.google_search_api_key = config.GET_GOOGLE_API_CONFIG()[0]
|
||||||
results = await asyncio.gather(*tasks)
|
_google.google_search_engine_id = config.GET_GOOGLE_API_CONFIG()[1]
|
||||||
return results
|
return _google.google()
|
||||||
|
|
||||||
|
|
||||||
def flatten(a: list[list[Any]]) -> list[Any]:
|
|
||||||
return list(itertools.chain(*a))
|
|
||||||
|
|
||||||
|
|
||||||
def get_url_contents(urls: list[str], question: str) -> list[str]:
|
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(loop)
|
|
||||||
contents = loop.run_until_complete(fetch_urls(urls, question))
|
|
||||||
loop.close()
|
|
||||||
return flatten(contents)
|
|
||||||
|
|
||||||
|
|
||||||
URL_EXTRACTOR: URLExtract = URLExtract()
|
|
||||||
|
|
||||||
|
|
||||||
def google(
|
|
||||||
query: str, API_KEY: str, SEARCH_ENGINE_ID: str
|
|
||||||
) -> tuple[list[str], list[str]]:
|
|
||||||
reload(config)
|
|
||||||
global URL_EXTRACTOR
|
|
||||||
# Hard coded exceptions - START
|
|
||||||
if "Thamognya" in query or "thamognya" in query:
|
|
||||||
return (["The smartest person in the world"], ["I decided it"])
|
|
||||||
if "modi" in query or "Modi" in query:
|
|
||||||
return (
|
|
||||||
["Prime Minister of India"],
|
|
||||||
[
|
|
||||||
"https://www.narendramodi.in/",
|
|
||||||
"https://en.wikipedia.org/wiki/Narendra_Modi",
|
|
||||||
"https://twitter.com/narendramodi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor",
|
|
||||||
"https://www.instagram.com/narendramodi/?hl=en",
|
|
||||||
"https://www.facebook.com/narendramodi/",
|
|
||||||
"http://www.pmindia.gov.in/en/",
|
|
||||||
"https://timesofindia.indiatimes.com/topic/Narendra-Modi",
|
|
||||||
"https://www.britannica.com/biography/Narendra-Modi",
|
|
||||||
"https://indianexpress.com/article/india/zelenskky-dials-pm-modi-wishes-new-delhi-successful-g20-presidency-8345365/",
|
|
||||||
"https://economictimes.indiatimes.com/news/narendra-modi",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
# Hard coded exceptions - END
|
|
||||||
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
|
||||||
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
|
||||||
urls = google_urls(
|
|
||||||
query,
|
|
||||||
links_in_text,
|
|
||||||
GOOGLE_API_KEY=API_KEY,
|
|
||||||
GOOGLE_SEARCH_ENGINE_ID=SEARCH_ENGINE_ID,
|
|
||||||
)
|
|
||||||
content = get_url_contents(urls, query)
|
|
||||||
if config.CONF_DEBUG:
|
|
||||||
logging.info(f"Urls: {urls}")
|
|
||||||
logging.info(f"Content: {content}")
|
|
||||||
return (content, urls)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -27,10 +27,10 @@ logging.basicConfig(
|
||||||
format="%(name)s - %(levelname)s - %(message)s",
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
)
|
)
|
||||||
|
|
||||||
nltk.download("punkt")
|
nltk.download("punkt", quiet=True)
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords", quiet=True)
|
||||||
nltk.download("wordnet")
|
nltk.download("wordnet", quiet=True)
|
||||||
nltk.download("omw-1.4")
|
nltk.download("omw-1.4", quiet=True)
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm") # Load the English language model
|
nlp = spacy.load("en_core_web_sm") # Load the English language model
|
||||||
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
|
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
|
||||||
|
|
|
@ -21,10 +21,7 @@ import concurrent.futures
|
||||||
import config
|
import config
|
||||||
import nltk
|
import nltk
|
||||||
|
|
||||||
try:
|
nltk.download("words", quiet=True)
|
||||||
nltk.data.find("words")
|
|
||||||
except LookupError:
|
|
||||||
nltk.download("words")
|
|
||||||
|
|
||||||
ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
|
ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
@ -15,7 +17,7 @@ GOOGLE_SEARCH_ENGINE_ID: str = ""
|
||||||
NLP_CONF_MODE: str = "default"
|
NLP_CONF_MODE: str = "default"
|
||||||
|
|
||||||
|
|
||||||
def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") -> None:
|
def GOOGLE_API_CONFIG(_GOOGLE_API_KEY: str, _GOOGLE_SEARCH_ENGINE_ID: str) -> None:
|
||||||
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
|
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
|
||||||
GOOGLE_API_KEY = _GOOGLE_API_KEY
|
GOOGLE_API_KEY = _GOOGLE_API_KEY
|
||||||
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
|
GOOGLE_SEARCH_ENGINE_ID = _GOOGLE_SEARCH_ENGINE_ID
|
||||||
|
@ -25,8 +27,13 @@ def API_CONFIG(_GOOGLE_API_KEY: str = "", _GOOGLE_SEARCH_ENGINE_ID: str = "") ->
|
||||||
logging.info(f"SEARCH_ENGINE_ID set")
|
logging.info(f"SEARCH_ENGINE_ID set")
|
||||||
|
|
||||||
|
|
||||||
|
def GET_GOOGLE_API_CONFIG() -> tuple[str, str]:
|
||||||
|
global GOOGLE_SEARCH_ENGINE_ID, GOOGLE_API_KEY
|
||||||
|
return (GOOGLE_API_KEY, GOOGLE_SEARCH_ENGINE_ID)
|
||||||
|
|
||||||
|
|
||||||
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
def NLP_config(mode: str = "default", debug: bool = True) -> None:
|
||||||
global conf_MODE, conf_DEBUG
|
global NLP_CONF_MODE, CONF_DEBUG
|
||||||
CONF_DEBUG = debug
|
CONF_DEBUG = debug
|
||||||
if mode == "accuracy" or mode == "speed":
|
if mode == "accuracy" or mode == "speed":
|
||||||
NLP_CONF_MODE = mode
|
NLP_CONF_MODE = mode
|
||||||
|
|
|
@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "internet_ml"
|
name = "internet_ml"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
description = "Internet-ML: Allowing ML to connect to the internet"
|
description = "Internet-ML: Allowing ML to connect to the internet"
|
||||||
readme = "./.github/README.md"
|
readme = "./.github/README.md"
|
||||||
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
||||||
|
|
Loading…
Reference in New Issue