update: look at todo
parent
5f0faa77a4
commit
5261d734de
|
@ -1,4 +1,13 @@
|
||||||
# type: ignore
|
# type: ignore
|
||||||
|
|
||||||
|
"""
|
||||||
|
model naming convention
|
||||||
|
# Open-AI models:
|
||||||
|
include prefix openai-*
|
||||||
|
# HuggingFace
|
||||||
|
include prefix hf-*
|
||||||
|
"""
|
||||||
|
|
||||||
from typing import Any, List, Tuple
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -31,11 +40,6 @@ def answer(
|
||||||
CHATGPT_CONVERSATION_ID: str = "",
|
CHATGPT_CONVERSATION_ID: str = "",
|
||||||
CHATGPT_PARENT_ID: str = "",
|
CHATGPT_PARENT_ID: str = "",
|
||||||
) -> tuple[Any, list[str]]:
|
) -> tuple[Any, list[str]]:
|
||||||
# if environment keys are not given, assume it is in env
|
|
||||||
if GOOGLE_SEARCH_API_KEY == "":
|
|
||||||
GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
|
|
||||||
if GOOGLE_SEARCH_ENGINE_ID == "":
|
|
||||||
GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))
|
|
||||||
if OPENAI_API_KEY == "":
|
if OPENAI_API_KEY == "":
|
||||||
OPENAI_API_KEY = str(os.environ.get("OPENAI_API_KEY"))
|
OPENAI_API_KEY = str(os.environ.get("OPENAI_API_KEY"))
|
||||||
openai.api_key = OPENAI_API_KEY
|
openai.api_key = OPENAI_API_KEY
|
||||||
|
@ -45,25 +49,20 @@ def answer(
|
||||||
CHATGPT_CONVERSATION_ID = str(os.environ.get("CHATGPT_CONVERSATION_ID"))
|
CHATGPT_CONVERSATION_ID = str(os.environ.get("CHATGPT_CONVERSATION_ID"))
|
||||||
if CHATGPT_PARENT_ID == "":
|
if CHATGPT_PARENT_ID == "":
|
||||||
CHATGPT_PARENT_ID = str(os.environ.get("CHATGPT_PARENT_ID"))
|
CHATGPT_PARENT_ID = str(os.environ.get("CHATGPT_PARENT_ID"))
|
||||||
"""
|
|
||||||
model naming convention
|
|
||||||
# Open-AI models:
|
|
||||||
include prefix openai-*
|
|
||||||
# HuggingFace
|
|
||||||
include prefix hf-*
|
|
||||||
#
|
|
||||||
"""
|
|
||||||
if not (model.startswith("openai-") or model.startswith("hf-")):
|
if not (model.startswith("openai-") or model.startswith("hf-")):
|
||||||
model = "openai-chatgpt" # Default
|
model = "openai-chatgpt" # Default
|
||||||
|
|
||||||
|
results: tuple[list[str], list[str]] = internet.Google(
|
||||||
|
query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
|
||||||
|
).google()
|
||||||
|
|
||||||
if model.startswith("openai-"):
|
if model.startswith("openai-"):
|
||||||
if model == "openai-chatgpt":
|
if model == "openai-chatgpt":
|
||||||
# ChatGPT
|
# ChatGPT
|
||||||
results: tuple[list[str], list[str]] = internet.Google(
|
|
||||||
query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
|
|
||||||
).google()
|
|
||||||
# print(' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000])
|
|
||||||
prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
|
prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
|
||||||
print(prompt)
|
print(prompt)
|
||||||
|
exit(1)
|
||||||
chatbot = Chatbot(
|
chatbot = Chatbot(
|
||||||
{"session_token": CHATGPT_SESSION_TOKEN},
|
{"session_token": CHATGPT_SESSION_TOKEN},
|
||||||
conversation_id=None,
|
conversation_id=None,
|
||||||
|
@ -77,14 +76,11 @@ def answer(
|
||||||
return (response["message"], results[1])
|
return (response["message"], results[1])
|
||||||
else:
|
else:
|
||||||
if model == "openai-text-davinci-003":
|
if model == "openai-text-davinci-003":
|
||||||
results: tuple[list[str], list[str]] = internet.Google(
|
# text-davinci-003
|
||||||
query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
|
prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
|
||||||
).google()
|
|
||||||
context = " ".join(results[0])
|
|
||||||
context[: (4097 - len(query) - 10)]
|
|
||||||
response = openai.Completion.create(
|
response = openai.Completion.create(
|
||||||
model="text-davinci-003",
|
model="text-davinci-003",
|
||||||
prompt=f"{context} Q: {query}",
|
prompt=prompt,
|
||||||
max_tokens=len(context),
|
max_tokens=len(context),
|
||||||
n=1,
|
n=1,
|
||||||
stop=None,
|
stop=None,
|
||||||
|
@ -94,9 +90,6 @@ def answer(
|
||||||
# TODO: add suport later
|
# TODO: add suport later
|
||||||
else:
|
else:
|
||||||
model = model.replace("hf-", "", 1)
|
model = model.replace("hf-", "", 1)
|
||||||
results: tuple[list[str], list[str]] = internet.Google(
|
|
||||||
query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
|
|
||||||
).google()
|
|
||||||
qa_model = pipeline("question-answering", model=model)
|
qa_model = pipeline("question-answering", model=model)
|
||||||
response = qa_model(question=query, context=" ".join(results[0]))
|
response = qa_model(question=query, context=" ".join(results[0]))
|
||||||
return (response["answer"], results[1])
|
return (response["answer"], results[1])
|
||||||
|
|
|
@ -1,6 +1,142 @@
|
||||||
from typing import Any, List, Tuple
|
# from typing import Any, List, Tuple
|
||||||
|
|
||||||
|
# import os
|
||||||
|
# import sys
|
||||||
|
# from importlib import reload
|
||||||
|
# from pathlib import Path
|
||||||
|
|
||||||
|
# import dotenv
|
||||||
|
# import requests
|
||||||
|
|
||||||
|
# HTTP_USERAGENT: dict[str, str] = {
|
||||||
|
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||||
|
# }
|
||||||
|
|
||||||
|
# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
||||||
|
# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
|
# sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# import asyncio
|
||||||
|
# import itertools
|
||||||
|
# import re
|
||||||
|
|
||||||
|
# import aiohttp
|
||||||
|
# import config
|
||||||
|
# from bs4 import BeautifulSoup
|
||||||
|
# from normalize import normalizer
|
||||||
|
|
||||||
|
# # from relevancy import filter_irrelevant
|
||||||
|
# from sentencize import sentencizer
|
||||||
|
# from urlextract import URLExtract
|
||||||
|
# from adremover import AdRemover
|
||||||
|
|
||||||
|
|
||||||
|
# class Google:
|
||||||
|
# def __init__(
|
||||||
|
# self: "Google",
|
||||||
|
# query: str,
|
||||||
|
# GOOGLE_SEARCH_API_KEY: str,
|
||||||
|
# GOOGLE_SEARCH_ENGINE_ID: str,
|
||||||
|
# ) -> None:
|
||||||
|
# self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
|
||||||
|
# self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
|
||||||
|
|
||||||
|
# # if environment keys are not given, assume it is in env
|
||||||
|
# if GOOGLE_SEARCH_API_KEY == "":
|
||||||
|
# self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
|
||||||
|
# if GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
# self.__GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))
|
||||||
|
|
||||||
|
# self.__num_res: int = 10
|
||||||
|
# self.__query = query
|
||||||
|
# self.__URL_EXTRACTOR: URLExtract = URLExtract()
|
||||||
|
# self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
|
||||||
|
# self.__query = str(
|
||||||
|
# re.sub(
|
||||||
|
# r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
|
||||||
|
# "",
|
||||||
|
# str(self.__query),
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# def __get_urls(self: "Google") -> None:
|
||||||
|
# if self.__GOOGLE_SEARCH_API_KEY == "":
|
||||||
|
# exit("ERROR: Google API Key not found")
|
||||||
|
# if self.__GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
# exit("ERROR: Google Search Engine Id not found")
|
||||||
|
# response = requests.get(
|
||||||
|
# "https://www.googleapis.com/customsearch/v1",
|
||||||
|
# params={
|
||||||
|
# "key": self.__GOOGLE_SEARCH_API_KEY,
|
||||||
|
# "q": self.__query,
|
||||||
|
# "cx": self.__GOOGLE_SEARCH_ENGINE_ID,
|
||||||
|
# },
|
||||||
|
# )
|
||||||
|
# results = response.json()["items"]
|
||||||
|
# for result in results:
|
||||||
|
# self.__urls.append(result["link"])
|
||||||
|
# if len(self.__urls) == self.__num_res:
|
||||||
|
# break
|
||||||
|
|
||||||
|
# async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
|
||||||
|
# try:
|
||||||
|
# async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
|
# html = await response.text()
|
||||||
|
# soup = BeautifulSoup(html, "html.parser")
|
||||||
|
# text = soup.get_text()
|
||||||
|
# normalized_text = normalizer(text)
|
||||||
|
# sentences: list[str] = sentencizer(normalized_text)
|
||||||
|
# return sentences
|
||||||
|
# except aiohttp.ClientConnectorError:
|
||||||
|
# return [""]
|
||||||
|
# except Exception:
|
||||||
|
# return [""]
|
||||||
|
|
||||||
|
# async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
|
||||||
|
# async with aiohttp.ClientSession() as session:
|
||||||
|
# tasks = [
|
||||||
|
# asyncio.create_task(self.__fetch_url(session, url)) for url in urls
|
||||||
|
# ]
|
||||||
|
# results = await asyncio.gather(*tasks)
|
||||||
|
# return results
|
||||||
|
|
||||||
|
# def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
|
||||||
|
# return list(itertools.chain(*a))
|
||||||
|
|
||||||
|
# def __get_urls_contents(self: "Google") -> None:
|
||||||
|
# loop = asyncio.new_event_loop()
|
||||||
|
# asyncio.set_event_loop(loop)
|
||||||
|
# contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
||||||
|
# loop.close()
|
||||||
|
# self.__content = self.__flatten(contents)
|
||||||
|
|
||||||
|
# def google(self: "Google") -> tuple[list[str], list[str]]:
|
||||||
|
# self.__get_urls()
|
||||||
|
# self.__get_urls_contents()
|
||||||
|
# return (self.__content, self.__urls)
|
||||||
|
|
||||||
|
|
||||||
|
# """
|
||||||
|
# Timing:
|
||||||
|
# import time
|
||||||
|
# start_time = time.time()
|
||||||
|
# google("Who is Elon Musk")
|
||||||
|
# print("--- %s seconds ---" % (time.time() - start_time))
|
||||||
|
|
||||||
|
# # Results:
|
||||||
|
|
||||||
|
# # --- 2.2230100631713867 seconds ---
|
||||||
|
|
||||||
|
# # ________________________________________________________
|
||||||
|
# # Executed in 4.73 secs fish external
|
||||||
|
# # usr time 3.35 secs 85.00 micros 3.35 secs
|
||||||
|
# # sys time 1.86 secs 956.00 micros 1.86 secs
|
||||||
|
# """
|
||||||
|
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import sys
|
import sys
|
||||||
from importlib import reload
|
from importlib import reload
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -17,18 +153,22 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
sys.path.append(str(Path(__file__).parent.parent))
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import concurrent.futures
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import config
|
import config
|
||||||
|
from adremover import AdRemover
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from keywords import get_keywords
|
||||||
from normalize import normalizer
|
from normalize import normalizer
|
||||||
|
from relevancy import filter_relevant
|
||||||
# from relevancy import filter_irrelevant
|
|
||||||
from sentencize import sentencizer
|
from sentencize import sentencizer
|
||||||
from urlextract import URLExtract
|
from urlextract import URLExtract
|
||||||
|
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class Google:
|
class Google:
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -37,9 +177,18 @@ class Google:
|
||||||
GOOGLE_SEARCH_API_KEY: str,
|
GOOGLE_SEARCH_API_KEY: str,
|
||||||
GOOGLE_SEARCH_ENGINE_ID: str,
|
GOOGLE_SEARCH_ENGINE_ID: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
|
# if environment keys are not given, assume it is in env
|
||||||
self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
|
if GOOGLE_SEARCH_API_KEY == "":
|
||||||
self.__num_res: int = 10
|
self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
|
||||||
|
if GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
self.__GOOGLE_SEARCH_ENGINE_ID = str(
|
||||||
|
os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
|
||||||
|
)
|
||||||
|
self.__num_res: int = (
|
||||||
|
5
|
||||||
|
if config.NLP_CONF_MODE == "speed"
|
||||||
|
else (20 if config.NLP_CONF_MODE else 10)
|
||||||
|
)
|
||||||
self.__query = query
|
self.__query = query
|
||||||
self.__URL_EXTRACTOR: URLExtract = URLExtract()
|
self.__URL_EXTRACTOR: URLExtract = URLExtract()
|
||||||
self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
|
self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
|
||||||
|
@ -50,8 +199,15 @@ class Google:
|
||||||
str(self.__query),
|
str(self.__query),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
self.__content: list[str] = []
|
||||||
|
ADBLOCK_RULES = [
|
||||||
|
"https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt",
|
||||||
|
"https://filters.adtidy.org/extension/chromium/filters/1.txt",
|
||||||
|
]
|
||||||
|
self.__ad_remover = AdRemover(ADBLOCK_RULES)
|
||||||
|
|
||||||
def __get_urls(self: "Google") -> None:
|
def __get_urls(self: "Google") -> None:
|
||||||
|
# Send the request to the Google Search API
|
||||||
if self.__GOOGLE_SEARCH_API_KEY == "":
|
if self.__GOOGLE_SEARCH_API_KEY == "":
|
||||||
exit("ERROR: Google API Key not found")
|
exit("ERROR: Google API Key not found")
|
||||||
if self.__GOOGLE_SEARCH_ENGINE_ID == "":
|
if self.__GOOGLE_SEARCH_ENGINE_ID == "":
|
||||||
|
@ -74,6 +230,7 @@ class Google:
|
||||||
try:
|
try:
|
||||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
html = await response.text()
|
html = await response.text()
|
||||||
|
html = self.__ad_remover.remove_ads(html)
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
text = soup.get_text()
|
text = soup.get_text()
|
||||||
normalized_text = normalizer(text)
|
normalized_text = normalizer(text)
|
||||||
|
@ -101,11 +258,26 @@ class Google:
|
||||||
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
||||||
loop.close()
|
loop.close()
|
||||||
self.__content = self.__flatten(contents)
|
self.__content = self.__flatten(contents)
|
||||||
|
self.__content = [str(x) for x in self.__content]
|
||||||
|
|
||||||
def google(self: "Google") -> tuple[list[str], list[str]]:
|
def __filter_irrelevant_processing(self: "Google") -> None:
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
|
||||||
|
futures = [executor.submit(filter_relevant, self.__content, self.__query)]
|
||||||
|
concurrent.futures.wait(futures)
|
||||||
|
content: list[str] = []
|
||||||
|
for future in futures:
|
||||||
|
content.append(future.result())
|
||||||
|
self.__content = content
|
||||||
|
|
||||||
|
def google(
|
||||||
|
self: "Google", filter_irrelevant: bool = True
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
self.__get_urls()
|
self.__get_urls()
|
||||||
self.__get_urls_contents()
|
self.__get_urls_contents()
|
||||||
return (self.__content, self.__urls)
|
if filter_irrelevant:
|
||||||
|
self.__filter_irrelevant_processing()
|
||||||
|
results: tuple[list[str], list[str]] = (self.__content, self.__urls)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue