From 5261d734ded6423d09a1bdd849783b5cc376dfc9 Mon Sep 17 00:00:00 2001 From: Thamognya Kodi Date: Sat, 14 Jan 2023 21:20:23 +0700 Subject: [PATCH] update: look at todo --- internet_ml/NLP/no_context/QA.py | 45 +++--- internet_ml/tools/NLP/data/internet.py | 188 +++++++++++++++++++++++-- 2 files changed, 199 insertions(+), 34 deletions(-) diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index bc40ffe..64befad 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -1,4 +1,13 @@ # type: ignore + +""" +model naming convention +# Open-AI models: +include prefix openai-* +# HuggingFace +include prefix hf-* +""" + from typing import Any, List, Tuple import os @@ -31,11 +40,6 @@ def answer( CHATGPT_CONVERSATION_ID: str = "", CHATGPT_PARENT_ID: str = "", ) -> tuple[Any, list[str]]: - # if environment keys are not given, assume it is in env - if GOOGLE_SEARCH_API_KEY == "": - GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY")) - if GOOGLE_SEARCH_ENGINE_ID == "": - GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID")) if OPENAI_API_KEY == "": OPENAI_API_KEY = str(os.environ.get("OPENAI_API_KEY")) openai.api_key = OPENAI_API_KEY @@ -45,25 +49,20 @@ def answer( CHATGPT_CONVERSATION_ID = str(os.environ.get("CHATGPT_CONVERSATION_ID")) if CHATGPT_PARENT_ID == "": CHATGPT_PARENT_ID = str(os.environ.get("CHATGPT_PARENT_ID")) - """ - model naming convention - # Open-AI models: - include prefix openai-* - # HuggingFace - include prefix hf-* - # - """ + if not (model.startswith("openai-") or model.startswith("hf-")): model = "openai-chatgpt" # Default + + results: tuple[list[str], list[str]] = internet.Google( + query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID + ).google() + if model.startswith("openai-"): if model == "openai-chatgpt": # ChatGPT - results: tuple[list[str], list[str]] = internet.Google( - query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google() - # print(' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000]) prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked." print(prompt) + exit(1) chatbot = Chatbot( {"session_token": CHATGPT_SESSION_TOKEN}, conversation_id=None, @@ -77,14 +76,11 @@ def answer( return (response["message"], results[1]) else: if model == "openai-text-davinci-003": - results: tuple[list[str], list[str]] = internet.Google( - query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google() - context = " ".join(results[0]) - context[: (4097 - len(query) - 10)] + # text-davinci-003 + prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked." response = openai.Completion.create( model="text-davinci-003", - prompt=f"{context} Q: {query}", + prompt=prompt, max_tokens=len(context), n=1, stop=None, @@ -94,9 +90,6 @@ def answer( # TODO: add suport later else: model = model.replace("hf-", "", 1) - results: tuple[list[str], list[str]] = internet.Google( - query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google() qa_model = pipeline("question-answering", model=model) response = qa_model(question=query, context=" ".join(results[0])) return (response["answer"], results[1]) diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index 6789e39..bf53f4b 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -1,6 +1,142 @@ -from typing import Any, List, Tuple +# from typing import Any, List, Tuple + +# import os +# import sys +# from importlib import reload +# from pathlib import Path + +# import dotenv +# import requests + +# HTTP_USERAGENT: dict[str, str] = { +# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" +# } + +# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") +# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") +# sys.path.append(str(Path(__file__).parent.parent)) + +# import asyncio +# import itertools +# import re + +# import aiohttp +# import config +# from bs4 import BeautifulSoup +# from normalize import normalizer + +# # from relevancy import filter_irrelevant +# from sentencize import sentencizer +# from urlextract import URLExtract +# from adremover import AdRemover + + +# class Google: +# def __init__( +# self: "Google", +# query: str, +# GOOGLE_SEARCH_API_KEY: str, +# GOOGLE_SEARCH_ENGINE_ID: str, +# ) -> None: +# self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY +# self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID + +# # if environment keys are not given, assume it is in env +# if GOOGLE_SEARCH_API_KEY == "": +# self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY")) +# if GOOGLE_SEARCH_ENGINE_ID == "": +# self.__GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID")) + +# self.__num_res: int = 10 +# self.__query = query +# self.__URL_EXTRACTOR: URLExtract = URLExtract() +# self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query) +# self.__query = str( +# re.sub( +# r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", +# "", +# str(self.__query), +# ) +# ) + +# def __get_urls(self: "Google") -> None: +# if self.__GOOGLE_SEARCH_API_KEY == "": +# exit("ERROR: Google API Key not found") +# if self.__GOOGLE_SEARCH_ENGINE_ID == "": +# exit("ERROR: Google Search Engine Id not found") +# response = requests.get( +# "https://www.googleapis.com/customsearch/v1", +# params={ +# "key": self.__GOOGLE_SEARCH_API_KEY, +# "q": self.__query, +# "cx": self.__GOOGLE_SEARCH_ENGINE_ID, +# }, +# ) +# results = response.json()["items"] +# for result in results: +# self.__urls.append(result["link"]) +# if len(self.__urls) == self.__num_res: +# break + +# async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]: +# try: +# async with session.get(url, headers=HTTP_USERAGENT) as response: +# html = await response.text() +# soup = BeautifulSoup(html, "html.parser") +# text = soup.get_text() +# normalized_text = normalizer(text) +# sentences: list[str] = sentencizer(normalized_text) +# return sentences +# except aiohttp.ClientConnectorError: +# return [""] +# except Exception: +# return [""] + +# async def __fetch_urls(self: "Google", urls: list[str]) -> Any: +# async with aiohttp.ClientSession() as session: +# tasks = [ +# asyncio.create_task(self.__fetch_url(session, url)) for url in urls +# ] +# results = await asyncio.gather(*tasks) +# return results + +# def __flatten(self: Any, a: list[list[Any]]) -> list[Any]: +# return list(itertools.chain(*a)) + +# def __get_urls_contents(self: "Google") -> None: +# loop = asyncio.new_event_loop() +# asyncio.set_event_loop(loop) +# contents = loop.run_until_complete(self.__fetch_urls(self.__urls)) +# loop.close() +# self.__content = self.__flatten(contents) + +# def google(self: "Google") -> tuple[list[str], list[str]]: +# self.__get_urls() +# self.__get_urls_contents() +# return (self.__content, self.__urls) + + +# """ +# Timing: +# import time +# start_time = time.time() +# google("Who is Elon Musk") +# print("--- %s seconds ---" % (time.time() - start_time)) + +# # Results: + +# # --- 2.2230100631713867 seconds --- + +# # ________________________________________________________ +# # Executed in 4.73 secs fish external +# # usr time 3.35 secs 85.00 micros 3.35 secs +# # sys time 1.86 secs 956.00 micros 1.86 secs +# """ + +from typing import Any, Dict, List, Tuple import os +import pickle import sys from importlib import reload from pathlib import Path @@ -17,18 +153,22 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") sys.path.append(str(Path(__file__).parent.parent)) import asyncio +import concurrent.futures import itertools import re import aiohttp import config +from adremover import AdRemover from bs4 import BeautifulSoup +from keywords import get_keywords from normalize import normalizer - -# from relevancy import filter_irrelevant +from relevancy import filter_relevant from sentencize import sentencizer from urlextract import URLExtract +dotenv.load_dotenv() + class Google: def __init__( @@ -37,9 +177,18 @@ class Google: GOOGLE_SEARCH_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str, ) -> None: - self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY - self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID - self.__num_res: int = 10 + # if environment keys are not given, assume it is in env + if GOOGLE_SEARCH_API_KEY == "": + self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY")) + if GOOGLE_SEARCH_ENGINE_ID == "": + self.__GOOGLE_SEARCH_ENGINE_ID = str( + os.environ.get("GOOGLE_SEARCH_ENGINE_ID") + ) + self.__num_res: int = ( + 5 + if config.NLP_CONF_MODE == "speed" + else (20 if config.NLP_CONF_MODE else 10) + ) self.__query = query self.__URL_EXTRACTOR: URLExtract = URLExtract() self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query) @@ -50,8 +199,15 @@ class Google: str(self.__query), ) ) + self.__content: list[str] = [] + ADBLOCK_RULES = [ + "https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt", + "https://filters.adtidy.org/extension/chromium/filters/1.txt", + ] + self.__ad_remover = AdRemover(ADBLOCK_RULES) def __get_urls(self: "Google") -> None: + # Send the request to the Google Search API if self.__GOOGLE_SEARCH_API_KEY == "": exit("ERROR: Google API Key not found") if self.__GOOGLE_SEARCH_ENGINE_ID == "": @@ -74,6 +230,7 @@ class Google: try: async with session.get(url, headers=HTTP_USERAGENT) as response: html = await response.text() + html = self.__ad_remover.remove_ads(html) soup = BeautifulSoup(html, "html.parser") text = soup.get_text() normalized_text = normalizer(text) @@ -101,11 +258,26 @@ class Google: contents = loop.run_until_complete(self.__fetch_urls(self.__urls)) loop.close() self.__content = self.__flatten(contents) + self.__content = [str(x) for x in self.__content] - def google(self: "Google") -> tuple[list[str], list[str]]: + def __filter_irrelevant_processing(self: "Google") -> None: + with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor: + futures = [executor.submit(filter_relevant, self.__content, self.__query)] + concurrent.futures.wait(futures) + content: list[str] = [] + for future in futures: + content.append(future.result()) + self.__content = content + + def google( + self: "Google", filter_irrelevant: bool = True + ) -> tuple[list[str], list[str]]: self.__get_urls() self.__get_urls_contents() - return (self.__content, self.__urls) + if filter_irrelevant: + self.__filter_irrelevant_processing() + results: tuple[list[str], list[str]] = (self.__content, self.__urls) + return results """