diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index c884a54..bc40ffe 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -60,24 +60,26 @@ def answer( # ChatGPT results: tuple[list[str], list[str]] = internet.Google( query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google(filter_irrelevant=False) + ).google() + # print(' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000]) + prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked." + print(prompt) chatbot = Chatbot( {"session_token": CHATGPT_SESSION_TOKEN}, - conversation_id=CHATGPT_CONVERSATION_ID, - parent_id=CHATGPT_PARENT_ID, + conversation_id=None, + parent_id=None, ) - prompt = f"Utilize the following context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000]} and answer the question only with the given context: {query}" response = chatbot.ask( prompt=prompt, - conversation_id=CHATGPT_CONVERSATION_ID, - parent_id=CHATGPT_PARENT_ID, + conversation_id=None, + parent_id=None, ) return (response["message"], results[1]) else: if model == "openai-text-davinci-003": results: tuple[list[str], list[str]] = internet.Google( query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google(filter_irrelevant=False) + ).google() context = " ".join(results[0]) context[: (4097 - len(query) - 10)] response = openai.Completion.create( @@ -94,17 +96,15 @@ def answer( model = model.replace("hf-", "", 1) results: tuple[list[str], list[str]] = internet.Google( query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID - ).google(filter_irrelevant=False) + ).google() qa_model = pipeline("question-answering", model=model) response = qa_model(question=query, context=" ".join(results[0])) return (response["answer"], results[1]) -# print(os.environ) print( answer( - query="What is Club is Crisitano Ronaldo in 2023?", - model="openai-text-davinci-003", + query="Best original song in 80th Golden Globe award 2023?", + model="openai-chatgpt", ) ) -# def custom_answer diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index 0df7c06..6789e39 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -1,7 +1,6 @@ -from typing import Any, Dict, List, Tuple +from typing import Any, List, Tuple import os -import pickle import sys from importlib import reload from pathlib import Path @@ -18,22 +17,18 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") sys.path.append(str(Path(__file__).parent.parent)) import asyncio -import concurrent.futures import itertools import re import aiohttp import config -from adremover import AdRemover from bs4 import BeautifulSoup -from keywords import get_keywords from normalize import normalizer -from relevancy import filter_relevant + +# from relevancy import filter_irrelevant from sentencize import sentencizer from urlextract import URLExtract -dotenv.load_dotenv() - class Google: def __init__( @@ -44,11 +39,7 @@ class Google: ) -> None: self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID - self.__num_res: int = ( - 5 - if config.NLP_CONF_MODE == "speed" - else (20 if config.NLP_CONF_MODE else 10) - ) + self.__num_res: int = 10 self.__query = query self.__URL_EXTRACTOR: URLExtract = URLExtract() self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query) @@ -59,15 +50,8 @@ class Google: str(self.__query), ) ) - self.__content: list[str] = [] - ADBLOCK_RULES = [ - "https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt", - "https://filters.adtidy.org/extension/chromium/filters/1.txt", - ] - self.__ad_remover = AdRemover(ADBLOCK_RULES) def __get_urls(self: "Google") -> None: - # Send the request to the Google Search API if self.__GOOGLE_SEARCH_API_KEY == "": exit("ERROR: Google API Key not found") if self.__GOOGLE_SEARCH_ENGINE_ID == "": @@ -90,7 +74,6 @@ class Google: try: async with session.get(url, headers=HTTP_USERAGENT) as response: html = await response.text() - html = self.__ad_remover.remove_ads(html) soup = BeautifulSoup(html, "html.parser") text = soup.get_text() normalized_text = normalizer(text) @@ -118,26 +101,11 @@ class Google: contents = loop.run_until_complete(self.__fetch_urls(self.__urls)) loop.close() self.__content = self.__flatten(contents) - self.__content = [str(x) for x in self.__content] - def __filter_irrelevant_processing(self: "Google") -> None: - with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor: - futures = [executor.submit(filter_relevant, self.__content, self.__query)] - concurrent.futures.wait(futures) - content: list[str] = [] - for future in futures: - content.append(future.result()) - self.__content = content - - def google( - self: "Google", filter_irrelevant: bool = True - ) -> tuple[list[str], list[str]]: + def google(self: "Google") -> tuple[list[str], list[str]]: self.__get_urls() self.__get_urls_contents() - if filter_irrelevant: - self.__filter_irrelevant_processing() - results: tuple[list[str], list[str]] = (self.__content, self.__urls) - return results + return (self.__content, self.__urls) """