update: look at todo

2023-01-14 20:12:43 +07:00 · 2023-01-14 20:12:43 +07:00 · 5f0faa77a4
parent 581ec23b2a
commit 5f0faa77a4
2 changed files with 18 additions and 50 deletions
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
@ -60,24 +60,26 @@ def answer(
            # ChatGPT
            results: tuple[list[str], list[str]] = internet.Google(
                query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
-            ).google(filter_irrelevant=False)
+            ).google()
+            # print(' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000])
+            prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
+            print(prompt)
            chatbot = Chatbot(
                {"session_token": CHATGPT_SESSION_TOKEN},
-                conversation_id=CHATGPT_CONVERSATION_ID,
-                parent_id=CHATGPT_PARENT_ID,
+                conversation_id=None,
+                parent_id=None,
            )
-            prompt = f"Utilize the following context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000]} and answer the question only with the given context: {query}"
            response = chatbot.ask(
                prompt=prompt,
-                conversation_id=CHATGPT_CONVERSATION_ID,
-                parent_id=CHATGPT_PARENT_ID,
+                conversation_id=None,
+                parent_id=None,
            )
            return (response["message"], results[1])
        else:
            if model == "openai-text-davinci-003":
                results: tuple[list[str], list[str]] = internet.Google(
                    query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
-                ).google(filter_irrelevant=False)
+                ).google()
                context = " ".join(results[0])
                context[: (4097 - len(query) - 10)]
                response = openai.Completion.create(
@ -94,17 +96,15 @@ def answer(
        model = model.replace("hf-", "", 1)
        results: tuple[list[str], list[str]] = internet.Google(
            query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
-        ).google(filter_irrelevant=False)
+        ).google()
        qa_model = pipeline("question-answering", model=model)
        response = qa_model(question=query, context=" ".join(results[0]))
        return (response["answer"], results[1])


-# print(os.environ)
 print(
    answer(
-        query="What is Club is Crisitano Ronaldo in 2023?",
-        model="openai-text-davinci-003",
+        query="Best original song in 80th Golden Globe award 2023?",
+        model="openai-chatgpt",
    )
 )
-# def custom_answer
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@ -1,7 +1,6 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, List, Tuple

 import os
-import pickle
 import sys
 from importlib import reload
 from pathlib import Path
@ -18,22 +17,18 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
 sys.path.append(str(Path(__file__).parent.parent))

 import asyncio
-import concurrent.futures
 import itertools
 import re

 import aiohttp
 import config
-from adremover import AdRemover
 from bs4 import BeautifulSoup
-from keywords import get_keywords
 from normalize import normalizer
-from relevancy import filter_relevant
+
+# from relevancy import filter_irrelevant
 from sentencize import sentencizer
 from urlextract import URLExtract

-dotenv.load_dotenv()
-

 class Google:
    def __init__(
@ -44,11 +39,7 @@ class Google:
    ) -> None:
        self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
        self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
-        self.__num_res: int = (
-            5
-            if config.NLP_CONF_MODE == "speed"
-            else (20 if config.NLP_CONF_MODE else 10)
-        )
+        self.__num_res: int = 10
        self.__query = query
        self.__URL_EXTRACTOR: URLExtract = URLExtract()
        self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
@ -59,15 +50,8 @@ class Google:
                str(self.__query),
            )
        )
-        self.__content: list[str] = []
-        ADBLOCK_RULES = [
-            "https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt",
-            "https://filters.adtidy.org/extension/chromium/filters/1.txt",
-        ]
-        self.__ad_remover = AdRemover(ADBLOCK_RULES)

    def __get_urls(self: "Google") -> None:
-        # Send the request to the Google Search API
        if self.__GOOGLE_SEARCH_API_KEY == "":
            exit("ERROR: Google API Key not found")
        if self.__GOOGLE_SEARCH_ENGINE_ID == "":
@ -90,7 +74,6 @@ class Google:
        try:
            async with session.get(url, headers=HTTP_USERAGENT) as response:
                html = await response.text()
-                html = self.__ad_remover.remove_ads(html)
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text()
                normalized_text = normalizer(text)
@ -118,26 +101,11 @@ class Google:
        contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
        loop.close()
        self.__content = self.__flatten(contents)
-        self.__content = [str(x) for x in self.__content]

-    def __filter_irrelevant_processing(self: "Google") -> None:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
-            futures = [executor.submit(filter_relevant, self.__content, self.__query)]
-            concurrent.futures.wait(futures)
-            content: list[str] = []
-            for future in futures:
-                content.append(future.result())
-            self.__content = content
-
-    def google(
-        self: "Google", filter_irrelevant: bool = True
-    ) -> tuple[list[str], list[str]]:
+    def google(self: "Google") -> tuple[list[str], list[str]]:
        self.__get_urls()
        self.__get_urls_contents()
-        if filter_irrelevant:
-            self.__filter_irrelevant_processing()
-        results: tuple[list[str], list[str]] = (self.__content, self.__urls)
-        return results
+        return (self.__content, self.__urls)


 """