update: look at todo

2023-01-14 21:20:23 +07:00 · 2023-01-14 21:20:23 +07:00 · 5261d734de
parent 5f0faa77a4
commit 5261d734de
2 changed files with 199 additions and 34 deletions
--- a/internet_ml/NLP/no_context/QA.py
+++ b/internet_ml/NLP/no_context/QA.py
@ -1,4 +1,13 @@
 # type: ignore
 """
 model naming convention
 # Open-AI models:
 include prefix openai-*
 # HuggingFace
 include prefix hf-*
 """
 from typing import Any, List, Tuple
 import os
@ -31,11 +40,6 @@ def answer(
    CHATGPT_CONVERSATION_ID: str = "",
    CHATGPT_PARENT_ID: str = "",
 ) -> tuple[Any, list[str]]:
    # if environment keys are not given, assume it is in env
    if GOOGLE_SEARCH_API_KEY == "":
        GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
    if GOOGLE_SEARCH_ENGINE_ID == "":
        GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))
    if OPENAI_API_KEY == "":
        OPENAI_API_KEY = str(os.environ.get("OPENAI_API_KEY"))
        openai.api_key = OPENAI_API_KEY
@ -45,25 +49,20 @@ def answer(
        CHATGPT_CONVERSATION_ID = str(os.environ.get("CHATGPT_CONVERSATION_ID"))
    if CHATGPT_PARENT_ID == "":
        CHATGPT_PARENT_ID = str(os.environ.get("CHATGPT_PARENT_ID"))
-    """
+
    model naming convention
    # Open-AI models:
    include prefix openai-*
    # HuggingFace
    include prefix hf-*
    # 
    """
    if not (model.startswith("openai-") or model.startswith("hf-")):
        model = "openai-chatgpt"  # Default
    results: tuple[list[str], list[str]] = internet.Google(
        query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
    ).google()
    if model.startswith("openai-"):
        if model == "openai-chatgpt":
            # ChatGPT
            results: tuple[list[str], list[str]] = internet.Google(
                query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
            ).google()
            # print(' '.join(filter(lambda x: isinstance(x, str), results[0]))[:4000])
            prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
            print(prompt)
            exit(1)
            chatbot = Chatbot(
                {"session_token": CHATGPT_SESSION_TOKEN},
                conversation_id=None,
@ -77,14 +76,11 @@ def answer(
            return (response["message"], results[1])
        else:
            if model == "openai-text-davinci-003":
-                results: tuple[list[str], list[str]] = internet.Google(
+                # text-davinci-003
-                    query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
+                prompt = f"Using the context: {' '.join(filter(lambda x: isinstance(x, str), results[0]))[:3000]} and answer the question with the context above and previous knowledge: \"{query}\". Also write long answers or essays if asked."
                ).google()
                context = " ".join(results[0])
                context[: (4097 - len(query) - 10)]
                response = openai.Completion.create(
                    model="text-davinci-003",
-                    prompt=f"{context} Q: {query}",
+                    prompt=prompt,
                    max_tokens=len(context),
                    n=1,
                    stop=None,
@ -94,9 +90,6 @@ def answer(
            # TODO: add suport later
    else:
        model = model.replace("hf-", "", 1)
        results: tuple[list[str], list[str]] = internet.Google(
            query, GOOGLE_SEARCH_API_KEY, GOOGLE_SEARCH_ENGINE_ID
        ).google()
        qa_model = pipeline("question-answering", model=model)
        response = qa_model(question=query, context=" ".join(results[0]))
        return (response["answer"], results[1])
--- a/internet_ml/tools/NLP/data/internet.py
+++ b/internet_ml/tools/NLP/data/internet.py
@ -1,6 +1,142 @@
-from typing import Any, List, Tuple
+# from typing import Any, List, Tuple
 # import os
 # import sys
 # from importlib import reload
 # from pathlib import Path
 # import dotenv
 # import requests
 # HTTP_USERAGENT: dict[str, str] = {
 #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
 # }
 # sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
 # sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
 # sys.path.append(str(Path(__file__).parent.parent))
 # import asyncio
 # import itertools
 # import re
 # import aiohttp
 # import config
 # from bs4 import BeautifulSoup
 # from normalize import normalizer
 # # from relevancy import filter_irrelevant
 # from sentencize import sentencizer
 # from urlextract import URLExtract
 # from adremover import AdRemover
 # class Google:
 #     def __init__(
 #         self: "Google",
 #         query: str,
 #         GOOGLE_SEARCH_API_KEY: str,
 #         GOOGLE_SEARCH_ENGINE_ID: str,
 #     ) -> None:
 #         self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
 #         self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
 #         # if environment keys are not given, assume it is in env
 #         if GOOGLE_SEARCH_API_KEY == "":
 #             self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
 #         if GOOGLE_SEARCH_ENGINE_ID == "":
 #             self.__GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))
 #         self.__num_res: int = 10
 #         self.__query = query
 #         self.__URL_EXTRACTOR: URLExtract = URLExtract()
 #         self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
 #         self.__query = str(
 #             re.sub(
 #                 r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
 #                 "",
 #                 str(self.__query),
 #             )
 #         )
 #     def __get_urls(self: "Google") -> None:
 #         if self.__GOOGLE_SEARCH_API_KEY == "":
 #             exit("ERROR: Google API Key not found")
 #         if self.__GOOGLE_SEARCH_ENGINE_ID == "":
 #             exit("ERROR: Google Search Engine Id not found")
 #         response = requests.get(
 #             "https://www.googleapis.com/customsearch/v1",
 #             params={
 #                 "key": self.__GOOGLE_SEARCH_API_KEY,
 #                 "q": self.__query,
 #                 "cx": self.__GOOGLE_SEARCH_ENGINE_ID,
 #             },
 #         )
 #         results = response.json()["items"]
 #         for result in results:
 #             self.__urls.append(result["link"])
 #             if len(self.__urls) == self.__num_res:
 #                 break
 #     async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
 #         try:
 #             async with session.get(url, headers=HTTP_USERAGENT) as response:
 #                 html = await response.text()
 #                 soup = BeautifulSoup(html, "html.parser")
 #                 text = soup.get_text()
 #                 normalized_text = normalizer(text)
 #                 sentences: list[str] = sentencizer(normalized_text)
 #                 return sentences
 #         except aiohttp.ClientConnectorError:
 #             return [""]
 #         except Exception:
 #             return [""]
 #     async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
 #         async with aiohttp.ClientSession() as session:
 #             tasks = [
 #                 asyncio.create_task(self.__fetch_url(session, url)) for url in urls
 #             ]
 #             results = await asyncio.gather(*tasks)
 #             return results
 #     def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
 #         return list(itertools.chain(*a))
 #     def __get_urls_contents(self: "Google") -> None:
 #         loop = asyncio.new_event_loop()
 #         asyncio.set_event_loop(loop)
 #         contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
 #         loop.close()
 #         self.__content = self.__flatten(contents)
 #     def google(self: "Google") -> tuple[list[str], list[str]]:
 #         self.__get_urls()
 #         self.__get_urls_contents()
 #         return (self.__content, self.__urls)
 # """
 # Timing:
 # import time
 # start_time = time.time()
 # google("Who is Elon Musk")
 # print("--- %s seconds ---" % (time.time() - start_time))
 # # Results:
 # # --- 2.2230100631713867 seconds ---
 # # ________________________________________________________
 # # Executed in    4.73 secs    fish           external
 # #    usr time    3.35 secs   85.00 micros    3.35 secs
 # #    sys time    1.86 secs  956.00 micros    1.86 secs
 # """
 from typing import Any, Dict, List, Tuple
 import os
 import pickle
 import sys
 from importlib import reload
 from pathlib import Path
@ -17,18 +153,22 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
 sys.path.append(str(Path(__file__).parent.parent))
 import asyncio
 import concurrent.futures
 import itertools
 import re
 import aiohttp
 import config
 from adremover import AdRemover
 from bs4 import BeautifulSoup
 from keywords import get_keywords
 from normalize import normalizer
-
+from relevancy import filter_relevant
 # from relevancy import filter_irrelevant
 from sentencize import sentencizer
 from urlextract import URLExtract
 dotenv.load_dotenv()
 class Google:
    def __init__(
@ -37,9 +177,18 @@ class Google:
        GOOGLE_SEARCH_API_KEY: str,
        GOOGLE_SEARCH_ENGINE_ID: str,
    ) -> None:
-        self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
+        # if environment keys are not given, assume it is in env
-        self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
+        if GOOGLE_SEARCH_API_KEY == "":
-        self.__num_res: int = 10
+            self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
        if GOOGLE_SEARCH_ENGINE_ID == "":
            self.__GOOGLE_SEARCH_ENGINE_ID = str(
                os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
            )
        self.__num_res: int = (
            5
            if config.NLP_CONF_MODE == "speed"
            else (20 if config.NLP_CONF_MODE else 10)
        )
        self.__query = query
        self.__URL_EXTRACTOR: URLExtract = URLExtract()
        self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
@ -50,8 +199,15 @@ class Google:
                str(self.__query),
            )
        )
        self.__content: list[str] = []
        ADBLOCK_RULES = [
            "https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt",
            "https://filters.adtidy.org/extension/chromium/filters/1.txt",
        ]
        self.__ad_remover = AdRemover(ADBLOCK_RULES)
    def __get_urls(self: "Google") -> None:
        # Send the request to the Google Search API
        if self.__GOOGLE_SEARCH_API_KEY == "":
            exit("ERROR: Google API Key not found")
        if self.__GOOGLE_SEARCH_ENGINE_ID == "":
@ -74,6 +230,7 @@ class Google:
        try:
            async with session.get(url, headers=HTTP_USERAGENT) as response:
                html = await response.text()
                html = self.__ad_remover.remove_ads(html)
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text()
                normalized_text = normalizer(text)
@ -101,11 +258,26 @@ class Google:
        contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
        loop.close()
        self.__content = self.__flatten(contents)
        self.__content = [str(x) for x in self.__content]
-    def google(self: "Google") -> tuple[list[str], list[str]]:
+    def __filter_irrelevant_processing(self: "Google") -> None:
        with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
            futures = [executor.submit(filter_relevant, self.__content, self.__query)]
            concurrent.futures.wait(futures)
            content: list[str] = []
            for future in futures:
                content.append(future.result())
            self.__content = content
    def google(
        self: "Google", filter_irrelevant: bool = True
    ) -> tuple[list[str], list[str]]:
        self.__get_urls()
        self.__get_urls_contents()
-        return (self.__content, self.__urls)
+        if filter_irrelevant:
            self.__filter_irrelevant_processing()
        results: tuple[list[str], list[str]] = (self.__content, self.__urls)
        return results
 """