internet_ml/internet_ml/tools/NLP/data/internet.py

# from typing import Any, List, Tuple

# import os
# import sys
# from importlib import reload
# from pathlib import Path

# import dotenv
# import requests

# HTTP_USERAGENT: dict[str, str] = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
# }

# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
# sys.path.append(str(Path(__file__).parent.parent))

# import asyncio
# import itertools
# import re

# import aiohttp
# import config
# from bs4 import BeautifulSoup
# from normalize import normalizer

# # from relevancy import filter_irrelevant
# from sentencize import sentencizer
# from urlextract import URLExtract
# from adremover import AdRemover


# class Google:
#     def __init__(
#         self: "Google",
#         query: str,
#         GOOGLE_SEARCH_API_KEY: str,
#         GOOGLE_SEARCH_ENGINE_ID: str,
#     ) -> None:
#         self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
#         self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID

#         # if environment keys are not given, assume it is in env
#         if GOOGLE_SEARCH_API_KEY == "":
#             self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
#         if GOOGLE_SEARCH_ENGINE_ID == "":
#             self.__GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))

#         self.__num_res: int = 10
#         self.__query = query
#         self.__URL_EXTRACTOR: URLExtract = URLExtract()
#         self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
#         self.__query = str(
#             re.sub(
#                 r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
#                 "",
#                 str(self.__query),
#             )
#         )

#     def __get_urls(self: "Google") -> None:
#         if self.__GOOGLE_SEARCH_API_KEY == "":
#             exit("ERROR: Google API Key not found")
#         if self.__GOOGLE_SEARCH_ENGINE_ID == "":
#             exit("ERROR: Google Search Engine Id not found")
#         response = requests.get(
#             "https://www.googleapis.com/customsearch/v1",
#             params={
#                 "key": self.__GOOGLE_SEARCH_API_KEY,
#                 "q": self.__query,
#                 "cx": self.__GOOGLE_SEARCH_ENGINE_ID,
#             },
#         )
#         results = response.json()["items"]
#         for result in results:
#             self.__urls.append(result["link"])
#             if len(self.__urls) == self.__num_res:
#                 break

#     async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
#         try:
#             async with session.get(url, headers=HTTP_USERAGENT) as response:
#                 html = await response.text()
#                 soup = BeautifulSoup(html, "html.parser")
#                 text = soup.get_text()
#                 normalized_text = normalizer(text)
#                 sentences: list[str] = sentencizer(normalized_text)
#                 return sentences
#         except aiohttp.ClientConnectorError:
#             return [""]
#         except Exception:
#             return [""]

#     async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
#         async with aiohttp.ClientSession() as session:
#             tasks = [
#                 asyncio.create_task(self.__fetch_url(session, url)) for url in urls
#             ]
#             results = await asyncio.gather(*tasks)
#             return results

#     def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
#         return list(itertools.chain(*a))

#     def __get_urls_contents(self: "Google") -> None:
#         loop = asyncio.new_event_loop()
#         asyncio.set_event_loop(loop)
#         contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
#         loop.close()
#         self.__content = self.__flatten(contents)

#     def google(self: "Google") -> tuple[list[str], list[str]]:
#         self.__get_urls()
#         self.__get_urls_contents()
#         return (self.__content, self.__urls)


# """
# Timing:
# import time
# start_time = time.time()
# google("Who is Elon Musk")
# print("--- %s seconds ---" % (time.time() - start_time))

# # Results:

# # --- 2.2230100631713867 seconds ---

# # ________________________________________________________
# # Executed in    4.73 secs    fish           external
# #    usr time    3.35 secs   85.00 micros    3.35 secs
# #    sys time    1.86 secs  956.00 micros    1.86 secs
# """

from typing import Any, Dict, List, Tuple

import os
import pickle
import sys
from importlib import reload
from pathlib import Path

import dotenv
import requests

HTTP_USERAGENT: dict[str, str] = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent))

import asyncio
import concurrent.futures
import itertools
import re

import aiohttp
import config
from adremover import AdRemover
from bs4 import BeautifulSoup
from keywords import get_keywords
from normalize import normalizer
from relevancy import filter_relevant
from sentencize import sentencizer
from urlextract import URLExtract

dotenv.load_dotenv()


class Google:
    def __init__(
        self: "Google",
        query: str,
        GOOGLE_SEARCH_API_KEY: str,
        GOOGLE_SEARCH_ENGINE_ID: str,
    ) -> None:
        # if environment keys are not given, assume it is in env
        if GOOGLE_SEARCH_API_KEY == "":
            self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
        if GOOGLE_SEARCH_ENGINE_ID == "":
            self.__GOOGLE_SEARCH_ENGINE_ID = str(
                os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
            )
        self.__num_res: int = (
            5
            if config.NLP_CONF_MODE == "speed"
            else (20 if config.NLP_CONF_MODE else 10)
        )
        self.__query = query
        self.__URL_EXTRACTOR: URLExtract = URLExtract()
        self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
        self.__query = str(
            re.sub(
                r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
                "",
                str(self.__query),
            )
        )
        self.__content: list[str] = []
        ADBLOCK_RULES = [
            "https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt",
            "https://filters.adtidy.org/extension/chromium/filters/1.txt",
        ]
        self.__ad_remover = AdRemover(ADBLOCK_RULES)

    def __get_urls(self: "Google") -> None:
        # Send the request to the Google Search API
        if self.__GOOGLE_SEARCH_API_KEY == "":
            exit("ERROR: Google API Key not found")
        if self.__GOOGLE_SEARCH_ENGINE_ID == "":
            exit("ERROR: Google Search Engine Id not found")
        response = requests.get(
            "https://www.googleapis.com/customsearch/v1",
            params={
                "key": self.__GOOGLE_SEARCH_API_KEY,
                "q": self.__query,
                "cx": self.__GOOGLE_SEARCH_ENGINE_ID,
            },
        )
        results = response.json()["items"]
        for result in results:
            self.__urls.append(result["link"])
            if len(self.__urls) == self.__num_res:
                break

    async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
        try:
            async with session.get(url, headers=HTTP_USERAGENT) as response:
                html = await response.text()
                html = self.__ad_remover.remove_ads(html)
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text()
                normalized_text = normalizer(text)
                sentences: list[str] = sentencizer(normalized_text)
                return sentences
        except aiohttp.ClientConnectorError:
            return [""]
        except Exception:
            return [""]

    async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
        async with aiohttp.ClientSession() as session:
            tasks = [
                asyncio.create_task(self.__fetch_url(session, url)) for url in urls
            ]
            results = await asyncio.gather(*tasks)
            return results

    def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
        return list(itertools.chain(*a))

    def __get_urls_contents(self: "Google") -> None:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
        loop.close()
        self.__content = self.__flatten(contents)
        self.__content = [str(x) for x in self.__content]

    def __filter_irrelevant_processing(self: "Google") -> None:
        with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
            futures = [executor.submit(filter_relevant, self.__content, self.__query)]
            concurrent.futures.wait(futures)
            content: list[str] = []
            for future in futures:
                content.append(future.result())
            self.__content = content

    def google(
        self: "Google", filter_irrelevant: bool = True
    ) -> tuple[list[str], list[str]]:
        self.__get_urls()
        self.__get_urls_contents()
        if filter_irrelevant:
            self.__filter_irrelevant_processing()
        results: tuple[list[str], list[str]] = (self.__content, self.__urls)
        return results


"""
Timing:
import time
start_time = time.time()
google("Who is Elon Musk")
print("--- %s seconds ---" % (time.time() - start_time))

# Results:

# --- 2.2230100631713867 seconds ---

# ________________________________________________________
# Executed in    4.73 secs    fish           external
#    usr time    3.35 secs   85.00 micros    3.35 secs
#    sys time    1.86 secs  956.00 micros    1.86 secs
"""