internet_ml/internet_ml/tools/NLP/data/internet.py

299 lines
10 KiB
Python
Raw Normal View History

2023-01-14 14:20:23 +00:00
# from typing import Any, List, Tuple
# import os
# import sys
# from importlib import reload
# from pathlib import Path
# import dotenv
# import requests
# HTTP_USERAGENT: dict[str, str] = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
# }
# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
# sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
# sys.path.append(str(Path(__file__).parent.parent))
# import asyncio
# import itertools
# import re
# import aiohttp
# import config
# from bs4 import BeautifulSoup
# from normalize import normalizer
# # from relevancy import filter_irrelevant
# from sentencize import sentencizer
# from urlextract import URLExtract
# from adremover import AdRemover
# class Google:
# def __init__(
# self: "Google",
# query: str,
# GOOGLE_SEARCH_API_KEY: str,
# GOOGLE_SEARCH_ENGINE_ID: str,
# ) -> None:
# self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
# self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
# # if environment keys are not given, assume it is in env
# if GOOGLE_SEARCH_API_KEY == "":
# self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
# if GOOGLE_SEARCH_ENGINE_ID == "":
# self.__GOOGLE_SEARCH_ENGINE_ID = str(os.environ.get("GOOGLE_SEARCH_ENGINE_ID"))
# self.__num_res: int = 10
# self.__query = query
# self.__URL_EXTRACTOR: URLExtract = URLExtract()
# self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
# self.__query = str(
# re.sub(
# r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
# "",
# str(self.__query),
# )
# )
# def __get_urls(self: "Google") -> None:
# if self.__GOOGLE_SEARCH_API_KEY == "":
# exit("ERROR: Google API Key not found")
# if self.__GOOGLE_SEARCH_ENGINE_ID == "":
# exit("ERROR: Google Search Engine Id not found")
# response = requests.get(
# "https://www.googleapis.com/customsearch/v1",
# params={
# "key": self.__GOOGLE_SEARCH_API_KEY,
# "q": self.__query,
# "cx": self.__GOOGLE_SEARCH_ENGINE_ID,
# },
# )
# results = response.json()["items"]
# for result in results:
# self.__urls.append(result["link"])
# if len(self.__urls) == self.__num_res:
# break
# async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
# try:
# async with session.get(url, headers=HTTP_USERAGENT) as response:
# html = await response.text()
# soup = BeautifulSoup(html, "html.parser")
# text = soup.get_text()
# normalized_text = normalizer(text)
# sentences: list[str] = sentencizer(normalized_text)
# return sentences
# except aiohttp.ClientConnectorError:
# return [""]
# except Exception:
# return [""]
# async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
# async with aiohttp.ClientSession() as session:
# tasks = [
# asyncio.create_task(self.__fetch_url(session, url)) for url in urls
# ]
# results = await asyncio.gather(*tasks)
# return results
# def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
# return list(itertools.chain(*a))
# def __get_urls_contents(self: "Google") -> None:
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
# contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
# loop.close()
# self.__content = self.__flatten(contents)
# def google(self: "Google") -> tuple[list[str], list[str]]:
# self.__get_urls()
# self.__get_urls_contents()
# return (self.__content, self.__urls)
# """
# Timing:
# import time
# start_time = time.time()
# google("Who is Elon Musk")
# print("--- %s seconds ---" % (time.time() - start_time))
# # Results:
# # --- 2.2230100631713867 seconds ---
# # ________________________________________________________
# # Executed in 4.73 secs fish external
# # usr time 3.35 secs 85.00 micros 3.35 secs
# # sys time 1.86 secs 956.00 micros 1.86 secs
# """
from typing import Any, Dict, List, Tuple
2022-12-25 17:15:24 +00:00
import os
2023-01-14 14:20:23 +00:00
import pickle
2022-12-26 06:07:39 +00:00
import sys
2022-12-27 06:56:53 +00:00
from importlib import reload
2022-12-26 06:07:39 +00:00
from pathlib import Path
2022-12-25 17:15:24 +00:00
import dotenv
import requests
2022-12-28 11:50:22 +00:00
HTTP_USERAGENT: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
2022-12-26 06:07:39 +00:00
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent))
import asyncio
2023-01-14 14:20:23 +00:00
import concurrent.futures
2022-12-26 06:07:39 +00:00
import itertools
import re
import aiohttp
import config
2023-01-14 14:20:23 +00:00
from adremover import AdRemover
2022-12-26 06:07:39 +00:00
from bs4 import BeautifulSoup
2023-01-14 14:20:23 +00:00
from keywords import get_keywords
2022-12-26 06:07:39 +00:00
from normalize import normalizer
2023-01-14 14:20:23 +00:00
from relevancy import filter_relevant
2022-12-26 06:07:39 +00:00
from sentencize import sentencizer
from urlextract import URLExtract
2023-01-14 14:20:23 +00:00
dotenv.load_dotenv()
2022-12-24 08:20:54 +00:00
2022-12-27 12:19:01 +00:00
class Google:
2022-12-30 05:28:26 +00:00
def __init__(
self: "Google",
query: str,
GOOGLE_SEARCH_API_KEY: str,
GOOGLE_SEARCH_ENGINE_ID: str,
) -> None:
2023-01-14 14:20:23 +00:00
# if environment keys are not given, assume it is in env
if GOOGLE_SEARCH_API_KEY == "":
self.__GOOGLE_SEARCH_API_KEY = str(os.environ.get("GOOGLE_SEARCH_API_KEY"))
if GOOGLE_SEARCH_ENGINE_ID == "":
self.__GOOGLE_SEARCH_ENGINE_ID = str(
os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
)
self.__num_res: int = (
5
if config.NLP_CONF_MODE == "speed"
else (20 if config.NLP_CONF_MODE else 10)
)
2022-12-27 12:19:01 +00:00
self.__query = query
self.__URL_EXTRACTOR: URLExtract = URLExtract()
self.__urls: list[str] = self.__URL_EXTRACTOR.find_urls(query)
2023-01-01 13:01:12 +00:00
self.__query = str(
re.sub(
r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*",
"",
str(self.__query),
)
2022-12-27 12:19:01 +00:00
)
2023-01-14 14:20:23 +00:00
self.__content: list[str] = []
ADBLOCK_RULES = [
"https://easylist-downloads.adblockplus.org/ruadlist+easylist.txt",
"https://filters.adtidy.org/extension/chromium/filters/1.txt",
]
self.__ad_remover = AdRemover(ADBLOCK_RULES)
2022-12-27 12:19:01 +00:00
2022-12-28 11:50:22 +00:00
def __get_urls(self: "Google") -> None:
2023-01-14 14:20:23 +00:00
# Send the request to the Google Search API
2022-12-28 10:41:27 +00:00
if self.__GOOGLE_SEARCH_API_KEY == "":
2022-12-27 06:38:47 +00:00
exit("ERROR: Google API Key not found")
2022-12-28 10:41:27 +00:00
if self.__GOOGLE_SEARCH_ENGINE_ID == "":
2022-12-27 06:38:47 +00:00
exit("ERROR: Google Search Engine Id not found")
response = requests.get(
"https://www.googleapis.com/customsearch/v1",
params={
2022-12-28 10:41:27 +00:00
"key": self.__GOOGLE_SEARCH_API_KEY,
2022-12-27 12:19:01 +00:00
"q": self.__query,
2022-12-28 10:41:27 +00:00
"cx": self.__GOOGLE_SEARCH_ENGINE_ID,
2022-12-27 06:38:47 +00:00
},
)
results = response.json()["items"]
for result in results:
2022-12-27 12:19:01 +00:00
self.__urls.append(result["link"])
if len(self.__urls) == self.__num_res:
2022-12-27 06:38:47 +00:00
break
2022-12-27 12:19:01 +00:00
2022-12-28 11:50:22 +00:00
async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
2022-12-27 12:19:01 +00:00
try:
async with session.get(url, headers=HTTP_USERAGENT) as response:
html = await response.text()
2023-01-14 14:20:23 +00:00
html = self.__ad_remover.remove_ads(html)
2022-12-27 12:19:01 +00:00
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
normalized_text = normalizer(text)
sentences: list[str] = sentencizer(normalized_text)
return sentences
except aiohttp.ClientConnectorError:
return [""]
except Exception:
return [""]
2022-12-28 11:50:22 +00:00
async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
2022-12-27 12:19:01 +00:00
async with aiohttp.ClientSession() as session:
tasks = [
asyncio.create_task(self.__fetch_url(session, url)) for url in urls
]
results = await asyncio.gather(*tasks)
return results
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
return list(itertools.chain(*a))
2022-12-28 11:50:22 +00:00
def __get_urls_contents(self: "Google") -> None:
2022-12-27 12:19:01 +00:00
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
loop.close()
self.__content = self.__flatten(contents)
2023-01-14 14:20:23 +00:00
self.__content = [str(x) for x in self.__content]
def __filter_irrelevant_processing(self: "Google") -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
futures = [executor.submit(filter_relevant, self.__content, self.__query)]
concurrent.futures.wait(futures)
content: list[str] = []
for future in futures:
content.append(future.result())
self.__content = content
2023-01-14 13:12:43 +00:00
2023-01-14 14:20:23 +00:00
def google(
self: "Google", filter_irrelevant: bool = True
) -> tuple[list[str], list[str]]:
2022-12-27 12:19:01 +00:00
self.__get_urls()
self.__get_urls_contents()
2023-01-14 14:20:23 +00:00
if filter_irrelevant:
self.__filter_irrelevant_processing()
results: tuple[list[str], list[str]] = (self.__content, self.__urls)
return results
2022-12-27 12:19:01 +00:00
2022-12-27 06:38:47 +00:00
"""
Timing:
import time
start_time = time.time()
google("Who is Elon Musk")
print("--- %s seconds ---" % (time.time() - start_time))
# Results:
# --- 2.2230100631713867 seconds ---
# ________________________________________________________
# Executed in 4.73 secs fish external
# usr time 3.35 secs 85.00 micros 3.35 secs
# sys time 1.86 secs 956.00 micros 1.86 secs
"""