update: look at todo

main
Thamognya Kodi 2022-12-28 18:50:22 +07:00
parent f8f2e26d40
commit f3f672dfc2
4 changed files with 33 additions and 24 deletions

3
.github/CHANGELOG.md vendored 100644
View File

@ -0,0 +1,3 @@
# 1.0.0 (Stable relase)
This is the first stable release of Internet-ML (specifically Internet-NLP)

View File

@ -9,6 +9,17 @@ from pathlib import Path
import dotenv import dotenv
import requests import requests
dotenv.load_dotenv()
GOOGLE_SEARCH_API_KEY = str(os.environ["INTERNET_ML_GOOGLE_API"])
GOOGLE_SEARCH_ENGINE_ID = str(os.environ["INTERNET_ML_GOOGLE_SEARCH_ENGINE_ID"])
HTTP_USERAGENT: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logging.basicConfig( logging.basicConfig(
filename="internet.log", filename="internet.log",
filemode="w", filemode="w",
@ -21,7 +32,6 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent)) sys.path.append(str(Path(__file__).parent.parent))
import asyncio import asyncio
import concurrent.futures
import itertools import itertools
import re import re
@ -29,22 +39,14 @@ import aiohttp
import config import config
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from normalize import normalizer from normalize import normalizer
from relevancy import filter_irrelevant
# from relevancy import filter_irrelevant
from sentencize import sentencizer from sentencize import sentencizer
from urlextract import URLExtract from urlextract import URLExtract
dotenv.load_dotenv()
HTTP_USERAGENT: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
class Google: class Google:
def __init__( def __init__(self: "Google", query: str) -> None:
self: Any, query: str, GOOGLE_SEARCH_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str
) -> None:
self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
self.__num_res: int = ( self.__num_res: int = (
@ -59,7 +61,7 @@ class Google:
r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query
) )
def __get_urls(self: Any) -> None: def __get_urls(self: "Google") -> None:
# Send the request to the Google Search API # Send the request to the Google Search API
if self.__GOOGLE_SEARCH_API_KEY == "": if self.__GOOGLE_SEARCH_API_KEY == "":
exit("ERROR: Google API Key not found") exit("ERROR: Google API Key not found")
@ -81,7 +83,7 @@ class Google:
if config.CONF_DEBUG: if config.CONF_DEBUG:
logging.info(f"Links: {self.__urls}") logging.info(f"Links: {self.__urls}")
async def __fetch_url(self: Any, session: Any, url: str) -> list[str]: async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
try: try:
async with session.get(url, headers=HTTP_USERAGENT) as response: async with session.get(url, headers=HTTP_USERAGENT) as response:
html = await response.text() html = await response.text()
@ -101,7 +103,7 @@ class Google:
except Exception: except Exception:
return [""] return [""]
async def __fetch_urls(self: Any, urls: list[str]) -> Any: async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
tasks = [ tasks = [
asyncio.create_task(self.__fetch_url(session, url)) for url in urls asyncio.create_task(self.__fetch_url(session, url)) for url in urls
@ -112,14 +114,14 @@ class Google:
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]: def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
return list(itertools.chain(*a)) return list(itertools.chain(*a))
def __get_urls_contents(self: Any) -> None: def __get_urls_contents(self: "Google") -> None:
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
contents = loop.run_until_complete(self.__fetch_urls(self.__urls)) contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
loop.close() loop.close()
self.__content = self.__flatten(contents) self.__content = self.__flatten(contents)
def google(self: Any) -> tuple[list[str], list[str]]: def google(self: "Google") -> tuple[list[str], list[str]]:
# Hard coded exceptions - START # Hard coded exceptions - START
if "Thamognya" in self.__query or "thamognya" in self.__query: if "Thamognya" in self.__query or "thamognya" in self.__query:
return (["The smartest person in the world"], ["I decided it"]) return (["The smartest person in the world"], ["I decided it"])
@ -139,18 +141,14 @@ class Google:
"https://economictimes.indiatimes.com/news/narendra-modi", "https://economictimes.indiatimes.com/news/narendra-modi",
], ],
) )
# Hard coded exceptions - End
self.__get_urls() self.__get_urls()
self.__get_urls_contents() self.__get_urls_contents()
return (self.__content, self.__urls) return (self.__content, self.__urls)
def google(query: str) -> tuple[list[str], list[str]]: def google(query: str) -> tuple[list[str], list[str]]:
_google = Google( return Google(query).google()
query,
os.environ["INTERNET_ML_GOOGLE_API"],
os.environ["INTERNET_ML_GOOGLE_SEARCH_ENGINE_ID"],
)
return _google.google()
""" """

View File

@ -9,6 +9,14 @@ logging.basicConfig(
format="%(name)s - %(levelname)s - %(message)s", format="%(name)s - %(levelname)s - %(message)s",
) )
# General
CONF_DEBUG: bool = True
# Google
GOOGLE_API_KEY: str = ""
GOOGLE_SEARCH_ENGINE_ID: str = ""
# NLP
NLP_CONF_MODE: str = "default"
class FullConfig: class FullConfig:
def __init__(self: Any) -> None: def __init__(self: Any) -> None:

View File

@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "internet-ml" name = "internet-ml"
version = "0.2.11" version = "1.0.0"
description = "Internet-ML: Allowing ML to connect to the internet" description = "Internet-ML: Allowing ML to connect to the internet"
readme = "./.github/README.md" readme = "./.github/README.md"
authors = ["Thamognya Kodi <contact@thamognya.com>"] authors = ["Thamognya Kodi <contact@thamognya.com>"]