update: look at todo
parent
f8f2e26d40
commit
f3f672dfc2
|
@ -0,0 +1,3 @@
|
||||||
|
# 1.0.0 (Stable relase)
|
||||||
|
|
||||||
|
This is the first stable release of Internet-ML (specifically Internet-NLP)
|
|
@ -9,6 +9,17 @@ from pathlib import Path
|
||||||
import dotenv
|
import dotenv
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
GOOGLE_SEARCH_API_KEY = str(os.environ["INTERNET_ML_GOOGLE_API"])
|
||||||
|
GOOGLE_SEARCH_ENGINE_ID = str(os.environ["INTERNET_ML_GOOGLE_SEARCH_ENGINE_ID"])
|
||||||
|
|
||||||
|
HTTP_USERAGENT: dict[str, str] = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||||
|
}
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
filename="internet.log",
|
filename="internet.log",
|
||||||
filemode="w",
|
filemode="w",
|
||||||
|
@ -21,7 +32,6 @@ sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
sys.path.append(str(Path(__file__).parent.parent))
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import concurrent.futures
|
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -29,22 +39,14 @@ import aiohttp
|
||||||
import config
|
import config
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from normalize import normalizer
|
from normalize import normalizer
|
||||||
from relevancy import filter_irrelevant
|
|
||||||
|
# from relevancy import filter_irrelevant
|
||||||
from sentencize import sentencizer
|
from sentencize import sentencizer
|
||||||
from urlextract import URLExtract
|
from urlextract import URLExtract
|
||||||
|
|
||||||
dotenv.load_dotenv()
|
|
||||||
|
|
||||||
|
|
||||||
HTTP_USERAGENT: dict[str, str] = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Google:
|
class Google:
|
||||||
def __init__(
|
def __init__(self: "Google", query: str) -> None:
|
||||||
self: Any, query: str, GOOGLE_SEARCH_API_KEY: str, GOOGLE_SEARCH_ENGINE_ID: str
|
|
||||||
) -> None:
|
|
||||||
self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
|
self.__GOOGLE_SEARCH_API_KEY: str = GOOGLE_SEARCH_API_KEY
|
||||||
self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
|
self.__GOOGLE_SEARCH_ENGINE_ID: str = GOOGLE_SEARCH_ENGINE_ID
|
||||||
self.__num_res: int = (
|
self.__num_res: int = (
|
||||||
|
@ -59,7 +61,7 @@ class Google:
|
||||||
r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query
|
r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", self.__query
|
||||||
)
|
)
|
||||||
|
|
||||||
def __get_urls(self: Any) -> None:
|
def __get_urls(self: "Google") -> None:
|
||||||
# Send the request to the Google Search API
|
# Send the request to the Google Search API
|
||||||
if self.__GOOGLE_SEARCH_API_KEY == "":
|
if self.__GOOGLE_SEARCH_API_KEY == "":
|
||||||
exit("ERROR: Google API Key not found")
|
exit("ERROR: Google API Key not found")
|
||||||
|
@ -81,7 +83,7 @@ class Google:
|
||||||
if config.CONF_DEBUG:
|
if config.CONF_DEBUG:
|
||||||
logging.info(f"Links: {self.__urls}")
|
logging.info(f"Links: {self.__urls}")
|
||||||
|
|
||||||
async def __fetch_url(self: Any, session: Any, url: str) -> list[str]:
|
async def __fetch_url(self: "Google", session: Any, url: str) -> list[str]:
|
||||||
try:
|
try:
|
||||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
html = await response.text()
|
html = await response.text()
|
||||||
|
@ -101,7 +103,7 @@ class Google:
|
||||||
except Exception:
|
except Exception:
|
||||||
return [""]
|
return [""]
|
||||||
|
|
||||||
async def __fetch_urls(self: Any, urls: list[str]) -> Any:
|
async def __fetch_urls(self: "Google", urls: list[str]) -> Any:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
tasks = [
|
tasks = [
|
||||||
asyncio.create_task(self.__fetch_url(session, url)) for url in urls
|
asyncio.create_task(self.__fetch_url(session, url)) for url in urls
|
||||||
|
@ -112,14 +114,14 @@ class Google:
|
||||||
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
|
def __flatten(self: Any, a: list[list[Any]]) -> list[Any]:
|
||||||
return list(itertools.chain(*a))
|
return list(itertools.chain(*a))
|
||||||
|
|
||||||
def __get_urls_contents(self: Any) -> None:
|
def __get_urls_contents(self: "Google") -> None:
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
contents = loop.run_until_complete(self.__fetch_urls(self.__urls))
|
||||||
loop.close()
|
loop.close()
|
||||||
self.__content = self.__flatten(contents)
|
self.__content = self.__flatten(contents)
|
||||||
|
|
||||||
def google(self: Any) -> tuple[list[str], list[str]]:
|
def google(self: "Google") -> tuple[list[str], list[str]]:
|
||||||
# Hard coded exceptions - START
|
# Hard coded exceptions - START
|
||||||
if "Thamognya" in self.__query or "thamognya" in self.__query:
|
if "Thamognya" in self.__query or "thamognya" in self.__query:
|
||||||
return (["The smartest person in the world"], ["I decided it"])
|
return (["The smartest person in the world"], ["I decided it"])
|
||||||
|
@ -139,18 +141,14 @@ class Google:
|
||||||
"https://economictimes.indiatimes.com/news/narendra-modi",
|
"https://economictimes.indiatimes.com/news/narendra-modi",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
# Hard coded exceptions - End
|
||||||
self.__get_urls()
|
self.__get_urls()
|
||||||
self.__get_urls_contents()
|
self.__get_urls_contents()
|
||||||
return (self.__content, self.__urls)
|
return (self.__content, self.__urls)
|
||||||
|
|
||||||
|
|
||||||
def google(query: str) -> tuple[list[str], list[str]]:
|
def google(query: str) -> tuple[list[str], list[str]]:
|
||||||
_google = Google(
|
return Google(query).google()
|
||||||
query,
|
|
||||||
os.environ["INTERNET_ML_GOOGLE_API"],
|
|
||||||
os.environ["INTERNET_ML_GOOGLE_SEARCH_ENGINE_ID"],
|
|
||||||
)
|
|
||||||
return _google.google()
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -9,6 +9,14 @@ logging.basicConfig(
|
||||||
format="%(name)s - %(levelname)s - %(message)s",
|
format="%(name)s - %(levelname)s - %(message)s",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# General
|
||||||
|
CONF_DEBUG: bool = True
|
||||||
|
# Google
|
||||||
|
GOOGLE_API_KEY: str = ""
|
||||||
|
GOOGLE_SEARCH_ENGINE_ID: str = ""
|
||||||
|
# NLP
|
||||||
|
NLP_CONF_MODE: str = "default"
|
||||||
|
|
||||||
|
|
||||||
class FullConfig:
|
class FullConfig:
|
||||||
def __init__(self: Any) -> None:
|
def __init__(self: Any) -> None:
|
||||||
|
|
|
@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "internet-ml"
|
name = "internet-ml"
|
||||||
version = "0.2.11"
|
version = "1.0.0"
|
||||||
description = "Internet-ML: Allowing ML to connect to the internet"
|
description = "Internet-ML: Allowing ML to connect to the internet"
|
||||||
readme = "./.github/README.md"
|
readme = "./.github/README.md"
|
||||||
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
authors = ["Thamognya Kodi <contact@thamognya.com>"]
|
||||||
|
|
Loading…
Reference in New Issue