main
Thamognya Kodi 2022-12-26 00:15:24 +07:00
parent 0c12e6b0ff
commit dbf476bd35
19 changed files with 243 additions and 622 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
.env
# Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode # Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode
# Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode # Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode

View File

@ -0,0 +1,17 @@
import sys
from pathlib import Path
from transformers import pipeline
sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
import internet
qa_model = pipeline("question-answering")
question = "Who is Elon Musk?"
a = internet.google(question)[0]
print(a)
context = ""
for i in a:
context += str(i)
print(qa_model(question=question, context=context))
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}

Binary file not shown.

View File

@ -0,0 +1,4 @@
import sys
from pathlib import Path
print()

View File

@ -1,256 +1,61 @@
# type: ignore # type: ignore
from typing import Any, Dict, List, Tuple from typing import List
import asyncio import asyncio
import logging import functools
import re import multiprocessing
import time import os
import urllib
import aiohttp import aiohttp
from bs4 import BeautifulSoup import dotenv
import requests
# Set up logging dotenv.load_dotenv()
logging.basicConfig(
filename="internet.log",
filemode="w",
level=logging.INFO,
format="%(name)s - %(levelname)s - %(message)s",
)
# import concurrent.futures
# Import the config module
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
import config
sys.path.append(str(Path(__file__).parent.parent))
import pickle
from is_relevant import filter_irrelevant
from normalize import normalizer
from sentencize import sentencizer
from urlextract import URLExtract
# Define the user agent
HTTP_USERAGENT: dict[str, str] = { HTTP_USERAGENT: dict[str, str] = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# Define the google domains
UNWANTED_DOMAINS = {
"https://www.google.",
"https://google.",
"https://webcache.googleusercontent.",
"http://webcache.googleusercontent.",
"https://policies.google.",
"https://support.google.",
"https://maps.google.",
"https://youtube.",
"https://translate.google.",
} }
CACHE_FILE_PATH: str = "./internet_cache.pkl"
CACHE_TIME: int = 86400 # one day
URL_EXTRACTOR = URLExtract() def google_urls(query: str, links: list[str]) -> list[str]:
# Send the request to the Google Search API
# Load the cache from the file (if it exists) response = requests.get(
try: "https://www.googleapis.com/customsearch/v1",
with open(CACHE_FILE_PATH, "rb") as f: params={
cache: Any = pickle.load(f) "key": os.environ["API_KEY"],
except FileNotFoundError: "q": query,
cache: Any = {} "cx": os.environ["SEARCH_ENGINE_ID"],
},
)
results = response.json()["items"]
# Print the search results
for result in results:
links.append(result["link"])
return links
# Define the fetch_url function class LinkFetcher:
async def fetch_url(session: aiohttp.ClientSession, url: str) -> str: def __init__(self, urls):
global HTTP_USERAGENT self.urls = urls
async def fetch(self, session, url):
async with session.get(url, headers=HTTP_USERAGENT) as response: async with session.get(url, headers=HTTP_USERAGENT) as response:
return await response.text() return await response.text()
async def main(self, session):
# Define the google_urls function tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls]
async def google_urls(query: str, links: list[str]) -> list[str]: responses = await asyncio.gather(*tasks)
""" return responses
Asynchronously search Google for the given query and retrieve the URLs of the top results.
Parameters:
query (str): The query to search for.
Returns:
List[str]: A list of the URLs of the top search results.
"""
global UNWANTED_DOMAINS
# Initialize an empty list to store the URLs
urls: list[str] = links
# Determine the number of results to retrieve based on the configuration mode
num_of_res: int = (
5
if config.NLP_CONF_MODE == "speed"
else (20 if config.NLP_CONF_MODE == "accuracy" else 10)
)
# Log the number of results wanted (if debugging is enabled)
if config.NLP_CONF_DEBUG:
logging.info(f"number of results wanted: {num_of_res}")
# Construct the search URL
search_url: str = (
"https://www.google.com/search?q="
+ str(urllib.parse.quote_plus(query))
+ "&num="
+ str(num_of_res)
)
# Log the search URL (if debugging is enabled)
if config.NLP_CONF_DEBUG:
logging.info(f"url: {search_url}")
# Create an aiohttp session and use it to fetch the search results
async with aiohttp.ClientSession() as session:
response: str = await fetch_url(session, search_url)
# Wait 10 seconds before parsing the results (to avoid being rate-limited)
await asyncio.sleep(10.0)
# Parse the search results using BeautifulSoup
soup: BeautifulSoup = BeautifulSoup(response, "html.parser")
# Iterate over the links in the search results
for link in list(soup.select("a[href]")):
# Extract the URL from the link
url = str(link["href"])
# Check if the URL is valid and not a Google or YouTube link
if ("http" in url) and (
not any(url.startswith(s) for s in UNWANTED_DOMAINS)
):
urls.append(url)
if config.NLP_CONF_DEBUG:
logging.info(f"added {url}")
if len(urls) == num_of_res:
break
return urls
async def fetch_url_text( def fetch_content(urls: list[str]):
session: aiohttp.ClientSession, url: str, query: str fetcher = LinkFetcher(urls)
) -> list[str]: with aiohttp.ClientSession() as session:
""" with multiprocessing.Pool(processes=5) as pool:
Extract the text from the given HTML content. contents = list(pool.map(functools.partial(fetcher.main), [session]))
return contents
Parameters:
session (aiohttp.ClientSession): aiohttp session
url (str): The url content to get text from.
Returns:
str: The extracted text.
"""
global HTTP_USERAGENT
try:
async with session.get(url, headers=HTTP_USERAGENT) as response:
soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser")
text = normalizer(soup.get_text())
if config.NLP_CONF_DEBUG:
logging.info(f"Text: {text}")
sentences: list[str] = sentencizer(text)
sentences = filter_irrelevant(sentences, query)
return sentences
except Exception as e:
# Log the error and continue execution
logging.error(f"Error occurred: {e}")
return []
def flatten(l): a = google_urls("Who is Neil Armstrong", [])
return [item for sublist in l for item in sublist] print(a)
print(fetch_content(a))
async def get_text_content(urls: list[str], query: str) -> list[str]:
# Create a list to store the results
results: list[str] = []
# Create an aiohttp session
async with aiohttp.ClientSession() as session:
# Create a list of tasks to run concurrently
tasks: list[Any] = [
asyncio.create_task(fetch_url_text(session, url, query)) for url in urls
]
# Use asyncio.gather to run the tasks concurrently
results = await asyncio.gather(*tasks)
sentences: list[str] = flatten(results)
return sentences
def google(query: str) -> Tuple[List[str], str]:
global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
entry = cache.get(query)
if entry is None:
# no query exists, so add a new entry to the cache
urls: List[str] = asyncio.run(google_urls(query, links_in_text))
text: str = str(asyncio.run(get_text_content(urls, query)))
cache[query]: Tuple[Tuple[List[str], str], int] = (
(text, urls),
time.time() + CACHE_TIME,
) # cache expires in one hour
elif entry[1] < time.time():
# update as it expired
urls: List[str] = asyncio.run(google_urls(query, links_in_text))
text: str = str(asyncio.run(get_text_content(urls, query)))
cache[query]: Tuple[Tuple[List[str], str], int] = (
(text, urls),
time.time() + CACHE_TIME,
) # cache expires in one hour
else:
# available so return it
text: List[str] = entry[0][0]
urls: str = entry[0][1]
# Save the cache to the file
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
# Return the text
return (text, urls)
"""
async + multithreading since web scraping is I/O bound
https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio
normal
________________________________________________________
Executed in 1.67 secs fish external
usr time 137.29 millis 0.11 millis 137.18 millis
sys time 38.39 millis 1.25 millis 37.13 millis
Async
________________________________________________________
Executed in 624.82 millis fish external
usr time 141.92 millis 0.11 millis 141.81 millis
sys time 38.00 millis 1.45 millis 36.55 millis
concurrent
________________________________________________________
Executed in 629.67 millis fish external
usr time 136.72 millis 0.12 millis 136.60 millis
sys time 36.86 millis 1.32 millis 35.54 millis
multiprocessing
________________________________________________________
Executed in 754.61 millis fish external
usr time 399.25 millis 0.11 millis 399.14 millis
sys time 164.39 millis 1.49 millis 162.90 millis
multiprocessing
OVERALL
multithreading bs4
________________________________________________________
Executed in 14.67 secs fish external
usr time 1.81 secs 0.12 millis 1.81 secs
sys time 0.14 secs 1.50 millis 0.14 secs
multiprocessing bs4
"""

400
poetry.lock generated
View File

@ -153,18 +153,6 @@ files = [
[package.dependencies] [package.dependencies]
frozenlist = ">=1.1.0" frozenlist = ">=1.1.0"
[[package]]
name = "appdirs"
version = "1.4.4"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "main"
optional = false
python-versions = "*"
files = [
{file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
{file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
]
[[package]] [[package]]
name = "astroid" name = "astroid"
version = "2.12.13" version = "2.12.13"
@ -215,17 +203,6 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
tests = ["attrs[tests-no-zope]", "zope.interface"] tests = ["attrs[tests-no-zope]", "zope.interface"]
tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"]
[[package]]
name = "audioread"
version = "3.0.0"
description = "multi-library, cross-platform audio decoding"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "audioread-3.0.0.tar.gz", hash = "sha256:121995bd207eb1fda3d566beb851d3534275925bc35a4fb6da0cb11de0f7251a"},
]
[[package]] [[package]]
name = "bandit" name = "bandit"
version = "1.7.4" version = "1.7.4"
@ -296,83 +273,6 @@ files = [
{file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
] ]
[[package]]
name = "cffi"
version = "1.15.1"
description = "Foreign Function Interface for Python calling C code."
category = "main"
optional = false
python-versions = "*"
files = [
{file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
{file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
{file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
{file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
{file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
{file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
{file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
{file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
{file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
{file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
{file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
{file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
{file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
{file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
{file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
{file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
{file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
{file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
{file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
{file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
{file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
{file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
{file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
{file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
{file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
{file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
{file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
{file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
{file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
{file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
{file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
{file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
{file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
{file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
{file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
{file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
]
[package.dependencies]
pycparser = "*"
[[package]] [[package]]
name = "cfgv" name = "cfgv"
version = "3.3.1" version = "3.3.1"
@ -553,7 +453,6 @@ aiohttp = "*"
dill = "<0.3.7" dill = "<0.3.7"
fsspec = {version = ">=2021.11.1", extras = ["http"]} fsspec = {version = ">=2021.11.1", extras = ["http"]}
huggingface-hub = ">=0.2.0,<1.0.0" huggingface-hub = ">=0.2.0,<1.0.0"
librosa = {version = "*", optional = true, markers = "extra == \"audio\""}
multiprocess = "*" multiprocess = "*"
numpy = ">=1.17" numpy = ">=1.17"
packaging = "*" packaging = "*"
@ -580,18 +479,6 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch
torch = ["torch"] torch = ["torch"]
vision = ["Pillow (>=6.2.1)"] vision = ["Pillow (>=6.2.1)"]
[[package]]
name = "decorator"
version = "5.1.1"
description = "Decorators for Humans"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
{file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
{file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
]
[[package]] [[package]]
name = "diffusers" name = "diffusers"
version = "0.11.1" version = "0.11.1"
@ -1002,18 +889,6 @@ MarkupSafe = ">=2.0"
[package.extras] [package.extras]
i18n = ["Babel (>=2.7)"] i18n = ["Babel (>=2.7)"]
[[package]]
name = "joblib"
version = "1.2.0"
description = "Lightweight pipelining with Python functions"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"},
{file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"},
]
[[package]] [[package]]
name = "lazy-object-proxy" name = "lazy-object-proxy"
version = "1.8.0" version = "1.8.0"
@ -1043,62 +918,6 @@ files = [
{file = "lazy_object_proxy-1.8.0-pp39-pypy39_pp73-any.whl", hash = "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8"}, {file = "lazy_object_proxy-1.8.0-pp39-pypy39_pp73-any.whl", hash = "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8"},
] ]
[[package]]
name = "librosa"
version = "0.9.2"
description = "Python module for audio and music processing"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "librosa-0.9.2-py3-none-any.whl", hash = "sha256:322a813e6d37af9fbc369e6a637dcf5fdc5c6925ce806a0d27c68de61a81350f"},
{file = "librosa-0.9.2.tar.gz", hash = "sha256:5b576b5efdce428e90bc988bdd5a953d12a727e5f931f30d74c53b63abbe3c89"},
]
[package.dependencies]
audioread = ">=2.1.9"
decorator = ">=4.0.10"
joblib = ">=0.14"
numba = ">=0.45.1"
numpy = ">=1.17.0"
packaging = ">=20.0"
pooch = ">=1.0"
resampy = ">=0.2.2"
scikit-learn = ">=0.19.1"
scipy = ">=1.2.0"
soundfile = ">=0.10.2"
[package.extras]
display = ["matplotlib (>=3.3.0)"]
docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.0.0,<2.0.0)", "sphinxcontrib-svg2pdfconverter"]
tests = ["contextlib2", "matplotlib (>=3.3.0)", "pytest", "pytest-cov", "pytest-mpl", "samplerate", "soxr"]
[[package]]
name = "llvmlite"
version = "0.34.0"
description = "lightweight wrapper around basic LLVM functionality"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "llvmlite-0.34.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:11342e5ac320c953590bdd9d0dec8c52f4b5252c4c6335ba25f1e7b9f91f9325"},
{file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:5bdf0ce430adfaf938ced5844d12f80616eb8321b5b9edfc45ef84ada5c5242c"},
{file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e08d9d2dc5a31636bfc6b516d2d7daba95632afa3419eb8730dc76a7951e9558"},
{file = "llvmlite-0.34.0-cp36-cp36m-win32.whl", hash = "sha256:9ff1dcdad03be0cf953aca5fc8cffdca25ccee2ec9e8ec7e95571722cdc02d55"},
{file = "llvmlite-0.34.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5acdc3c3c7ea0ef7a1a6b442272e05d695bc8492e5b07666135ed1cfbf4ab9d2"},
{file = "llvmlite-0.34.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:bb96989bc57a1ccb131e7a0e061d07b68139b6f81a98912345d53d9239e231e1"},
{file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:6d3f81992f52a94077e7b9b16497029daf5b5eebb2cce56f3c8345bbc9c6308e"},
{file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d841248d1c630426c93e3eb3f8c45bca0dab77c09faeb7553b1a500220e362ce"},
{file = "llvmlite-0.34.0-cp37-cp37m-win32.whl", hash = "sha256:408b15ffec30696406e821c89da010f1bb1eb0aa572be4561c98eb2536d610ab"},
{file = "llvmlite-0.34.0-cp37-cp37m-win_amd64.whl", hash = "sha256:5d1f370bf150db7239204f09cf6a0603292ea28bac984e69b167e16fe160d803"},
{file = "llvmlite-0.34.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:132322bc084abf336c80dd106f9357978c8c085911fb656898d3be0d9ff057ea"},
{file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:8f344102745fceba6eb5bf03c228bb290e9bc79157e9506a4a72878d636f9b3c"},
{file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:05253f3f44fab0148276335b2c1b2c4a78143dfa78e6bafd7f937d6248f297cc"},
{file = "llvmlite-0.34.0-cp38-cp38-win32.whl", hash = "sha256:28264f9e2b3df4135cbcfca5a91c5b0b31dd3fc02fa623b4bb13327f0cd4fc80"},
{file = "llvmlite-0.34.0-cp38-cp38-win_amd64.whl", hash = "sha256:964f8f7a2184963cb3617d057c2382575953e488b7bb061b632ee014cfef110a"},
{file = "llvmlite-0.34.0.tar.gz", hash = "sha256:f03ee0d19bca8f2fe922bb424a909d05c28411983b0c2bc58b020032a0d11f63"},
]
[[package]] [[package]]
name = "markdown" name = "markdown"
version = "3.3.7" version = "3.3.7"
@ -1405,37 +1224,6 @@ files = [
[package.dependencies] [package.dependencies]
setuptools = "*" setuptools = "*"
[[package]]
name = "numba"
version = "0.51.2"
description = "compiling Python code using LLVM"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "numba-0.51.2-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:af798310eeb318c56cdb83254abbe9a938cc0182d08671d7f9f032dc817e064d"},
{file = "numba-0.51.2-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:93e18350f2094e7432321c1275730a3143b94af012fb609cc180fa376c44867f"},
{file = "numba-0.51.2-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:9e2bb1f129bfadd757ad7a9c18ab79c3ab25ce6d6a68e58565d6c52ad07b3566"},
{file = "numba-0.51.2-cp36-cp36m-win32.whl", hash = "sha256:31cdf6b6d1301d5fb6c4fcb8b4c711ba5c9f60ba2fca008b550da9b56185367c"},
{file = "numba-0.51.2-cp36-cp36m-win_amd64.whl", hash = "sha256:df6edca13c04a31fdb5addf5205199478a7da372712829157ef491e8a6e7031f"},
{file = "numba-0.51.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a628122dacfcba9a3ea68a9e95578c6b6391016e34962c46550ea8e189e0412e"},
{file = "numba-0.51.2-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:106736d5a8dab6bebce989d4ab1b3f169c264582598f172e6e5b736210d2e834"},
{file = "numba-0.51.2-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a12f16fdb4ca5edc94e2ef412e4e768c29217ef9b6fdfc237d064ebe30acfe14"},
{file = "numba-0.51.2-cp37-cp37m-win32.whl", hash = "sha256:025b033fd31c44bba17802293c81270084b5454b5b055b8c10c394385c232f00"},
{file = "numba-0.51.2-cp37-cp37m-win_amd64.whl", hash = "sha256:081788f584fa500339e9b74bf02e3c5029d408c114e555ada19cae0b92721416"},
{file = "numba-0.51.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:5416b584183fd599afda11b947b64f89450fcf26a9c15b408167f412b98a3a94"},
{file = "numba-0.51.2-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:05da65dca2ac28a192c9d8f20e9e477eb1237205cfc4d131c414f5f8092c6639"},
{file = "numba-0.51.2-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:aee435e3b7e465dd49971f8ea76aa414532a87736916cb399534e017334d1138"},
{file = "numba-0.51.2-cp38-cp38-win32.whl", hash = "sha256:bbbe2432433b11d3fadab0226a84c1a81918cb905ba1aeb022249e8d2ba8856c"},
{file = "numba-0.51.2-cp38-cp38-win_amd64.whl", hash = "sha256:259e7c15b24feec4a99fb41eb8c47b5ad49b544d1a5ad40ad0252ef531ba06fd"},
{file = "numba-0.51.2.tar.gz", hash = "sha256:16bd59572114adbf5f600ea383880d7b2071ae45477e84a24994e089ea390768"},
]
[package.dependencies]
llvmlite = ">=0.34.0.dev0,<0.35"
numpy = ">=1.15"
setuptools = "*"
[[package]] [[package]]
name = "numpy" name = "numpy"
version = "1.24.0" version = "1.24.0"
@ -1733,28 +1521,6 @@ files = [
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"] testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pooch"
version = "1.6.0"
description = "\"Pooch manages your Python library's sample data files: it automatically downloads and stores them in a local directory, with support for versioning and corruption checks.\""
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "pooch-1.6.0-py3-none-any.whl", hash = "sha256:3bf0e20027096836b8dbce0152dbb785a269abeb621618eb4bdd275ff1e23c9c"},
{file = "pooch-1.6.0.tar.gz", hash = "sha256:57d20ec4b10dd694d2b05bb64bc6b109c6e85a6c1405794ce87ed8b341ab3f44"},
]
[package.dependencies]
appdirs = ">=1.3.0"
packaging = ">=20.0"
requests = ">=2.19.0"
[package.extras]
progress = ["tqdm (>=4.41.0,<5.0.0)"]
sftp = ["paramiko (>=2.7.0)"]
xxhash = ["xxhash (>=1.4.3)"]
[[package]] [[package]]
name = "pre-commit" name = "pre-commit"
version = "2.20.0" version = "2.20.0"
@ -1852,18 +1618,6 @@ files = [
[package.dependencies] [package.dependencies]
numpy = ">=1.16.6" numpy = ">=1.16.6"
[[package]]
name = "pycparser"
version = "2.21"
description = "C parser in Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
]
[[package]] [[package]]
name = "pydocstyle" name = "pydocstyle"
version = "6.1.1" version = "6.1.1"
@ -2031,6 +1785,21 @@ files = [
[package.dependencies] [package.dependencies]
six = ">=1.5" six = ">=1.5"
[[package]]
name = "python-dotenv"
version = "0.21.0"
description = "Read key-value pairs from a .env file and set them as environment variables"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "python-dotenv-0.21.0.tar.gz", hash = "sha256:b77d08274639e3d34145dfa6c7008e66df0f04b7be7a75fd0d5292c191d79045"},
{file = "python_dotenv-0.21.0-py3-none-any.whl", hash = "sha256:1684eb44636dd462b66c3ee016599815514527ad99965de77f43e0944634a7e5"},
]
[package.extras]
cli = ["click (>=5.0)"]
[[package]] [[package]]
name = "pytz" name = "pytz"
version = "2022.7" version = "2022.7"
@ -2243,27 +2012,6 @@ urllib3 = ">=1.21.1,<1.27"
socks = ["PySocks (>=1.5.6,!=1.5.7)"] socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "resampy"
version = "0.3.1"
description = "Efficient signal resampling"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "resampy-0.3.1-py3-none-any.whl", hash = "sha256:b09066c8f0eb0418d59963aca23b79b52d0ace0b8a6de0fce082e6771f7c3f68"},
{file = "resampy-0.3.1.tar.gz", hash = "sha256:7ed185b0912e1d913902a6a1ff7e3b8d977eb5bdd5663012276512ef09a8f701"},
]
[package.dependencies]
numba = ">=0.47"
numpy = ">=1.17"
[package.extras]
design = ["optuna (>=2.10.0)"]
docs = ["numpydoc", "sphinx (!=1.3.1)"]
tests = ["pytest (<8)", "pytest-cov", "scipy (>=1.0)"]
[[package]] [[package]]
name = "responses" name = "responses"
version = "0.18.0" version = "0.18.0"
@ -2390,88 +2138,6 @@ setuptools = ">=19.3"
github = ["jinja2 (>=3.1.0)", "pygithub (>=1.43.3)"] github = ["jinja2 (>=3.1.0)", "pygithub (>=1.43.3)"]
gitlab = ["python-gitlab (>=1.3.0)"] gitlab = ["python-gitlab (>=1.3.0)"]
[[package]]
name = "scikit-learn"
version = "1.2.0"
description = "A set of python modules for machine learning and data mining"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
{file = "scikit-learn-1.2.0.tar.gz", hash = "sha256:680b65b3caee469541385d2ca5b03ff70408f6c618c583948312f0d2125df680"},
{file = "scikit_learn-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1beaa631434d1f17a20b1eef5d842e58c195875d2bc11901a1a70b5fe544745b"},
{file = "scikit_learn-1.2.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d395730f26d8fc752321f1953ddf72647c892d8bed74fad4d7c816ec9b602dfa"},
{file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd3480c982b9e616b9f76ad8587804d3f4e91b4e2a6752e7dafb8a2e1f541098"},
{file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184a42842a4e698ffa4d849b6019de50a77a0aa24d26afa28fa49c9190bb144b"},
{file = "scikit_learn-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:867023a044fdfe59e5014a7fec7a3086a8928f10b5dce9382eedf4135f6709a2"},
{file = "scikit_learn-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5546a8894a0616e92489ef995b39a0715829f3df96e801bb55cbf196be0d9649"},
{file = "scikit_learn-1.2.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bc7073e025b62c1067cbfb76e69d08650c6b9d7a0e7afdfa20cb92d4afe516f6"},
{file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0a72237f0c56780cf550df87201a702d3bdcbbb23c6ef7d54c19326fa23f19"},
{file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e1ea0bc1706da45589bcf2490cde6276490a1b88f9af208dbb396fdc3a0babf"},
{file = "scikit_learn-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:f17420a8e3f40129aeb7e0f5ee35822d6178617007bb8f69521a2cefc20d5f00"},
{file = "scikit_learn-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25ba705ee1600ffc5df1dccd8fae129d7c6836e44ffcbb52d78536c9eaf8fcf9"},
{file = "scikit_learn-1.2.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:6b63ca2b0643d30fbf9d25d93017ed3fb8351f31175d82d104bfec60cba7bb87"},
{file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c772fa8c64776ad769fd764752c8452844307adcf10dee3adcc43988260f21"},
{file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0834e4cec2a2e0d8978f39cb8fe1cad3be6c27a47927e1774bf5737ea65ec228"},
{file = "scikit_learn-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:da29d2e379c396a63af5ed4b671ad2005cd690ac373a23bee5a0f66504e05272"},
{file = "scikit_learn-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:23a88883ca60c571a06278e4726b3b51b3709cfa4c93cacbf5568b22ba960899"},
{file = "scikit_learn-1.2.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:40f3ff68c505cb9d1f3693397c73991875d609da905087e00e7b4477645ec67b"},
{file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9535e867281ae6987bb80620ba14cf1649e936bfe45f48727b978b7a2dbe835"},
{file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de897720173b26842e21bed54362f5294e282422116b61cd931d4f5d870b9855"},
{file = "scikit_learn-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:ceb0008f345188aa236e49c973dc160b9ed504a3abd7b321a0ecabcb669be0bd"},
]
[package.dependencies]
joblib = ">=1.1.1"
numpy = ">=1.17.3"
scipy = ">=1.3.2"
threadpoolctl = ">=2.0.0"
[package.extras]
benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"]
docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"]
examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"]
tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=5.3.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"]
[[package]]
name = "scipy"
version = "1.9.3"
description = "Fundamental algorithms for scientific computing in Python"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
{file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"},
{file = "scipy-1.9.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:83b89e9586c62e787f5012e8475fbb12185bafb996a03257e9675cd73d3736dd"},
{file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a72d885fa44247f92743fc20732ae55564ff2a519e8302fb7e18717c5355a8b"},
{file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01e1dd7b15bd2449c8bfc6b7cc67d630700ed655654f0dfcf121600bad205c9"},
{file = "scipy-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:68239b6aa6f9c593da8be1509a05cb7f9efe98b80f43a5861cd24c7557e98523"},
{file = "scipy-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b41bc822679ad1c9a5f023bc93f6d0543129ca0f37c1ce294dd9d386f0a21096"},
{file = "scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:90453d2b93ea82a9f434e4e1cba043e779ff67b92f7a0e85d05d286a3625df3c"},
{file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c06e62a390a9167da60bedd4575a14c1f58ca9dfde59830fc42e5197283dab"},
{file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abaf921531b5aeaafced90157db505e10345e45038c39e5d9b6c7922d68085cb"},
{file = "scipy-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:06d2e1b4c491dc7d8eacea139a1b0b295f74e1a1a0f704c375028f8320d16e31"},
{file = "scipy-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a04cd7d0d3eff6ea4719371cbc44df31411862b9646db617c99718ff68d4840"},
{file = "scipy-1.9.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:545c83ffb518094d8c9d83cce216c0c32f8c04aaf28b92cc8283eda0685162d5"},
{file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d54222d7a3ba6022fdf5773931b5d7c56efe41ede7f7128c7b1637700409108"},
{file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff3a5295234037e39500d35316a4c5794739433528310e117b8a9a0c76d20fc"},
{file = "scipy-1.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:2318bef588acc7a574f5bfdff9c172d0b1bf2c8143d9582e05f878e580a3781e"},
{file = "scipy-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d644a64e174c16cb4b2e41dfea6af722053e83d066da7343f333a54dae9bc31c"},
{file = "scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:da8245491d73ed0a994ed9c2e380fd058ce2fa8a18da204681f2fe1f57f98f95"},
{file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db5b30849606a95dcf519763dd3ab6fe9bd91df49eba517359e450a7d80ce2e"},
{file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c68db6b290cbd4049012990d7fe71a2abd9ffbe82c0056ebe0f01df8be5436b0"},
{file = "scipy-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:5b88e6d91ad9d59478fafe92a7c757d00c59e3bdc3331be8ada76a4f8d683f58"},
{file = "scipy-1.9.3.tar.gz", hash = "sha256:fbc5c05c85c1a02be77b1ff591087c83bc44579c6d2bd9fb798bb64ea5e1a027"},
]
[package.dependencies]
numpy = ">=1.18.5,<1.26.0"
[package.extras]
dev = ["flake8", "mypy", "pycodestyle", "typing_extensions"]
doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"]
test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
[[package]] [[package]]
name = "setuptools" name = "setuptools"
version = "65.6.3" version = "65.6.3"
@ -2537,28 +2203,6 @@ files = [
{file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
] ]
[[package]]
name = "soundfile"
version = "0.11.0"
description = "An audio library based on libsndfile, CFFI and NumPy"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "soundfile-0.11.0-py2.py3-none-any.whl", hash = "sha256:f4e4f832b1958403fb9726eeea54e0ebf1c7fc2599ff296a7ab1ac062f8048c9"},
{file = "soundfile-0.11.0-py2.py3-none-macosx_10_9_arm64.macosx_11_0_arm64.whl", hash = "sha256:9e6a62eefad0a7f856cc8f5ede7f1a0c196b65d2901c00fffc74a3d7e81d89c8"},
{file = "soundfile-0.11.0-py2.py3-none-macosx_10_9_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:12f66fe9dcddedaa6c808bc3e104fc67fcee59dc64214bf7f43605e69836c497"},
{file = "soundfile-0.11.0-py2.py3-none-win32.whl", hash = "sha256:08d9636815692f332e042990d449e79b888d288f0752226d8602e91523a0a29b"},
{file = "soundfile-0.11.0-py2.py3-none-win_amd64.whl", hash = "sha256:a4ab6f66ad222d8e144dcb6abc73fbb867c11da2934b677f9b129778a6c65112"},
{file = "soundfile-0.11.0.tar.gz", hash = "sha256:931738a1c93e8684c2d3e1d514ac63440ce827ec783ea0a2d3e4730e3dc58c18"},
]
[package.dependencies]
cffi = ">=1.0"
[package.extras]
numpy = ["numpy"]
[[package]] [[package]]
name = "stevedore" name = "stevedore"
version = "4.1.1" version = "4.1.1"
@ -2574,18 +2218,6 @@ files = [
[package.dependencies] [package.dependencies]
pbr = ">=2.0.0,<2.1.0 || >2.1.0" pbr = ">=2.0.0,<2.1.0 || >2.1.0"
[[package]]
name = "threadpoolctl"
version = "3.1.0"
description = "threadpoolctl"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"},
{file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"},
]
[[package]] [[package]]
name = "timm" name = "timm"
version = "0.6.12" version = "0.6.12"
@ -3235,4 +2867,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "1bbe73b603795c02917f6cb6c74998bde1bfc6ad54964649c84c7df2e32d8cb0" content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526"

View File

@ -45,6 +45,7 @@ diffusers = {extras = ["torch"], version = "^0.11.1"}
timm = "^0.6.12" timm = "^0.6.12"
torchvision = "^0.14.1" torchvision = "^0.14.1"
torchaudio = "^0.13.1" torchaudio = "^0.13.1"
python-dotenv = "^0.21.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
bandit = "^1.7.4" bandit = "^1.7.4"

View File

@ -0,0 +1,15 @@
\begin{tikzpicture}[auto,
node distance = 12mm,
start chain = going below,
box/.style = {draw,rounded corners,blur shadow,fill=white, on chain,align=center}]
\node[box] (b1) {$x_1\leftarrow0$\\ $y_1\leftarrow0$};
\node[box] (b2) {$x_2\leftarrow\phi(x_1,x_3)$\\
$y_2\leftarrow\phi(y_1,y_3)$\\
$(x_2<10)$?};
\node[box] (b3) {$y_3\leftarrow y_2+x_2$\\ $x_3\leftarrow x_2+1$};
\node[box] (b4) {print($y_2$)};
\begin{scope}[rounded corners,-latex]
\path (b2.-40) edge[bend left=50] (b4.40) (b1) edge (b2) (b2) edge (b3);
\draw (b3.230) -- ++(0,-0.3) -| ([xshift=-5mm]b2.west) |- ([yshift=3mm]b2.130) -- (b2.130);
\end{scope}
\end{tikzpicture}

View File

@ -0,0 +1,69 @@
\section{Internet-NLP}
This publication will introduce Internet-NLP and its control flow for allowing NLPs to connect to internet, which will replace traditional knowledge bases with the resources on the internet.
\begin{figure}
\begin{center}
\input{control_flow}
\caption{This is an illustration of how Internet-NLP's control flow works.}
\label{fig:ControlFlow}
\end{center}
\end{figure}
In the control flow diagram \ref{fig:ControlFlow}, it shows how Internet-NLP gains its data for NLP tasks and also makes sure that the data scraped is accurate and not offensive for the NLP task it is being asked to do; Internet-NLP does this by utilizing several different NLP and NLI models in combination to enable this data collection system. This allows other NLP models to utilize the data to allow for other NLP tasks that was requested.
Internet-NLP's control flow diagram \ref{fig:ControlFlow} will be explained in the following subsections.
\subsection{NLP Tasks Applicable}
Internet-NLP currently allow for the following NLP tasks without context:
\begin{itemize}[leftmargin=1em]
\item Question Answering
\item Zero-Shot Classification
\item Natural Language Inference
\item Text2Text Generation
\item Conversational (this still in beta and does not completely work)
\end{itemize}
\subsection{Disclaimers}
\subsubsection{Types of English}
Internet-NLP at this point of time can only fully understand "formal" English \cite{FormalInformal}. Additionally idioms, similes, and other figures of speech are not understood by Internet-NLP or it's models.
\subsubsection{Output of Internet-NLP}
The accuracy of the output of Internet-NLP depends on the data it scrapes which may not be completely accurate (which the chances are minimized to an extent with utilizing mutliple resources) and may contain profanity or abrasive language which may or may not affect the output.
\subsection{Common Components of Internet-NLP's Process}
\subsubsection{Answer To Question Text2Text-generator\label{subsubsection:AnswerToQuestion}}
\subsubsection{Search Queries Text2Text-generator \label{subsubsection:search-query}}
The search query generator that enables converting questions into viable search queries utilizes a fastT5 model \cite{2019t5}. It is trained on reddit and quora questions (that are non-mathematical i.e does not require logical computation) and then passed through an parts of speech tagging model and normalizer wherein the question is optimized for search engines by removing specific details and punctuation \cite{BetterWebSearches}.
The reason for utilizing fastT5 models rather than the parts of speech tagging model comes down due to efficency issues as fastT5 outperforms the parts of speech tagging model \cite{inproceedings, 2019t5}.
\subsubsection{Data Collection \label{subsubsection:DataCollection}}
\subsection{Question Answering}
\subsubsection{Answer to Question Text2Text-generator}
In the case of question answering without context, Internet-NLP only needs one of following:
\begin{itemize}
\item Question
\begin{itemize}
\item In this case Internet-NLP passes the question through the Search Query Text2Text-generator \ref{subsubsection:search-query} wherein an output of a optimized search question for search engine is returned. This optimized question will be used for data collection \ref{subsubsection:DataCollection}.
\label{subsubsection:itemize:question}
\end{itemize}
\item Answer
\begin{itemize}
\item In this case the Answer to Question Text2Text-generator \ref{subsubsection:AnswerToQuestion}. After which it follows the same process of question optimization explained above in Question case \ref{subsubsection:itemize:question}.
\end{itemize}
\end{itemize}
\subsubsection{Natural Language Inference Without Premise}

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,22 @@
%auto-ignore
\section{Preliminaries}
The preliminaries listed are NLP tasks Internet-NLP benefits from the access to internet listed:
\subsection{Question Answering}
The tasks of training an NLP model to utilize question, and context (or in the case of ODQA closed-book LM just question) to create a logical answer. The current most popular would be context-needing question answering models, where in the answer is provided in the context. These models utilize reading comprehension to utilize the context to make an answer based on the question \cite{https://doi.org/10.48550/arxiv.2002.08910}.
Closed-book QDQA LM are a type of question-answering model where there is no context provided, and these are usually the hardest variant to train, and results in large sizes, low efficency, and low accuracy. Thsese models can only be asked context-independent questions such as facts \cite{https://doi.org/10.48550/arxiv.2002.08910}.
The alternative to ODQA LM would be utilizing a knowledge base and retriver for getting the required context from a knowledge base and then utilizing an context-needing question-answering NLP model which would be known as open-book question-answering model \cite{https://doi.org/10.48550/arxiv.2002.08910}. This however require knowledge base which would be static and hence would not contain the latest information; additionally requires a large database and hence the however solution is also large.
In this publication, Internet-NLP applies question answering open-book LM with the constraint of not utilizing a knowledge base and keeping the overall solution size to be low, high efficency and high accuracy with the ability to also being asked context-dependent (by giving the optional context) and context-independent questions. Internet-NLP utilizes the internet to replace the knowledge base, utilize a retriver to get the required information from the internet data and then an open-book Text2Text-generation model to create an answer from the information, question and any extra context given.
\subsection{Natural Language Inference}
NLI models require a premise (similar to context) and hypothesis (an preidiction) to give one of the following: entailment (hypothesis is correct based on premise), neutral (hypothesis is neither correct nor wrong based on premise) and contradiction (hypothesis is wrong based on premise).
Current no-premise NLI models utilizes a knowledge base to reproduce the premise via a retriver and then utilize an NLI model to then given output.
In this publication, Internet-NLP produces the premise based on the hypothesis by converting the hypothesis into an search query (via an Text2Text-generation LM) which will then be scraped for results and then be indivisually compared to the hypothesis to to only select ones that have either contradiction or entailment to then give an ouput on wether its an entailment or contradiction. This allows for hypothesis to be checked if they are either correct or wrong without an large knowledge base or model.

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

View File

@ -0,0 +1,56 @@
%auto-ignore
\section{Related Work}
\subsection{Internet-NLP}
\subsubsection{NLP Models with Knowledge Base and Retriver}
These approaches are one the two most popular current solution for NLP tasks to be done without context. It utilizes an knowledge base, a retriever for this data and a LM depending on the use case for example (this list is not extensive):
\begin{itemize}[leftmargin=1em]
\item question answering: LinkBERT or T5 \cite{https://doi.org/10.48550/arxiv.2203.15827, https://doi.org/10.48550/arxiv.1910.10683}
\item NLI: CrossEncoder models BERT or DeBERTa \cite{thakur-2020-AugSBERT, https://doi.org/10.48550/arxiv.1810.04805, https://doi.org/10.48550/arxiv.2006.03654}
\end{itemize}
This allows for no-context NLP applications (especially question and answering) to function without any context given, due to knowledge base and retriver providing the context. An representation of this is shown in illustration \ref{fig:CurrSolTwoImg} \cite{https://doi.org/10.48550/arxiv.2201.09651}.
\subsection{Internet-NLP's NLP models}
\subsubsection{LinkBERT}
\begin{figure}
\includegraphics[width=1.0\columnwidth]{fig_motivation_v8.pdf}
\caption{This is an illustration of example of how LinkBERT utilizes hyperlinks to make a graph corpus \cite{https://doi.org/10.48550/arxiv.2203.15827}.}
\label{fig:LinkBERTGraphExample}
\end{figure}
\begin{figure}
\includegraphics[width=1.0\columnwidth]{fig_overview_v13.pdf}
\caption{This is an illustration of example of how LinkBERT makes a graph corpus \cite{https://doi.org/10.48550/arxiv.2203.15827}.}
\label{fig:LinkBERTGraphIllustration}
\end{figure}
LinkBERT is a NLP model that is a pre-trained BERT \cite{https://doi.org/10.48550/arxiv.1810.04805} model that is trained on a graph-based corpus of documents from not only documents but also the hyperlinks in documents. It utilizes a "fusion of graph-based and language-based self-supervised learning" \cite{https://doi.org/10.48550/arxiv.2203.15827}. It gains better performance on graph-based data corpus than other pre-trained NLP models due to it being trained with utilizing graph-based self-supervised learning.
These are illustrations that explain LinkBERT's graph-based and language-based fusion:
\begin{itemize}[leftmargin=1em]
\item This illustration shows how hyperlinks can contain crucial information: \ref{fig:LinkBERTGraphExample}.
\item This illustration shows how LinkBERT \cite{https://doi.org/10.48550/arxiv.2203.15827} makes a graph from links: \ref{fig:LinkBERTGraphIllustration}.
\end{itemize}
For training the Internet-NLP and LM for Text2Text-generation for question answering would be utilizing the fusion of graph-based and language-based learning LinkBERT revolutionized \cite{https://doi.org/10.48550/arxiv.2203.15827}.
\subsection{Internet-NLP's NLI models}
\subsubsection{Cross-Encoder NLI Models}
\begin{figure}
\includegraphics[width=1.0\columnwidth]{Bi_vs_Cross-Encoder.png}
\caption{This is an illustration of how NLI using Cross-Encoders vs Bi-Encoder work like \cite{thakur-2020-AugSBERT}.}
\label{fig:CrossEncoderNLI}
\end{figure}
NLI compares two sentences to given an output of entailment (true), neutral or contradiction (false).
Utilizing Cross-Encoder for NLI applications that allow for the utilization of Cross-Encoder (an illustration of Cross-Encoders \ref{fig:CrossEncoderNLI}) where two sentence are passed simultaneously, and then utilizing a classifier to get the output of 0 to 1 which goes from contradiction to entailment \cite{thakur-2020-AugSBERT, https://doi.org/10.48550/arxiv.1908.10084}.