diff --git a/.gitignore b/.gitignore index 0df3095..3f67b9d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ - +.env # Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode # Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode diff --git a/internet_ml/NLP/no_context/QA.py b/internet_ml/NLP/no_context/QA.py index e69de29..b21602b 100644 --- a/internet_ml/NLP/no_context/QA.py +++ b/internet_ml/NLP/no_context/QA.py @@ -0,0 +1,17 @@ +import sys +from pathlib import Path + +from transformers import pipeline + +sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data") +import internet + +qa_model = pipeline("question-answering") +question = "Who is Elon Musk?" +a = internet.google(question)[0] +print(a) +context = "" +for i in a: + context += str(i) +print(qa_model(question=question, context=context)) +## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31} diff --git a/internet_ml/NLP/no_context/internet_cache.pkl b/internet_ml/NLP/no_context/internet_cache.pkl new file mode 100644 index 0000000..be8b024 Binary files /dev/null and b/internet_ml/NLP/no_context/internet_cache.pkl differ diff --git a/internet_ml/NLP/no_context/is_relevant_cache.pkl b/internet_ml/NLP/no_context/is_relevant_cache.pkl new file mode 100644 index 0000000..ce99a77 Binary files /dev/null and b/internet_ml/NLP/no_context/is_relevant_cache.pkl differ diff --git a/internet_ml/NLP/no_context/test.py b/internet_ml/NLP/no_context/test.py new file mode 100644 index 0000000..1ec666a --- /dev/null +++ b/internet_ml/NLP/no_context/test.py @@ -0,0 +1,4 @@ +import sys +from pathlib import Path + +print() diff --git a/internet_ml/tools/NLP/data/internet.py b/internet_ml/tools/NLP/data/internet.py index 3b35073..ad3a7c5 100644 --- a/internet_ml/tools/NLP/data/internet.py +++ b/internet_ml/tools/NLP/data/internet.py @@ -1,256 +1,61 @@ -#type: ignore -from typing import Any, Dict, List, Tuple +# type: ignore +from typing import List import asyncio -import logging -import re -import time -import urllib +import functools +import multiprocessing +import os import aiohttp -from bs4 import BeautifulSoup +import dotenv +import requests -# Set up logging -logging.basicConfig( - filename="internet.log", - filemode="w", - level=logging.INFO, - format="%(name)s - %(levelname)s - %(message)s", -) +dotenv.load_dotenv() -# import concurrent.futures - -# Import the config module -import sys -from pathlib import Path - -sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP") -sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils") -import config - -sys.path.append(str(Path(__file__).parent.parent)) -import pickle - -from is_relevant import filter_irrelevant -from normalize import normalizer -from sentencize import sentencizer -from urlextract import URLExtract - -# Define the user agent HTTP_USERAGENT: dict[str, str] = { - "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" -} -# Define the google domains -UNWANTED_DOMAINS = { - "https://www.google.", - "https://google.", - "https://webcache.googleusercontent.", - "http://webcache.googleusercontent.", - "https://policies.google.", - "https://support.google.", - "https://maps.google.", - "https://youtube.", - "https://translate.google.", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } -CACHE_FILE_PATH: str = "./internet_cache.pkl" -CACHE_TIME: int = 86400 # one day -URL_EXTRACTOR = URLExtract() - -# Load the cache from the file (if it exists) -try: - with open(CACHE_FILE_PATH, "rb") as f: - cache: Any = pickle.load(f) -except FileNotFoundError: - cache: Any = {} - - -# Define the fetch_url function -async def fetch_url(session: aiohttp.ClientSession, url: str) -> str: - global HTTP_USERAGENT - async with session.get(url, headers=HTTP_USERAGENT) as response: - return await response.text() - - -# Define the google_urls function -async def google_urls(query: str, links: list[str]) -> list[str]: - """ - Asynchronously search Google for the given query and retrieve the URLs of the top results. - - Parameters: - query (str): The query to search for. - - Returns: - List[str]: A list of the URLs of the top search results. - """ - global UNWANTED_DOMAINS - # Initialize an empty list to store the URLs - urls: list[str] = links - - # Determine the number of results to retrieve based on the configuration mode - num_of_res: int = ( - 5 - if config.NLP_CONF_MODE == "speed" - else (20 if config.NLP_CONF_MODE == "accuracy" else 10) +def google_urls(query: str, links: list[str]) -> list[str]: + # Send the request to the Google Search API + response = requests.get( + "https://www.googleapis.com/customsearch/v1", + params={ + "key": os.environ["API_KEY"], + "q": query, + "cx": os.environ["SEARCH_ENGINE_ID"], + }, ) - - # Log the number of results wanted (if debugging is enabled) - if config.NLP_CONF_DEBUG: - logging.info(f"number of results wanted: {num_of_res}") - - # Construct the search URL - search_url: str = ( - "https://www.google.com/search?q=" - + str(urllib.parse.quote_plus(query)) - + "&num=" - + str(num_of_res) - ) - - # Log the search URL (if debugging is enabled) - if config.NLP_CONF_DEBUG: - logging.info(f"url: {search_url}") - - # Create an aiohttp session and use it to fetch the search results - async with aiohttp.ClientSession() as session: - response: str = await fetch_url(session, search_url) - - # Wait 10 seconds before parsing the results (to avoid being rate-limited) - await asyncio.sleep(10.0) - - # Parse the search results using BeautifulSoup - soup: BeautifulSoup = BeautifulSoup(response, "html.parser") - - # Iterate over the links in the search results - for link in list(soup.select("a[href]")): - # Extract the URL from the link - url = str(link["href"]) - - # Check if the URL is valid and not a Google or YouTube link - if ("http" in url) and ( - not any(url.startswith(s) for s in UNWANTED_DOMAINS) - ): - urls.append(url) - if config.NLP_CONF_DEBUG: - logging.info(f"added {url}") - if len(urls) == num_of_res: - break - return urls + results = response.json()["items"] + # Print the search results + for result in results: + links.append(result["link"]) + return links -async def fetch_url_text( - session: aiohttp.ClientSession, url: str, query: str -) -> list[str]: - """ - Extract the text from the given HTML content. +class LinkFetcher: + def __init__(self, urls): + self.urls = urls - Parameters: - session (aiohttp.ClientSession): aiohttp session - url (str): The url content to get text from. - - Returns: - str: The extracted text. - """ - global HTTP_USERAGENT - try: + async def fetch(self, session, url): async with session.get(url, headers=HTTP_USERAGENT) as response: - soup: BeautifulSoup = BeautifulSoup(await response.text(), "html.parser") - text = normalizer(soup.get_text()) - if config.NLP_CONF_DEBUG: - logging.info(f"Text: {text}") - sentences: list[str] = sentencizer(text) - sentences = filter_irrelevant(sentences, query) - return sentences - except Exception as e: - # Log the error and continue execution - logging.error(f"Error occurred: {e}") - return [] + return await response.text() + + async def main(self, session): + tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls] + responses = await asyncio.gather(*tasks) + return responses -def flatten(l): - return [item for sublist in l for item in sublist] +def fetch_content(urls: list[str]): + fetcher = LinkFetcher(urls) + with aiohttp.ClientSession() as session: + with multiprocessing.Pool(processes=5) as pool: + contents = list(pool.map(functools.partial(fetcher.main), [session])) + return contents -async def get_text_content(urls: list[str], query: str) -> list[str]: - # Create a list to store the results - results: list[str] = [] - # Create an aiohttp session - async with aiohttp.ClientSession() as session: - # Create a list of tasks to run concurrently - tasks: list[Any] = [ - asyncio.create_task(fetch_url_text(session, url, query)) for url in urls - ] - # Use asyncio.gather to run the tasks concurrently - results = await asyncio.gather(*tasks) - sentences: list[str] = flatten(results) - return sentences - - -def google(query: str) -> Tuple[List[str], str]: - global cache, CACHE_FILE_PATH, CACHE_TIME, URL_EXTRACTOR - links_in_text: list[str] = URL_EXTRACTOR.find_urls(query) - query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query) - entry = cache.get(query) - if entry is None: - # no query exists, so add a new entry to the cache - urls: List[str] = asyncio.run(google_urls(query, links_in_text)) - text: str = str(asyncio.run(get_text_content(urls, query))) - cache[query]: Tuple[Tuple[List[str], str], int] = ( - (text, urls), - time.time() + CACHE_TIME, - ) # cache expires in one hour - elif entry[1] < time.time(): - # update as it expired - urls: List[str] = asyncio.run(google_urls(query, links_in_text)) - text: str = str(asyncio.run(get_text_content(urls, query))) - cache[query]: Tuple[Tuple[List[str], str], int] = ( - (text, urls), - time.time() + CACHE_TIME, - ) # cache expires in one hour - else: - # available so return it - text: List[str] = entry[0][0] - urls: str = entry[0][1] - # Save the cache to the file - with open(CACHE_FILE_PATH, "wb") as f: - pickle.dump(cache, f) - # Return the text - return (text, urls) - - -""" -async + multithreading since web scraping is I/O bound -https://stackoverflow.com/questions/27435284/multiprocessing-vs-multithreading-vs-asyncio -normal -________________________________________________________ -Executed in 1.67 secs fish external - usr time 137.29 millis 0.11 millis 137.18 millis - sys time 38.39 millis 1.25 millis 37.13 millis -Async -________________________________________________________ -Executed in 624.82 millis fish external - usr time 141.92 millis 0.11 millis 141.81 millis - sys time 38.00 millis 1.45 millis 36.55 millis - -concurrent -________________________________________________________ -Executed in 629.67 millis fish external - usr time 136.72 millis 0.12 millis 136.60 millis - sys time 36.86 millis 1.32 millis 35.54 millis - -multiprocessing -________________________________________________________ -Executed in 754.61 millis fish external - usr time 399.25 millis 0.11 millis 399.14 millis - sys time 164.39 millis 1.49 millis 162.90 millis - -multiprocessing - -OVERALL -multithreading bs4 -________________________________________________________ -Executed in 14.67 secs fish external - usr time 1.81 secs 0.12 millis 1.81 secs - sys time 0.14 secs 1.50 millis 0.14 secs -multiprocessing bs4 -""" +a = google_urls("Who is Neil Armstrong", []) +print(a) +print(fetch_content(a)) diff --git a/internet_ml/tools/NLP/data/internet_cache.pkl b/internet_ml/tools/NLP/data/internet_cache.pkl index b90d6a1..ce443c8 100644 Binary files a/internet_ml/tools/NLP/data/internet_cache.pkl and b/internet_ml/tools/NLP/data/internet_cache.pkl differ diff --git a/internet_ml/tools/NLP/data/is_relevant_cache.pkl b/internet_ml/tools/NLP/data/is_relevant_cache.pkl index b0ffe68..7f23ad0 100644 Binary files a/internet_ml/tools/NLP/data/is_relevant_cache.pkl and b/internet_ml/tools/NLP/data/is_relevant_cache.pkl differ diff --git a/poetry.lock b/poetry.lock index 958b0c2..b076395 100644 --- a/poetry.lock +++ b/poetry.lock @@ -153,18 +153,6 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" -[[package]] -name = "appdirs" -version = "1.4.4" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, - {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, -] - [[package]] name = "astroid" version = "2.12.13" @@ -215,17 +203,6 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope.interface"] tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] -[[package]] -name = "audioread" -version = "3.0.0" -description = "multi-library, cross-platform audio decoding" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "audioread-3.0.0.tar.gz", hash = "sha256:121995bd207eb1fda3d566beb851d3534275925bc35a4fb6da0cb11de0f7251a"}, -] - [[package]] name = "bandit" version = "1.7.4" @@ -296,83 +273,6 @@ files = [ {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, ] -[[package]] -name = "cffi" -version = "1.15.1" -description = "Foreign Function Interface for Python calling C code." -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, -] - -[package.dependencies] -pycparser = "*" - [[package]] name = "cfgv" version = "3.3.1" @@ -553,7 +453,6 @@ aiohttp = "*" dill = "<0.3.7" fsspec = {version = ">=2021.11.1", extras = ["http"]} huggingface-hub = ">=0.2.0,<1.0.0" -librosa = {version = "*", optional = true, markers = "extra == \"audio\""} multiprocess = "*" numpy = ">=1.17" packaging = "*" @@ -580,18 +479,6 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch torch = ["torch"] vision = ["Pillow (>=6.2.1)"] -[[package]] -name = "decorator" -version = "5.1.1" -description = "Decorators for Humans" -category = "main" -optional = false -python-versions = ">=3.5" -files = [ - {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, - {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, -] - [[package]] name = "diffusers" version = "0.11.1" @@ -1002,18 +889,6 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] -[[package]] -name = "joblib" -version = "1.2.0" -description = "Lightweight pipelining with Python functions" -category = "main" -optional = false -python-versions = ">=3.7" -files = [ - {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, - {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, -] - [[package]] name = "lazy-object-proxy" version = "1.8.0" @@ -1043,62 +918,6 @@ files = [ {file = "lazy_object_proxy-1.8.0-pp39-pypy39_pp73-any.whl", hash = "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8"}, ] -[[package]] -name = "librosa" -version = "0.9.2" -description = "Python module for audio and music processing" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "librosa-0.9.2-py3-none-any.whl", hash = "sha256:322a813e6d37af9fbc369e6a637dcf5fdc5c6925ce806a0d27c68de61a81350f"}, - {file = "librosa-0.9.2.tar.gz", hash = "sha256:5b576b5efdce428e90bc988bdd5a953d12a727e5f931f30d74c53b63abbe3c89"}, -] - -[package.dependencies] -audioread = ">=2.1.9" -decorator = ">=4.0.10" -joblib = ">=0.14" -numba = ">=0.45.1" -numpy = ">=1.17.0" -packaging = ">=20.0" -pooch = ">=1.0" -resampy = ">=0.2.2" -scikit-learn = ">=0.19.1" -scipy = ">=1.2.0" -soundfile = ">=0.10.2" - -[package.extras] -display = ["matplotlib (>=3.3.0)"] -docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.0.0,<2.0.0)", "sphinxcontrib-svg2pdfconverter"] -tests = ["contextlib2", "matplotlib (>=3.3.0)", "pytest", "pytest-cov", "pytest-mpl", "samplerate", "soxr"] - -[[package]] -name = "llvmlite" -version = "0.34.0" -description = "lightweight wrapper around basic LLVM functionality" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "llvmlite-0.34.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:11342e5ac320c953590bdd9d0dec8c52f4b5252c4c6335ba25f1e7b9f91f9325"}, - {file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:5bdf0ce430adfaf938ced5844d12f80616eb8321b5b9edfc45ef84ada5c5242c"}, - {file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e08d9d2dc5a31636bfc6b516d2d7daba95632afa3419eb8730dc76a7951e9558"}, - {file = "llvmlite-0.34.0-cp36-cp36m-win32.whl", hash = "sha256:9ff1dcdad03be0cf953aca5fc8cffdca25ccee2ec9e8ec7e95571722cdc02d55"}, - {file = "llvmlite-0.34.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5acdc3c3c7ea0ef7a1a6b442272e05d695bc8492e5b07666135ed1cfbf4ab9d2"}, - {file = "llvmlite-0.34.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:bb96989bc57a1ccb131e7a0e061d07b68139b6f81a98912345d53d9239e231e1"}, - {file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:6d3f81992f52a94077e7b9b16497029daf5b5eebb2cce56f3c8345bbc9c6308e"}, - {file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d841248d1c630426c93e3eb3f8c45bca0dab77c09faeb7553b1a500220e362ce"}, - {file = "llvmlite-0.34.0-cp37-cp37m-win32.whl", hash = "sha256:408b15ffec30696406e821c89da010f1bb1eb0aa572be4561c98eb2536d610ab"}, - {file = "llvmlite-0.34.0-cp37-cp37m-win_amd64.whl", hash = "sha256:5d1f370bf150db7239204f09cf6a0603292ea28bac984e69b167e16fe160d803"}, - {file = "llvmlite-0.34.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:132322bc084abf336c80dd106f9357978c8c085911fb656898d3be0d9ff057ea"}, - {file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:8f344102745fceba6eb5bf03c228bb290e9bc79157e9506a4a72878d636f9b3c"}, - {file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:05253f3f44fab0148276335b2c1b2c4a78143dfa78e6bafd7f937d6248f297cc"}, - {file = "llvmlite-0.34.0-cp38-cp38-win32.whl", hash = "sha256:28264f9e2b3df4135cbcfca5a91c5b0b31dd3fc02fa623b4bb13327f0cd4fc80"}, - {file = "llvmlite-0.34.0-cp38-cp38-win_amd64.whl", hash = "sha256:964f8f7a2184963cb3617d057c2382575953e488b7bb061b632ee014cfef110a"}, - {file = "llvmlite-0.34.0.tar.gz", hash = "sha256:f03ee0d19bca8f2fe922bb424a909d05c28411983b0c2bc58b020032a0d11f63"}, -] - [[package]] name = "markdown" version = "3.3.7" @@ -1405,37 +1224,6 @@ files = [ [package.dependencies] setuptools = "*" -[[package]] -name = "numba" -version = "0.51.2" -description = "compiling Python code using LLVM" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "numba-0.51.2-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:af798310eeb318c56cdb83254abbe9a938cc0182d08671d7f9f032dc817e064d"}, - {file = "numba-0.51.2-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:93e18350f2094e7432321c1275730a3143b94af012fb609cc180fa376c44867f"}, - {file = "numba-0.51.2-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:9e2bb1f129bfadd757ad7a9c18ab79c3ab25ce6d6a68e58565d6c52ad07b3566"}, - {file = "numba-0.51.2-cp36-cp36m-win32.whl", hash = "sha256:31cdf6b6d1301d5fb6c4fcb8b4c711ba5c9f60ba2fca008b550da9b56185367c"}, - {file = "numba-0.51.2-cp36-cp36m-win_amd64.whl", hash = "sha256:df6edca13c04a31fdb5addf5205199478a7da372712829157ef491e8a6e7031f"}, - {file = "numba-0.51.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a628122dacfcba9a3ea68a9e95578c6b6391016e34962c46550ea8e189e0412e"}, - {file = "numba-0.51.2-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:106736d5a8dab6bebce989d4ab1b3f169c264582598f172e6e5b736210d2e834"}, - {file = "numba-0.51.2-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a12f16fdb4ca5edc94e2ef412e4e768c29217ef9b6fdfc237d064ebe30acfe14"}, - {file = "numba-0.51.2-cp37-cp37m-win32.whl", hash = "sha256:025b033fd31c44bba17802293c81270084b5454b5b055b8c10c394385c232f00"}, - {file = "numba-0.51.2-cp37-cp37m-win_amd64.whl", hash = "sha256:081788f584fa500339e9b74bf02e3c5029d408c114e555ada19cae0b92721416"}, - {file = "numba-0.51.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:5416b584183fd599afda11b947b64f89450fcf26a9c15b408167f412b98a3a94"}, - {file = "numba-0.51.2-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:05da65dca2ac28a192c9d8f20e9e477eb1237205cfc4d131c414f5f8092c6639"}, - {file = "numba-0.51.2-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:aee435e3b7e465dd49971f8ea76aa414532a87736916cb399534e017334d1138"}, - {file = "numba-0.51.2-cp38-cp38-win32.whl", hash = "sha256:bbbe2432433b11d3fadab0226a84c1a81918cb905ba1aeb022249e8d2ba8856c"}, - {file = "numba-0.51.2-cp38-cp38-win_amd64.whl", hash = "sha256:259e7c15b24feec4a99fb41eb8c47b5ad49b544d1a5ad40ad0252ef531ba06fd"}, - {file = "numba-0.51.2.tar.gz", hash = "sha256:16bd59572114adbf5f600ea383880d7b2071ae45477e84a24994e089ea390768"}, -] - -[package.dependencies] -llvmlite = ">=0.34.0.dev0,<0.35" -numpy = ">=1.15" -setuptools = "*" - [[package]] name = "numpy" version = "1.24.0" @@ -1733,28 +1521,6 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] -[[package]] -name = "pooch" -version = "1.6.0" -description = "\"Pooch manages your Python library's sample data files: it automatically downloads and stores them in a local directory, with support for versioning and corruption checks.\"" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pooch-1.6.0-py3-none-any.whl", hash = "sha256:3bf0e20027096836b8dbce0152dbb785a269abeb621618eb4bdd275ff1e23c9c"}, - {file = "pooch-1.6.0.tar.gz", hash = "sha256:57d20ec4b10dd694d2b05bb64bc6b109c6e85a6c1405794ce87ed8b341ab3f44"}, -] - -[package.dependencies] -appdirs = ">=1.3.0" -packaging = ">=20.0" -requests = ">=2.19.0" - -[package.extras] -progress = ["tqdm (>=4.41.0,<5.0.0)"] -sftp = ["paramiko (>=2.7.0)"] -xxhash = ["xxhash (>=1.4.3)"] - [[package]] name = "pre-commit" version = "2.20.0" @@ -1852,18 +1618,6 @@ files = [ [package.dependencies] numpy = ">=1.16.6" -[[package]] -name = "pycparser" -version = "2.21" -description = "C parser in Python" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, -] - [[package]] name = "pydocstyle" version = "6.1.1" @@ -2031,6 +1785,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "0.21.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "python-dotenv-0.21.0.tar.gz", hash = "sha256:b77d08274639e3d34145dfa6c7008e66df0f04b7be7a75fd0d5292c191d79045"}, + {file = "python_dotenv-0.21.0-py3-none-any.whl", hash = "sha256:1684eb44636dd462b66c3ee016599815514527ad99965de77f43e0944634a7e5"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2022.7" @@ -2243,27 +2012,6 @@ urllib3 = ">=1.21.1,<1.27" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] -[[package]] -name = "resampy" -version = "0.3.1" -description = "Efficient signal resampling" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "resampy-0.3.1-py3-none-any.whl", hash = "sha256:b09066c8f0eb0418d59963aca23b79b52d0ace0b8a6de0fce082e6771f7c3f68"}, - {file = "resampy-0.3.1.tar.gz", hash = "sha256:7ed185b0912e1d913902a6a1ff7e3b8d977eb5bdd5663012276512ef09a8f701"}, -] - -[package.dependencies] -numba = ">=0.47" -numpy = ">=1.17" - -[package.extras] -design = ["optuna (>=2.10.0)"] -docs = ["numpydoc", "sphinx (!=1.3.1)"] -tests = ["pytest (<8)", "pytest-cov", "scipy (>=1.0)"] - [[package]] name = "responses" version = "0.18.0" @@ -2390,88 +2138,6 @@ setuptools = ">=19.3" github = ["jinja2 (>=3.1.0)", "pygithub (>=1.43.3)"] gitlab = ["python-gitlab (>=1.3.0)"] -[[package]] -name = "scikit-learn" -version = "1.2.0" -description = "A set of python modules for machine learning and data mining" -category = "main" -optional = false -python-versions = ">=3.8" -files = [ - {file = "scikit-learn-1.2.0.tar.gz", hash = "sha256:680b65b3caee469541385d2ca5b03ff70408f6c618c583948312f0d2125df680"}, - {file = "scikit_learn-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1beaa631434d1f17a20b1eef5d842e58c195875d2bc11901a1a70b5fe544745b"}, - {file = "scikit_learn-1.2.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d395730f26d8fc752321f1953ddf72647c892d8bed74fad4d7c816ec9b602dfa"}, - {file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd3480c982b9e616b9f76ad8587804d3f4e91b4e2a6752e7dafb8a2e1f541098"}, - {file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184a42842a4e698ffa4d849b6019de50a77a0aa24d26afa28fa49c9190bb144b"}, - {file = "scikit_learn-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:867023a044fdfe59e5014a7fec7a3086a8928f10b5dce9382eedf4135f6709a2"}, - {file = "scikit_learn-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5546a8894a0616e92489ef995b39a0715829f3df96e801bb55cbf196be0d9649"}, - {file = "scikit_learn-1.2.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bc7073e025b62c1067cbfb76e69d08650c6b9d7a0e7afdfa20cb92d4afe516f6"}, - {file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0a72237f0c56780cf550df87201a702d3bdcbbb23c6ef7d54c19326fa23f19"}, - {file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e1ea0bc1706da45589bcf2490cde6276490a1b88f9af208dbb396fdc3a0babf"}, - {file = "scikit_learn-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:f17420a8e3f40129aeb7e0f5ee35822d6178617007bb8f69521a2cefc20d5f00"}, - {file = "scikit_learn-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25ba705ee1600ffc5df1dccd8fae129d7c6836e44ffcbb52d78536c9eaf8fcf9"}, - {file = "scikit_learn-1.2.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:6b63ca2b0643d30fbf9d25d93017ed3fb8351f31175d82d104bfec60cba7bb87"}, - {file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c772fa8c64776ad769fd764752c8452844307adcf10dee3adcc43988260f21"}, - {file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0834e4cec2a2e0d8978f39cb8fe1cad3be6c27a47927e1774bf5737ea65ec228"}, - {file = "scikit_learn-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:da29d2e379c396a63af5ed4b671ad2005cd690ac373a23bee5a0f66504e05272"}, - {file = "scikit_learn-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:23a88883ca60c571a06278e4726b3b51b3709cfa4c93cacbf5568b22ba960899"}, - {file = "scikit_learn-1.2.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:40f3ff68c505cb9d1f3693397c73991875d609da905087e00e7b4477645ec67b"}, - {file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9535e867281ae6987bb80620ba14cf1649e936bfe45f48727b978b7a2dbe835"}, - {file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de897720173b26842e21bed54362f5294e282422116b61cd931d4f5d870b9855"}, - {file = "scikit_learn-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:ceb0008f345188aa236e49c973dc160b9ed504a3abd7b321a0ecabcb669be0bd"}, -] - -[package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3" -scipy = ">=1.3.2" -threadpoolctl = ">=2.0.0" - -[package.extras] -benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] -examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] -tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=5.3.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"] - -[[package]] -name = "scipy" -version = "1.9.3" -description = "Fundamental algorithms for scientific computing in Python" -category = "main" -optional = false -python-versions = ">=3.8" -files = [ - {file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"}, - {file = "scipy-1.9.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:83b89e9586c62e787f5012e8475fbb12185bafb996a03257e9675cd73d3736dd"}, - {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a72d885fa44247f92743fc20732ae55564ff2a519e8302fb7e18717c5355a8b"}, - {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01e1dd7b15bd2449c8bfc6b7cc67d630700ed655654f0dfcf121600bad205c9"}, - {file = "scipy-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:68239b6aa6f9c593da8be1509a05cb7f9efe98b80f43a5861cd24c7557e98523"}, - {file = "scipy-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b41bc822679ad1c9a5f023bc93f6d0543129ca0f37c1ce294dd9d386f0a21096"}, - {file = "scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:90453d2b93ea82a9f434e4e1cba043e779ff67b92f7a0e85d05d286a3625df3c"}, - {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c06e62a390a9167da60bedd4575a14c1f58ca9dfde59830fc42e5197283dab"}, - {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abaf921531b5aeaafced90157db505e10345e45038c39e5d9b6c7922d68085cb"}, - {file = "scipy-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:06d2e1b4c491dc7d8eacea139a1b0b295f74e1a1a0f704c375028f8320d16e31"}, - {file = "scipy-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a04cd7d0d3eff6ea4719371cbc44df31411862b9646db617c99718ff68d4840"}, - {file = "scipy-1.9.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:545c83ffb518094d8c9d83cce216c0c32f8c04aaf28b92cc8283eda0685162d5"}, - {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d54222d7a3ba6022fdf5773931b5d7c56efe41ede7f7128c7b1637700409108"}, - {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff3a5295234037e39500d35316a4c5794739433528310e117b8a9a0c76d20fc"}, - {file = "scipy-1.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:2318bef588acc7a574f5bfdff9c172d0b1bf2c8143d9582e05f878e580a3781e"}, - {file = "scipy-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d644a64e174c16cb4b2e41dfea6af722053e83d066da7343f333a54dae9bc31c"}, - {file = "scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:da8245491d73ed0a994ed9c2e380fd058ce2fa8a18da204681f2fe1f57f98f95"}, - {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db5b30849606a95dcf519763dd3ab6fe9bd91df49eba517359e450a7d80ce2e"}, - {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c68db6b290cbd4049012990d7fe71a2abd9ffbe82c0056ebe0f01df8be5436b0"}, - {file = "scipy-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:5b88e6d91ad9d59478fafe92a7c757d00c59e3bdc3331be8ada76a4f8d683f58"}, - {file = "scipy-1.9.3.tar.gz", hash = "sha256:fbc5c05c85c1a02be77b1ff591087c83bc44579c6d2bd9fb798bb64ea5e1a027"}, -] - -[package.dependencies] -numpy = ">=1.18.5,<1.26.0" - -[package.extras] -dev = ["flake8", "mypy", "pycodestyle", "typing_extensions"] -doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"] -test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] - [[package]] name = "setuptools" version = "65.6.3" @@ -2537,28 +2203,6 @@ files = [ {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, ] -[[package]] -name = "soundfile" -version = "0.11.0" -description = "An audio library based on libsndfile, CFFI and NumPy" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "soundfile-0.11.0-py2.py3-none-any.whl", hash = "sha256:f4e4f832b1958403fb9726eeea54e0ebf1c7fc2599ff296a7ab1ac062f8048c9"}, - {file = "soundfile-0.11.0-py2.py3-none-macosx_10_9_arm64.macosx_11_0_arm64.whl", hash = "sha256:9e6a62eefad0a7f856cc8f5ede7f1a0c196b65d2901c00fffc74a3d7e81d89c8"}, - {file = "soundfile-0.11.0-py2.py3-none-macosx_10_9_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:12f66fe9dcddedaa6c808bc3e104fc67fcee59dc64214bf7f43605e69836c497"}, - {file = "soundfile-0.11.0-py2.py3-none-win32.whl", hash = "sha256:08d9636815692f332e042990d449e79b888d288f0752226d8602e91523a0a29b"}, - {file = "soundfile-0.11.0-py2.py3-none-win_amd64.whl", hash = "sha256:a4ab6f66ad222d8e144dcb6abc73fbb867c11da2934b677f9b129778a6c65112"}, - {file = "soundfile-0.11.0.tar.gz", hash = "sha256:931738a1c93e8684c2d3e1d514ac63440ce827ec783ea0a2d3e4730e3dc58c18"}, -] - -[package.dependencies] -cffi = ">=1.0" - -[package.extras] -numpy = ["numpy"] - [[package]] name = "stevedore" version = "4.1.1" @@ -2574,18 +2218,6 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" -[[package]] -name = "threadpoolctl" -version = "3.1.0" -description = "threadpoolctl" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, - {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, -] - [[package]] name = "timm" version = "0.6.12" @@ -3235,4 +2867,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "1bbe73b603795c02917f6cb6c74998bde1bfc6ad54964649c84c7df2e32d8cb0" +content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526" diff --git a/pyproject.toml b/pyproject.toml index 7fb29d3..7716dd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ diffusers = {extras = ["torch"], version = "^0.11.1"} timm = "^0.6.12" torchvision = "^0.14.1" torchaudio = "^0.13.1" +python-dotenv = "^0.21.0" [tool.poetry.group.dev.dependencies] bandit = "^1.7.4" diff --git a/research/Internet-NLP/paper/Internet-NLP/control_flow.tex b/research/Internet-NLP/paper/Internet-NLP/control_flow.tex new file mode 100644 index 0000000..7d17d2c --- /dev/null +++ b/research/Internet-NLP/paper/Internet-NLP/control_flow.tex @@ -0,0 +1,15 @@ +\begin{tikzpicture}[auto, + node distance = 12mm, + start chain = going below, + box/.style = {draw,rounded corners,blur shadow,fill=white, on chain,align=center}] + \node[box] (b1) {$x_1\leftarrow0$\\ $y_1\leftarrow0$}; + \node[box] (b2) {$x_2\leftarrow\phi(x_1,x_3)$\\ + $y_2\leftarrow\phi(y_1,y_3)$\\ + $(x_2<10)$?}; + \node[box] (b3) {$y_3\leftarrow y_2+x_2$\\ $x_3\leftarrow x_2+1$}; + \node[box] (b4) {print($y_2$)}; + \begin{scope}[rounded corners,-latex] + \path (b2.-40) edge[bend left=50] (b4.40) (b1) edge (b2) (b2) edge (b3); + \draw (b3.230) -- ++(0,-0.3) -| ([xshift=-5mm]b2.west) |- ([yshift=3mm]b2.130) -- (b2.130); + \end{scope} +\end{tikzpicture} diff --git a/research/Internet-NLP/paper/Internet-NLP/main.tex b/research/Internet-NLP/paper/Internet-NLP/main.tex index e69de29..3a67d65 100644 --- a/research/Internet-NLP/paper/Internet-NLP/main.tex +++ b/research/Internet-NLP/paper/Internet-NLP/main.tex @@ -0,0 +1,69 @@ +\section{Internet-NLP} + +This publication will introduce Internet-NLP and its control flow for allowing NLPs to connect to internet, which will replace traditional knowledge bases with the resources on the internet. + +\begin{figure} + \begin{center} + \input{control_flow} + \caption{This is an illustration of how Internet-NLP's control flow works.} + \label{fig:ControlFlow} + \end{center} +\end{figure} + +In the control flow diagram \ref{fig:ControlFlow}, it shows how Internet-NLP gains its data for NLP tasks and also makes sure that the data scraped is accurate and not offensive for the NLP task it is being asked to do; Internet-NLP does this by utilizing several different NLP and NLI models in combination to enable this data collection system. This allows other NLP models to utilize the data to allow for other NLP tasks that was requested. + +Internet-NLP's control flow diagram \ref{fig:ControlFlow} will be explained in the following subsections. + +\subsection{NLP Tasks Applicable} + +Internet-NLP currently allow for the following NLP tasks without context: + +\begin{itemize}[leftmargin=1em] + \item Question Answering + \item Zero-Shot Classification + \item Natural Language Inference + \item Text2Text Generation + \item Conversational (this still in beta and does not completely work) +\end{itemize} + +\subsection{Disclaimers} + +\subsubsection{Types of English} + +Internet-NLP at this point of time can only fully understand "formal" English \cite{FormalInformal}. Additionally idioms, similes, and other figures of speech are not understood by Internet-NLP or it's models. + +\subsubsection{Output of Internet-NLP} + +The accuracy of the output of Internet-NLP depends on the data it scrapes which may not be completely accurate (which the chances are minimized to an extent with utilizing mutliple resources) and may contain profanity or abrasive language which may or may not affect the output. + +\subsection{Common Components of Internet-NLP's Process} + +\subsubsection{Answer To Question Text2Text-generator\label{subsubsection:AnswerToQuestion}} + +\subsubsection{Search Queries Text2Text-generator \label{subsubsection:search-query}} + +The search query generator that enables converting questions into viable search queries utilizes a fastT5 model \cite{2019t5}. It is trained on reddit and quora questions (that are non-mathematical i.e does not require logical computation) and then passed through an parts of speech tagging model and normalizer wherein the question is optimized for search engines by removing specific details and punctuation \cite{BetterWebSearches}. + +The reason for utilizing fastT5 models rather than the parts of speech tagging model comes down due to efficency issues as fastT5 outperforms the parts of speech tagging model \cite{inproceedings, 2019t5}. + +\subsubsection{Data Collection \label{subsubsection:DataCollection}} + +\subsection{Question Answering} + +\subsubsection{Answer to Question Text2Text-generator} + +In the case of question answering without context, Internet-NLP only needs one of following: + +\begin{itemize} + \item Question + \begin{itemize} + \item In this case Internet-NLP passes the question through the Search Query Text2Text-generator \ref{subsubsection:search-query} wherein an output of a optimized search question for search engine is returned. This optimized question will be used for data collection \ref{subsubsection:DataCollection}. + \label{subsubsection:itemize:question} + \end{itemize} + \item Answer + \begin{itemize} + \item In this case the Answer to Question Text2Text-generator \ref{subsubsection:AnswerToQuestion}. After which it follows the same process of question optimization explained above in Question case \ref{subsubsection:itemize:question}. + \end{itemize} +\end{itemize} + +\subsubsection{Natural Language Inference Without Premise} diff --git a/research/Internet-NLP/paper/acl_latex.pdf b/research/Internet-NLP/paper/acl_latex.pdf new file mode 100644 index 0000000..3290e88 Binary files /dev/null and b/research/Internet-NLP/paper/acl_latex.pdf differ diff --git a/research/Internet-NLP/paper/main.pdf b/research/Internet-NLP/paper/main.pdf index 5302412..effd8bf 100644 Binary files a/research/Internet-NLP/paper/main.pdf and b/research/Internet-NLP/paper/main.pdf differ diff --git a/research/Internet-NLP/paper/preliminaries/main.tex b/research/Internet-NLP/paper/preliminaries/main.tex index e69de29..58ddcd7 100644 --- a/research/Internet-NLP/paper/preliminaries/main.tex +++ b/research/Internet-NLP/paper/preliminaries/main.tex @@ -0,0 +1,22 @@ +%auto-ignore +\section{Preliminaries} + +The preliminaries listed are NLP tasks Internet-NLP benefits from the access to internet listed: + +\subsection{Question Answering} + +The tasks of training an NLP model to utilize question, and context (or in the case of ODQA closed-book LM just question) to create a logical answer. The current most popular would be context-needing question answering models, where in the answer is provided in the context. These models utilize reading comprehension to utilize the context to make an answer based on the question \cite{https://doi.org/10.48550/arxiv.2002.08910}. + +Closed-book QDQA LM are a type of question-answering model where there is no context provided, and these are usually the hardest variant to train, and results in large sizes, low efficency, and low accuracy. Thsese models can only be asked context-independent questions such as facts \cite{https://doi.org/10.48550/arxiv.2002.08910}. + +The alternative to ODQA LM would be utilizing a knowledge base and retriver for getting the required context from a knowledge base and then utilizing an context-needing question-answering NLP model which would be known as open-book question-answering model \cite{https://doi.org/10.48550/arxiv.2002.08910}. This however require knowledge base which would be static and hence would not contain the latest information; additionally requires a large database and hence the however solution is also large. + +In this publication, Internet-NLP applies question answering open-book LM with the constraint of not utilizing a knowledge base and keeping the overall solution size to be low, high efficency and high accuracy with the ability to also being asked context-dependent (by giving the optional context) and context-independent questions. Internet-NLP utilizes the internet to replace the knowledge base, utilize a retriver to get the required information from the internet data and then an open-book Text2Text-generation model to create an answer from the information, question and any extra context given. + +\subsection{Natural Language Inference} + +NLI models require a premise (similar to context) and hypothesis (an preidiction) to give one of the following: entailment (hypothesis is correct based on premise), neutral (hypothesis is neither correct nor wrong based on premise) and contradiction (hypothesis is wrong based on premise). + +Current no-premise NLI models utilizes a knowledge base to reproduce the premise via a retriver and then utilize an NLI model to then given output. + +In this publication, Internet-NLP produces the premise based on the hypothesis by converting the hypothesis into an search query (via an Text2Text-generation LM) which will then be scraped for results and then be indivisually compared to the hypothesis to to only select ones that have either contradiction or entailment to then give an ouput on wether its an entailment or contradiction. This allows for hypothesis to be checked if they are either correct or wrong without an large knowledge base or model. diff --git a/research/Internet-NLP/paper/related_work/Bi_vs_Cross-Encoder.png b/research/Internet-NLP/paper/related_work/Bi_vs_Cross-Encoder.png new file mode 100644 index 0000000..ed064a4 Binary files /dev/null and b/research/Internet-NLP/paper/related_work/Bi_vs_Cross-Encoder.png differ diff --git a/research/Internet-NLP/paper/related_work/fig_motivation_v8.pdf b/research/Internet-NLP/paper/related_work/fig_motivation_v8.pdf new file mode 100644 index 0000000..5452f8d Binary files /dev/null and b/research/Internet-NLP/paper/related_work/fig_motivation_v8.pdf differ diff --git a/research/Internet-NLP/paper/related_work/fig_overview_v13.pdf b/research/Internet-NLP/paper/related_work/fig_overview_v13.pdf new file mode 100644 index 0000000..353a59c Binary files /dev/null and b/research/Internet-NLP/paper/related_work/fig_overview_v13.pdf differ diff --git a/research/Internet-NLP/paper/related_work/main.tex b/research/Internet-NLP/paper/related_work/main.tex index e69de29..f81ff29 100644 --- a/research/Internet-NLP/paper/related_work/main.tex +++ b/research/Internet-NLP/paper/related_work/main.tex @@ -0,0 +1,56 @@ +%auto-ignore +\section{Related Work} + +\subsection{Internet-NLP} + +\subsubsection{NLP Models with Knowledge Base and Retriver} + +These approaches are one the two most popular current solution for NLP tasks to be done without context. It utilizes an knowledge base, a retriever for this data and a LM depending on the use case for example (this list is not extensive): + +\begin{itemize}[leftmargin=1em] + \item question answering: LinkBERT or T5 \cite{https://doi.org/10.48550/arxiv.2203.15827, https://doi.org/10.48550/arxiv.1910.10683} + \item NLI: CrossEncoder models BERT or DeBERTa \cite{thakur-2020-AugSBERT, https://doi.org/10.48550/arxiv.1810.04805, https://doi.org/10.48550/arxiv.2006.03654} +\end{itemize} + +This allows for no-context NLP applications (especially question and answering) to function without any context given, due to knowledge base and retriver providing the context. An representation of this is shown in illustration \ref{fig:CurrSolTwoImg} \cite{https://doi.org/10.48550/arxiv.2201.09651}. + +\subsection{Internet-NLP's NLP models} + +\subsubsection{LinkBERT} + +\begin{figure} + \includegraphics[width=1.0\columnwidth]{fig_motivation_v8.pdf} + \caption{This is an illustration of example of how LinkBERT utilizes hyperlinks to make a graph corpus \cite{https://doi.org/10.48550/arxiv.2203.15827}.} + \label{fig:LinkBERTGraphExample} +\end{figure} + +\begin{figure} + \includegraphics[width=1.0\columnwidth]{fig_overview_v13.pdf} + \caption{This is an illustration of example of how LinkBERT makes a graph corpus \cite{https://doi.org/10.48550/arxiv.2203.15827}.} + \label{fig:LinkBERTGraphIllustration} +\end{figure} + +LinkBERT is a NLP model that is a pre-trained BERT \cite{https://doi.org/10.48550/arxiv.1810.04805} model that is trained on a graph-based corpus of documents from not only documents but also the hyperlinks in documents. It utilizes a "fusion of graph-based and language-based self-supervised learning" \cite{https://doi.org/10.48550/arxiv.2203.15827}. It gains better performance on graph-based data corpus than other pre-trained NLP models due to it being trained with utilizing graph-based self-supervised learning. + +These are illustrations that explain LinkBERT's graph-based and language-based fusion: + +\begin{itemize}[leftmargin=1em] + \item This illustration shows how hyperlinks can contain crucial information: \ref{fig:LinkBERTGraphExample}. + \item This illustration shows how LinkBERT \cite{https://doi.org/10.48550/arxiv.2203.15827} makes a graph from links: \ref{fig:LinkBERTGraphIllustration}. +\end{itemize} + +For training the Internet-NLP and LM for Text2Text-generation for question answering would be utilizing the fusion of graph-based and language-based learning LinkBERT revolutionized \cite{https://doi.org/10.48550/arxiv.2203.15827}. + +\subsection{Internet-NLP's NLI models} + +\subsubsection{Cross-Encoder NLI Models} + +\begin{figure} + \includegraphics[width=1.0\columnwidth]{Bi_vs_Cross-Encoder.png} + \caption{This is an illustration of how NLI using Cross-Encoders vs Bi-Encoder work like \cite{thakur-2020-AugSBERT}.} + \label{fig:CrossEncoderNLI} +\end{figure} + +NLI compares two sentences to given an output of entailment (true), neutral or contradiction (false). + +Utilizing Cross-Encoder for NLI applications that allow for the utilization of Cross-Encoder (an illustration of Cross-Encoders \ref{fig:CrossEncoderNLI}) where two sentence are passed simultaneously, and then utilizing a classifier to get the output of 0 to 1 which goes from contradiction to entailment \cite{thakur-2020-AugSBERT, https://doi.org/10.48550/arxiv.1908.10084}.