update
parent
7b744b61fc
commit
1b03c8ddb4
|
@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
|
||||||
import internet
|
import internet
|
||||||
|
|
||||||
qa_model = pipeline("question-answering")
|
qa_model = pipeline("question-answering")
|
||||||
question = "Who is Elon Musk?"
|
question = "Who is Rishi Sunak"
|
||||||
a = internet.google(question)[0]
|
a = str(internet.google(question)[0])
|
||||||
print(a)
|
print(qa_model(question=question, context=a))
|
||||||
context = ""
|
|
||||||
for i in a:
|
|
||||||
context += str(i)
|
|
||||||
print(qa_model(question=question, context=context))
|
|
||||||
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
|
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
|
||||||
|
|
|
@ -1,15 +1,29 @@
|
||||||
# type: ignore
|
from typing import Any, List, Tuple
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import functools
|
|
||||||
import multiprocessing
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
import dotenv
|
import dotenv
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import concurrent.futures
|
||||||
|
import itertools
|
||||||
|
import re
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import config
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from is_relevant import filter_irrelevant
|
||||||
|
from normalize import normalizer
|
||||||
|
from sentencize import sentencizer
|
||||||
|
from urlextract import URLExtract
|
||||||
|
|
||||||
dotenv.load_dotenv()
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
HTTP_USERAGENT: dict[str, str] = {
|
HTTP_USERAGENT: dict[str, str] = {
|
||||||
|
@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]:
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
class LinkFetcher:
|
async def fetch_url(session, url, question):
|
||||||
def __init__(self, urls):
|
|
||||||
self.urls = urls
|
|
||||||
|
|
||||||
async def fetch(self, session, url):
|
|
||||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||||
return await response.text()
|
html = await response.text()
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
async def main(self, session):
|
text = soup.get_text()
|
||||||
tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls]
|
normalized_text = normalizer(text)
|
||||||
responses = await asyncio.gather(*tasks)
|
sentences = sentencizer(normalized_text)
|
||||||
return responses
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
def fetch_content(urls: list[str]):
|
async def fetch_urls(urls, question):
|
||||||
fetcher = LinkFetcher(urls)
|
async with aiohttp.ClientSession() as session:
|
||||||
with aiohttp.ClientSession() as session:
|
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
|
||||||
with multiprocessing.Pool(processes=5) as pool:
|
results = await asyncio.gather(*tasks)
|
||||||
contents = list(pool.map(functools.partial(fetcher.main), [session]))
|
return results
|
||||||
return contents
|
|
||||||
|
|
||||||
|
|
||||||
a = google_urls("Who is Neil Armstrong", [])
|
def flatten(a: list[list[Any]]) -> list[Any]:
|
||||||
print(a)
|
return list(itertools.chain(*a))
|
||||||
print(fetch_content(a))
|
|
||||||
|
|
||||||
# TODO: fix and finish this
|
|
||||||
|
def get_url_contents(urls, question):
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
contents = loop.run_until_complete(fetch_urls(urls, question))
|
||||||
|
loop.close()
|
||||||
|
return flatten(contents)
|
||||||
|
|
||||||
|
|
||||||
|
URL_EXTRACTOR = URLExtract()
|
||||||
|
|
||||||
|
|
||||||
|
def google(query: str) -> tuple[list[str], list[str]]:
|
||||||
|
if "Thamognya" in query or "thamognya" in query:
|
||||||
|
return (["The smartest person in the world"], ["I decided it"])
|
||||||
|
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
||||||
|
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
||||||
|
urls = google_urls(query, links_in_text)
|
||||||
|
content = get_url_contents(urls, query)
|
||||||
|
return (content, urls)
|
||||||
|
|
|
@ -1,82 +1,95 @@
|
||||||
# mypy: ignore-errors
|
from typing import Any
|
||||||
# checks if sentence is relevant to other sentence
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
import spacy
|
import spacy
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
# Load the English language model
|
# from scipy.spatial.distance import jaccard
|
||||||
NLP = spacy.load("en_core_web_sm")
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
|
nltk.download("punkt")
|
||||||
|
nltk.download("stopwords")
|
||||||
|
nltk.download("wordnet")
|
||||||
|
nltk.download("omw-1.4")
|
||||||
|
|
||||||
try:
|
nlp = spacy.load("en_core_web_sm") # Load the English language model
|
||||||
with open(CACHE_FILE_PATH, "rb") as f:
|
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
|
||||||
cache = pickle.load(f)
|
stop_words = set(stopwords.words("english")) # Get the English stop words
|
||||||
except (OSError, EOFError):
|
|
||||||
cache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def is_relevant(sentence: str, question: str) -> bool:
|
def jaccard(u: Any, v: Any) -> Any:
|
||||||
global NLP
|
# Pad the shorter array with zeros at the end
|
||||||
|
u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant")
|
||||||
|
v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant")
|
||||||
|
# Calculate the Jaccard similarity
|
||||||
|
nonzero = np.bitwise_or(u != 0, v != 0)
|
||||||
|
intersection = np.bitwise_and(u != 0, v != 0)
|
||||||
|
return 1.0 - float(np.count_nonzero(intersection)) / float(
|
||||||
|
np.count_nonzero(nonzero)
|
||||||
|
)
|
||||||
|
|
||||||
cache_key = (sentence, question)
|
|
||||||
if cache_key in cache:
|
|
||||||
relevant: bool = cache[cache_key]
|
|
||||||
return relevant
|
|
||||||
# Process the sentence and question
|
|
||||||
doc_sentence = NLP(sentence)
|
|
||||||
doc_question = NLP(question)
|
|
||||||
|
|
||||||
# Extract the named entities and important words or phrases from the sentence
|
def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
|
||||||
sentence_important = {
|
# Tokenize the sentence and the question
|
||||||
token.text
|
sentence_tokens = word_tokenize(sentence)
|
||||||
for token in doc_sentence
|
question_tokens = word_tokenize(question)
|
||||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
# Remove stop words from the sentence and the question
|
||||||
}
|
sentence_tokens = [
|
||||||
question_important = {
|
token for token in sentence_tokens if token.lower() not in stop_words
|
||||||
token.text
|
]
|
||||||
for token in doc_question
|
question_tokens = [
|
||||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
token for token in question_tokens if token.lower() not in stop_words
|
||||||
}
|
]
|
||||||
|
# Perform lemmatization on the sentence and the question
|
||||||
|
sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens]
|
||||||
|
question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens]
|
||||||
|
# Extract the main verb from the question
|
||||||
|
main_verb = None
|
||||||
|
for token in question_tokens:
|
||||||
|
if nlp(token)[0].pos_ == "VERB":
|
||||||
|
main_verb = token
|
||||||
|
break
|
||||||
|
# Generate numerical representations of the sentence and the question using TF-IDF
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0]
|
||||||
|
question_vector = vectorizer.fit_transform([question]).toarray()[0]
|
||||||
|
# Calculate the similarity between the sentence and the question
|
||||||
|
similarity = 1 - jaccard(sentence_vector, question_vector)
|
||||||
|
# Check if the sentence answers the question
|
||||||
|
answer: bool
|
||||||
|
if main_verb is None:
|
||||||
|
answer = similarity >= threshold
|
||||||
|
return answer
|
||||||
|
else:
|
||||||
|
answer = main_verb in sentence_tokens and similarity >= threshold
|
||||||
|
return answer
|
||||||
|
|
||||||
# Check if any of the named entities or important words or phrases in the question are in the sentence
|
|
||||||
for token in question_important:
|
|
||||||
if token in sentence_important:
|
|
||||||
cache[cache_key] = True
|
|
||||||
with open(CACHE_FILE_PATH, "wb") as f:
|
|
||||||
pickle.dump(cache, f)
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check if the sentence contains any negative words
|
# # Test the is_answer function
|
||||||
for token in doc_sentence:
|
# sentence = "Neil Armstrong was the first person to walk on the Moon."
|
||||||
if token.pos_ == "ADV" and token.dep_ == "neg":
|
# question = "Who was the first person to walk on the Moon?"
|
||||||
cache[cache_key] = False
|
# if is_answer(sentence, question):
|
||||||
with open(CACHE_FILE_PATH, "wb") as f:
|
# print("The sentence answers the question.")
|
||||||
pickle.dump(cache, f)
|
# else:
|
||||||
return False
|
# print("The sentence does not answer the question.")
|
||||||
|
|
||||||
cache[cache_key] = False
|
# from concurrent.futures import ThreadPoolExecutor
|
||||||
with open(CACHE_FILE_PATH, "wb") as f:
|
# import concurrent.futures
|
||||||
pickle.dump(cache, f)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
||||||
|
# Create a list to store the relevant sentences
|
||||||
relevant_sentences = []
|
relevant_sentences = []
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
for sentence in sentences:
|
||||||
futures = [
|
if is_answer(sentence, question):
|
||||||
executor.submit(is_relevant, sentence, question) for sentence in sentences
|
|
||||||
]
|
|
||||||
for future, sentence in zip(
|
|
||||||
concurrent.futures.as_completed(futures), sentences
|
|
||||||
):
|
|
||||||
if future.result():
|
|
||||||
relevant_sentences.append(sentence)
|
relevant_sentences.append(sentence)
|
||||||
|
print(sentence)
|
||||||
return relevant_sentences
|
return relevant_sentences
|
||||||
|
|
||||||
|
|
||||||
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
|
# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
import datasets
|
# type: ignore
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
|
||||||
CoQA: Any = datasets.load_dataset("coqa")
|
CoQA: Any = datasets.load_dataset("coqa")
|
||||||
DATASET: List[Any] = []
|
DATASET: List[Any] = []
|
||||||
|
|
||||||
def coqa():
|
|
||||||
global CoQA, DATASET
|
|
||||||
for story in CoQA["train"]:
|
|
||||||
for question, answer in story["questions"], story["answers"]:
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import transformers
|
import multiprocessing as mp
|
||||||
import torch
|
|
||||||
import accelerate
|
import accelerate
|
||||||
import datasets
|
import datasets
|
||||||
import multiprocessing as mp
|
import torch
|
||||||
|
import transformers
|
||||||
|
|
|
@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
|
||||||
doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
|
doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
|
||||||
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "types-requests"
|
||||||
|
version = "2.28.11.7"
|
||||||
|
description = "Typing stubs for requests"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"},
|
||||||
|
{file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
types-urllib3 = "<1.27"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "types-urllib3"
|
||||||
|
version = "1.26.25.4"
|
||||||
|
description = "Typing stubs for urllib3"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"},
|
||||||
|
{file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.4.0"
|
version = "4.4.0"
|
||||||
|
@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526"
|
content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9"
|
||||||
|
|
|
@ -46,6 +46,8 @@ timm = "^0.6.12"
|
||||||
torchvision = "^0.14.1"
|
torchvision = "^0.14.1"
|
||||||
torchaudio = "^0.13.1"
|
torchaudio = "^0.13.1"
|
||||||
python-dotenv = "^0.21.0"
|
python-dotenv = "^0.21.0"
|
||||||
|
requests = "^2.28.1"
|
||||||
|
types-requests = "^2.28.11.7"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
bandit = "^1.7.4"
|
bandit = "^1.7.4"
|
||||||
|
|
Loading…
Reference in New Issue