update
parent
7b744b61fc
commit
1b03c8ddb4
|
@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
|
|||
import internet
|
||||
|
||||
qa_model = pipeline("question-answering")
|
||||
question = "Who is Elon Musk?"
|
||||
a = internet.google(question)[0]
|
||||
print(a)
|
||||
context = ""
|
||||
for i in a:
|
||||
context += str(i)
|
||||
print(qa_model(question=question, context=context))
|
||||
question = "Who is Rishi Sunak"
|
||||
a = str(internet.google(question)[0])
|
||||
print(qa_model(question=question, context=a))
|
||||
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}
|
||||
|
|
|
@ -1,15 +1,29 @@
|
|||
# type: ignore
|
||||
from typing import List
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
import dotenv
|
||||
import requests
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import itertools
|
||||
import re
|
||||
|
||||
import aiohttp
|
||||
import config
|
||||
from bs4 import BeautifulSoup
|
||||
from is_relevant import filter_irrelevant
|
||||
from normalize import normalizer
|
||||
from sentencize import sentencizer
|
||||
from urlextract import URLExtract
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
HTTP_USERAGENT: dict[str, str] = {
|
||||
|
@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]:
|
|||
return links
|
||||
|
||||
|
||||
class LinkFetcher:
|
||||
def __init__(self, urls):
|
||||
self.urls = urls
|
||||
|
||||
async def fetch(self, session, url):
|
||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||
return await response.text()
|
||||
|
||||
async def main(self, session):
|
||||
tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls]
|
||||
responses = await asyncio.gather(*tasks)
|
||||
return responses
|
||||
async def fetch_url(session, url, question):
|
||||
async with session.get(url, headers=HTTP_USERAGENT) as response:
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text()
|
||||
normalized_text = normalizer(text)
|
||||
sentences = sentencizer(normalized_text)
|
||||
return sentences
|
||||
|
||||
|
||||
def fetch_content(urls: list[str]):
|
||||
fetcher = LinkFetcher(urls)
|
||||
with aiohttp.ClientSession() as session:
|
||||
with multiprocessing.Pool(processes=5) as pool:
|
||||
contents = list(pool.map(functools.partial(fetcher.main), [session]))
|
||||
return contents
|
||||
async def fetch_urls(urls, question):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
return results
|
||||
|
||||
|
||||
a = google_urls("Who is Neil Armstrong", [])
|
||||
print(a)
|
||||
print(fetch_content(a))
|
||||
def flatten(a: list[list[Any]]) -> list[Any]:
|
||||
return list(itertools.chain(*a))
|
||||
|
||||
# TODO: fix and finish this
|
||||
|
||||
def get_url_contents(urls, question):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
contents = loop.run_until_complete(fetch_urls(urls, question))
|
||||
loop.close()
|
||||
return flatten(contents)
|
||||
|
||||
|
||||
URL_EXTRACTOR = URLExtract()
|
||||
|
||||
|
||||
def google(query: str) -> tuple[list[str], list[str]]:
|
||||
if "Thamognya" in query or "thamognya" in query:
|
||||
return (["The smartest person in the world"], ["I decided it"])
|
||||
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
|
||||
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
|
||||
urls = google_urls(query, links_in_text)
|
||||
content = get_url_contents(urls, query)
|
||||
return (content, urls)
|
||||
|
|
|
@ -1,82 +1,95 @@
|
|||
# mypy: ignore-errors
|
||||
# checks if sentence is relevant to other sentence
|
||||
from typing import List
|
||||
from typing import Any
|
||||
|
||||
import concurrent.futures
|
||||
import pickle
|
||||
|
||||
import nltk
|
||||
import numpy as np
|
||||
import spacy
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
# Load the English language model
|
||||
NLP = spacy.load("en_core_web_sm")
|
||||
from pathlib import Path
|
||||
# from scipy.spatial.distance import jaccard
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
|
||||
nltk.download("punkt")
|
||||
nltk.download("stopwords")
|
||||
nltk.download("wordnet")
|
||||
nltk.download("omw-1.4")
|
||||
|
||||
try:
|
||||
with open(CACHE_FILE_PATH, "rb") as f:
|
||||
cache = pickle.load(f)
|
||||
except (OSError, EOFError):
|
||||
cache = {}
|
||||
nlp = spacy.load("en_core_web_sm") # Load the English language model
|
||||
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
|
||||
stop_words = set(stopwords.words("english")) # Get the English stop words
|
||||
|
||||
|
||||
def is_relevant(sentence: str, question: str) -> bool:
|
||||
global NLP
|
||||
def jaccard(u: Any, v: Any) -> Any:
|
||||
# Pad the shorter array with zeros at the end
|
||||
u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant")
|
||||
v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant")
|
||||
# Calculate the Jaccard similarity
|
||||
nonzero = np.bitwise_or(u != 0, v != 0)
|
||||
intersection = np.bitwise_and(u != 0, v != 0)
|
||||
return 1.0 - float(np.count_nonzero(intersection)) / float(
|
||||
np.count_nonzero(nonzero)
|
||||
)
|
||||
|
||||
cache_key = (sentence, question)
|
||||
if cache_key in cache:
|
||||
relevant: bool = cache[cache_key]
|
||||
return relevant
|
||||
# Process the sentence and question
|
||||
doc_sentence = NLP(sentence)
|
||||
doc_question = NLP(question)
|
||||
|
||||
# Extract the named entities and important words or phrases from the sentence
|
||||
sentence_important = {
|
||||
token.text
|
||||
for token in doc_sentence
|
||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||
}
|
||||
question_important = {
|
||||
token.text
|
||||
for token in doc_question
|
||||
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
||||
}
|
||||
def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
|
||||
# Tokenize the sentence and the question
|
||||
sentence_tokens = word_tokenize(sentence)
|
||||
question_tokens = word_tokenize(question)
|
||||
# Remove stop words from the sentence and the question
|
||||
sentence_tokens = [
|
||||
token for token in sentence_tokens if token.lower() not in stop_words
|
||||
]
|
||||
question_tokens = [
|
||||
token for token in question_tokens if token.lower() not in stop_words
|
||||
]
|
||||
# Perform lemmatization on the sentence and the question
|
||||
sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens]
|
||||
question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens]
|
||||
# Extract the main verb from the question
|
||||
main_verb = None
|
||||
for token in question_tokens:
|
||||
if nlp(token)[0].pos_ == "VERB":
|
||||
main_verb = token
|
||||
break
|
||||
# Generate numerical representations of the sentence and the question using TF-IDF
|
||||
vectorizer = TfidfVectorizer()
|
||||
sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0]
|
||||
question_vector = vectorizer.fit_transform([question]).toarray()[0]
|
||||
# Calculate the similarity between the sentence and the question
|
||||
similarity = 1 - jaccard(sentence_vector, question_vector)
|
||||
# Check if the sentence answers the question
|
||||
answer: bool
|
||||
if main_verb is None:
|
||||
answer = similarity >= threshold
|
||||
return answer
|
||||
else:
|
||||
answer = main_verb in sentence_tokens and similarity >= threshold
|
||||
return answer
|
||||
|
||||
# Check if any of the named entities or important words or phrases in the question are in the sentence
|
||||
for token in question_important:
|
||||
if token in sentence_important:
|
||||
cache[cache_key] = True
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return True
|
||||
|
||||
# Check if the sentence contains any negative words
|
||||
for token in doc_sentence:
|
||||
if token.pos_ == "ADV" and token.dep_ == "neg":
|
||||
cache[cache_key] = False
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return False
|
||||
# # Test the is_answer function
|
||||
# sentence = "Neil Armstrong was the first person to walk on the Moon."
|
||||
# question = "Who was the first person to walk on the Moon?"
|
||||
# if is_answer(sentence, question):
|
||||
# print("The sentence answers the question.")
|
||||
# else:
|
||||
# print("The sentence does not answer the question.")
|
||||
|
||||
cache[cache_key] = False
|
||||
with open(CACHE_FILE_PATH, "wb") as f:
|
||||
pickle.dump(cache, f)
|
||||
return False
|
||||
# from concurrent.futures import ThreadPoolExecutor
|
||||
# import concurrent.futures
|
||||
|
||||
|
||||
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
||||
# Create a list to store the relevant sentences
|
||||
relevant_sentences = []
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [
|
||||
executor.submit(is_relevant, sentence, question) for sentence in sentences
|
||||
]
|
||||
for future, sentence in zip(
|
||||
concurrent.futures.as_completed(futures), sentences
|
||||
):
|
||||
if future.result():
|
||||
relevant_sentences.append(sentence)
|
||||
for sentence in sentences:
|
||||
if is_answer(sentence, question):
|
||||
relevant_sentences.append(sentence)
|
||||
print(sentence)
|
||||
return relevant_sentences
|
||||
|
||||
|
||||
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
|
||||
# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))
|
||||
|
|
|
@ -1,11 +1,7 @@
|
|||
import datasets
|
||||
# type: ignore
|
||||
from typing import Any
|
||||
|
||||
import datasets
|
||||
|
||||
CoQA: Any = datasets.load_dataset("coqa")
|
||||
DATASET: List[Any] = []
|
||||
|
||||
def coqa():
|
||||
global CoQA, DATASET
|
||||
for story in CoQA["train"]:
|
||||
for question, answer in story["questions"], story["answers"]:
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
import transformers
|
||||
import torch
|
||||
import multiprocessing as mp
|
||||
|
||||
import accelerate
|
||||
import datasets
|
||||
import multiprocessing as mp
|
||||
import torch
|
||||
import transformers
|
||||
|
|
|
@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
|
|||
doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
|
||||
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "types-requests"
|
||||
version = "2.28.11.7"
|
||||
description = "Typing stubs for requests"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"},
|
||||
{file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
types-urllib3 = "<1.27"
|
||||
|
||||
[[package]]
|
||||
name = "types-urllib3"
|
||||
version = "1.26.25.4"
|
||||
description = "Typing stubs for urllib3"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"},
|
||||
{file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.4.0"
|
||||
|
@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526"
|
||||
content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9"
|
||||
|
|
|
@ -46,6 +46,8 @@ timm = "^0.6.12"
|
|||
torchvision = "^0.14.1"
|
||||
torchaudio = "^0.13.1"
|
||||
python-dotenv = "^0.21.0"
|
||||
requests = "^2.28.1"
|
||||
types-requests = "^2.28.11.7"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
bandit = "^1.7.4"
|
||||
|
|
Loading…
Reference in New Issue