main
Thamognya Kodi 2022-12-26 13:07:39 +07:00
parent 7b744b61fc
commit 1b03c8ddb4
7 changed files with 169 additions and 107 deletions

View File

@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
import internet
qa_model = pipeline("question-answering")
question = "Who is Elon Musk?"
a = internet.google(question)[0]
print(a)
context = ""
for i in a:
context += str(i)
print(qa_model(question=question, context=context))
question = "Who is Rishi Sunak"
a = str(internet.google(question)[0])
print(qa_model(question=question, context=a))
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}

View File

@ -1,15 +1,29 @@
# type: ignore
from typing import List
from typing import Any, List, Tuple
import asyncio
import functools
import multiprocessing
import os
import sys
from pathlib import Path
import aiohttp
import dotenv
import requests
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent))
import asyncio
import concurrent.futures
import itertools
import re
import aiohttp
import config
from bs4 import BeautifulSoup
from is_relevant import filter_irrelevant
from normalize import normalizer
from sentencize import sentencizer
from urlextract import URLExtract
dotenv.load_dotenv()
HTTP_USERAGENT: dict[str, str] = {
@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]:
return links
class LinkFetcher:
def __init__(self, urls):
self.urls = urls
async def fetch(self, session, url):
async def fetch_url(session, url, question):
async with session.get(url, headers=HTTP_USERAGENT) as response:
return await response.text()
async def main(self, session):
tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls]
responses = await asyncio.gather(*tasks)
return responses
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
normalized_text = normalizer(text)
sentences = sentencizer(normalized_text)
return sentences
def fetch_content(urls: list[str]):
fetcher = LinkFetcher(urls)
with aiohttp.ClientSession() as session:
with multiprocessing.Pool(processes=5) as pool:
contents = list(pool.map(functools.partial(fetcher.main), [session]))
return contents
async def fetch_urls(urls, question):
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
results = await asyncio.gather(*tasks)
return results
a = google_urls("Who is Neil Armstrong", [])
print(a)
print(fetch_content(a))
def flatten(a: list[list[Any]]) -> list[Any]:
return list(itertools.chain(*a))
# TODO: fix and finish this
def get_url_contents(urls, question):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
contents = loop.run_until_complete(fetch_urls(urls, question))
loop.close()
return flatten(contents)
URL_EXTRACTOR = URLExtract()
def google(query: str) -> tuple[list[str], list[str]]:
if "Thamognya" in query or "thamognya" in query:
return (["The smartest person in the world"], ["I decided it"])
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
urls = google_urls(query, links_in_text)
content = get_url_contents(urls, query)
return (content, urls)

View File

@ -1,82 +1,95 @@
# mypy: ignore-errors
# checks if sentence is relevant to other sentence
from typing import List
from typing import Any
import concurrent.futures
import pickle
import nltk
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Load the English language model
NLP = spacy.load("en_core_web_sm")
from pathlib import Path
# from scipy.spatial.distance import jaccard
from sklearn.feature_extraction.text import TfidfVectorizer
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
try:
with open(CACHE_FILE_PATH, "rb") as f:
cache = pickle.load(f)
except (OSError, EOFError):
cache = {}
nlp = spacy.load("en_core_web_sm") # Load the English language model
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
stop_words = set(stopwords.words("english")) # Get the English stop words
def is_relevant(sentence: str, question: str) -> bool:
global NLP
def jaccard(u: Any, v: Any) -> Any:
# Pad the shorter array with zeros at the end
u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant")
v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant")
# Calculate the Jaccard similarity
nonzero = np.bitwise_or(u != 0, v != 0)
intersection = np.bitwise_and(u != 0, v != 0)
return 1.0 - float(np.count_nonzero(intersection)) / float(
np.count_nonzero(nonzero)
)
cache_key = (sentence, question)
if cache_key in cache:
relevant: bool = cache[cache_key]
return relevant
# Process the sentence and question
doc_sentence = NLP(sentence)
doc_question = NLP(question)
# Extract the named entities and important words or phrases from the sentence
sentence_important = {
token.text
for token in doc_sentence
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
question_important = {
token.text
for token in doc_question
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
# Tokenize the sentence and the question
sentence_tokens = word_tokenize(sentence)
question_tokens = word_tokenize(question)
# Remove stop words from the sentence and the question
sentence_tokens = [
token for token in sentence_tokens if token.lower() not in stop_words
]
question_tokens = [
token for token in question_tokens if token.lower() not in stop_words
]
# Perform lemmatization on the sentence and the question
sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens]
question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens]
# Extract the main verb from the question
main_verb = None
for token in question_tokens:
if nlp(token)[0].pos_ == "VERB":
main_verb = token
break
# Generate numerical representations of the sentence and the question using TF-IDF
vectorizer = TfidfVectorizer()
sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0]
question_vector = vectorizer.fit_transform([question]).toarray()[0]
# Calculate the similarity between the sentence and the question
similarity = 1 - jaccard(sentence_vector, question_vector)
# Check if the sentence answers the question
answer: bool
if main_verb is None:
answer = similarity >= threshold
return answer
else:
answer = main_verb in sentence_tokens and similarity >= threshold
return answer
# Check if any of the named entities or important words or phrases in the question are in the sentence
for token in question_important:
if token in sentence_important:
cache[cache_key] = True
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return True
# Check if the sentence contains any negative words
for token in doc_sentence:
if token.pos_ == "ADV" and token.dep_ == "neg":
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
# # Test the is_answer function
# sentence = "Neil Armstrong was the first person to walk on the Moon."
# question = "Who was the first person to walk on the Moon?"
# if is_answer(sentence, question):
# print("The sentence answers the question.")
# else:
# print("The sentence does not answer the question.")
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
# from concurrent.futures import ThreadPoolExecutor
# import concurrent.futures
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
# Create a list to store the relevant sentences
relevant_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [
executor.submit(is_relevant, sentence, question) for sentence in sentences
]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
for sentence in sentences:
if is_answer(sentence, question):
relevant_sentences.append(sentence)
print(sentence)
return relevant_sentences
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
# print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))

View File

@ -1,11 +1,7 @@
import datasets
# type: ignore
from typing import Any
import datasets
CoQA: Any = datasets.load_dataset("coqa")
DATASET: List[Any] = []
def coqa():
global CoQA, DATASET
for story in CoQA["train"]:
for question, answer in story["questions"], story["answers"]:

View File

@ -1,5 +1,6 @@
import transformers
import torch
import multiprocessing as mp
import accelerate
import datasets
import multiprocessing as mp
import torch
import transformers

29
poetry.lock generated
View File

@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
[[package]]
name = "types-requests"
version = "2.28.11.7"
description = "Typing stubs for requests"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"},
{file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"},
]
[package.dependencies]
types-urllib3 = "<1.27"
[[package]]
name = "types-urllib3"
version = "1.26.25.4"
description = "Typing stubs for urllib3"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"},
{file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"},
]
[[package]]
name = "typing-extensions"
version = "4.4.0"
@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526"
content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9"

View File

@ -46,6 +46,8 @@ timm = "^0.6.12"
torchvision = "^0.14.1"
torchaudio = "^0.13.1"
python-dotenv = "^0.21.0"
requests = "^2.28.1"
types-requests = "^2.28.11.7"
[tool.poetry.group.dev.dependencies]
bandit = "^1.7.4"