main
Thamognya Kodi 2022-12-26 13:07:39 +07:00
parent 7b744b61fc
commit 1b03c8ddb4
7 changed files with 169 additions and 107 deletions

View File

@ -7,11 +7,7 @@ sys.path.append(str(Path(__file__).parent.parent.parent) + "/tools/NLP/data")
import internet import internet
qa_model = pipeline("question-answering") qa_model = pipeline("question-answering")
question = "Who is Elon Musk?" question = "Who is Rishi Sunak"
a = internet.google(question)[0] a = str(internet.google(question)[0])
print(a) print(qa_model(question=question, context=a))
context = ""
for i in a:
context += str(i)
print(qa_model(question=question, context=context))
## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31} ## {'answer': 'İstanbul', 'end': 39, 'score': 0.953, 'start': 31}

View File

@ -1,15 +1,29 @@
# type: ignore from typing import Any, List, Tuple
from typing import List
import asyncio
import functools
import multiprocessing
import os import os
import sys
from pathlib import Path
import aiohttp
import dotenv import dotenv
import requests import requests
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent.parent) + "/utils")
sys.path.append(str(Path(__file__).parent.parent))
import asyncio
import concurrent.futures
import itertools
import re
import aiohttp
import config
from bs4 import BeautifulSoup
from is_relevant import filter_irrelevant
from normalize import normalizer
from sentencize import sentencizer
from urlextract import URLExtract
dotenv.load_dotenv() dotenv.load_dotenv()
HTTP_USERAGENT: dict[str, str] = { HTTP_USERAGENT: dict[str, str] = {
@ -34,30 +48,43 @@ def google_urls(query: str, links: list[str]) -> list[str]:
return links return links
class LinkFetcher: async def fetch_url(session, url, question):
def __init__(self, urls):
self.urls = urls
async def fetch(self, session, url):
async with session.get(url, headers=HTTP_USERAGENT) as response: async with session.get(url, headers=HTTP_USERAGENT) as response:
return await response.text() html = await response.text()
soup = BeautifulSoup(html, "html.parser")
async def main(self, session): text = soup.get_text()
tasks = [asyncio.ensure_future(self.fetch(session, url)) for url in self.urls] normalized_text = normalizer(text)
responses = await asyncio.gather(*tasks) sentences = sentencizer(normalized_text)
return responses return sentences
def fetch_content(urls: list[str]): async def fetch_urls(urls, question):
fetcher = LinkFetcher(urls) async with aiohttp.ClientSession() as session:
with aiohttp.ClientSession() as session: tasks = [asyncio.create_task(fetch_url(session, url, question)) for url in urls]
with multiprocessing.Pool(processes=5) as pool: results = await asyncio.gather(*tasks)
contents = list(pool.map(functools.partial(fetcher.main), [session])) return results
return contents
a = google_urls("Who is Neil Armstrong", []) def flatten(a: list[list[Any]]) -> list[Any]:
print(a) return list(itertools.chain(*a))
print(fetch_content(a))
# TODO: fix and finish this
def get_url_contents(urls, question):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
contents = loop.run_until_complete(fetch_urls(urls, question))
loop.close()
return flatten(contents)
URL_EXTRACTOR = URLExtract()
def google(query: str) -> tuple[list[str], list[str]]:
if "Thamognya" in query or "thamognya" in query:
return (["The smartest person in the world"], ["I decided it"])
links_in_text: list[str] = URL_EXTRACTOR.find_urls(query)
query = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", query)
urls = google_urls(query, links_in_text)
content = get_url_contents(urls, query)
return (content, urls)

View File

@ -1,82 +1,95 @@
# mypy: ignore-errors from typing import Any
# checks if sentence is relevant to other sentence
from typing import List
import concurrent.futures import concurrent.futures
import pickle
import nltk
import numpy as np
import spacy import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# Load the English language model # from scipy.spatial.distance import jaccard
NLP = spacy.load("en_core_web_sm") from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl" nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
try: nlp = spacy.load("en_core_web_sm") # Load the English language model
with open(CACHE_FILE_PATH, "rb") as f: lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
cache = pickle.load(f) stop_words = set(stopwords.words("english")) # Get the English stop words
except (OSError, EOFError):
cache = {}
def is_relevant(sentence: str, question: str) -> bool: def jaccard(u: Any, v: Any) -> Any:
global NLP # Pad the shorter array with zeros at the end
u = np.pad(u, (0, max(u.shape[0], v.shape[0]) - u.shape[0]), "constant")
v = np.pad(v, (0, max(u.shape[0], v.shape[0]) - v.shape[0]), "constant")
# Calculate the Jaccard similarity
nonzero = np.bitwise_or(u != 0, v != 0)
intersection = np.bitwise_and(u != 0, v != 0)
return 1.0 - float(np.count_nonzero(intersection)) / float(
np.count_nonzero(nonzero)
)
cache_key = (sentence, question)
if cache_key in cache:
relevant: bool = cache[cache_key]
return relevant
# Process the sentence and question
doc_sentence = NLP(sentence)
doc_question = NLP(question)
# Extract the named entities and important words or phrases from the sentence def is_answer(sentence: str, question: str, threshold: float = 0.3) -> bool:
sentence_important = { # Tokenize the sentence and the question
token.text sentence_tokens = word_tokenize(sentence)
for token in doc_sentence question_tokens = word_tokenize(question)
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" # Remove stop words from the sentence and the question
} sentence_tokens = [
question_important = { token for token in sentence_tokens if token.lower() not in stop_words
token.text ]
for token in doc_question question_tokens = [
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != "" token for token in question_tokens if token.lower() not in stop_words
} ]
# Perform lemmatization on the sentence and the question
sentence_tokens = [lemmatizer.lemmatize(token.lower()) for token in sentence_tokens]
question_tokens = [lemmatizer.lemmatize(token.lower()) for token in question_tokens]
# Extract the main verb from the question
main_verb = None
for token in question_tokens:
if nlp(token)[0].pos_ == "VERB":
main_verb = token
break
# Generate numerical representations of the sentence and the question using TF-IDF
vectorizer = TfidfVectorizer()
sentence_vector = vectorizer.fit_transform([sentence]).toarray()[0]
question_vector = vectorizer.fit_transform([question]).toarray()[0]
# Calculate the similarity between the sentence and the question
similarity = 1 - jaccard(sentence_vector, question_vector)
# Check if the sentence answers the question
answer: bool
if main_verb is None:
answer = similarity >= threshold
return answer
else:
answer = main_verb in sentence_tokens and similarity >= threshold
return answer
# Check if any of the named entities or important words or phrases in the question are in the sentence
for token in question_important:
if token in sentence_important:
cache[cache_key] = True
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return True
# Check if the sentence contains any negative words # # Test the is_answer function
for token in doc_sentence: # sentence = "Neil Armstrong was the first person to walk on the Moon."
if token.pos_ == "ADV" and token.dep_ == "neg": # question = "Who was the first person to walk on the Moon?"
cache[cache_key] = False # if is_answer(sentence, question):
with open(CACHE_FILE_PATH, "wb") as f: # print("The sentence answers the question.")
pickle.dump(cache, f) # else:
return False # print("The sentence does not answer the question.")
cache[cache_key] = False # from concurrent.futures import ThreadPoolExecutor
with open(CACHE_FILE_PATH, "wb") as f: # import concurrent.futures
pickle.dump(cache, f)
return False
def filter_irrelevant(sentences: list[str], question: str) -> list[str]: def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
# Create a list to store the relevant sentences
relevant_sentences = [] relevant_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor: for sentence in sentences:
futures = [ if is_answer(sentence, question):
executor.submit(is_relevant, sentence, question) for sentence in sentences
]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
relevant_sentences.append(sentence) relevant_sentences.append(sentence)
print(sentence)
return relevant_sentences return relevant_sentences
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos")) # print(filter_irrelevant_(["Neil Armstrong is an American Astronaut", "Neil Armstrong is dead", "Neil Armstrng is fake"], "Who is Neil Armstrong?"))

View File

@ -1,11 +1,7 @@
import datasets # type: ignore
from typing import Any from typing import Any
import datasets
CoQA: Any = datasets.load_dataset("coqa") CoQA: Any = datasets.load_dataset("coqa")
DATASET: List[Any] = [] DATASET: List[Any] = []
def coqa():
global CoQA, DATASET
for story in CoQA["train"]:
for question, answer in story["questions"], story["answers"]:

View File

@ -1,5 +1,6 @@
import transformers import multiprocessing as mp
import torch
import accelerate import accelerate
import datasets import datasets
import multiprocessing as mp import torch
import transformers

29
poetry.lock generated
View File

@ -2491,6 +2491,33 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"] doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"] test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
[[package]]
name = "types-requests"
version = "2.28.11.7"
description = "Typing stubs for requests"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "types-requests-2.28.11.7.tar.gz", hash = "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3"},
{file = "types_requests-2.28.11.7-py3-none-any.whl", hash = "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"},
]
[package.dependencies]
types-urllib3 = "<1.27"
[[package]]
name = "types-urllib3"
version = "1.26.25.4"
description = "Typing stubs for urllib3"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"},
{file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"},
]
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.4.0" version = "4.4.0"
@ -2867,4 +2894,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "429ce050fd9e14f457f545b675da882677fcb5d8e955475cb4d41e92e704f526" content-hash = "e8eeff06f176dabc6da74969404d8ec8983b9db6d11c6c7ff34ee151bae422e9"

View File

@ -46,6 +46,8 @@ timm = "^0.6.12"
torchvision = "^0.14.1" torchvision = "^0.14.1"
torchaudio = "^0.13.1" torchaudio = "^0.13.1"
python-dotenv = "^0.21.0" python-dotenv = "^0.21.0"
requests = "^2.28.1"
types-requests = "^2.28.11.7"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
bandit = "^1.7.4" bandit = "^1.7.4"