83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
# mypy: ignore-errors
|
|
# checks if sentence is relevant to other sentence
|
|
from typing import List
|
|
|
|
import concurrent.futures
|
|
import pickle
|
|
|
|
import spacy
|
|
|
|
# Load the English language model
|
|
NLP = spacy.load("en_core_web_sm")
|
|
from pathlib import Path
|
|
|
|
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
|
|
|
|
try:
|
|
with open(CACHE_FILE_PATH, "rb") as f:
|
|
cache = pickle.load(f)
|
|
except (OSError, EOFError):
|
|
cache = {}
|
|
|
|
|
|
def is_relevant(sentence: str, question: str) -> bool:
|
|
global NLP
|
|
|
|
cache_key = (sentence, question)
|
|
if cache_key in cache:
|
|
relevant: bool = cache[cache_key]
|
|
return relevant
|
|
# Process the sentence and question
|
|
doc_sentence = NLP(sentence)
|
|
doc_question = NLP(question)
|
|
|
|
# Extract the named entities and important words or phrases from the sentence
|
|
sentence_important = {
|
|
token.text
|
|
for token in doc_sentence
|
|
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
|
}
|
|
question_important = {
|
|
token.text
|
|
for token in doc_question
|
|
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
|
|
}
|
|
|
|
# Check if any of the named entities or important words or phrases in the question are in the sentence
|
|
for token in question_important:
|
|
if token in sentence_important:
|
|
cache[cache_key] = True
|
|
with open(CACHE_FILE_PATH, "wb") as f:
|
|
pickle.dump(cache, f)
|
|
return True
|
|
|
|
# Check if the sentence contains any negative words
|
|
for token in doc_sentence:
|
|
if token.pos_ == "ADV" and token.dep_ == "neg":
|
|
cache[cache_key] = False
|
|
with open(CACHE_FILE_PATH, "wb") as f:
|
|
pickle.dump(cache, f)
|
|
return False
|
|
|
|
cache[cache_key] = False
|
|
with open(CACHE_FILE_PATH, "wb") as f:
|
|
pickle.dump(cache, f)
|
|
return False
|
|
|
|
|
|
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
|
|
relevant_sentences = []
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
futures = [
|
|
executor.submit(is_relevant, sentence, question) for sentence in sentences
|
|
]
|
|
for future, sentence in zip(
|
|
concurrent.futures.as_completed(futures), sentences
|
|
):
|
|
if future.result():
|
|
relevant_sentences.append(sentence)
|
|
return relevant_sentences
|
|
|
|
|
|
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))
|