internet_ml/internet_ml/tools/NLP/is_relevant.py

83 lines
2.5 KiB
Python

# mypy: ignore-errors
# checks if sentence is relevant to other sentence
from typing import List
import concurrent.futures
import pickle
import spacy
# Load the English language model
NLP = spacy.load("en_core_web_sm")
from pathlib import Path
CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"
try:
with open(CACHE_FILE_PATH, "rb") as f:
cache = pickle.load(f)
except (OSError, EOFError):
cache = {}
def is_relevant(sentence: str, question: str) -> bool:
global NLP
cache_key = (sentence, question)
if cache_key in cache:
relevant: bool = cache[cache_key]
return relevant
# Process the sentence and question
doc_sentence = NLP(sentence)
doc_question = NLP(question)
# Extract the named entities and important words or phrases from the sentence
sentence_important = {
token.text
for token in doc_sentence
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
question_important = {
token.text
for token in doc_question
if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
}
# Check if any of the named entities or important words or phrases in the question are in the sentence
for token in question_important:
if token in sentence_important:
cache[cache_key] = True
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return True
# Check if the sentence contains any negative words
for token in doc_sentence:
if token.pos_ == "ADV" and token.dep_ == "neg":
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
cache[cache_key] = False
with open(CACHE_FILE_PATH, "wb") as f:
pickle.dump(cache, f)
return False
def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
relevant_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [
executor.submit(is_relevant, sentence, question) for sentence in sentences
]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
relevant_sentences.append(sentence)
return relevant_sentences
# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))