internet_ml/internet_ml/tools/NLP/is_relevant.py

# mypy: ignore-errors
# checks if sentence is relevant to other sentence
from typing import List

import concurrent.futures
import pickle

import spacy

# Load the English language model
NLP = spacy.load("en_core_web_sm")
from pathlib import Path

CACHE_FILE_PATH: str = "./is_relevant_cache.pkl"

try:
    with open(CACHE_FILE_PATH, "rb") as f:
        cache = pickle.load(f)
except (OSError, EOFError):
    cache = {}


def is_relevant(sentence: str, question: str) -> bool:
    global NLP

    cache_key = (sentence, question)
    if cache_key in cache:
        relevant: bool = cache[cache_key]
        return relevant
    # Process the sentence and question
    doc_sentence = NLP(sentence)
    doc_question = NLP(question)

    # Extract the named entities and important words or phrases from the sentence
    sentence_important = {
        token.text
        for token in doc_sentence
        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
    }
    question_important = {
        token.text
        for token in doc_question
        if token.pos_ in ["NOUN", "PROPN", "ADJ"] or token.ent_type_ != ""
    }

    # Check if any of the named entities or important words or phrases in the question are in the sentence
    for token in question_important:
        if token in sentence_important:
            cache[cache_key] = True
            with open(CACHE_FILE_PATH, "wb") as f:
                pickle.dump(cache, f)
            return True

    # Check if the sentence contains any negative words
    for token in doc_sentence:
        if token.pos_ == "ADV" and token.dep_ == "neg":
            cache[cache_key] = False
            with open(CACHE_FILE_PATH, "wb") as f:
                pickle.dump(cache, f)
            return False

    cache[cache_key] = False
    with open(CACHE_FILE_PATH, "wb") as f:
        pickle.dump(cache, f)
    return False


def filter_irrelevant(sentences: list[str], question: str) -> list[str]:
    relevant_sentences = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(is_relevant, sentence, question) for sentence in sentences
        ]
        for future, sentence in zip(
            concurrent.futures.as_completed(futures), sentences
        ):
            if future.result():
                relevant_sentences.append(sentence)
    return relevant_sentences


# print(filter_irrelevant(["jeff bezos died", "jeff is stupid", "jeff bezos is an entrepenur"], "who is jeff bezos"))