internet_ml/internet_ml/tools/NLP/sentencize.py

from typing import Any, List

import sys
from pathlib import Path

# Add utils directory to path
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
import concurrent.futures

import config
import nltk

nltk.download("words", quiet=True)

ENGLISH_WORDS: Any = set(nltk.corpus.words.words())


def convert_to_english(text: str) -> str:
    global ENGLISH_WORDS
    return " ".join(
        w
        for w in nltk.wordpunct_tokenize(text)
        if w.lower() in ENGLISH_WORDS or not w.isalpha()
    )


def sentencizer(text: str) -> list[str]:
    global convert_to_english
    inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
    english_sentences: list[str] = []

    # Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
        # Create a list of futures to process the sentences concurrently
        futures = [
            executor.submit(convert_to_english, sentence)
            for sentence in inital_sentences
        ]
        # Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
        for future in concurrent.futures.as_completed(futures):
            english_sentences.append(future.result())

    return english_sentences


# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))