48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
from typing import Any, List
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add utils directory to path
|
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils/NLP")
|
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
|
|
import concurrent.futures
|
|
|
|
import config
|
|
import nltk
|
|
|
|
nltk.download("words", quiet=True)
|
|
|
|
ENGLISH_WORDS: Any = set(nltk.corpus.words.words())
|
|
|
|
|
|
def convert_to_english(text: str) -> str:
|
|
global ENGLISH_WORDS
|
|
return " ".join(
|
|
w
|
|
for w in nltk.wordpunct_tokenize(text)
|
|
if w.lower() in ENGLISH_WORDS or not w.isalpha()
|
|
)
|
|
|
|
|
|
def sentencizer(text: str) -> list[str]:
|
|
global convert_to_english
|
|
inital_sentences: list[str] = nltk.tokenize.sent_tokenize(text)
|
|
english_sentences: list[str] = []
|
|
|
|
# Use concurrent.futures.ThreadPoolExecutor to process the sentences concurrently
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=500) as executor:
|
|
# Create a list of futures to process the sentences concurrently
|
|
futures = [
|
|
executor.submit(convert_to_english, sentence)
|
|
for sentence in inital_sentences
|
|
]
|
|
# Use concurrent.futures.as_completed to retrieve the results of the futures as they complete
|
|
for future in concurrent.futures.as_completed(futures):
|
|
english_sentences.append(future.result())
|
|
|
|
return english_sentences
|
|
|
|
|
|
# print(sentencizer("hello gdfjsfkjd. i amf dfjdslf the greatest efe ve every"))
|