import concurrent.futures import string import sys from pathlib import Path import contractions import tokenizers from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents # Add utils directory to path sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils/NLP") sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils") import config # Define normalization sequence NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence( [Lowercase(), NFKD(), Strip(), StripAccents()] ) def remove_non_ascii(string: str) -> str: return string.encode("ascii", errors="ignore").decode() def normalizer(text: str) -> str: global remove_non_ascii """Normalize input text. Args: text (str): Input text to normalize. Returns: str: Normalized text. """ global NORMALIZER_SEQ # Expand contractions contractions.fix(text) # Remove punctuation text = text.translate(str.maketrans("", "", string.punctuation)) # Normalize string text = NORMALIZER_SEQ.normalize_str(text) text = ( text.replace("\n", " ") .replace("\t", " ") .replace("\r", " ") .replace("'", " ") .replace("\\x", " ") .replace('"', " ") .replace("\\", " ") .replace("\\", " ") .replace("\\r", " ") .replace("\\f", " ") .replace("\\a", " ") .replace(r"\/a", " ") .replace(r"\/f", " ") .replace(r"\/b", " ") .replace(" ", " ") ) text = remove_non_ascii(text) return text def normalize_sentences(sentences: list[str]) -> list[str]: normalized_sentences = [] with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(normalizer, sentence) for sentence in sentences] for future, sentence in zip( concurrent.futures.as_completed(futures), sentences ): if future.result(): normalized_sentences.append(sentence) return normalized_sentences