73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
import concurrent.futures
|
|
import string
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import contractions
|
|
import tokenizers
|
|
from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
|
|
|
|
# Add utils directory to path
|
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils/NLP")
|
|
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
|
|
import config
|
|
|
|
# Define normalization sequence
|
|
NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
|
|
[Lowercase(), NFKD(), Strip(), StripAccents()]
|
|
)
|
|
|
|
|
|
def remove_non_ascii(string: str) -> str:
|
|
return string.encode("ascii", errors="ignore").decode()
|
|
|
|
|
|
def normalizer(text: str) -> str:
|
|
global remove_non_ascii
|
|
"""Normalize input text.
|
|
|
|
Args:
|
|
text (str): Input text to normalize.
|
|
|
|
Returns:
|
|
str: Normalized text.
|
|
"""
|
|
global NORMALIZER_SEQ
|
|
# Expand contractions
|
|
contractions.fix(text)
|
|
# Remove punctuation
|
|
text = text.translate(str.maketrans("", "", string.punctuation))
|
|
# Normalize string
|
|
text = NORMALIZER_SEQ.normalize_str(text)
|
|
text = (
|
|
text.replace("\n", " ")
|
|
.replace("\t", " ")
|
|
.replace("\r", " ")
|
|
.replace("'", " ")
|
|
.replace("\\x", " ")
|
|
.replace('"', " ")
|
|
.replace("\\", " ")
|
|
.replace("\\", " ")
|
|
.replace("\\r", " ")
|
|
.replace("\\f", " ")
|
|
.replace("\\a", " ")
|
|
.replace(r"\/a", " ")
|
|
.replace(r"\/f", " ")
|
|
.replace(r"\/b", " ")
|
|
.replace(" ", " ")
|
|
)
|
|
text = remove_non_ascii(text)
|
|
return text
|
|
|
|
|
|
def normalize_sentences(sentences: list[str]) -> list[str]:
|
|
normalized_sentences = []
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
futures = [executor.submit(normalizer, sentence) for sentence in sentences]
|
|
for future, sentence in zip(
|
|
concurrent.futures.as_completed(futures), sentences
|
|
):
|
|
if future.result():
|
|
normalized_sentences.append(sentence)
|
|
return normalized_sentences
|