internet_ml/internet_ml/tools/NLP/normalize.py

73 lines
2.0 KiB
Python

import concurrent.futures
import string
import sys
from pathlib import Path
import contractions
import tokenizers
from tokenizers.normalizers import NFKD, Lowercase, Strip, StripAccents
# Add utils directory to path
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils/NLP")
sys.path.append(str(Path(__file__).parent.parent.parent) + "/utils")
import config
# Define normalization sequence
NORMALIZER_SEQ: tokenizers.normalizers.Sequence = tokenizers.normalizers.Sequence(
[Lowercase(), NFKD(), Strip(), StripAccents()]
)
def remove_non_ascii(string: str) -> str:
return string.encode("ascii", errors="ignore").decode()
def normalizer(text: str) -> str:
global remove_non_ascii
"""Normalize input text.
Args:
text (str): Input text to normalize.
Returns:
str: Normalized text.
"""
global NORMALIZER_SEQ
# Expand contractions
contractions.fix(text)
# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# Normalize string
text = NORMALIZER_SEQ.normalize_str(text)
text = (
text.replace("\n", " ")
.replace("\t", " ")
.replace("\r", " ")
.replace("'", " ")
.replace("\\x", " ")
.replace('"', " ")
.replace("\\", " ")
.replace("\\", " ")
.replace("\\r", " ")
.replace("\\f", " ")
.replace("\\a", " ")
.replace(r"\/a", " ")
.replace(r"\/f", " ")
.replace(r"\/b", " ")
.replace(" ", " ")
)
text = remove_non_ascii(text)
return text
def normalize_sentences(sentences: list[str]) -> list[str]:
normalized_sentences = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(normalizer, sentence) for sentence in sentences]
for future, sentence in zip(
concurrent.futures.as_completed(futures), sentences
):
if future.result():
normalized_sentences.append(sentence)
return normalized_sentences