Source code for cltk.alphabet.processes
"""This module holds the ``Process`` for normalizing text strings, usually
before the text is sent to other processes.
"""
from dataclasses import dataclass
from boltons.cacheutils import cachedproperty # type: ignore
from cltk.alphabet.grc.grc import normalize_grc
from cltk.alphabet.lat import normalize_lat
from cltk.core.data_types import Doc, Process
[docs]@dataclass
class NormalizeProcess(Process):
"""Generic process for text normalization."""
language: str = None
@cachedproperty
def algorithm(self):
if self.language == "grc":
return normalize_grc
elif self.language == "lat":
return normalize_lat
[docs] def run(self, input_doc: Doc) -> Doc:
"""This ideally returns an algorithm that takes and returns a string."""
normalized_text = self.algorithm(input_doc.raw)
input_doc.normalized_text = normalized_text
return input_doc
[docs]@dataclass
class GreekNormalizeProcess(NormalizeProcess):
"""Text normalization for Ancient Greek.
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> lang = "grc"
>>> orig_text = get_example_text(lang)
>>> non_normed_doc = Doc(raw=orig_text)
>>> normalize_proc = GreekNormalizeProcess(language=lang)
>>> normalized_text = normalize_proc.run(input_doc=non_normed_doc)
>>> normalized_text == orig_text
False
"""
language = "grc"
[docs]@dataclass
class LatinNormalizeProcess(NormalizeProcess):
"""Text normalization for Latin.
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> lang = "lat"
>>> orig_text = get_example_text(lang)
>>> non_normed_doc = Doc(raw=orig_text)
>>> normalize_proc = LatinNormalizeProcess(language=lang)
>>> normalized_text = normalize_proc.run(input_doc=non_normed_doc)
>>> normalized_text == orig_text
False
"""
language = "lat"