Skip to content

processes

Hold the Process for normalizing text strings.

Usually used before the text is sent to other processes.

NormalizeProcess

Bases: Process

Generic process for text normalization.

process_id class-attribute

process_id: str = 'normalize'

language_code class-attribute instance-attribute

language_code: Optional[str] = None

algorithm cached property

algorithm: Callable[[str], str]

Return the normalization function used for this process.

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

run

run(input_doc: Doc) -> Doc

Invoke language-appropriate normalization code for text a given language.

Source code in cltk/text/processes.py
def run(self, input_doc: Doc) -> Doc:
    """Invoke language-appropriate normalization code for text a given language."""
    log = bind_from_doc(input_doc)
    log.debug(f"Running normalization for language: {self.language_code}")
    if self.algorithm is None:
        log.error(
            f"No normalization algorithm found for language '{self.language_code}'"
        )
        raise ValueError(
            f"No normalization algorithm found for language '{self.language_code}'"
        )
    if input_doc.raw is None:
        log.error("input_doc.raw must not be None")
        raise ValueError("input_doc.raw must not be None")
    normalized_text = self.algorithm(input_doc.raw)
    input_doc.normalized_text = normalized_text
    log.info(
        f"Normalized text: {input_doc.normalized_text[:50]}..."
        if input_doc.normalized_text
        else "Normalized text is empty."
    )
    return input_doc

MultilingualNormalizeProcess

Bases: NormalizeProcess

Text normalization for multiple languages.

process_id class-attribute

process_id: str = 'normalize'

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

language_code class-attribute instance-attribute

language_code: Optional[str] = None

algorithm cached property

algorithm: Callable[[str], str]

Return the normalization function used for this process.

run

run(input_doc: Doc) -> Doc

Invoke language-appropriate normalization code for text a given language.

Source code in cltk/text/processes.py
def run(self, input_doc: Doc) -> Doc:
    """Invoke language-appropriate normalization code for text a given language."""
    log = bind_from_doc(input_doc)
    log.debug(f"Running normalization for language: {self.language_code}")
    if self.algorithm is None:
        log.error(
            f"No normalization algorithm found for language '{self.language_code}'"
        )
        raise ValueError(
            f"No normalization algorithm found for language '{self.language_code}'"
        )
    if input_doc.raw is None:
        log.error("input_doc.raw must not be None")
        raise ValueError("input_doc.raw must not be None")
    normalized_text = self.algorithm(input_doc.raw)
    input_doc.normalized_text = normalized_text
    log.info(
        f"Normalized text: {input_doc.normalized_text[:50]}..."
        if input_doc.normalized_text
        else "Normalized text is empty."
    )
    return input_doc