Hold the Process for normalizing text strings.
Usually used before the text is sent to other processes.
NormalizeProcess
Bases: Process
Generic process for text normalization.
process_id
class-attribute
process_id: str = 'normalize'
language_code
class-attribute
instance-attribute
language_code: Optional[str] = None
algorithm
cached
property
algorithm: Callable[[str], str]
Return the normalization function used for this process.
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
run
run(input_doc: Doc) -> Doc
Invoke language-appropriate normalization code for text a given language.
Source code in cltk/text/processes.py
| def run(self, input_doc: Doc) -> Doc:
"""Invoke language-appropriate normalization code for text a given language."""
log = bind_from_doc(input_doc)
log.debug(f"Running normalization for language: {self.language_code}")
if self.algorithm is None:
log.error(
f"No normalization algorithm found for language '{self.language_code}'"
)
raise ValueError(
f"No normalization algorithm found for language '{self.language_code}'"
)
if input_doc.raw is None:
log.error("input_doc.raw must not be None")
raise ValueError("input_doc.raw must not be None")
normalized_text = self.algorithm(input_doc.raw)
input_doc.normalized_text = normalized_text
log.info(
f"Normalized text: {input_doc.normalized_text[:50]}..."
if input_doc.normalized_text
else "Normalized text is empty."
)
return input_doc
|
MultilingualNormalizeProcess
Bases: NormalizeProcess
Text normalization for multiple languages.
process_id
class-attribute
process_id: str = 'normalize'
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
language_code
class-attribute
instance-attribute
language_code: Optional[str] = None
algorithm
cached
property
algorithm: Callable[[str], str]
Return the normalization function used for this process.
run
run(input_doc: Doc) -> Doc
Invoke language-appropriate normalization code for text a given language.
Source code in cltk/text/processes.py
| def run(self, input_doc: Doc) -> Doc:
"""Invoke language-appropriate normalization code for text a given language."""
log = bind_from_doc(input_doc)
log.debug(f"Running normalization for language: {self.language_code}")
if self.algorithm is None:
log.error(
f"No normalization algorithm found for language '{self.language_code}'"
)
raise ValueError(
f"No normalization algorithm found for language '{self.language_code}'"
)
if input_doc.raw is None:
log.error("input_doc.raw must not be None")
raise ValueError("input_doc.raw must not be None")
normalized_text = self.algorithm(input_doc.raw)
input_doc.normalized_text = normalized_text
log.info(
f"Normalized text: {input_doc.normalized_text[:50]}..."
if input_doc.normalized_text
else "Normalized text is empty."
)
return input_doc
|