Source code for cltk.text.processes

"""Processes for processing text."""

from copy import copy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core import Doc, Process
from cltk.text.non import OldNorsePunctuationRemover


[docs]@dataclass class PunctuationRemovalProcess(Process): """"""
[docs] def run(self, input_doc: Doc) -> Doc: punctuation_remover: PunctuationRemovalProcess = self.algorithm output_doc = copy(input_doc) output_doc.words = [ word for word in output_doc.words if not punctuation_remover(word) ] return output_doc
[docs]class DefaultPunctuationRemovalProcess(PunctuationRemovalProcess): description: str = "Default punctuation removal algorithm" @cachedproperty def algorithm(self): return DefaultPunctuationRemover()
DEFAULT_PUNCTUATION: list[str] = [".", ",", ";", ":", '"', "'", "!", "?"]
[docs]class DefaultPunctuationRemover: """DefaultPunctuationRemover""" def __init__(self): pass
[docs] def filter(self, word) -> bool: return word.string in DEFAULT_PUNCTUATION
def __repr__(self) -> str: return f"<DefaultPunctuationRemover>" def __call__(self, word) -> bool: return self.filter(word)
[docs]class OldNorsePunctuationRemovalProcess(PunctuationRemovalProcess): description: str = "Default Old Norse punctuation removal algorithm" @cachedproperty def algorithm(self) -> OldNorsePunctuationRemover: return OldNorsePunctuationRemover()