Source code for cltk.text.processes

"""

"""

from copy import deepcopy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core import Doc, Process
from cltk.text.non import OldNorsePunctuationRemover


[docs]@dataclass class PunctuationRemovalProcess(Process): """"""
[docs] def run(self, input_doc: Doc) -> Doc: punctuation_remover = self.algorithm output_doc = deepcopy(input_doc) output_doc.words = [ word for word in output_doc.words if not punctuation_remover(word) ] return output_doc
[docs]class DefaultPunctuationRemovalProcess(PunctuationRemovalProcess): description = "Default punctuation removal algorithm" @cachedproperty def algorithm(self): return DefaultPunctuationRemover()
DEFAULT_PUNCTUATION = [".", ",", ";", ":", '"', "'", "!", "?"]
[docs]class DefaultPunctuationRemover: """""" def __init__(self): pass
[docs] def filter(self, word): return word.string in DEFAULT_PUNCTUATION
def __repr__(self): return f"<DefaultPunctuationRemover>" def __call__(self, word): return self.filter(word)
[docs]class OldNorsePunctuationRemovalProcess(PunctuationRemovalProcess): description = "Default Old Norse punctuation removal algorithm" @cachedproperty def algorithm(self): return OldNorsePunctuationRemover()