Source code for cltk.text.processes
"""Processes for processing text."""
from copy import copy
from dataclasses import dataclass
from boltons.cacheutils import cachedproperty
from cltk.core import Doc, Process
from cltk.text.non import OldNorsePunctuationRemover
[docs]@dataclass
class PunctuationRemovalProcess(Process):
""""""
[docs] def run(self, input_doc: Doc) -> Doc:
punctuation_remover: PunctuationRemovalProcess = self.algorithm
output_doc = copy(input_doc)
output_doc.words = [
word for word in output_doc.words if not punctuation_remover(word)
]
return output_doc
[docs]class DefaultPunctuationRemovalProcess(PunctuationRemovalProcess):
description: str = "Default punctuation removal algorithm"
@cachedproperty
def algorithm(self):
return DefaultPunctuationRemover()
DEFAULT_PUNCTUATION: list[str] = [".", ",", ";", ":", '"', "'", "!", "?"]
[docs]class DefaultPunctuationRemover:
"""DefaultPunctuationRemover"""
def __init__(self):
pass
[docs] def filter(self, word) -> bool:
return word.string in DEFAULT_PUNCTUATION
def __repr__(self) -> str:
return f"<DefaultPunctuationRemover>"
def __call__(self, word) -> bool:
return self.filter(word)
[docs]class OldNorsePunctuationRemovalProcess(PunctuationRemovalProcess):
description: str = "Default Old Norse punctuation removal algorithm"
@cachedproperty
def algorithm(self) -> OldNorsePunctuationRemover:
return OldNorsePunctuationRemover()