from copy import copy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty
from boltons.strutils import split_punct_ws

from cltk.core.data_types import Doc, Process
from cltk.stops.words import Stops

[docs]@dataclass class StopsProcess(Process): """ >>> from cltk.core.data_types import Doc, Word >>> from cltk.stops.processes import StopsProcess >>> from cltk.languages.example_texts import get_example_text >>> lang = "lat" >>> words = [Word(string=token) for token in split_punct_ws(get_example_text(lang))] >>> stops_process = StopsProcess(language=lang) >>> output_doc =, words=words)) >>> output_doc.words[1].string 'est' >>> output_doc.words[1].stop True """ @cachedproperty def algorithm(self): return Stops(iso_code=self.language).get_stopwords()
[docs] def run(self, input_doc: Doc) -> Doc: """Note this marks a word a stop if there is a match on either the inflected form (``Word.string``) or the lemma (``Word.lemma``). """ output_doc: Doc = copy(input_doc) stops_list: StopsProcess = self.algorithm for index, word_obj in enumerate(output_doc.words): if (word_obj.string in stops_list) or (word_obj.lemma in stops_list): word_obj.stop: bool = True else: word_obj.stop: bool = False output_doc.words[index] = word_obj return output_doc