Source code for cltk.lemmatize.processes

"""Processes for lemmatization."""

from copy import copy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core.data_types import Doc, Process
from cltk.lemmatize.ang import OldEnglishDictionaryLemmatizer
from cltk.lemmatize.fro import OldFrenchDictionaryLemmatizer
from cltk.lemmatize.grc import GreekBackoffLemmatizer
from cltk.lemmatize.lat import LatinBackoffLemmatizer


[docs]@dataclass class LemmatizationProcess(Process): """To be inherited for each language's lemmatization declarations. Example: ``LemmatizationProcess`` -> ``LatinLemmatizationProcess`` >>> from cltk.lemmatize.processes import LemmatizationProcess >>> from cltk.core.data_types import Process >>> issubclass(LemmatizationProcess, Process) True """
[docs] def run(self, input_doc: Doc) -> Doc: lemmatizer = self.algorithm output_doc = copy(input_doc) for word in output_doc.words: word.lemma = lemmatizer(word.string) return output_doc
[docs]class GreekLemmatizationProcess(LemmatizationProcess): """The default Ancient Greek lemmatization algorithm. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import MultilingualTokenizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Greek pipeline", \ processes=[MultilingualTokenizationProcess, GreekLemmatizationProcess], \ language=get_lang("grc")) >>> nlp = NLP(language='grc', custom_pipeline=pipe, suppress_banner=True) >>> nlp(get_example_text("grc")).lemmata[30:40] ['ἔλεγον.', 'καίτοι', 'ἀληθές', 'γε', 'ὡς', 'ἔπος', 'εἰπεῖν', 'οὐδὲν', 'εἰρήκασιν.', 'μάλιστα'] """ description = "Lemmatization process for Ancient Greek" @cachedproperty def algorithm(self): return GreekBackoffLemmatizer()
[docs]class LatinLemmatizationProcess(LemmatizationProcess): """The default Latin lemmatization algorithm. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import LatinTokenizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Latin pipeline", \ processes=[LatinTokenizationProcess, LatinLemmatizationProcess], \ language=get_lang("lat")) >>> nlp = NLP(language='lat', custom_pipeline=pipe, suppress_banner=True) >>> nlp(get_example_text("lat")).lemmata[30:40] ['institutis', ',', 'legibus', 'inter', 'se', 'differunt', '.', 'Gallos', 'ab', 'Aquitanis'] """ description = "Lemmatization process for Latin" @cachedproperty def algorithm(self): return LatinBackoffLemmatizer()
[docs]@dataclass class OldEnglishLemmatizationProcess(LemmatizationProcess): """The default Old English lemmatization algorithm. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import MultilingualTokenizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Old English pipeline", \ processes=[MultilingualTokenizationProcess, OldEnglishLemmatizationProcess], \ language=get_lang("ang")) >>> nlp = NLP(language='ang', custom_pipeline=pipe, suppress_banner=True) >>> nlp(get_example_text("ang")).lemmata[30:40] ['siððan', 'ær', 'weorþan', 'feasceaft', 'findan', ',', 'he', 'se', 'frofre', 'gebidan'] """ description = "Lemmatization process for Old English" @cachedproperty def algorithm(self): return OldEnglishDictionaryLemmatizer()
[docs]@dataclass class OldFrenchLemmatizationProcess(LemmatizationProcess): """The default Old French lemmatization algorithm. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import MultilingualTokenizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Old French pipeline", \ processes=[MultilingualTokenizationProcess, OldFrenchLemmatizationProcess], \ language=get_lang("fro")) >>> nlp = NLP(language='fro', custom_pipeline=pipe, suppress_banner=True) >>> nlp(get_example_text("fro")).lemmata[30:40] ['avenir', 'jadis', 'en', 'bretaingne', 'avoir', '.I.', 'molt', 'riche', 'chevalier', 'PUNK'] """ description = "Lemmatization process for Old French" @cachedproperty def algorithm(self): return OldFrenchDictionaryLemmatizer()