Source code for cltk.lexicon.processes

"""
Processes for dictionary lookup.
"""

from copy import deepcopy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core.data_types import Doc, Process
from cltk.core.exceptions import CLTKException
from cltk.lexicon.lat import LatinLewisLexicon
from cltk.lexicon.non import OldNorseZoegaLexicon

__author__ = ["Clément Besnier <clem@clementbesnier.fr>"]


[docs]@dataclass class LexiconProcess(Process): """To be inherited for each language's dictionary declarations. Example: ``LexiconProcess`` -> ``LatinLexiconProcess`` >>> from cltk.lexicon.processes import LexiconProcess >>> from cltk.lemmatize.processes import LemmatizationProcess >>> from cltk.core.data_types import Process >>> issubclass(LexiconProcess, Process) True """ language: str = None @cachedproperty def algorithm(self): if self.language == "lat": lex_class = LatinLewisLexicon() else: raise CLTKException(f"No lookup algorithm for language '{self.language}'.") return lex_class
[docs] def run(self, input_doc: Doc) -> Doc: lookup_algo = self.algorithm output_doc = deepcopy(input_doc) for word in output_doc.words: if self.language == "lat": word.definition = lookup_algo.lookup(word.lemma) elif self.language == "non": word.definition = lookup_algo.lookup(word.string) else: raise CLTKException( f"``LexiconProcess()`` not available for language '{self.language}' This should never happen." ) return output_doc
[docs]class LatinLexiconProcess(LexiconProcess): """The default Latin dictionary lookup algorithm. >>> from cltk.lexicon.processes import LexiconProcess >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import LatinTokenizationProcess >>> from cltk.lemmatize.processes import LatinLemmatizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Latin pipeline", \ processes=[LatinTokenizationProcess, LatinLemmatizationProcess, LatinLexiconProcess], \ language=get_lang("lat")) >>> nlp = NLP(language='lat', custom_pipeline=pipe, suppress_banner=True) >>> cltk_doc = nlp.analyze(text=get_example_text("lat")) >>> [word.definition[:10] for word in cltk_doc.words][:5] ['', 'est\\n\\n\\n see', 'omnis e (o', '', 'in old in'] """ description = "Dictionary lookup process for Latin" language = "lat" @cachedproperty def algorithm(self): return LatinLewisLexicon()
[docs]class OldNorseLexiconProcess(LexiconProcess): """The default Latin dictionary lookup algorithm. >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.tokenizers import OldNorseTokenizationProcess >>> from cltk.languages.utils import get_lang >>> from cltk.languages.example_texts import get_example_text >>> from cltk.nlp import NLP >>> pipe = Pipeline(description="A custom Old Norse pipeline", \ processes=[OldNorseTokenizationProcess, OldNorseLexiconProcess], \ language=get_lang("non")) >>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True) >>> cltk_doc = nlp.analyze(text=get_example_text("non")) #>>> [word.definition[:10] for word in cltk_doc.words][:5] # TODO check this #['', '(-s, -ar),', '', 'adv.\n1) th', ''] """ description = "Dictionary lookup process for Old Norse" language = "non" @cachedproperty def algorithm(self): return OldNorseZoegaLexicon()