"""Processes for stemming.
"""
from copy import copy
from dataclasses import dataclass
from typing import Any
import cltk.stem.akk
import cltk.stem.enm
import cltk.stem.fro
import cltk.stem.gmh
import cltk.stem.lat
from cltk.core.data_types import Doc, Process
[docs]@dataclass
class StemmingProcess(Process):
"""To be inherited for each language's stemming declarations.
Example: ``StemmingProcess`` -> ``LatinStemmingProcess``
>>> from cltk.stem.processes import StemmingProcess
>>> from cltk.core.data_types import Process
>>> issubclass(StemmingProcess, Process)
True
"""
[docs] def run(self, input_doc: Doc) -> Doc:
stem: Any = self.algorithm
output_doc: Doc = copy(input_doc)
for word in output_doc.words:
word.stem = stem(word.string)
return output_doc
[docs]class LatinStemmingProcess(StemmingProcess):
"""The default Latin stemming algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import LatinTokenizationProcess
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.languages.utils import get_lang
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess, LatinStemmingProcess], language=get_lang("lat"))
>>> nlp = NLP(language='lat', custom_pipeline = pipe, suppress_banner=True)
>>> nlp(get_example_text("lat")[:23]).stems
['Gall', 'est', 'omn', 'divis']
"""
description = "Default stemmer for the Latin language."
[docs] @staticmethod
def algorithm(word: str) -> str:
return cltk.stem.lat.stem(word)
[docs]class MiddleEnglishStemmingProcess(StemmingProcess):
"""The default Middle English stemming algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import MiddleEnglishTokenizationProcess
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.languages.utils import get_lang
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Middle English pipeline", \
processes=[MiddleEnglishTokenizationProcess, MiddleEnglishStemmingProcess], \
language=get_lang("enm"))
>>> nlp = NLP(language='enm', custom_pipeline = pipe, suppress_banner=True)
>>> nlp(get_example_text("enm")[:29]).stems
['Whil', ',', 'as', 'olde', 'stor', 'lle']
"""
description = "Default stemmer for the Middle English language."
[docs] @staticmethod
def algorithm(word: str) -> str:
return cltk.stem.enm.stem(word)
[docs]class MiddleHighGermanStemmingProcess(StemmingProcess):
"""The default Middle High German stemming algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import MiddleHighGermanTokenizationProcess
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.languages.utils import get_lang
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom MHG pipeline", \
processes=[MiddleHighGermanTokenizationProcess, MiddleHighGermanStemmingProcess], \
language=get_lang("gmh"))
>>> nlp = NLP(language='gmh', custom_pipeline = pipe, suppress_banner=True)
>>> nlp(get_example_text("gmh")[:29]).stems
['uns', 'ist', 'in', 'alten', 'mær', 'wund']
"""
description = "Default stemmer for the Middle High German language."
[docs] @staticmethod
def algorithm(word: str) -> str:
return cltk.stem.gmh.stem(word)
[docs]class OldFrenchStemmingProcess(StemmingProcess):
"""The default Old French stemming algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.tokenizers import OldFrenchTokenizationProcess
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.languages.utils import get_lang
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Old French pipeline", \
processes=[OldFrenchTokenizationProcess, OldFrenchStemmingProcess], \
language=get_lang("fro"))
>>> nlp = NLP(language='fro', custom_pipeline = pipe, suppress_banner=True)
>>> nlp(get_example_text("fro")[:29]).stems
['un', 'aventu', 'vos', 'voil', 'di', 'mo']
"""
description = "Default stemmer for the Old French language."
[docs] @staticmethod
def algorithm(word: str) -> str:
return cltk.stem.fro.stem(word)