Source code for cltk.stem.enm

from typing import Dict, List

__author__ = ["Eleftheria Chatziargyriou <>"]
__license__ = "MIT License. See LICENSE."

Stemming present a significant challenge in ME, as it is exceptionally
difficult to account for the orthographical variations sometimes even
occurring within a single text. The affix algorithm attempts to account
for variations in spelling, but still Mostly relies on a relatively narrow
hard-coded list (Middle English Dictionary(MED)

  TODO: Improve on the affix stemmer by implementing an accurate spell checker
  TODO: Implement a stochastic algorithm/Implement overarching stemmer class



# Used for attaching endings to suffixes, catches more orthographical variations (e.g 'ir', 'ire')
ENDS = ["", "s", "e", "en", "es"]

[docs]def stem( word: str, exception_list: Dict[str, str] = dict(), strip_pref: bool = True, strip_suf: bool = True, ) -> str: """ :param words: string list The affix stemmer works by rule-based stripping. It can work on prefixes, >>> stem('yesterday') 'day' suffixes, >>> stem('likingnes') 'liking' or both >>> stem('yisterdayes') 'day' You can also define whether the stemmer will strip suffixes >>> stem('yisterdayes', strip_suf = False) 'dayes' or prefixes >>> stem('yisterdayes', strip_pref = False) 'yisterday' The stemmer also accepts a user-defined dictionary, that essentially serves the function of a dictionary look-up stemmer >>> stem('arisnesse', exception_list = {'arisnesse':'rise'}) 'rise' """ if word in exception_list: return exception_list[word] if len(word) <= 4: return word if strip_pref: for prefix in PREFIXES: if word.startswith(prefix): word = word[len(prefix) :] break if strip_suf: for en in ENDS: if len(word) <= 4: break # Strip suffixes for suffix in SUFFIXES: if len(suffix) <= len(en): break if (word + en).endswith(suffix): word = word[: -len(suffix) + len(en)] break if len(word) <= 4: break return word