Source code for cltk.stem.enm

__author__ = ["Eleftheria Chatziargyriou <ele.hatzy@gmail.com>"]
__license__ = "MIT License. See LICENSE."

"""
Stemming present a significant challenge in ME, as it is exceptionally
difficult to account for the orthographical variations sometimes even
occurring within a single text. The affix algorithm attempts to account
for variations in spelling, but still Mostly relies on a relatively narrow
hard-coded list (Middle English Dictionary(MED) https://quod.lib.umich.edu/m/med/)

  TODO: Improve on the affix stemmer by implementing an accurate spell checker
  TODO: Implement a stochastic algorithm/Implement overarching stemmer class
"""

SUFFIXES: list[str] = [
    "rightes",
    "eresse",
    "kinnes",
    "lechen",
    "licher",
    "linges",
    "lokest",
    "longes",
    "wardes",
    "atour",
    "aunce",
    "enger",
    "estre",
    "evous",
    "iende",
    "iinde",
    "istre",
    "ivous",
    "lesse",
    "liche",
    "liece",
    "liest",
    "lyese",
    "nesce",
    "neshe",
    "nissa",
    "nisse",
    "omlie",
    "right",
    "somes",
    "trice",
    "eren",
    "erie",
    "acle",
    "ager",
    "aten",
    "atif",
    "aunt",
    "cund",
    "elet",
    "ende",
    "erel",
    "esse",
    "fold",
    "ible",
    "ical",
    "ieth",
    "inde",
    "ioun",
    "ious",
    "iple",
    "laes",
    "laus",
    "leas",
    "lech",
    "lese",
    "lice",
    "ling",
    "long",
    "lous",
    "lyas",
    "ment",
    "most",
    "nece",
    "rede",
    "ship",
    "soum",
    "uous",
    "ward",
    "ade",
    "age",
    "ail",
    "ain",
    "air",
    "and",
    "ard",
    "ari",
    "dom",
    "ede",
    "els",
    "eon",
    "ere",
    "est",
    "eth",
    "eur",
    "ful",
    "gat",
    "hed",
    "ial",
    "ien",
    "ier",
    "ild",
    "ing",
    "ise",
    "ish",
    "ist",
    "ith",
    "kin",
    "lac",
    "les",
    "leu",
    "lez",
    "læs",
    "mel",
    "mor",
    "nes",
    "nez",
    "oir",
    "orn",
    "oun",
    "our",
    "ous",
    "som",
    "ure",
    "wil",
    "al",
    "an",
    "ar",
    "at",
    "ed",
    "el",
    "en",
    "er",
    "es",
    "et",
    "fi",
    "if",
    "ik",
    "il",
    "in",
    "ir",
    "it",
    "li",
    "ok",
    "om",
    "on",
    "ot",
    "re",
    "se",
    "te",
    "th",
    "ti",
    "ur",
]

PREFIXES: list[str] = [
    "yester",
    "yister",
    "yistyr",
    "yistyr",
    "yuster",
    "forth",
    "yond",
    "eth",
    "toe",
    "too",
    "tou",
    "tow",
    "tuo",
    "two",
    "at",
    "ef",
    "et",
    "ex",
    "ta",
    "te",
    "th",
    "to",
    "tu",
]

# Used for attaching endings to suffixes, catches more orthographical variations (e.g 'ir', 'ire')
ENDS: list[str] = ["", "s", "e", "en", "es"]


[docs]def stem(
    word: str,
    exception_list: dict[str, str] = dict(),
    strip_pref: bool = True,
    strip_suf: bool = True,
) -> str:
    """
    :param words: string list

    The affix stemmer works by rule-based stripping. It can work on prefixes,

    >>> stem('yesterday')
    'day'

    suffixes,

    >>> stem('likingnes')
    'liking'

    or both

    >>> stem('yisterdayes')
    'day'

    You can also define whether the stemmer will strip suffixes

    >>> stem('yisterdayes', strip_suf = False)
    'dayes'

    or prefixes

    >>> stem('yisterdayes', strip_pref = False)
    'yisterday'

    The stemmer also accepts a user-defined dictionary, that essentially serves
    the function of a dictionary look-up stemmer

    >>> stem('arisnesse', exception_list = {'arisnesse':'rise'})
    'rise'
    """

    if word in exception_list:
        return exception_list[word]

    if len(word) <= 4:
        return word

    if strip_pref:
        for prefix in PREFIXES:
            if word.startswith(prefix):
                word = word[len(prefix) :]
                break

    if strip_suf:
        for en in ENDS:
            if len(word) <= 4:
                break

            # Strip suffixes
            for suffix in SUFFIXES:
                if len(suffix) <= len(en):
                    break

                if (word + en).endswith(suffix):
                    word = word[: -len(suffix) + len(en)]
                    break

            if len(word) <= 4:
                break

    return word
Source code for cltk.stem.enm

The Classical Language Toolkit

Navigation

Related Topics