Source code for cltk.stem.fro

"""Stemmer for Old French."""
import re

__author__ = ["Natasha Voake <natashavoake@gmail.com>"]
__license__ = "MIT License. See LICENSE."

EXCEPTIONS: list[str] = [
    "mer",
    "certes",
    "mais",
    "vos",
    "onques",
    "nos",
    "talent",
    "omnipotent",
    "jadis",
    "et",
    "je",
    "certes",
    "que",
    "mais",
    "ne",
    "me",
    "de",
    "ce",
    "qui",
    "la",
    "li",
    "le",
    "ma",
    "mon",
    "si",
    "sa",
    "son",
    "chambre",
    "quant",
    "ki",
    "tut",
]


[docs]def _matchremove_noun_endings(word: str) -> tuple[str, bool]:
    """Remove the noun and adverb word endings"""

    was_stemmed: bool = False

    """common and proper noun and adjective word endings sorted by charlen, then alph"""
    noun_endings: list[str] = [
        "arons",
        "ains",
        "aron",
        "ment",
        "ain",
        "age",
        "on",
        "es",
        "ée",
        "ee",
        "ie",
        "s",
    ]

    for ending in noun_endings:
        """ignore exceptions"""
        if word in EXCEPTIONS:
            word = word
            was_stemmed = True
            break
        if word == ending:
            word = word
            was_stemmed = True
            break
        """removes noun endings"""
        if word.endswith(ending):
            word = re.sub(r"{0}$".format(ending), "", word)
            was_stemmed = True
            break

    return word, was_stemmed


[docs]def _matchremove_verb_endings(word: str) -> str:
    """Remove the verb endings"""
    """verb endings sorted by charlen then alph"""
    verb_endings: list[str] = [
        "issiiens",
        "isseient",
        "issiiez",
        "issons",
        "issent",
        "issant",
        "isseie",
        "isseit",
        "issons",
        "isseiz",
        "assent",
        "issons",
        "isseiz",
        "issent",
        "iiens",
        "eient",
        "issez",
        "oient",
        "istes",
        "ïstes",
        "istes",
        "astes",
        "erent",
        "istes",
        "irent",
        "ustes",
        "urent",
        "âmes",
        "âtes",
        "èrent",
        "asses",
        "isses",
        "issez",
        "ssons",
        "sseiz",
        "ssent",
        "erent",
        "eies",
        "iiez",
        "oies",
        "iens",
        "ions",
        "oint",
        "eret",
        "imes",
        "rent",
        "ümes",
        "ütes",
        "ïmes",
        "imes",
        "asse",
        "isse",
        "usse",
        "ames",
        "imes",
        "umes",
        "asse",
        "isse",
        "sses",
        "ssez",
        "ons",
        "ent",
        "ant",
        "eie",
        "eit",
        "int",
        "ist",
        "eiz",
        "oie",
        "oit",
        "iez",
        "ois",
        "oit",
        "iez",
        "res",
        "ert",
        "ast",
        "ist",
        "sse",
        "mes",
        "er",
        "es",
        "et",
        "ez",
        "is",
        "re",
        "oi",
        "ïs",
        "üs",
        "ai",
        "as",
        "at",
        "is",
        "it",
        "ui",
        "us",
        "ut",
        "st",
        "s",
        "t",
        "e",
        "é",
        "z",
        "u",
        "a",
        "i",
    ]

    for ending in verb_endings:
        if word == ending:
            word = word
            break
        if word.endswith(ending):
            word = re.sub(r"{0}$".format(ending), "", word)
            break

    return word


[docs]def stem(word: str) -> str:
    """
    Stem a word of Old French.

    >>> stem('departissent')
    'depart'
    >>> stem('talent')
    'talent'

    """
    word = word.lower()

    """remove the simple endings from the target word"""
    stemmed_word, was_stemmed = _matchremove_noun_endings(word)

    """if word didn't match the simple endings, try verb endings"""
    if not was_stemmed:
        stemmed_word = _matchremove_verb_endings(word)

    return stemmed_word
Source code for cltk.stem.fro

The Classical Language Toolkit

Navigation

Related Topics