"""Stem Latin words with an implementation of the Schinke Latin stemming
algorithm (Schinke R, Greengrass M, Robertson AM and Willett P. (1996). 'A
stemming algorithm for Latin text databases'. Journal of Documentation, 52:
172-187).
.. todo::
Make this stemmer like lemma, with import from ``stem`` dir.
"""
__author__ = ["Luke Hollis <lukehollis@gmail.com>"]
__license__ = "MIT License. See LICENSE."
import re
from cltk.stops.lat import STOPS
[docs]def _checkremove_que(word) -> tuple[str, bool]:
"""If word ends in -que and if word is not in pass list, strip -que"""
in_que_pass_list: bool = False
que_pass_list: list[str] = [
"atque",
"quoque",
"neque",
"itaque",
"absque",
"apsque",
"abusque",
"adaeque",
"adusque",
"denique",
"deque",
"susque",
"oblique",
"peraeque",
"plenisque",
"quandoque",
"quisque",
"quaeque",
"cuiusque",
"cuique",
"quemque",
"quamque",
"quaque",
"quique",
"quorumque",
"quarumque",
"quibusque",
"quosque",
"quasque",
"quotusquisque",
"quousque",
"ubique",
"undique",
"usque",
"uterque",
"utique",
"utroque",
"utribique",
"torque",
"coque",
"concoque",
"contorque",
"detorque",
"decoque",
"excoque",
"extorque",
"obtorque",
"optorque",
"retorque",
"recoque",
"attorque",
"incoque",
"intorque",
"praetorque",
]
if word not in que_pass_list:
word: str = re.sub(r"que$", "", word)
else:
in_que_pass_list: bool = True
return word, in_que_pass_list
[docs]def _matchremove_simple_endings(word) -> tuple[str, bool]:
"""Remove the noun, adjective, adverb word endings"""
was_stemmed: bool = False
# noun, adjective, and adverb word endings sorted by charlen, then alph
simple_endings: list[str] = [
"ibus",
"ius",
"ae",
"am",
"as",
"em",
"es",
"ia",
"is",
"nt",
"os",
"ud",
"um",
"us",
"a",
"e",
"i",
"o",
"u",
]
for ending in simple_endings:
if word.endswith(ending):
word: str = re.sub(r"{0}$".format(ending), "", word)
was_stemmed: bool = True
break
return word, was_stemmed
[docs]def _matchremove_verb_endings(word) -> str:
"""Remove the verb endings"""
i_verb_endings: list[str] = ["iuntur", "erunt", "untur", "iunt", "unt"]
bi_verb_endings: list[str] = ["beris", "bor", "bo"]
eri_verb_endings: list[str] = ["ero"]
verb_endings: list[str] = [
"mini",
"ntur",
"stis",
"mur",
"mus",
"ris",
"sti",
"tis",
"tur",
"ns",
"nt",
"ri",
"m",
"r",
"s",
"t",
]
# replace i verb endings with i
for ending in i_verb_endings:
if word.endswith(ending):
word: str = re.sub(r"{0}$".format(ending), "i", word)
return word
# replace bi verb endings with bi
for ending in bi_verb_endings:
if word.endswith(ending):
word: str = re.sub(r"{0}$".format(ending), "bi", word)
return word
# replace eri verb endings with eri
for ending in eri_verb_endings:
if word.endswith(ending):
word: str = re.sub(r"{0}$".format(ending), "eri", word)
return word
# otherwise, remove general verb endings
for ending in verb_endings:
if word.endswith(ending):
word: str = re.sub(r"{0}$".format(ending), "", word)
break
return word
[docs]def stem(word: str) -> str:
"""
Stem each word of the Latin text.
>>> stem('interdum')
'interd'
>>> stem('mercaturis')
'mercatur'
"""
if word not in STOPS:
# remove '-que' suffix
word, in_que_pass_list = _checkremove_que(word)
if not in_que_pass_list:
# remove the simple endings from the target word
word, was_stemmed = _matchremove_simple_endings(word)
# if word didn't match the simple endings, try verb endings
if not was_stemmed:
word = _matchremove_verb_endings(word)
return word