Source code for cltk.morphology.lat

""" This modules provides Decliner for Latin. Given a lemma, the Decliner will provide each grammatically valid forms

This work is based on the lexical and linguistic data built for and by the Collatinus Team ( https://github.com/biblissima/collatinus ).
This module hence inherit the license from the original project. The objective of this module is to port part of Collatinus to CLTK.

"""

__author__ = ["Thibault Clerice"]
# credits also to "Yves Ouvrard" and "Philippe Verkerk"
__license__ = "GPL v3"


import json
import os
import re

from cltk.core.exceptions import CLTKException
from cltk.utils import CLTK_DATA_DIR


[docs]class CollatinusDecliner: """Latin Decliner based on Collatinus data and approach to declining words for Latin .. code-block:: python # Ensure you have downloaded the corpus latin_models_cltk before running this from cltk.stem.lat.declension import CollatinusDecliner decliner = CollatinusDecliner() print(decliner.decline("via")) [ ('via', '--s----n-'), ('via', '--s----v-'), ('viam', '--s----a-'), ('viae', '--s----g-'), ('viae', '--s----d-'), ('via', '--s----b-'), ('viae', '--p----n-'), ('viae', '--p----v-'), ('vias', '--p----a-'), ('viarum', '--p----g-'), ('viis', '--p----d-'), ('viis', '--p----b-') ] """ _dism = re.compile(r"(\d+)") def __init__(self): path = os.path.join( CLTK_DATA_DIR, "lat", "model", "lat_models_cltk", "lemmata", "collatinus", "collected.json", ) path = os.path.expanduser(path) with open(path) as data_file: self._data = json.load(data_file) self._models = self._data["models"] self._lemmas = self._data["lemmas"] self._mapped = self._data["maps"] def __getPOS(self, key): """Get POS tag for key :param key: Key Index of Collatinus Morphos :return: Part-Of-Speech tag """ return self._data["pos"][str(key)]
[docs] def _remove_disambiguation(self, root): """Remove disambiguation index from lemma root :param root: Root in Collatinus :return: Cleaned root """ return CollatinusDecliner._dism.sub("", root)
[docs] def _getRoots(self, lemma, model): """Retrieve the known roots of a lemma :param lemma: Canonical form of the word (lemma) :type lemma: str :param model: Model data from the loaded self.__data__. Can be passed by decline() :type model: dict :return: Dictionary of roots with their root identifier as key :rtype: dict """ if lemma not in self._lemmas: raise CLTKException("%s is unknown" % lemma) ROOT_IDS = {"K": "lemma", "1": "geninf", "2": "perf"} lemma_entry = self._lemmas[lemma] if "quantity" in lemma_entry and lemma_entry["quantity"]: lemma_in_lemma_entry = lemma_entry["quantity"] else: lemma_in_lemma_entry = self._remove_disambiguation(lemma_entry["lemma"]) original_roots = { root_id: lemma_entry[root_name].split(",") for root_id, root_name in ROOT_IDS.items() if root_id != "K" and lemma_entry[root_name] } returned_roots = {} if not model: model = self._models[lemma_entry["model"]] # For each registered root in the model, for model_root_id, model_root_data in model["R"].items(): # If we have K, it's equivalent to canonical form if model_root_data[0] == "K": returned_roots[model_root_id] = lemma_in_lemma_entry.split(",") # Otherwise we have deletion number and addition char else: deletion, addition = int(model_root_data[0]), model_root_data[1] or "" # If a the root is declared already, # we retrieve the information if model_root_id != "1" and model_root_id in returned_roots: lemma_roots = returned_roots[model_root_id] else: lemma_roots = lemma_in_lemma_entry.split(",") # We construct the roots returned_roots[model_root_id] = [ lemma_root[:-deletion] + addition for lemma_root in lemma_roots ] if model_root_id in original_roots: returned_roots[model_root_id].extend(original_roots[model_root_id]) returned_roots[model_root_id] = list(set(returned_roots[model_root_id])) original_roots.update(returned_roots) return original_roots
[docs] def decline( self, lemma: str, flatten: bool = False, collatinus_dict: bool = False ) -> list[tuple[str, str]]: """ Decline a lemma .. warning:: POS are incomplete as we do not detect the type outside of verbs, participle and adjective. :raise CLTKException: When the lemma is unknown to our data :param lemma: Lemma (Canonical form) to decline :type lemma: str :param flatten: If set to True, returns a list of forms without natural language information about them :type flatten: bool :param collatinus_dict: If sets to Trueionary of grammatically valid forms, including variants, with keys\ corresponding to morpho informations. :type collatinus_dict: bool :return: List of tuple where first value is the form and second the pos, ie [("sum", "v1ppip---")] :rtype: list or dict """ if lemma in self._lemmas: # Get data information lemma_entry = self._lemmas[lemma] elif lemma in self._mapped and self._mapped[lemma] in self._lemmas: # Get data information lemma = self._mapped[lemma] lemma_entry = self._lemmas[self._mapped[lemma]] else: raise CLTKException("%s is unknown" % lemma) model = self._models[lemma_entry["model"]] # Get the roots roots = self._getRoots(lemma, model=model) # Get the known forms in order keys = sorted([int(key) for key in model["des"].keys()]) forms_data = [(key, model["des"][str(key)]) for key in keys] # Generate the return dict forms = {key: [] for key in keys} for key, form_list in forms_data: for form in form_list: root_id, endings = tuple(form) for root in roots[root_id]: for ending in endings: forms[key].append(root + ending) # sufd means we have the original forms of the parent but we add a suffix if len(model["sufd"]): # For each constant form1 for key, iter_forms in forms.items(): new_forms = [] # We add the constant suffix for sufd in model["sufd"]: new_forms += [form + sufd for form in iter_forms] forms[key] = new_forms # If we need a secure version of the forms. For example, if we have variants if len(model["suf"]): cached_forms = { k: v + [] for k, v in forms.items() } # Making cache without using copy # For each suffix # The format is [suffix characters, [modified forms]] for suffixes in model["suf"]: suffix, modified_forms = suffixes[0], suffixes[1] for modified_form in modified_forms: forms[modified_form] += [ f + suffix for f in cached_forms[modified_form] ] # We update with the new roots # If some form do not exist, we delete them prehentively if len(model["abs"]): for abs_form in model["abs"]: if abs_form in forms: del forms[abs_form] if flatten: return list([form for case_forms in forms.values() for form in case_forms]) elif collatinus_dict: return forms else: return list( [ (form, self.__getPOS(key)) for key, case_forms in forms.items() for form in case_forms ] )
@property def lemmas(self) -> dict[str, dict[str, str]]: return self._lemmas