Source code for cltk.morphology.morphosyntax

"""A module for representing universal morphosyntactic feature bundles."""

from typing import Optional, Type, Union

from cltk.core.exceptions import CLTKException
from cltk.morphology.universal_dependencies_features import *

__author__ = ["John Stewart <free-variation>"]


[docs]class MorphosyntacticFeatureBundle: """A representation of a set of features, usually associated with a word form.""" def __init__(self, *features: list[MorphosyntacticFeature]) -> None: """ >>> f1 = MorphosyntacticFeatureBundle(F.neg, N.pos, V.neg, Case.accusative) >>> f1.features {F: [neg], N: [pos], V: [neg], Case: [accusative]} """ self.features = {} for feature in features: if isinstance(feature, type) and issubclass( feature, MorphosyntacticFeature ): self.features[feature] = Underspecified else: if type(feature) in self.features: self.features[type(feature)].append(feature) else: self.features[type(feature)] = [feature] def __getitem__( self, feature_name: Union[str, Type[MorphosyntacticFeature]] ) -> list[MorphosyntacticFeature]: """ Use dict-type syntax for accessing the values of features. >>> f1 = f(F.pos, N.pos) >>> f1[F] [pos] >>> f1[V] Traceback (most recent call last): cltk.core.exceptions.CLTKException: {F: [pos], N: [pos]} unspecified for V >>> f1['F'] [pos] """ if type(feature_name) == str: if feature_name not in globals(): raise TypeError(feature_name + " is not a morphosytactic feature") feature_name = globals()[feature_name] if not issubclass(feature_name, MorphosyntacticFeature): raise TypeError(str(feature_name) + " is not a morphosytactic feature") if feature_name in self.features: return self.features[feature_name] else: raise CLTKException(f"{self} unspecified for {feature_name}") def __setitem__( self, feature_name: Union[str, Type[MorphosyntacticFeature]], feature_values: Union[MorphosyntacticFeature, list[MorphosyntacticFeature]], ) -> "MorphosyntacticFeatureBundle": """ Use dict-type syntax to set the value of features. >>> f1 = f(F.pos) >>> f1[N] = N.neg >>> f1 {F: [pos], N: [neg]} >>> f1['V'] = V.pos >>> f1 {F: [pos], N: [neg], V: [pos]} """ if type(feature_name) == str: if feature_name not in globals(): raise TypeError(feature_name + " is not a morphosytactic feature") feature_name = globals()[feature_name] if not issubclass(feature_name, MorphosyntacticFeature): raise TypeError(str(feature_name) + " is not a morphosyntactic feature") if type(feature_values) is not list: feature_values = [feature_values] for value in feature_values: if value is not None and type(value) != feature_name: raise TypeError(str(value) + " is not a " + str(feature_name)) self.features[feature_name] = feature_values return self
[docs] def all( self, ) -> list[tuple[Type[MorphosyntacticFeature], list[MorphosyntacticFeature]]]: return self.features.items()
[docs] def underspecify(self, feature_name: Type[MorphosyntacticFeature]) -> None: """ Underspecify the given feature in the bundle. >>> f1 = f(F.pos, N.pos, V.neg) >>> f1.underspecify(F) >>> f1[F] is Underspecified True """ if not issubclass(feature_name, MorphosyntacticFeature): raise TypeError(str(feature_name) + " is not a morphosytactic feature") self.features[feature_name] = Underspecified
[docs] def matches(self, other: "MorphosyntacticFeatureBundle") -> bool: """ This feature bundle matches other if other contains all the features of this bundle, i.e. if this bundle is an improper subset of other. Underspecified features will match. >>> f1 = f(F, N.pos, V.neg) >>> f2 = f(F.neg, N.pos, V.neg) >>> f3 = f(F.pos, N.neg, V.pos) >>> f1.matches(f2) True >>> f1.matches(f3) False """ if other is None: return False for f in self.features.keys(): if f not in other.features: return False if ( self[f] is not Underspecified and other[f] is not Underspecified and not (self[f] == other[f]) ): return False return True
def __str__(self) -> str: return str(self.features) def __iter__(self): return iter(self.features) __repr__ = __str__
[docs] def keys(self): return self.features.keys()
[docs] def values(self): return self.features.values()
[docs] def items(self): return self.features.items()
def __len__(self): return len(self.features) def __contains__(self, item: MorphosyntacticFeature): if not isinstance(item, MorphosyntacticFeature): # raise TypeError(str(item) + " is not a MorphosyntacticFeature") return False else: for i in self.features: if item in self.features[i]: return True return False
f = MorphosyntacticFeatureBundle
[docs]def to_categorial(pos: int) -> "MorphosyntacticFeatureBundle": """Maps UD parts of speech to binary categorial feature bundles. In some cases these are underspecified, including empty bundles for interjections. >>> to_categorial(POS.adjective) {F: [neg], N: [pos], V: [pos]} >>> to_categorial(POS.particle) {F: [pos]} >>> to_categorial(POS.interjection) {} """ if pos == POS.adjective or pos == POS.adverb: return f(F.neg, N.pos, V.pos) elif pos == POS.adposition: return f(F.pos, N.neg, V.neg) elif pos == POS.auxiliary: return f(F.pos, N.neg, V.pos) elif ( pos == POS.coordinating_conjunction or pos == POS.subordinating_conjunction or pos == POS.particle ): return f(F.pos) elif pos == POS.determiner or pos == POS.pronoun or pos == POS.numeral: return f(F.pos, N.pos, V.neg) elif pos == POS.noun or pos == POS.proper_noun: return f(F.neg, N.pos, V.neg) elif pos == POS.verb: return f(F.neg, N.neg, V.pos) else: return f()
FORM_UD_MAP: dict[str, dict[str, MorphosyntacticFeature]] = { # parts of speech "POS": { "ADJ": POS.adjective, "ADP": POS.adposition, "ADV": POS.adverb, "AUX": POS.auxiliary, "CCONJ": POS.coordinating_conjunction, "DET": POS.determiner, "INTJ": POS.interjection, "NOUN": POS.noun, "NUM": POS.numeral, "PART": POS.particle, "PRON": POS.pronoun, "PROPN": POS.proper_noun, "PUNCT": POS.punctuation, "SCONJ": POS.subordinating_conjunction, "SYM": POS.symbol, "VERB": POS.verb, "X": POS.other, }, # verbal features "VerbForm": { "Conv": VerbForm.converb, "Fin": VerbForm.finite, "Gdv": VerbForm.gerundive, "Ger": VerbForm.gerund, "Inf": VerbForm.infinitive, "Part": VerbForm.participle, "Sup": VerbForm.supine, "Vnoun": VerbForm.masdar, }, # https://universaldependencies.org/u/feat/Mood.html "Mood": { "Adm": Mood.admirative, "Cnd": Mood.conditional, "Des": Mood.desiderative, "Imp": Mood.imperative, "Ind": Mood.indicative, "Jus": Mood.jussive, "Nec": Mood.necessitative, "Opt": Mood.optative, "Pot": Mood.potential, "Prp": Mood.purposive, "Qot": Mood.quotative, "Sub": Mood.subjunctive, }, "Tense": { "Fut": Tense.future, "Imp": Tense.imperfect, "Past": Tense.past, "Pqp": Tense.pluperfect, "Pres": Tense.present, }, "Aspect": { "Hab": Aspect.habitual, "Imp": Aspect.imperfective, "Iter": Aspect.iterative, "Perf": Aspect.perfective, "Prog": Aspect.progressive, "Prosp": Aspect.prospective, }, "Voice": { "Act": Voice.active, "Antip": Voice.antipassive, "Bfoc": Voice.beneficiary_focus, "Lfoc": Voice.location_focus, "Caus": Voice.causative, "Dir": Voice.direct, "Inv": Voice.inverse, "Mid": Voice.middle, "Pass": Voice.passive, "Rcp": Voice.reciprocal, }, "Evident": {"Fh": Evidentiality.first_hand, "Nfh": Evidentiality.non_first_hand}, "Polarity": {"Pos": Polarity.pos, "Neg": Polarity.neg}, "Person": { "0": Person.zeroth, "1": Person.first, "2": Person.second, "3": Person.third, "4": Person.fourth, "Psor": Person.psor, "Subj": Person.subj, }, "Polite": { "Elev": Politeness.elevated, "Form": Politeness.formal, "Humb": Politeness.humble, "Infm": Politeness.informal, }, "Clusivity": {"Ex": Clusivity.exclusive, "In": Clusivity.inclusive}, # nominal "Gender": { "Com": Gender.common, "Fem": Gender.feminine, "Masc": Gender.masculine, "Neut": Gender.neuter, "Psor": Gender.psor, }, "Animacy": { "Anim": Animacy.animate, "Hum": Animacy.human, "Inan": Animacy.inanimate, "Nhum": Animacy.non_human, }, "Number": { "Coll": Number.collective, "Count": Number.count_plural, "Dual": Number.dual, "Grpa": Number.greater_paucal, "Grpl": Number.greater_plural, "Inv": Number.inverse_number, "Pauc": Number.paucal, "Plur": Number.plural, "Ptan": Number.plurale_tantum, "Sing": Number.singular, "Tri": Number.trial, "Psor": Number.psor, }, "NumForm": { "Word": NumForm.word, "Digit": NumForm.digit, "Roman": NumForm.roman, "Reference": NumForm.reference, }, "Case": { # structural cases "Nom": Case.nominative, "Acc": Case.accusative, "Erg": Case.ergative, "Abs": Case.absolutive, # oblique cases "Abe": Case.abessive, "Ben": Case.befefactive, "Caus": Case.causative, "Cmp": Case.comparative, "Cns": Case.considerative, "Com": Case.comitative, "Dat": Case.dative, "Dis": Case.distributive, "Equ": Case.equative, "Gen": Case.genitive, "Ins": Case.instrumental, "Par": Case.partitive, "Voc": Case.vocative, # spatiotemporal cases "Abl": Case.ablative, "Add": Case.additive, "Ade": Case.adessive, "All": Case.allative, "Del": Case.delative, "Ela": Case.elative, "Ess": Case.essive, "Ill": Case.illative, "Ine": Case.inessive, "Lat": Case.lative, "Loc": Case.locative, "Per": Case.perlative, "Sub": Case.sublative, "Sup": Case.superessive, "Ter": Case.terminative, "Tem": Case.temporal, "Tra": Case.translative, }, "Definite": { "Com": Definiteness.complex, "Cons": Definiteness.construct_state, "Def": Definiteness.definite, "Ind": Definiteness.indefinite, "Spec": Definiteness.specific_indefinite, }, "Degree": { "Abs": Degree.absolute_superlative, "Cmp": Degree.comparative, "Equ": Degree.equative, "Pos": Degree.positive, "Sup": Degree.superlative, }, # other lexical "PronType": { "Art": PronominalType.article, "Con": PronominalType.contrastive, "Dem": PronominalType.demonstrative, "Emp": PronominalType.emphatic, "Exc": PronominalType.exclamative, "Ind": PronominalType.indefinite, "Int": PronominalType.interrogative, "Neg": PronominalType.negative, "Prs": PronominalType.personal, "Rcp": PronominalType.reciprocal, "Rel": PronominalType.relative, "Tot": PronominalType.total, }, "AdpType": { "Prep": AdpositionalType.preposition, "Post": AdpositionalType.postposition, "Circ": AdpositionalType.circumposition, "Voc": AdpositionalType.vocalized_adposition, }, "AdvType": { "Man": AdverbialType.manner, "Loc": AdverbialType.location, "Tim": AdverbialType.time, "Deg": AdverbialType.degree, "Cau": AdverbialType.cause, "Mod": AdverbialType.modality, }, "VerbType": { "Aux": VerbType.auxiliary, "Cop": VerbType.copula, "Mod": VerbType.modal, "Light": VerbType.light, }, "NumType": { "Card": Numeral.cardinal, "Dist": Numeral.distributive, "Frac": Numeral.fractional, "Mult": Numeral.multiplicative, "Ord": Numeral.ordinal, "Range": Numeral.range, "Sets": Numeral.sets, }, "NumValue": {"1": NumValue.pos}, "Form": {"Emp": Form.pos}, "NameType": { "Geo": NameType.place, "Prs": NameType.person, "Giv": NameType.person_given_name, "Sur": NameType.person_surname, "Nat": NameType.nationality, "Com": NameType.company, "Pro": NameType.product, "Oth": NameType.other, }, "Strength": {"Strong": Strength.strong, "Weak": Strength.weak}, "Poss": {"Yes": Possessive.pos}, "Reflex": {"Yes": Reflexive.pos}, "Foreign": {"Yes": Foreign.pos}, "Abbr": {"Yes": Abbreviation.pos}, "Typo": {"Yes": Typo.pos}, "InflClass": { "IndEurA": InflClass.ind_eur_a, "IndEurE": InflClass.ind_eur_e, "IndEurI": InflClass.ind_eur_i, "IndEurO": InflClass.ind_eur_o, "IndEurU": InflClass.ind_eur_u, "IndEurX": InflClass.ind_eur_x, "LatA": InflClass.lat_a, "LatAnom": InflClass.lat_anom, "LatE": InflClass.lat_e, "LatI": InflClass.lat_i, "LatI2": InflClass.lat_i2, "LatPron": InflClass.lat_pron, "LatX": InflClass.lat_x, "Nominal": InflClass.nominal, }, "Proper": {"Yes": Proper.yes}, }
[docs]def _postprocess_latincy_ud_types( feature_name: str, feature_value: str ) -> tuple[str, str]: """Pre-process for invalid UD types in LatinCy""" if feature_name == "Verbform": feature_name = "VerbForm" if feature_name == "Mood" and feature_value in ["Ger", "Gdv", "Inf"]: return "VerbForm", feature_value return feature_name, feature_value
[docs]def from_ud( feature_name: str, feature_value: Optional[str] ) -> Optional[MorphosyntacticFeature]: """For a given Universal Dependencies feature name and value, return the appropriate feature class/value. >>> from_ud('Case', 'Abl') ablative >>> from_ud('Abbr', 'Yes') pos >>> from_ud('PronType', 'Ind') indefinite """ # Do cleanup on certain inputs that look like ``"Number[psor]`` # Thus this is rewritten to ``feature_name = Number`` # and ``feature_value = psor``. # Was this for Stanza or LatinCy? if "[" in feature_name and "]" in feature_name: feature_name_split: list[str] = feature_name.split("[", maxsplit=1) feature_name = feature_name_split[0] feature_value = feature_name_split[1][:-1] feature_value = feature_value.title() feature_name, feature_value = _postprocess_latincy_ud_types( feature_name=feature_name, feature_value=feature_value ) if feature_name in FORM_UD_MAP: feature_map = FORM_UD_MAP[feature_name] else: msg1: str = ( f"Unrecognized UD feature '{feature_name}' with value '{feature_value}'." ) msg2: str = f"If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error." print(msg1) print(msg2) print("") # raise CLTKException(msg) return None # print(f"feature_name={feature_name}->feature_value={feature_value}") if feature_value: values = feature_value.split(",") for value in values: if value in feature_map: return feature_map[value] else: msg1: str = ( f"Unrecognized value '{value}' for UD feature '{feature_name}'." ) msg2: str = f"If you believe this is not an error in the dependency parser, please raise an issue at <https://github.com/cltk/cltk/issues> and include a short text to reproduce the error." print(msg1) print(msg2) print("") else: raise CLTKException(f"{feature_name} is None")