Source code for cltk.core.data_types

"""Custom data types for the CLTK. These types form the building blocks
of the NLP pipeline.


>>> from cltk.core.data_types import Language
>>> from cltk.core.data_types import Word
>>> from cltk.core.data_types import Process
>>> from cltk.core.data_types import Doc
>>> from cltk.core.data_types import Pipeline
"""

import importlib
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Optional, Type, Union

import numpy as np
import stringcase as sc

from cltk.morphology.morphosyntax import MorphosyntacticFeatureBundle
from cltk.morphology.universal_dependencies_features import MorphosyntacticFeature

ud_mod = importlib.import_module("cltk.morphology.universal_dependencies_features")


[docs]@dataclass(frozen=True) class Language: """For holding information about any given language. Used to encode data from ISO 639-3 and Glottolog at ``cltk.languages.glottolog.LANGUAGES``. May be extended by user for dialects or languages not documented by ISO 639-3. >>> from cltk.core.data_types import Language >>> from cltk.languages.utils import get_lang >>> lat = get_lang("lat") >>> isinstance(lat, Language) True >>> lat Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a', dates=[]) """ name: str # Glottolog description glottolog_id: str latitude: float longitude: float family_id: str # from Glottolog parent_id: str # from Glottolog level: str # a language or a dialect iso_639_3_code: str type: str # "a" for ancient and "h" for historical; this from Glottolog dates: list[int] = None # add later; not available from Glottolog or ISO list
[docs]@dataclass class Word: """Contains attributes of each processed word in a list of words. Designed to be used in the ``Doc.words`` dataclass. >>> from cltk.core.data_types import Word >>> Word(index_char_start=0, index_char_stop=6, string="Gallia") Word(index_char_start=0, index_char_stop=6, index_token=None, index_sentence=None, string='Gallia', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None) """ index_char_start: Optional[int] = None index_char_stop: Optional[int] = None index_token: Optional[int] = None index_sentence: Optional[int] = None string: Optional[str] = None pos: Optional[MorphosyntacticFeature] = None lemma: Optional[str] = None stem: Optional[str] = None scansion: Optional[str] = None xpos: Optional[str] = None # treebank-specific POS tag (from Stanza or Spacy) upos: Optional[str] = None # universal POS tag (from Stanza or Spacy) dependency_relation: Optional[str] = None # (from Stanza or Spacy) governor: Optional[int] = None features: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle() category: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle() embedding: np.ndarray = field(repr=False, default=None) stop: Optional[bool] = None named_entity: Optional[bool] = None syllables: list[str] = None phonetic_transcription: Optional[str] = None definition: Optional[str] = None def __getitem__( self, feature_name: Union[str, Type[MorphosyntacticFeature]] ) -> list[MorphosyntacticFeature]: """Accessor to help get morphosyntatic features from a word object.""" return self.features[feature_name] def __getattr__(self, item: str): """Accessor to help get morphosyntatic features from a word object.""" feature_name = sc.pascalcase(item) if feature_name in ud_mod.__dict__: return self.features[feature_name] else: raise AttributeError(item)
[docs]@dataclass class Sentence: """ The Data Container for sentences. """ words: list[Word] = None index: int = None embedding: np.ndarray = field(repr=False, default=None) def __getitem__(self, item: int) -> Word: """This indexing operation descends into the word list structure.""" return self.words[item] def __len__(self) -> int: """Returns the number of tokens in the sentence""" return len(self.words)
[docs]@dataclass class Doc: """The object returned to the user from the ``NLP()`` class. Contains overall attributes of submitted texts, plus most importantly the processed tokenized text ``words``, being a list of ``Word`` types. >>> from cltk import NLP >>> from cltk.languages.example_texts import get_example_text >>> cltk_nlp = NLP(language="lat", suppress_banner=True) >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat")) >>> cltk_doc.raw[:38] 'Gallia est omnis divisa in partes tres' >>> isinstance(cltk_doc.raw, str) True >>> cltk_doc.tokens[:10] ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam'] >>> cltk_doc.tokens_stops_filtered[:10] ['Gallia', 'omnis', 'divisa', 'partes', 'tres', ',', 'incolunt', 'Belgae', ',', 'aliam'] >>> cltk_doc.pos[:3] ['NOUN', 'AUX', 'DET'] >>> cltk_doc.morphosyntactic_features[:3] [{Case: [nominative], Gender: [feminine], InflClass: [ind_eur_a], Number: [singular]}, {InflClass: [lat_anom], Number: [singular], VerbForm: [finite]}, {Case: [nominative], Gender: [feminine], InflClass: [ind_eur_i], Number: [singular], PronominalType: [indefinite]}] >>> cltk_doc[0].gender [feminine] >>> cltk_doc[0]['Case'] [nominative] >>> cltk_doc.lemmata[:5] ['Gallia', 'sum', 'omnis', 'divisa', 'in'] >>> len(cltk_doc.sentences) 9 >>> len(cltk_doc.sentences[0]) 26 >>> type(cltk_doc.sentences[0][2]) <class 'cltk.core.data_types.Word'> >>> cltk_doc.sentences[0][2].string 'omnis' >>> len(cltk_doc.sentences_tokens) 9 >>> len(cltk_doc.sentences_tokens[0]) 26 >>> isinstance(cltk_doc.sentences_tokens[0][2], str) True >>> cltk_doc.sentences_tokens[0][2] 'omnis' >>> len(cltk_doc.sentences_strings) 9 >>> len(cltk_doc.sentences_strings[0]) 150 >>> isinstance(cltk_doc.sentences_strings[0], str) True >>> cltk_doc.sentences_strings[1] 'Hi omnes lingua , institutis , legibus inter se differunt .' >>> import numpy as np >>> isinstance(cltk_doc.embeddings[1], np.ndarray) True """ language: str = None words: list[Word] = None pipeline: "Pipeline" = None # Note: type should be ``Pipeline`` w/o quotes raw: str = None normalized_text: str = None embeddings_model = None sentence_embeddings: dict[int, np.ndarray] = field(repr=False, default=None) @property def sentences(self) -> list[Sentence]: """Returns a list of ``Sentence``s, with each ``Sentence`` being a container for a list of ``Word`` objects.""" sents: dict[int, list[Word]] = defaultdict(list) for word in self.words: sents[word.index_sentence].append(word) for key in sents: for w in sents[key]: if w.index_token is None: raise ValueError(f"Index token is not defined for {w.string}") for key in sents: sents[key].sort(key=lambda x: x.index_token) # Sometimes not available, nor initialized; e.g. stanza if not self.sentence_embeddings: self.sentence_embeddings = dict() return [ Sentence(words=val, index=key, embedding=self.sentence_embeddings.get(key)) for key, val in sorted(sents.items(), key=lambda x: x[0]) ] @property def sentences_tokens(self) -> list[list[str]]: """Returns a list of lists, with the inner list being a list of word token strings. """ sentences_tokens: list[list[str]] = list() for sentence in self.sentences: sentence_tokens: list[str] = [word.string for word in sentence] sentences_tokens.append(sentence_tokens) return sentences_tokens @property def sentences_strings(self) -> list[str]: """Returns a list of strings, with each string being a sentence reconstructed from the word tokens. """ sentences_list: list[list[str]] = self.sentences_tokens sentences_str: list[str] = list() for sentence_tokens in sentences_list: # type: list[str] if self.language == "akk": # 'akk' produces list[tuple[str, str]] sentence_tokens_str = " ".join([tup[0] for tup in sentence_tokens]) else: sentence_tokens_str: str = " ".join(sentence_tokens) sentences_str.append(sentence_tokens_str) return sentences_str def _get_words_attribute(self, attribute): return [getattr(word, attribute) for word in self.words] @property def tokens(self) -> list[str]: """Returns a list of string word tokens of all words in the doc.""" tokens = self._get_words_attribute("string") return tokens @property def tokens_stops_filtered( self, ) -> list[str]: """Returns a list of string word tokens of all words in the doc, but with stopwords removed. """ tokens: list[str] = self._get_words_attribute("string") # create equal-length list of True & False/None values is_token_stop: list[bool] = self._get_words_attribute("stop") # remove from the token list any who index in ``is_token_stop`` is True tokens_no_stops: list[str] = [ token for index, token in enumerate(tokens) if not is_token_stop[index] ] return tokens_no_stops @property def pos(self) -> list[str]: """Returns a list of the POS tags of all words in the doc.""" return self._get_words_attribute("upos") @property def morphosyntactic_features(self) -> list[MorphosyntacticFeatureBundle]: """Returns a list of `MorphosyntacticFeatureBundle` containing the morphosyntactic features of each word (when available). """ return self._get_words_attribute("features") @property def lemmata(self) -> list[str]: """Returns a list of lemmata, indexed to the word tokens provided by `Doc.tokens`. """ return self._get_words_attribute("lemma") @property def stems(self) -> list[str]: """Returns a list of word stems, indexed to the word tokens provided by `Doc.tokens`. """ stems = self._get_words_attribute("stem") return stems def __getitem__(self, word_index: int) -> Word: """Indexing operator overloaded to return the `Word` at index `word_index`.""" return self.words[word_index] @property def embeddings(self): """Returns an embedding for each word. TODO: Consider option to use lemma """ return self._get_words_attribute("embedding")
[docs]@dataclass class Process(ABC): """For each type of NLP process there needs to be a definition. It includes the type of data it expects (``str``, ``list[str]``, ``Word``, etc.) and what field within ``Word`` it will populate. This base class is intended to be inherited by NLP process types (e.g., ``TokenizationProcess`` or ``DependencyProcess``). """ language: str = None
[docs] @abstractmethod def run(self, input_doc: Doc) -> Doc: pass
[docs]@dataclass class Pipeline: """Abstract ``Pipeline`` class to be inherited. # TODO: Consider adding a Unicode normalization as a default first Process >>> from cltk.core.data_types import Process, Pipeline >>> from cltk.languages.utils import get_lang >>> from cltk.tokenizers import LatinTokenizationProcess >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat")) >>> a_pipeline.description 'A custom Latin pipeline' >>> issubclass(a_pipeline.processes[0], Process) True """ description: str processes: list[Type[Process]] language: Language
[docs] def add_process(self, process: Type[Process]): self.processes.append(process)