"""Custom data types for the CLTK. These types form the building blocks
of the NLP pipeline.
>>> from cltk.core.data_types import Language
>>> from cltk.core.data_types import Word
>>> from cltk.core.data_types import Process
>>> from cltk.core.data_types import Doc
>>> from cltk.core.data_types import Pipeline
"""
import importlib
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Optional, Type, Union
import numpy as np
import stringcase as sc
from cltk.morphology.morphosyntax import MorphosyntacticFeatureBundle
from cltk.morphology.universal_dependencies_features import MorphosyntacticFeature
ud_mod = importlib.import_module("cltk.morphology.universal_dependencies_features")
[docs]@dataclass(frozen=True)
class Language:
"""For holding information about any given language. Used to
encode data from ISO 639-3 and Glottolog at
``cltk.languages.glottolog.LANGUAGES``. May be extended by
user for dialects or languages not documented by ISO 639-3.
>>> from cltk.core.data_types import Language
>>> from cltk.languages.utils import get_lang
>>> lat = get_lang("lat")
>>> isinstance(lat, Language)
True
>>> lat
Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a', dates=[])
"""
name: str # Glottolog description
glottolog_id: str
latitude: float
longitude: float
family_id: str # from Glottolog
parent_id: str # from Glottolog
level: str # a language or a dialect
iso_639_3_code: str
type: str # "a" for ancient and "h" for historical; this from Glottolog
dates: list[int] = None # add later; not available from Glottolog or ISO list
[docs]@dataclass
class Word:
"""Contains attributes of each processed word in a list of
words. Designed to be used in the ``Doc.words`` dataclass.
>>> from cltk.core.data_types import Word
>>> Word(index_char_start=0, index_char_stop=6, string="Gallia")
Word(index_char_start=0, index_char_stop=6, index_token=None, index_sentence=None, string='Gallia', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)
"""
index_char_start: Optional[int] = None
index_char_stop: Optional[int] = None
index_token: Optional[int] = None
index_sentence: Optional[int] = None
string: Optional[str] = None
pos: Optional[MorphosyntacticFeature] = None
lemma: Optional[str] = None
stem: Optional[str] = None
scansion: Optional[str] = None
xpos: Optional[str] = None # treebank-specific POS tag (from Stanza or Spacy)
upos: Optional[str] = None # universal POS tag (from Stanza or Spacy)
dependency_relation: Optional[str] = None # (from Stanza or Spacy)
governor: Optional[int] = None
features: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()
category: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()
embedding: np.ndarray = field(repr=False, default=None)
stop: Optional[bool] = None
named_entity: Optional[bool] = None
syllables: list[str] = None
phonetic_transcription: Optional[str] = None
definition: Optional[str] = None
def __getitem__(
self, feature_name: Union[str, Type[MorphosyntacticFeature]]
) -> list[MorphosyntacticFeature]:
"""Accessor to help get morphosyntatic features from a word object."""
return self.features[feature_name]
def __getattr__(self, item: str):
"""Accessor to help get morphosyntatic features from a word object."""
feature_name = sc.pascalcase(item)
if feature_name in ud_mod.__dict__:
return self.features[feature_name]
else:
raise AttributeError(item)
[docs]@dataclass
class Sentence:
"""
The Data Container for sentences.
"""
words: list[Word] = None
index: int = None
embedding: np.ndarray = field(repr=False, default=None)
def __getitem__(self, item: int) -> Word:
"""This indexing operation descends into the word list structure."""
return self.words[item]
def __len__(self) -> int:
"""Returns the number of tokens in the sentence"""
return len(self.words)
[docs]@dataclass
class Doc:
"""The object returned to the user from the ``NLP()`` class.
Contains overall attributes of submitted texts, plus most
importantly the processed tokenized text ``words``,
being a list of ``Word`` types.
>>> from cltk import NLP
>>> from cltk.languages.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat", suppress_banner=True)
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.raw[:38]
'Gallia est omnis divisa in partes tres'
>>> isinstance(cltk_doc.raw, str)
True
>>> cltk_doc.tokens[:10]
['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam']
>>> cltk_doc.tokens_stops_filtered[:10]
['Gallia', 'omnis', 'divisa', 'partes', 'tres', ',', 'incolunt', 'Belgae', ',', 'aliam']
>>> cltk_doc.pos[:3]
['NOUN', 'AUX', 'DET']
>>> cltk_doc.morphosyntactic_features[:3]
[{Case: [nominative], Gender: [feminine], InflClass: [ind_eur_a], Number: [singular]}, {InflClass: [lat_anom], Number: [singular], VerbForm: [finite]}, {Case: [nominative], Gender: [feminine], InflClass: [ind_eur_i], Number: [singular], PronominalType: [indefinite]}]
>>> cltk_doc[0].gender
[feminine]
>>> cltk_doc[0]['Case']
[nominative]
>>> cltk_doc.lemmata[:5]
['Gallia', 'sum', 'omnis', 'divisa', 'in']
>>> len(cltk_doc.sentences)
9
>>> len(cltk_doc.sentences[0])
26
>>> type(cltk_doc.sentences[0][2])
<class 'cltk.core.data_types.Word'>
>>> cltk_doc.sentences[0][2].string
'omnis'
>>> len(cltk_doc.sentences_tokens)
9
>>> len(cltk_doc.sentences_tokens[0])
26
>>> isinstance(cltk_doc.sentences_tokens[0][2], str)
True
>>> cltk_doc.sentences_tokens[0][2]
'omnis'
>>> len(cltk_doc.sentences_strings)
9
>>> len(cltk_doc.sentences_strings[0])
150
>>> isinstance(cltk_doc.sentences_strings[0], str)
True
>>> cltk_doc.sentences_strings[1]
'Hi omnes lingua , institutis , legibus inter se differunt .'
>>> import numpy as np
>>> isinstance(cltk_doc.embeddings[1], np.ndarray)
True
"""
language: str = None
words: list[Word] = None
pipeline: "Pipeline" = None # Note: type should be ``Pipeline`` w/o quotes
raw: str = None
normalized_text: str = None
embeddings_model = None
sentence_embeddings: dict[int, np.ndarray] = field(repr=False, default=None)
@property
def sentences(self) -> list[Sentence]:
"""Returns a list of ``Sentence``s, with each ``Sentence`` being a container for a
list of ``Word`` objects."""
sents: dict[int, list[Word]] = defaultdict(list)
for word in self.words:
sents[word.index_sentence].append(word)
for key in sents:
for w in sents[key]:
if w.index_token is None:
raise ValueError(f"Index token is not defined for {w.string}")
for key in sents:
sents[key].sort(key=lambda x: x.index_token)
# Sometimes not available, nor initialized; e.g. stanza
if not self.sentence_embeddings:
self.sentence_embeddings = dict()
return [
Sentence(words=val, index=key, embedding=self.sentence_embeddings.get(key))
for key, val in sorted(sents.items(), key=lambda x: x[0])
]
@property
def sentences_tokens(self) -> list[list[str]]:
"""Returns a list of lists, with the inner list being a
list of word token strings.
"""
sentences_tokens: list[list[str]] = list()
for sentence in self.sentences:
sentence_tokens: list[str] = [word.string for word in sentence]
sentences_tokens.append(sentence_tokens)
return sentences_tokens
@property
def sentences_strings(self) -> list[str]:
"""Returns a list of strings, with each string being
a sentence reconstructed from the word tokens.
"""
sentences_list: list[list[str]] = self.sentences_tokens
sentences_str: list[str] = list()
for sentence_tokens in sentences_list: # type: list[str]
if self.language == "akk":
# 'akk' produces list[tuple[str, str]]
sentence_tokens_str = " ".join([tup[0] for tup in sentence_tokens])
else:
sentence_tokens_str: str = " ".join(sentence_tokens)
sentences_str.append(sentence_tokens_str)
return sentences_str
def _get_words_attribute(self, attribute):
return [getattr(word, attribute) for word in self.words]
@property
def tokens(self) -> list[str]:
"""Returns a list of string word tokens of all words in the doc."""
tokens = self._get_words_attribute("string")
return tokens
@property
def tokens_stops_filtered(
self,
) -> list[str]:
"""Returns a list of string word tokens of all words in the
doc, but with stopwords removed.
"""
tokens: list[str] = self._get_words_attribute("string")
# create equal-length list of True & False/None values
is_token_stop: list[bool] = self._get_words_attribute("stop")
# remove from the token list any who index in ``is_token_stop`` is True
tokens_no_stops: list[str] = [
token for index, token in enumerate(tokens) if not is_token_stop[index]
]
return tokens_no_stops
@property
def pos(self) -> list[str]:
"""Returns a list of the POS tags of all words in the doc."""
return self._get_words_attribute("upos")
@property
def morphosyntactic_features(self) -> list[MorphosyntacticFeatureBundle]:
"""Returns a list of `MorphosyntacticFeatureBundle` containing the morphosyntactic features
of each word (when available).
"""
return self._get_words_attribute("features")
@property
def lemmata(self) -> list[str]:
"""Returns a list of lemmata, indexed to the word tokens
provided by `Doc.tokens`.
"""
return self._get_words_attribute("lemma")
@property
def stems(self) -> list[str]:
"""Returns a list of word stems, indexed to the word tokens
provided by `Doc.tokens`.
"""
stems = self._get_words_attribute("stem")
return stems
def __getitem__(self, word_index: int) -> Word:
"""Indexing operator overloaded to return the `Word` at index `word_index`."""
return self.words[word_index]
@property
def embeddings(self):
"""Returns an embedding for each word.
TODO: Consider option to use lemma
"""
return self._get_words_attribute("embedding")
[docs]@dataclass
class Process(ABC):
"""For each type of NLP process there needs to be a definition.
It includes the type of data it expects (``str``, ``list[str]``,
``Word``, etc.) and what field within ``Word`` it will populate.
This base class is intended to be inherited by NLP process
types (e.g., ``TokenizationProcess`` or ``DependencyProcess``).
"""
language: str = None
[docs] @abstractmethod
def run(self, input_doc: Doc) -> Doc:
pass
[docs]@dataclass
class Pipeline:
"""Abstract ``Pipeline`` class to be inherited.
# TODO: Consider adding a Unicode normalization as a default first Process
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.languages.utils import get_lang
>>> from cltk.tokenizers import LatinTokenizationProcess
>>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat"))
>>> a_pipeline.description
'A custom Latin pipeline'
>>> issubclass(a_pipeline.processes[0], Process)
True
"""
description: str
processes: list[Type[Process]]
language: Language
[docs] def add_process(self, process: Type[Process]):
self.processes.append(process)