8.1.2. cltk.core package

8.1.2.1. Submodules

8.1.2.2. cltk.core.cltk_logger module

CLTK’s logging module.

8.1.2.3. cltk.core.data_types module

Custom data types for the CLTK. These types form the building blocks of the NLP pipeline.

>>> from cltk.core.data_types import Language
>>> from cltk.core.data_types import Word
>>> from cltk.core.data_types import Process
>>> from cltk.core.data_types import Doc
>>> from cltk.core.data_types import Pipeline
class cltk.core.data_types.Language(name, glottolog_id, latitude, longitude, family_id, parent_id, level, iso_639_3_code, type, dates=None)[source]

Bases: object

For holding information about any given language. Used to encode data from ISO 639-3 and Glottolog at cltk.languages.glottolog.LANGUAGES. May be extended by user for dialects or languages not documented by ISO 639-3.

>>> from cltk.core.data_types import Language
>>> from cltk.languages.utils import get_lang
>>> lat = get_lang("lat")
>>> isinstance(lat, Language)
True
>>> lat
Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a', dates=[])
name: str
glottolog_id: str
latitude: float
longitude: float
family_id: str
parent_id: str
level: str
iso_639_3_code: str
type: str
dates: list[int] = None
class cltk.core.data_types.Word(index_char_start=None, index_char_stop=None, index_token=None, index_sentence=None, string=None, pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)[source]

Bases: object

Contains attributes of each processed word in a list of words. Designed to be used in the Doc.words dataclass.

>>> from cltk.core.data_types import Word
>>> Word(index_char_start=0, index_char_stop=6, string="Gallia")
Word(index_char_start=0, index_char_stop=6, index_token=None, index_sentence=None, string='Gallia', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)
index_char_start: Optional[int] = None
index_char_stop: Optional[int] = None
index_token: Optional[int] = None
index_sentence: Optional[int] = None
string: Optional[str] = None
pos: Optional[MorphosyntacticFeature] = None
lemma: Optional[str] = None
stem: Optional[str] = None
scansion: Optional[str] = None
xpos: Optional[str] = None
upos: Optional[str] = None
dependency_relation: Optional[str] = None
governor: Optional[int] = None
features: MorphosyntacticFeatureBundle = {}
category: MorphosyntacticFeatureBundle = {}
embedding: ndarray = None
stop: Optional[bool] = None
named_entity: Optional[bool] = None
syllables: list[str] = None
phonetic_transcription: Optional[str] = None
definition: Optional[str] = None
class cltk.core.data_types.Sentence(words=None, index=None, embedding=None)[source]

Bases: object

The Data Container for sentences.

words: list[cltk.core.data_types.Word] = None
index: int = None
embedding: ndarray = None
class cltk.core.data_types.Doc(language=None, words=None, pipeline=None, raw=None, normalized_text=None, sentence_embeddings=None)[source]

Bases: object

The object returned to the user from the NLP() class. Contains overall attributes of submitted texts, plus most importantly the processed tokenized text words, being a list of Word types.

>>> from cltk import NLP
>>> from cltk.languages.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat", suppress_banner=True)
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.raw[:38]
'Gallia est omnis divisa in partes tres'
>>> isinstance(cltk_doc.raw, str)
True
>>> cltk_doc.tokens[:10]
['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam']
>>> cltk_doc.tokens_stops_filtered[:10]
['Gallia', 'omnis', 'divisa', 'partes', 'tres', ',', 'incolunt', 'Belgae', ',', 'aliam']
>>> cltk_doc.pos[:3]
['NOUN', 'AUX', 'DET']
>>> cltk_doc.morphosyntactic_features[:3]
[{Case: [nominative], Gender: [feminine], InflClass: [ind_eur_a], Number: [singular]}, {InflClass: [lat_anom], Number: [singular], VerbForm: [finite]}, {Case: [nominative], Gender: [feminine], InflClass: [ind_eur_i], Number: [singular], PronominalType: [indefinite]}]
>>> cltk_doc[0].gender
[feminine]
>>> cltk_doc[0]['Case']
[nominative]
>>> cltk_doc.lemmata[:5]
['Gallia', 'sum', 'omnis', 'divisa', 'in']
>>> len(cltk_doc.sentences)
9
>>> len(cltk_doc.sentences[0])
26
>>> type(cltk_doc.sentences[0][2])
<class 'cltk.core.data_types.Word'>
>>> cltk_doc.sentences[0][2].string
'omnis'
>>> len(cltk_doc.sentences_tokens)
9
>>> len(cltk_doc.sentences_tokens[0])
26
>>> isinstance(cltk_doc.sentences_tokens[0][2], str)
True
>>> cltk_doc.sentences_tokens[0][2]
'omnis'
>>> len(cltk_doc.sentences_strings)
9
>>> len(cltk_doc.sentences_strings[0])
150
>>> isinstance(cltk_doc.sentences_strings[0], str)
True
>>> cltk_doc.sentences_strings[1]
'Hi omnes lingua , institutis , legibus inter se differunt .'
>>> import numpy as np
>>> isinstance(cltk_doc.embeddings[1], np.ndarray)
True
language: str = None
words: list[cltk.core.data_types.Word] = None
pipeline: Pipeline = None
raw: str = None
normalized_text: str = None
embeddings_model = None
sentence_embeddings: dict[int, numpy.ndarray] = None
property sentences: list[cltk.core.data_types.Sentence]

Returns a list of Sentence``s, with each ``Sentence being a container for a list of Word objects.

Return type:

list[Sentence]

property sentences_tokens: list[list[str]]

Returns a list of lists, with the inner list being a list of word token strings.

Return type:

list[list[str]]

property sentences_strings: list[str]

Returns a list of strings, with each string being a sentence reconstructed from the word tokens.

Return type:

list[str]

property tokens: list[str]

Returns a list of string word tokens of all words in the doc.

Return type:

list[str]

property tokens_stops_filtered: list[str]

Returns a list of string word tokens of all words in the doc, but with stopwords removed.

Return type:

list[str]

property pos: list[str]

Returns a list of the POS tags of all words in the doc.

Return type:

list[str]

property morphosyntactic_features: list[cltk.morphology.morphosyntax.MorphosyntacticFeatureBundle]

Returns a list of MorphosyntacticFeatureBundle containing the morphosyntactic features of each word (when available).

Return type:

list[MorphosyntacticFeatureBundle]

property lemmata: list[str]

Returns a list of lemmata, indexed to the word tokens provided by Doc.tokens.

Return type:

list[str]

property stems: list[str]

Returns a list of word stems, indexed to the word tokens provided by Doc.tokens.

Return type:

list[str]

property embeddings

Returns an embedding for each word.

TODO: Consider option to use lemma

class cltk.core.data_types.Process(language=None)[source]

Bases: ABC

For each type of NLP process there needs to be a definition. It includes the type of data it expects (str, list[str], Word, etc.) and what field within Word it will populate. This base class is intended to be inherited by NLP process types (e.g., TokenizationProcess or DependencyProcess).

language: str = None
abstract run(input_doc)[source]
Return type:

Doc

class cltk.core.data_types.Pipeline(description, processes, language)[source]

Bases: object

Abstract Pipeline class to be inherited.

# TODO: Consider adding a Unicode normalization as a default first Process

>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.languages.utils import get_lang
>>> from cltk.tokenizers import LatinTokenizationProcess
>>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat"))
>>> a_pipeline.description
'A custom Latin pipeline'
>>> issubclass(a_pipeline.processes[0], Process)
True
description: str
processes: list[Type[cltk.core.data_types.Process]]
language: Language
add_process(process)[source]

8.1.2.4. cltk.core.exceptions module

Custom exceptions for cltk library.

exception cltk.core.exceptions.CLTKException[source]

Bases: Exception

Exception class for the cltk library.

>>> from cltk.core.exceptions import CLTKException
>>> raise CLTKException
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.CLTKException[1]>", line 1, in <module>
    raise CLTKException
cltk.core.exceptions.CLTKException
exception cltk.core.exceptions.UnimplementedAlgorithmError[source]

Bases: CLTKException

Exception for when a language is supported by the CLTK however a particular algorithm is not available for that language.

>>> from cltk.core.exceptions import UnimplementedAlgorithmError
>>> raise UnimplementedAlgorithmError
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.UnimplementedAlgorithmError[1]>", line 1, in <module>
    raise UnimplementedAlgorithmError
cltk.core.exceptions.UnimplementedAlgorithmError
exception cltk.core.exceptions.UnknownLanguageError[source]

Bases: CLTKException

Exception for when a user requests a language either not known to the CLTK or not yet implemented.

All known languages at cltk.languages.glottolog.py. Implemented languages include those at cltk.languages.pipelines and some miscellaneously implemented throughout the library.

>>> from cltk.core.exceptions import UnknownLanguageError
>>> raise UnknownLanguageError
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.UnknownLanguageError[1]>", line 1, in <module>
    raise UnknownLanguageError
cltk.core.exceptions.UnknownLanguageError
exception cltk.core.exceptions.CorpusImportError[source]

Bases: Exception

CLTK exception to use when something goes wrong importing corpora