8.1.2. cltk.core package

8.1.2.1. Submodules

8.1.2.2. cltk.core.cltk_logger module

CLTK’s logging module.

8.1.2.3. cltk.core.data_types module

Custom data types for the CLTK. These types form the building blocks of the NLP pipeline.

>>> from cltk.core.data_types import Language
>>> from cltk.core.data_types import Word
>>> from cltk.core.data_types import Process
>>> from cltk.core.data_types import Doc
>>> from cltk.core.data_types import Pipeline
class cltk.core.data_types.Language(name: str, glottolog_id: str, latitude: float, longitude: float, dates: List[int], family_id: str, parent_id: str, level: str, iso_639_3_code: str, type: str)[source]

Bases: object

For holding information about any given language. Used to encode data from ISO 639-3 and Glottolog at cltk.languages.glottolog.LANGUAGES. May be extended by user for dialects or languages not documented by ISO 639-3.

>>> from cltk.core.data_types import Language
>>> from cltk.languages.utils import get_lang
>>> lat = get_lang("lat")
>>> isinstance(lat, Language)
True
>>> lat
Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, dates=[], family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a')
name: str
glottolog_id: str
latitude: float
longitude: float
dates: List[int]
family_id: str
parent_id: str
level: str
iso_639_3_code: str
type: str
class cltk.core.data_types.Word(index_char_start: int = None, index_char_stop: int = None, index_token: int = None, index_sentence: int = None, string: str = None, pos: str = None, lemma: str = None, stem: str = None, scansion: str = None, xpos: str = None, upos: str = None, dependency_relation: str = None, governor: int = None, features: cltk.morphology.morphosyntax.MorphosyntacticFeatureBundle = {}, category: cltk.morphology.morphosyntax.MorphosyntacticFeatureBundle = {}, embedding: numpy.ndarray = None, stop: bool = None, named_entity: bool = None, syllables: List[str] = None, phonetic_transcription: str = None, definition: str = None)[source]

Bases: object

Contains attributes of each processed word in a list of words. Designed to be used in the Doc.words dataclass.

>>> from cltk.core.data_types import Word
>>> from cltk.languages.example_texts import get_example_text
>>> get_example_text("lat")[:25]
'Gallia est omnis divisa i'
>>> from cltk.languages.utils import get_lang
>>> lat = get_lang("lat")
>>> Word(index_char_start=0, index_char_stop=6, index_token=0, string=get_example_text("lat")[0:6], pos="nom")
Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)
index_char_start: int = None
index_char_stop: int = None
index_token: int = None
index_sentence: int = None
string: str = None
pos: str = None
lemma: str = None
stem: str = None
scansion: str = None
xpos: str = None
upos: str = None
dependency_relation: str = None
governor: int = None
features: cltk.morphology.morphosyntax.MorphosyntacticFeatureBundle = {}
category: cltk.morphology.morphosyntax.MorphosyntacticFeatureBundle = {}
embedding: numpy.ndarray = None
stop: bool = None
named_entity: bool = None
syllables: List[str] = None
phonetic_transcription: str = None
definition: str = None
class cltk.core.data_types.Doc(language: str = None, words: List[cltk.core.data_types.Word] = None, pipeline: Optional[cltk.core.data_types.Pipeline] = None, raw: str = None, normalized_text: str = None)[source]

Bases: object

The object returned to the user from the NLP() class. Contains overall attributes of submitted texts, plus most importantly the processed tokenized text words, being a list of Word types.

>>> from cltk import NLP
>>> from cltk.languages.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat", suppress_banner=True)
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.raw[:38]
'Gallia est omnis divisa in partes tres'
>>> isinstance(cltk_doc.raw, str)
True
>>> cltk_doc.tokens[:10]
['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam']
>>> cltk_doc.tokens_stops_filtered[:10]
['Gallia', 'omnis', 'divisa', 'partes', 'tres', ',', 'incolunt', 'Belgae', ',', 'aliam']
>>> cltk_doc.pos[:3]
['NOUN', 'AUX', 'PRON']
>>> cltk_doc.morphosyntactic_features[:3]
[{Case: [nominative], Degree: [positive], Gender: [feminine], Number: [singular]}, {Mood: [indicative], Number: [singular], Person: [third], Tense: [present], VerbForm: [finite], Voice: [active]}, {Case: [nominative], Degree: [positive], Gender: [feminine], Number: [singular], PrononimalType: [indefinite]}]
>>> cltk_doc[0].gender
[feminine]
>>> cltk_doc[0]['Case']
[nominative]
>>> cltk_doc.lemmata[:5]
['mallis', 'sum', 'omnis', 'divido', 'in']
>>> len(cltk_doc.sentences)
9
>>> len(cltk_doc.sentences[0])
26
>>> type(cltk_doc.sentences[0][2])
<class 'cltk.core.data_types.Word'>
>>> cltk_doc.sentences[0][2].string
'omnis'
>>> len(cltk_doc.sentences_tokens)
9
>>> len(cltk_doc.sentences_tokens[0])
26
>>> isinstance(cltk_doc.sentences_tokens[0][2], str)
True
>>> cltk_doc.sentences_tokens[0][2]
'omnis'
>>> len(cltk_doc.sentences_strings)
9
>>> len(cltk_doc.sentences_strings[0])
150
>>> isinstance(cltk_doc.sentences_strings[0], str)
True
>>> cltk_doc.sentences_strings[1]
'Hi omnes lingua , institutis , legibus inter se differunt .'
>>> import numpy as np
>>> isinstance(cltk_doc.embeddings[1], np.ndarray)
True
language: str = None
words: List[cltk.core.data_types.Word] = None
pipeline: cltk.core.data_types.Pipeline = None
raw: str = None
normalized_text: str = None
embeddings_model = None
property sentences

Returns a list of lists, with the inner list being a list of Word objects.

Return type

List[List[Word]]

property sentences_tokens

Returns a list of lists, with the inner list being a list of word token strings.

Return type

List[List[str]]

property sentences_strings

Returns a list of strings, with each string being a sentence reconstructed from the word tokens.

Return type

List[str]

property tokens

Returns a list of string word tokens of all words in the doc.

Return type

List[str]

property tokens_stops_filtered

Returns a list of string word tokens of all words in the doc, but with stopwords removed.

Return type

List[str]

property pos

Returns a list of the POS tags of all words in the doc.

Return type

List[str]

property morphosyntactic_features

Returns a list of dictionaries containing the morphosyntactic features of each word (when available). Each dictionary specifies feature names as keys and feature values as values.

Return type

List[Dict[str, str]]

property lemmata

Returns a list of lemmata, indexed to the word tokens provided by Doc.tokens.

Return type

List[str]

property stems

Returns a list of word stems, indexed to the word tokens provided by Doc.tokens.

Return type

List[str]

property embeddings

Returns an embedding for each word.

TODO: Consider option to use lemma

class cltk.core.data_types.Process(language: str = None)[source]

Bases: abc.ABC

For each type of NLP process there needs to be a definition. It includes the type of data it expects (str, List[str], Word, etc.) and what field within Word it will populate. This base class is intended to be inherited by NLP process types (e.g., TokenizationProcess or DependencyProcess).

language: str = None
abstract run(input_doc)[source]
Return type

Doc

class cltk.core.data_types.Pipeline(description: str, processes: List[Type[cltk.core.data_types.Process]], language: cltk.core.data_types.Language)[source]

Bases: object

Abstract Pipeline class to be inherited.

# TODO: Consider adding a Unicode normalization as a default first Process

>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.languages.utils import get_lang
>>> from cltk.tokenizers import LatinTokenizationProcess
>>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat"))
>>> a_pipeline.description
'A custom Latin pipeline'
>>> issubclass(a_pipeline.processes[0], Process)
True
description: str
processes: List[Type[cltk.core.data_types.Process]]
language: cltk.core.data_types.Language
add_process(process)[source]

8.1.2.4. cltk.core.exceptions module

Custom exceptions for cltk library.

exception cltk.core.exceptions.CLTKException[source]

Bases: Exception

Exception class for the cltk library.

>>> from cltk.core.exceptions import CLTKException
>>> raise CLTKException
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.CLTKException[1]>", line 1, in <module>
    raise CLTKException
cltk.core.exceptions.CLTKException
exception cltk.core.exceptions.UnimplementedAlgorithmError[source]

Bases: cltk.core.exceptions.CLTKException

Exception for when a language is supported by the CLTK however a particular algorithm is not available for that language.

>>> from cltk.core.exceptions import UnimplementedAlgorithmError
>>> raise UnimplementedAlgorithmError
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.UnimplementedAlgorithmError[1]>", line 1, in <module>
    raise UnimplementedAlgorithmError
cltk.core.exceptions.UnimplementedAlgorithmError
exception cltk.core.exceptions.UnknownLanguageError[source]

Bases: cltk.core.exceptions.CLTKException

Exception for when a user requests a language either not known to the CLTK or not yet implemented.

All known languages at cltk.languages.glottolog.py. Implemented languages include those at cltk.languages.pipelines and some miscellaneously implemented throughout the library.

>>> from cltk.core.exceptions import UnknownLanguageError
>>> raise UnknownLanguageError
Traceback (most recent call last):
  ...
  File "<doctest cltk.core.exceptions.UnknownLanguageError[1]>", line 1, in <module>
    raise UnknownLanguageError
cltk.core.exceptions.UnknownLanguageError
exception cltk.core.exceptions.CorpusImportError[source]

Bases: Exception

CLTK exception to use when something goes wrong importing corpora