Skip to content

data_types

Core data models used throughout CLTK.

This module defines small, typed Pydantic models for linguistic metadata and the main runtime containers (Word, Sentence, Doc), along with lightweight abstractions for Process and Pipeline. These types are the building blocks of the NLP pipeline and are designed to be simple to serialize and render well in documentation.

Level module-attribute

Level: TypeAlias = Literal['family', 'language', 'dialect']

Status module-attribute

Status: TypeAlias = Literal[
    "living",
    "extinct",
    "second language only",
    "artificial",
    "unattested",
    "unknown",
]

Macroarea module-attribute

Macroarea: TypeAlias = Literal[
    "Africa",
    "Eurasia",
    "Papunesia",
    "Australia",
    "North America",
    "South America",
    "Antarctica",
]

ISOType module-attribute

ISOType: TypeAlias = Literal['639-1', '639-2', '639-3']

ScriptDir module-attribute

ScriptDir: TypeAlias = Literal['ltr', 'rtl', 'ttb', 'btt']

BACKEND_TYPES module-attribute

BACKEND_TYPES: TypeAlias = Literal[
    "openai",
    "stanza",
    "spacy",
    "ollama",
    "ollama-cloud",
    "mistral",
]

AVAILABLE_OPENAI_MODELS module-attribute

AVAILABLE_OPENAI_MODELS: TypeAlias = Literal[
    "gpt-5-mini", "gpt-5"
]

AVAILABLE_MISTRAL_MODELS module-attribute

AVAILABLE_MISTRAL_MODELS: TypeAlias = Literal[
    "mistral-large-latest",
    "magistral-small-latest",
    "mistral-medium-latest",
    "mistral-large-latest",
]

IPA_PRONUNCIATION_MODE module-attribute

IPA_PRONUNCIATION_MODE: TypeAlias = Literal[
    "attic_5c_bce", "koine_1c_ce", "byzantine_medieval"
]

InflectionalDomain module-attribute

InflectionalDomain = Literal['Nominal', 'Verbal']

UDFeatureValue

Bases: BaseModel

Canonical value for a UD feature key.

Attributes:

  • code (str) –

    Short code for the value (e.g., "Masc").

  • label (str) –

    Human‑readable label (e.g., "Masculine").

  • description (str) –

    Longer explanation of the value.

  • inflectional_class (Optional[InflectionalDomain]) –

    Optional class of inflectional features this value belongs to (e.g., "Nominal", "Verbal").

  • is_deprecated (Optional[bool]) –

    Whether the value is deprecated in UD.

code instance-attribute

code: str

label instance-attribute

label: str

description instance-attribute

description: str

inflectional_class class-attribute instance-attribute

inflectional_class: Optional[InflectionalDomain] = None

is_deprecated class-attribute instance-attribute

is_deprecated: Optional[bool] = False

UDFeature

Bases: BaseModel

Canonical UD feature definition.

Attributes:

  • key (str) –

    Feature key (e.g., "Case").

  • category (Literal['Lexical', 'Inflectional', 'Other']) –

    High‑level category (lexical/inflectional/other).

  • description (str) –

    Description of the feature semantics.

  • values (dict[str, UDFeatureValue]) –

    Mapping from value codes to their definitions.

key instance-attribute

key: str

category instance-attribute

category: Literal['Lexical', 'Inflectional', 'Other']

description instance-attribute

description: str

values instance-attribute

values: dict[str, UDFeatureValue]

UDFeatureTag

Bases: BaseModel

A single UD feature key/value tag.

Validates a pair (key, value) against the registry, attempting to normalize known variants via normalize_ud_feature_pair.

Attributes:

  • key (str) –

    UD feature key (e.g., "Case").

  • value (str) –

    UD feature value code (e.g., "Nom").

  • value_label (str) –

    Human‑readable label resolved from the registry.

  • category (Literal['Lexical', 'Inflectional', 'Other']) –

    Feature category populated from the canonical definition.

  • inflectional_class (Optional[Literal['Nominal', 'Verbal']]) –

    Optional inflectional class for the feature.

key instance-attribute

key: str

value instance-attribute

value: str

value_label class-attribute instance-attribute

value_label: str = ''

category class-attribute instance-attribute

category: Literal["Lexical", "Inflectional", "Other"] = (
    "Lexical"
)

inflectional_class class-attribute instance-attribute

inflectional_class: Optional[
    Literal["Nominal", "Verbal"]
] = None

fill_fields classmethod

fill_fields(data: dict) -> dict

Pre-validate and enrich tag data using the feature registry.

Attempts to normalize (key, value) pairs that are not found. On success, populates category, inflectional_class, and value_label based on the canonical UD_FEATURES_MAP entry.

Parameters:

  • data (dict) –

    Input dictionary with at least key and value.

Raises:

  • ValueError

    If required fields are missing or normalization fails.

Returns:

  • dict

    The enriched data dictionary for model construction.

Source code in cltk/core/data_types.py
@model_validator(mode="before")
@classmethod
def fill_fields(cls, data: dict) -> dict:
    """Pre-validate and enrich tag data using the feature registry.

    Attempts to normalize ``(key, value)`` pairs that are not found. On
    success, populates ``category``, ``inflectional_class``, and
    ``value_label`` based on the canonical ``UD_FEATURES_MAP`` entry.

    Args:
        data: Input dictionary with at least ``key`` and ``value``.

    Raises:
        ValueError: If required fields are missing or normalization fails.

    Returns:
        The enriched data dictionary for model construction.

    """
    from cltk.morphosyntax.normalization import normalize_ud_feature_pair
    from cltk.morphosyntax.ud_features import UD_FEATURES_MAP

    key = data.get("key")
    value = data.get("value")
    if not isinstance(key, str) or not isinstance(value, str):
        msg = "UDFeatureTag requires 'key' and 'value' as strings."
        logger.error(msg)
        raise ValueError(msg)
    if key not in UD_FEATURES_MAP or value not in UD_FEATURES_MAP[key].values:
        # Try to normalize
        normalized = normalize_ud_feature_pair(key, value)
        if normalized:
            key, value = normalized
            data["key"] = key
            data["value"] = value
        else:
            msg = f"Invalid value '{value}' for feature key '{key}'"
            raise ValueError(msg)
    feature = UD_FEATURES_MAP[key]
    if value not in feature.values:
        msg = f"Value '{value}' is not valid for feature key '{key}' even after normalization."
        raise ValueError(msg)
    data["category"] = feature.category
    data["inflectional_class"] = feature.values[value].inflectional_class
    data["value_label"] = feature.values[value].label
    return data

UDFeatureTagSet

Bases: BaseModel

A collection of feature tags for a token.

Attributes:

Notes

This uses a list to retain insertion order. A dictionary keyed by feature "key" may be more efficient for lookups in some contexts.

features class-attribute instance-attribute

features: list[UDFeatureTag] = []

add_feature

add_feature(feature: UDFeatureTag) -> None

Add a feature to the set if the key is not already present.

Parameters:

Returns:

  • None

    None

Source code in cltk/core/data_types.py
def add_feature(self, feature: UDFeatureTag) -> None:
    """Add a feature to the set if the key is not already present.

    Args:
        feature: Feature tag to add.

    Returns:
        None

    """
    if any(f.key == feature.key for f in self.features):
        logger.error(
            f"Feature with key '{feature.key}' already exists in the tag set."
        )
        return None
    self.features.append(feature)
    logger.debug(f"Added feature {feature.key} to UDFeatureTagSet.")

CLTKGenAIResponse

Bases: BaseModel

Response model for generative backend interactions (OpenAI/Ollama).

Attributes:

  • response (str) –

    The generated text returned by the LLM.

  • usage (dict[str, int]) –

    Token usage information (input, output, total) when available.

response instance-attribute

response: str

usage instance-attribute

usage: dict[str, int]

ScoredText

Bases: BaseModel

Generic scored alternative, used for glosses and translations.

text instance-attribute

text: str

probability class-attribute instance-attribute

probability: Optional[float] = Field(
    default=None, ge=0.0, le=1.0
)

note class-attribute instance-attribute

note: Optional[str] = None

Gloss

Bases: BaseModel

Contextual and dictionary glosses plus alternatives.

dictionary class-attribute instance-attribute

dictionary: Optional[str] = None

context class-attribute instance-attribute

context: Optional[str] = None

alternatives class-attribute instance-attribute

alternatives: list[ScoredText] = Field(default_factory=list)

LemmaTranslationCandidate

Bases: BaseModel

Stable lemma-level translation candidate with an optional probability.

text instance-attribute

text: str

probability class-attribute instance-attribute

probability: Optional[float] = Field(
    default=None, ge=0.0, le=1.0
)

source class-attribute instance-attribute

source: Optional[str] = None

IPAEnrichment

Bases: BaseModel

IPA transcription with explicit pronunciation mode.

value instance-attribute

value: str

mode instance-attribute

mode: IPA_PRONUNCIATION_MODE

OrthographyHelper

Bases: BaseModel

Orthography/phonology helpers to expose syllables and accent.

syllables class-attribute instance-attribute

syllables: list[str] = Field(default_factory=list)

stress class-attribute instance-attribute

stress: Optional[str] = None

accent_class class-attribute instance-attribute

accent_class: Optional[str] = None

phonology_trace class-attribute instance-attribute

phonology_trace: list[str] = Field(default_factory=list)

PedagogicalNote

Bases: BaseModel

Short, learner-facing note tied to a token or dependency relation.

token_index class-attribute instance-attribute

token_index: Optional[int] = None

relation class-attribute instance-attribute

relation: Optional[str] = None

note class-attribute instance-attribute

note: Optional[str] = None

disambiguates class-attribute instance-attribute

disambiguates: Optional[str] = None

IdiomSpan

Bases: BaseModel

Span-level idiom/MWE annotation.

id class-attribute instance-attribute

id: Optional[str] = None

token_indices class-attribute instance-attribute

token_indices: list[int] = Field(default_factory=list)

phrase_gloss class-attribute instance-attribute

phrase_gloss: Optional[str] = None

kind class-attribute instance-attribute

kind: Optional[str] = None

confidence class-attribute instance-attribute

confidence: Optional[float] = Field(
    default=None, ge=0.0, le=1.0
)

WordEnrichment

Bases: BaseModel

Bundle of enrichment data layered on top of morph/dependency analysis.

gloss class-attribute instance-attribute

gloss: Optional[Gloss] = None

lemma_translations class-attribute instance-attribute

lemma_translations: list[LemmaTranslationCandidate] = Field(
    default_factory=list
)

ipa class-attribute instance-attribute

ipa: Optional[IPAEnrichment] = None

orthography class-attribute instance-attribute

orthography: Optional[OrthographyHelper] = None

idiom_span_ids class-attribute instance-attribute

idiom_span_ids: list[str] = Field(default_factory=list)

pedagogical_notes class-attribute instance-attribute

pedagogical_notes: list[PedagogicalNote] = Field(
    default_factory=list
)

Translation

Bases: BaseModel

Structured translation with language metadata and notes.

source_lang_id class-attribute instance-attribute

source_lang_id: Optional[str] = None

target_lang_id class-attribute instance-attribute

target_lang_id: Optional[str] = None

text instance-attribute

text: str

notes class-attribute instance-attribute

notes: Optional[str] = None

confidence class-attribute instance-attribute

confidence: Optional[float] = Field(
    default=None, ge=0.0, le=1.0
)

NameVariant

Bases: BaseModel

Alternative name or label for a language or dialect.

Attributes:

  • value (str) –

    The display string for the name.

  • source (Optional[str]) –

    Optional provenance or catalogue name.

  • script (Optional[str]) –

    Optional script tag (e.g., ISO 15924).

  • language (Optional[str]) –

    Optional language code for the label text.

value instance-attribute

value: str

source class-attribute instance-attribute

source: Optional[str] = None

script class-attribute instance-attribute

script: Optional[str] = None

language class-attribute instance-attribute

language: Optional[str] = None

Identifier

Bases: BaseModel

External identifier record.

Attributes:

  • scheme (str) –

    Identifier scheme (e.g., glottocode, iso639-3).

  • value (str) –

    Identifier value.

scheme instance-attribute

scheme: str

value instance-attribute

value: str

GeoPoint

Bases: BaseModel

Geographic point in decimal degrees.

lat instance-attribute

lat: float

lon instance-attribute

lon: float

GeoArea

Bases: BaseModel

Geographic coverage for a language or dialect.

centroid class-attribute instance-attribute

centroid: Optional[GeoPoint] = None

macroareas class-attribute instance-attribute

macroareas: list[Macroarea] = Field(default_factory=list)

countries class-attribute instance-attribute

countries: list[str] = Field(default_factory=list)

Timespan

Bases: BaseModel

Approximate temporal coverage for a resource or orthography.

start class-attribute instance-attribute

start: Optional[int] = None

end class-attribute instance-attribute

end: Optional[int] = None

note class-attribute instance-attribute

note: Optional[str] = None

SourceRef

Bases: BaseModel

Bibliographic source/citation reference.

key instance-attribute

key: str

pages class-attribute instance-attribute

pages: Optional[str] = None

note class-attribute instance-attribute

note: Optional[str] = None

url class-attribute instance-attribute

url: Optional[AnyUrl] = None

Classification

Bases: BaseModel

Taxonomic/phylogenetic information for a language.

level instance-attribute

level: Level

parent_glottocode class-attribute instance-attribute

parent_glottocode: Optional[str] = None

lineage class-attribute instance-attribute

lineage: list[str] = Field(default_factory=list)

children_glottocodes class-attribute instance-attribute

children_glottocodes: list[str] = Field(
    default_factory=list
)

Endangerment

Bases: BaseModel

Endangerment status summary (if available).

status class-attribute instance-attribute

status: Optional[str] = None

source class-attribute instance-attribute

source: Optional[str] = None

date_assessed class-attribute instance-attribute

date_assessed: Optional[date] = None

note class-attribute instance-attribute

note: Optional[str] = None

Bases: BaseModel

External hyperlink with title.

title instance-attribute

title: str

url instance-attribute

url: AnyUrl

TransliterationSystem

Bases: BaseModel

Transliteration scheme description and provenance.

name instance-attribute

name: str

standard_body class-attribute instance-attribute

standard_body: Optional[str] = None

year class-attribute instance-attribute

year: Optional[int] = None

description class-attribute instance-attribute

description: Optional[str] = None

sources class-attribute instance-attribute

sources: list[SourceRef] = Field(default_factory=list)

Orthography

Bases: BaseModel

Orthography used for a language/dialect in a given period/region.

name instance-attribute

name: str

script instance-attribute

script: str

direction class-attribute instance-attribute

direction: Optional[ScriptDir] = None

period class-attribute instance-attribute

period: Optional[Timespan] = None

region class-attribute instance-attribute

region: Optional[str] = None

description class-attribute instance-attribute

description: Optional[str] = None

conventions class-attribute instance-attribute

conventions: list[str] = Field(default_factory=list)

transliteration class-attribute instance-attribute

transliteration: list[TransliterationSystem] = Field(
    default_factory=list
)

sample class-attribute instance-attribute

sample: Optional[str] = None

sources class-attribute instance-attribute

sources: list[SourceRef] = Field(default_factory=list)
links: list[Link] = Field(default_factory=list)

Dialect

Bases: BaseModel

Dialect metadata record from Glottolog‑derived data.

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

language_code class-attribute instance-attribute

language_code: Optional[str] = None

name instance-attribute

name: str

status class-attribute instance-attribute

status: Optional[Status] = None

alt_names class-attribute instance-attribute

alt_names: list[NameVariant] = Field(default_factory=list)

identifiers class-attribute instance-attribute

identifiers: list[Identifier] = Field(default_factory=list)

geo class-attribute instance-attribute

geo: Optional[GeoArea] = None

timespan class-attribute instance-attribute

timespan: Optional[Timespan] = None

scripts class-attribute instance-attribute

scripts: list[str] = Field(default_factory=list)

orthographies class-attribute instance-attribute

orthographies: list[Orthography] = Field(
    default_factory=list
)

sources class-attribute instance-attribute

sources: list[SourceRef] = Field(default_factory=list)
links: list[Link] = Field(default_factory=list)

Language

Bases: BaseModel

Language metadata record from Glottolog‑derived data.

name instance-attribute

name: str

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

identifiers class-attribute instance-attribute

identifiers: list[Identifier] = Field(default_factory=list)

level class-attribute instance-attribute

level: Optional[Level] = None

status class-attribute instance-attribute

status: Optional[Status] = None

type class-attribute instance-attribute

type: Optional[str] = None

geo class-attribute instance-attribute

geo: Optional[GeoArea] = None

timespan class-attribute instance-attribute

timespan: Optional[Timespan] = None

classification class-attribute instance-attribute

classification: Optional[Classification] = None

family_id class-attribute instance-attribute

family_id: Optional[str] = None

parent_id class-attribute instance-attribute

parent_id: Optional[str] = None

iso class-attribute instance-attribute

iso: Optional[str] = None

iso_set class-attribute instance-attribute

iso_set: dict[ISOType, str] = Field(default_factory=dict)

alt_names class-attribute instance-attribute

alt_names: list[NameVariant] = Field(default_factory=list)

scripts class-attribute instance-attribute

scripts: list[str] = Field(default_factory=list)

orthographies class-attribute instance-attribute

orthographies: list[Orthography] = Field(
    default_factory=list
)

sources class-attribute instance-attribute

sources: list[SourceRef] = Field(default_factory=list)
links: list[Link] = Field(default_factory=list)

dialects class-attribute instance-attribute

dialects: list[Dialect] = Field(default_factory=list)

default_variety_id class-attribute instance-attribute

default_variety_id: Optional[str] = None

glottolog_version class-attribute instance-attribute

glottolog_version: Optional[str] = None

commit_sha class-attribute instance-attribute

commit_sha: Optional[str] = None

last_updated class-attribute instance-attribute

last_updated: Optional[date] = None

endangerment class-attribute instance-attribute

endangerment: Optional[Endangerment] = None

latitude class-attribute instance-attribute

latitude: Optional[float] = None

longitude class-attribute instance-attribute

longitude: Optional[float] = None

dates class-attribute instance-attribute

dates: list[int] = Field(default_factory=list)

newick class-attribute instance-attribute

newick: Optional[str] = None

CLTKBaseModel

Bases: BaseModel

Base Pydantic model for CLTK runtime containers.

model_config class-attribute instance-attribute

model_config = {'arbitrary_types_allowed': True}

Word

Bases: CLTKBaseModel

Contains attributes of each processed word in a list of words.

index_char_start class-attribute instance-attribute

index_char_start: Optional[int] = None

index_char_stop class-attribute instance-attribute

index_char_stop: Optional[int] = None

index_token class-attribute instance-attribute

index_token: Optional[int] = None

index_sentence class-attribute instance-attribute

index_sentence: Optional[int] = None

string class-attribute instance-attribute

string: Optional[str] = None

lemma class-attribute instance-attribute

lemma: Optional[str] = None

upos class-attribute instance-attribute

upos: Optional[UDPartOfSpeechTag] = None

features class-attribute instance-attribute

features: Optional[UDFeatureTagSet] = None

dependency_relation class-attribute instance-attribute

dependency_relation: Optional[UDDeprelTag] = None

governor class-attribute instance-attribute

governor: Optional[int] = None

stem class-attribute instance-attribute

stem: Optional[str] = None

scansion class-attribute instance-attribute

scansion: Optional[str] = None

xpos class-attribute instance-attribute

xpos: Optional[str] = None

embedding class-attribute instance-attribute

embedding: Optional[ndarray] = None

stop class-attribute instance-attribute

stop: Optional[bool] = None

named_entity class-attribute instance-attribute

named_entity: Optional[str] = None

syllables class-attribute instance-attribute

syllables: Optional[list[str]] = Field(default_factory=list)

phonetic_transcription class-attribute instance-attribute

phonetic_transcription: Optional[str] = None

definition class-attribute instance-attribute

definition: Optional[str] = None

enrichment class-attribute instance-attribute

enrichment: Optional[WordEnrichment] = None

annotation_sources class-attribute instance-attribute

annotation_sources: dict[str, str] = Field(
    default_factory=dict
)

confidence class-attribute instance-attribute

confidence: dict[str, float] = Field(default_factory=dict)

model_config class-attribute instance-attribute

model_config = {'arbitrary_types_allowed': True}

Sentence

Bases: CLTKBaseModel

A sentence containing words and optional embedding.

words class-attribute instance-attribute

words: Optional[list[Word]] = Field(default_factory=list)

index class-attribute instance-attribute

index: Optional[int] = None

embedding class-attribute instance-attribute

embedding: Optional[ndarray] = None

translation class-attribute instance-attribute

translation: Optional[Translation] = None

annotation_sources class-attribute instance-attribute

annotation_sources: dict[str, str] = Field(
    default_factory=dict
)

model_config class-attribute instance-attribute

model_config = {'arbitrary_types_allowed': True}

ModelConfig

Bases: BaseModel

Common base for backend configuration blocks.

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

StanzaBackendConfig

Bases: ModelConfig

Options specific to the Stanza backend.

model class-attribute instance-attribute

model: Optional[str] = Field(
    default=None,
    description="Optional non-default Stanza model/treebank name to load.",
)

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

OpenAIBackendConfig

Bases: ModelConfig

Options specific to the OpenAI/ChatGPT backend.

model class-attribute instance-attribute

model: Optional[Union[AVAILABLE_OPENAI_MODELS, str]] = None

temperature class-attribute instance-attribute

temperature: float = Field(default=1.0, ge=0, le=2)

max_output_tokens class-attribute instance-attribute

max_output_tokens: Optional[int] = Field(default=None, gt=0)

top_p class-attribute instance-attribute

top_p: Optional[float] = Field(default=None, ge=0, le=1)

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = Field(
    default=None, ge=-2, le=2
)

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = Field(
    default=None, ge=-2, le=2
)

max_retries class-attribute instance-attribute

max_retries: int = Field(default=2, ge=0)

api_key class-attribute instance-attribute

api_key: Optional[str] = None

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

MistralBackendConfig

Bases: ModelConfig

Options specific to the Mistral backend.

model class-attribute instance-attribute

model: Optional[Union[AVAILABLE_MISTRAL_MODELS, str]] = None

temperature class-attribute instance-attribute

temperature: float = Field(default=1.0, ge=0, le=2)

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = Field(default=None, gt=0)

top_p class-attribute instance-attribute

top_p: Optional[float] = Field(default=None, ge=0, le=1)

random_seed class-attribute instance-attribute

random_seed: Optional[int] = Field(default=None, ge=0)

max_retries class-attribute instance-attribute

max_retries: int = Field(default=2, ge=0)

api_key class-attribute instance-attribute

api_key: Optional[str] = None

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

OllamaBackendConfig

Bases: ModelConfig

Options specific to the Ollama backend (local or remote).

model class-attribute instance-attribute

model: Optional[str] = None

temperature class-attribute instance-attribute

temperature: float = Field(default=0.8, ge=0)

top_p class-attribute instance-attribute

top_p: Optional[float] = Field(default=None, ge=0, le=1)

num_ctx class-attribute instance-attribute

num_ctx: Optional[int] = Field(default=None, gt=0)

num_predict class-attribute instance-attribute

num_predict: Optional[int] = Field(default=None, gt=0)

host class-attribute instance-attribute

host: Optional[str] = Field(
    default="http://127.0.0.1",
    description="Base URL for the Ollama server, e.g., http://localhost or https://ollama.example.com.",
)

port class-attribute instance-attribute

port: Optional[int] = Field(default=11434, ge=1, le=65535)

use_cloud class-attribute instance-attribute

use_cloud: bool = False

api_key class-attribute instance-attribute

api_key: Optional[str] = None

options class-attribute instance-attribute

options: dict[str, Any] = Field(
    default_factory=dict,
    description="Additional model options passed directly to the Ollama client.",
)

max_retries class-attribute instance-attribute

max_retries: int = Field(default=2, ge=0)

base_url property

base_url: Optional[str]

Return a combined host:port string when both are provided.

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

CLTKConfig

Bases: BaseModel

Bundled configuration for initializing :class:~cltk.nlp.NLP.

language_code class-attribute instance-attribute

language_code: Optional[str] = None

language class-attribute instance-attribute

language: Optional[Language] = None

backend class-attribute instance-attribute

backend: BACKEND_TYPES = 'stanza'

model class-attribute instance-attribute

model: Optional[str] = None

custom_pipeline class-attribute instance-attribute

custom_pipeline: Optional[Pipeline] = None

suppress_banner class-attribute instance-attribute

suppress_banner: bool = False

stanza class-attribute instance-attribute

stanza: Optional[StanzaBackendConfig] = None

openai class-attribute instance-attribute

openai: Optional[OpenAIBackendConfig] = None

mistral class-attribute instance-attribute

mistral: Optional[MistralBackendConfig] = None

ollama class-attribute instance-attribute

ollama: Optional[OllamaBackendConfig] = None

model_config class-attribute instance-attribute

model_config = {'extra': 'forbid'}

active_backend_config property

active_backend_config: Optional[ModelConfig]

Return the config block matching backend.

Doc

Bases: CLTKBaseModel

Top‑level container returned from NLP() pipelines.

Attributes:

  • language (Language) –

    Language metadata associated with the text.

  • words (list[Word]) –

    Token‑level annotations (may be empty prior to analysis).

  • pipeline (Optional[Pipeline]) –

    Pipeline instance that produced this document, if any.

  • raw (Optional[str]) –

    Original raw text.

  • normalized_text (Optional[str]) –

    Normalized version of the text.

  • sentence_embeddings (dict[int, ndarray]) –

    Optional embeddings per sentence index.

  • sentence_translations (dict[int, Translation]) –

    Structured translations keyed by sentence index.

  • translation (Optional[str]) –

    Optional document-level translation string (usually aggregated).

  • translations (list[Translation]) –

    Collected structured translations (e.g., per sentence).

  • summary (Optional[str]) –

    Optional summary of the document.

  • topic (Optional[str]) –

    Optional topic classification.

  • discourse_relations (list[str]) –

    Discourse relation labels (if available).

  • coreferences (list[tuple[str, str, int, int]]) –

    Coreference links as (mention, antecedent, i, j).

  • sentence_boundaries (list[tuple[int, int]]) –

    List of (start, stop) character offsets.

  • genai_use (list[dict[str, Any]]) –

    List of usage/metadata dicts from model calls.

  • metadata (dict[str, Any]) –

    Arbitrary metadata about the document.

language instance-attribute

language: Language

words class-attribute instance-attribute

words: list[Word] = Field(default_factory=list)

pipeline class-attribute instance-attribute

pipeline: Optional[Pipeline] = None

raw class-attribute instance-attribute

raw: Optional[str] = None

normalized_text class-attribute instance-attribute

normalized_text: Optional[str] = None

embeddings_model class-attribute instance-attribute

embeddings_model: Optional[Any] = None

sentence_embeddings class-attribute instance-attribute

sentence_embeddings: dict[int, ndarray] = Field(
    default_factory=dict
)

sentence_translations class-attribute instance-attribute

sentence_translations: dict[int, Translation] = Field(
    default_factory=dict
)

translation class-attribute instance-attribute

translation: Optional[str] = None

translations class-attribute instance-attribute

translations: list[Translation] = Field(
    default_factory=list
)

summary class-attribute instance-attribute

summary: Optional[str] = None

topic class-attribute instance-attribute

topic: Optional[str] = None

discourse_relations class-attribute instance-attribute

discourse_relations: list[str] = Field(default_factory=list)

coreferences class-attribute instance-attribute

coreferences: list[tuple[str, str, int, int]] = Field(
    default_factory=list
)

idiom_spans class-attribute instance-attribute

idiom_spans: list[IdiomSpan] = Field(default_factory=list)

sentence_boundaries class-attribute instance-attribute

sentence_boundaries: list[tuple[int, int]] = Field(
    default_factory=list
)

genai_use class-attribute instance-attribute

genai_use: list[dict[str, Any]] = Field(
    default_factory=list
)

backend class-attribute instance-attribute

backend: Optional[BACKEND_TYPES] = None

model class-attribute instance-attribute

model: Optional[Union[BACKEND_TYPES, str]] = None

dialect class-attribute instance-attribute

dialect: Optional[Dialect] = None

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict)

provenance class-attribute instance-attribute

provenance: dict[str, ProvenanceRecord] = Field(
    default_factory=dict
)

default_provenance_id class-attribute instance-attribute

default_provenance_id: Optional[str] = None

sentence_annotation_sources class-attribute instance-attribute

sentence_annotation_sources: dict[int, dict[str, str]] = (
    Field(default_factory=dict)
)

sentence_strings property

sentence_strings: list[str]

Return sentence strings derived from boundaries and text.

Returns:

  • list[str]

    A list of substrings of normalized_text cut by

  • list[str]

    sentence_boundaries. Returns an empty list if either field is

  • list[str]

    missing.

sentences property

sentences: list[Sentence]

model_config class-attribute instance-attribute

model_config = {'arbitrary_types_allowed': True}

Process

Bases: BaseModel

Abstract base for NLP processes operating on a Doc.

Subclasses implement run() to transform or enrich a document.

Attributes:

  • glottolog_id (Optional[str]) –

    Optional target language code for language‑specific logic.

process_id class-attribute

process_id: str = ''

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

run abstractmethod

run(input_doc: Doc) -> Doc

Process input_doc and return an enriched/modified copy.

Source code in cltk/core/data_types.py
@abstractmethod
def run(self, input_doc: Doc) -> Doc:
    """Process ``input_doc`` and return an enriched/modified copy."""
    pass

Pipeline

Bases: BaseModel

Composable set of processes to analyze a document.

Attributes:

  • description (Optional[str]) –

    Human‑readable description.

  • processes (Optional[list[Any]]) –

    Ordered list of process classes to apply.

  • language (Optional[Language]) –

    Resolved language metadata.

  • dialect (Optional[Dialect]) –

    Resolved dialect metadata (if applicable).

  • glottolog_id (Optional[str]) –

    Language code used for auto‑resolution.

description class-attribute instance-attribute

description: Optional[str] = None

processes class-attribute instance-attribute

processes: Optional[list[Any]] = Field(default_factory=list)

language class-attribute instance-attribute

language: Optional[Language] = None

dialect class-attribute instance-attribute

dialect: Optional[Dialect] = None

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

spec class-attribute instance-attribute

spec: Optional[Any] = None

add_process

add_process(process: Any) -> None

Append a process class to the pipeline order.

Parameters:

  • process (Any) –

    A Process subclass to add to the pipeline.

Source code in cltk/core/data_types.py
def add_process(self, process: Any) -> None:
    """Append a process class to the pipeline order.

    Args:
      process: A ``Process`` subclass to add to the pipeline.

    """
    if self.processes is None:
        self.processes = []
    self.processes.append(process)

describe

describe() -> list[str]

Return a human-friendly list describing pipeline order.

Source code in cltk/core/data_types.py
def describe(self) -> list[str]:
    """Return a human-friendly list describing pipeline order."""
    lines: list[str] = []
    if self.spec and getattr(self.spec, "steps", None):
        try:
            from cltk.core.process_registry import ProcessRegistry

            registry = ProcessRegistry.list_processes()
        except Exception:
            registry = {}
        for idx, step in enumerate(self.spec.steps, 1):
            proc_cls = registry.get(step.id)
            class_name = proc_cls.__name__ if proc_cls else step.id
            provides = getattr(proc_cls, "provides", None) if proc_cls else None
            requires = getattr(proc_cls, "requires", None) if proc_cls else None
            status = "enabled" if step.enabled else "disabled"
            line = f"{idx}. {step.id} ({class_name}) [{status}]"
            if provides:
                line += f" provides={_format_list(provides)}"
            if requires:
                line += f" requires={_format_list(requires)}"
            lines.append(line)
        return lines
    for idx, proc in enumerate(self.processes or [], 1):
        pid = _process_id(proc)
        class_name = _process_name(proc)
        provides = getattr(proc, "provides", None)
        requires = getattr(proc, "requires", None)
        line = f"{idx}. {pid} ({class_name})"
        if provides:
            line += f" provides={_format_list(provides)}"
        if requires:
            line += f" requires={_format_list(requires)}"
        lines.append(line)
    return lines

enable

enable(process_id: str) -> None

Enable a step by process_id or class name.

Source code in cltk/core/data_types.py
def enable(self, process_id: str) -> None:
    """Enable a step by process_id or class name."""
    if self.spec and getattr(self.spec, "steps", None):
        idx = _find_step_index(self.spec.steps, process_id)
        if idx is not None:
            self.spec.steps[idx].enabled = True
            self._sync_processes_from_spec()
        return
    if self.processes is None:
        self.processes = []
    if any(
        _matches_identifier(_process_id(p), process_id)
        or _process_name(p) == process_id
        for p in self.processes
    ):
        return
    try:
        from cltk.core.process_registry import ProcessRegistry

        registry = ProcessRegistry.list_processes()
    except Exception:
        registry = {}
    proc_cls = registry.get(process_id)
    if not proc_cls:
        for candidate in registry.values():
            if candidate.__name__ == process_id:
                proc_cls = candidate
                break
    if proc_cls:
        self.processes.append(proc_cls)

disable

disable(process_id: str) -> None

Disable a step by process_id or class name.

Source code in cltk/core/data_types.py
def disable(self, process_id: str) -> None:
    """Disable a step by process_id or class name."""
    if self.spec and getattr(self.spec, "steps", None):
        idx = _find_step_index(self.spec.steps, process_id)
        if idx is not None:
            self.spec.steps[idx].enabled = False
            self._sync_processes_from_spec()
        return
    self._remove_process(process_id)

remove

remove(process_id: str) -> None

Remove a step from the pipeline entirely.

Source code in cltk/core/data_types.py
def remove(self, process_id: str) -> None:
    """Remove a step from the pipeline entirely."""
    if self.spec and getattr(self.spec, "steps", None):
        registry = {}
        try:
            from cltk.core.process_registry import ProcessRegistry

            registry = ProcessRegistry.list_processes()
        except Exception:
            registry = {}
        original = list(self.spec.steps)
        self.spec.steps = [
            step
            for step in self.spec.steps
            if not _step_matches(step.id, process_id, registry)
        ]
        if self.spec.steps != original:
            self._sync_processes_from_spec()
        return
    self._remove_process(process_id)

move_before

move_before(
    process_id: str, before_process_id: str
) -> None

Move a step before another step.

Source code in cltk/core/data_types.py
def move_before(self, process_id: str, before_process_id: str) -> None:
    """Move a step before another step."""
    if self.spec and getattr(self.spec, "steps", None):
        self._move_step(process_id, before_process_id, before=True)
        return
    self._move_process(process_id, before_process_id, before=True)

move_after

move_after(process_id: str, after_process_id: str) -> None

Move a step after another step.

Source code in cltk/core/data_types.py
def move_after(self, process_id: str, after_process_id: str) -> None:
    """Move a step after another step."""
    if self.spec and getattr(self.spec, "steps", None):
        self._move_step(process_id, after_process_id, before=False)
        return
    self._move_process(process_id, after_process_id, before=False)

to_spec

to_spec() -> Any

Return a best-effort PipelineSpec from this pipeline.

Source code in cltk/core/data_types.py
def to_spec(self) -> Any:
    """Return a best-effort PipelineSpec from this pipeline."""
    if self.spec is not None:
        return self.spec
    try:
        from cltk.pipeline.specs import PipelineSpec, StepSpec

        steps = [
            StepSpec(
                id=_process_id(proc),
                enabled=True,
                config=(
                    proc.model_dump(exclude_none=True, exclude={"glottolog_id"})
                    if isinstance(proc, Process)
                    else {}
                ),
            )
            for proc in (self.processes or [])
        ]
        return PipelineSpec(
            language=self.glottolog_id,
            steps=steps,
        )
    except Exception:
        return None

from_toml classmethod

from_toml(path: str) -> Pipeline

Build a Pipeline from a TOML spec file.

Source code in cltk/core/data_types.py
@classmethod
def from_toml(cls, path: str) -> "Pipeline":
    """Build a Pipeline from a TOML spec file."""
    from cltk.pipeline.compiler import compile_pipeline
    from cltk.pipeline.spec_io import load_pipeline_spec

    spec = load_pipeline_spec(path)
    pipeline = compile_pipeline(spec)
    return pipeline