Core data models used throughout CLTK.
This module defines small, typed Pydantic models for linguistic metadata
and the main runtime containers (Word, Sentence, Doc), along with
lightweight abstractions for Process and Pipeline. These types are the
building blocks of the NLP pipeline and are designed to be simple to serialize
and render well in documentation.
Level
module-attribute
Level: TypeAlias = Literal['family', 'language', 'dialect']
Status
module-attribute
Status: TypeAlias = Literal[
"living",
"extinct",
"second language only",
"artificial",
"unattested",
"unknown",
]
Macroarea
module-attribute
Macroarea: TypeAlias = Literal[
"Africa",
"Eurasia",
"Papunesia",
"Australia",
"North America",
"South America",
"Antarctica",
]
ISOType
module-attribute
ISOType: TypeAlias = Literal['639-1', '639-2', '639-3']
ScriptDir
module-attribute
ScriptDir: TypeAlias = Literal['ltr', 'rtl', 'ttb', 'btt']
BACKEND_TYPES
module-attribute
BACKEND_TYPES: TypeAlias = Literal[
"openai",
"stanza",
"spacy",
"ollama",
"ollama-cloud",
"mistral",
]
AVAILABLE_OPENAI_MODELS
module-attribute
AVAILABLE_OPENAI_MODELS: TypeAlias = Literal[
"gpt-5-mini", "gpt-5"
]
AVAILABLE_MISTRAL_MODELS
module-attribute
AVAILABLE_MISTRAL_MODELS: TypeAlias = Literal[
"mistral-large-latest",
"magistral-small-latest",
"mistral-medium-latest",
"mistral-large-latest",
]
IPA_PRONUNCIATION_MODE
module-attribute
IPA_PRONUNCIATION_MODE: TypeAlias = Literal[
"attic_5c_bce", "koine_1c_ce", "byzantine_medieval"
]
InflectionalDomain
module-attribute
InflectionalDomain = Literal['Nominal', 'Verbal']
UDFeatureValue
Bases: BaseModel
Canonical value for a UD feature key.
Attributes:
-
code
(str)
–
Short code for the value (e.g., "Masc").
-
label
(str)
–
Human‑readable label (e.g., "Masculine").
-
description
(str)
–
Longer explanation of the value.
-
inflectional_class
(Optional[InflectionalDomain])
–
Optional class of inflectional features this value belongs to (e.g., "Nominal", "Verbal").
-
is_deprecated
(Optional[bool])
–
Whether the value is deprecated in UD.
description
instance-attribute
inflectional_class
class-attribute
instance-attribute
inflectional_class: Optional[InflectionalDomain] = None
is_deprecated
class-attribute
instance-attribute
is_deprecated: Optional[bool] = False
UDFeature
Bases: BaseModel
Canonical UD feature definition.
Attributes:
-
key
(str)
–
Feature key (e.g., "Case").
-
category
(Literal['Lexical', 'Inflectional', 'Other'])
–
High‑level category (lexical/inflectional/other).
-
description
(str)
–
Description of the feature semantics.
-
values
(dict[str, UDFeatureValue])
–
Mapping from value codes to their definitions.
category
instance-attribute
category: Literal['Lexical', 'Inflectional', 'Other']
description
instance-attribute
values
instance-attribute
values: dict[str, UDFeatureValue]
UDFeatureTag
Bases: BaseModel
A single UD feature key/value tag.
Validates a pair (key, value) against the registry, attempting to
normalize known variants via normalize_ud_feature_pair.
Attributes:
-
key
(str)
–
UD feature key (e.g., "Case").
-
value
(str)
–
UD feature value code (e.g., "Nom").
-
value_label
(str)
–
Human‑readable label resolved from the registry.
-
category
(Literal['Lexical', 'Inflectional', 'Other'])
–
Feature category populated from the canonical definition.
-
inflectional_class
(Optional[Literal['Nominal', 'Verbal']])
–
Optional inflectional class for the feature.
value_label
class-attribute
instance-attribute
category
class-attribute
instance-attribute
category: Literal["Lexical", "Inflectional", "Other"] = (
"Lexical"
)
inflectional_class
class-attribute
instance-attribute
inflectional_class: Optional[
Literal["Nominal", "Verbal"]
] = None
fill_fields
classmethod
fill_fields(data: dict) -> dict
Pre-validate and enrich tag data using the feature registry.
Attempts to normalize (key, value) pairs that are not found. On
success, populates category, inflectional_class, and
value_label based on the canonical UD_FEATURES_MAP entry.
Parameters:
-
data
(dict)
–
Input dictionary with at least key and value.
Raises:
-
ValueError
–
If required fields are missing or normalization fails.
Returns:
-
dict
–
The enriched data dictionary for model construction.
Source code in cltk/core/data_types.py
| @model_validator(mode="before")
@classmethod
def fill_fields(cls, data: dict) -> dict:
"""Pre-validate and enrich tag data using the feature registry.
Attempts to normalize ``(key, value)`` pairs that are not found. On
success, populates ``category``, ``inflectional_class``, and
``value_label`` based on the canonical ``UD_FEATURES_MAP`` entry.
Args:
data: Input dictionary with at least ``key`` and ``value``.
Raises:
ValueError: If required fields are missing or normalization fails.
Returns:
The enriched data dictionary for model construction.
"""
from cltk.morphosyntax.normalization import normalize_ud_feature_pair
from cltk.morphosyntax.ud_features import UD_FEATURES_MAP
key = data.get("key")
value = data.get("value")
if not isinstance(key, str) or not isinstance(value, str):
msg = "UDFeatureTag requires 'key' and 'value' as strings."
logger.error(msg)
raise ValueError(msg)
if key not in UD_FEATURES_MAP or value not in UD_FEATURES_MAP[key].values:
# Try to normalize
normalized = normalize_ud_feature_pair(key, value)
if normalized:
key, value = normalized
data["key"] = key
data["value"] = value
else:
msg = f"Invalid value '{value}' for feature key '{key}'"
raise ValueError(msg)
feature = UD_FEATURES_MAP[key]
if value not in feature.values:
msg = f"Value '{value}' is not valid for feature key '{key}' even after normalization."
raise ValueError(msg)
data["category"] = feature.category
data["inflectional_class"] = feature.values[value].inflectional_class
data["value_label"] = feature.values[value].label
return data
|
Bases: BaseModel
A collection of feature tags for a token.
Attributes:
Notes
This uses a list to retain insertion order. A dictionary keyed by
feature "key" may be more efficient for lookups in some contexts.
features: list[UDFeatureTag] = []
add_feature(feature: UDFeatureTag) -> None
Add a feature to the set if the key is not already present.
Parameters:
Returns:
Source code in cltk/core/data_types.py
| def add_feature(self, feature: UDFeatureTag) -> None:
"""Add a feature to the set if the key is not already present.
Args:
feature: Feature tag to add.
Returns:
None
"""
if any(f.key == feature.key for f in self.features):
logger.error(
f"Feature with key '{feature.key}' already exists in the tag set."
)
return None
self.features.append(feature)
logger.debug(f"Added feature {feature.key} to UDFeatureTagSet.")
|
CLTKGenAIResponse
Bases: BaseModel
Response model for generative backend interactions (OpenAI/Ollama).
Attributes:
-
response
(str)
–
The generated text returned by the LLM.
-
usage
(dict[str, int])
–
Token usage information (input, output, total) when available.
response
instance-attribute
ScoredText
Bases: BaseModel
Generic scored alternative, used for glosses and translations.
probability
class-attribute
instance-attribute
probability: Optional[float] = Field(
default=None, ge=0.0, le=1.0
)
note
class-attribute
instance-attribute
note: Optional[str] = None
Gloss
Bases: BaseModel
Contextual and dictionary glosses plus alternatives.
dictionary
class-attribute
instance-attribute
dictionary: Optional[str] = None
context
class-attribute
instance-attribute
context: Optional[str] = None
alternatives
class-attribute
instance-attribute
alternatives: list[ScoredText] = Field(default_factory=list)
LemmaTranslationCandidate
Bases: BaseModel
Stable lemma-level translation candidate with an optional probability.
probability
class-attribute
instance-attribute
probability: Optional[float] = Field(
default=None, ge=0.0, le=1.0
)
source
class-attribute
instance-attribute
source: Optional[str] = None
IPAEnrichment
Bases: BaseModel
IPA transcription with explicit pronunciation mode.
mode
instance-attribute
mode: IPA_PRONUNCIATION_MODE
OrthographyHelper
Bases: BaseModel
Orthography/phonology helpers to expose syllables and accent.
syllables
class-attribute
instance-attribute
syllables: list[str] = Field(default_factory=list)
stress
class-attribute
instance-attribute
stress: Optional[str] = None
accent_class
class-attribute
instance-attribute
accent_class: Optional[str] = None
phonology_trace
class-attribute
instance-attribute
phonology_trace: list[str] = Field(default_factory=list)
PedagogicalNote
Bases: BaseModel
Short, learner-facing note tied to a token or dependency relation.
token_index
class-attribute
instance-attribute
token_index: Optional[int] = None
relation
class-attribute
instance-attribute
relation: Optional[str] = None
note
class-attribute
instance-attribute
note: Optional[str] = None
disambiguates
class-attribute
instance-attribute
disambiguates: Optional[str] = None
IdiomSpan
Bases: BaseModel
Span-level idiom/MWE annotation.
id
class-attribute
instance-attribute
token_indices
class-attribute
instance-attribute
token_indices: list[int] = Field(default_factory=list)
phrase_gloss
class-attribute
instance-attribute
phrase_gloss: Optional[str] = None
kind
class-attribute
instance-attribute
kind: Optional[str] = None
confidence
class-attribute
instance-attribute
confidence: Optional[float] = Field(
default=None, ge=0.0, le=1.0
)
WordEnrichment
Bases: BaseModel
Bundle of enrichment data layered on top of morph/dependency analysis.
gloss
class-attribute
instance-attribute
gloss: Optional[Gloss] = None
lemma_translations
class-attribute
instance-attribute
lemma_translations: list[LemmaTranslationCandidate] = Field(
default_factory=list
)
ipa
class-attribute
instance-attribute
ipa: Optional[IPAEnrichment] = None
orthography
class-attribute
instance-attribute
orthography: Optional[OrthographyHelper] = None
idiom_span_ids
class-attribute
instance-attribute
idiom_span_ids: list[str] = Field(default_factory=list)
pedagogical_notes
class-attribute
instance-attribute
pedagogical_notes: list[PedagogicalNote] = Field(
default_factory=list
)
Translation
Bases: BaseModel
Structured translation with language metadata and notes.
source_lang_id
class-attribute
instance-attribute
source_lang_id: Optional[str] = None
target_lang_id
class-attribute
instance-attribute
target_lang_id: Optional[str] = None
notes
class-attribute
instance-attribute
notes: Optional[str] = None
confidence
class-attribute
instance-attribute
confidence: Optional[float] = Field(
default=None, ge=0.0, le=1.0
)
NameVariant
Bases: BaseModel
Alternative name or label for a language or dialect.
Attributes:
-
value
(str)
–
The display string for the name.
-
source
(Optional[str])
–
Optional provenance or catalogue name.
-
script
(Optional[str])
–
Optional script tag (e.g., ISO 15924).
-
language
(Optional[str])
–
Optional language code for the label text.
source
class-attribute
instance-attribute
source: Optional[str] = None
script
class-attribute
instance-attribute
script: Optional[str] = None
language
class-attribute
instance-attribute
language: Optional[str] = None
Identifier
Bases: BaseModel
External identifier record.
Attributes:
-
scheme
(str)
–
Identifier scheme (e.g., glottocode, iso639-3).
-
value
(str)
–
scheme
instance-attribute
GeoPoint
Bases: BaseModel
Geographic point in decimal degrees.
GeoArea
Bases: BaseModel
Geographic coverage for a language or dialect.
centroid
class-attribute
instance-attribute
centroid: Optional[GeoPoint] = None
macroareas
class-attribute
instance-attribute
macroareas: list[Macroarea] = Field(default_factory=list)
countries
class-attribute
instance-attribute
countries: list[str] = Field(default_factory=list)
Timespan
Bases: BaseModel
Approximate temporal coverage for a resource or orthography.
start
class-attribute
instance-attribute
start: Optional[int] = None
end
class-attribute
instance-attribute
end: Optional[int] = None
note
class-attribute
instance-attribute
note: Optional[str] = None
SourceRef
Bases: BaseModel
Bibliographic source/citation reference.
pages
class-attribute
instance-attribute
pages: Optional[str] = None
note
class-attribute
instance-attribute
note: Optional[str] = None
url
class-attribute
instance-attribute
url: Optional[AnyUrl] = None
Classification
Bases: BaseModel
Taxonomic/phylogenetic information for a language.
parent_glottocode
class-attribute
instance-attribute
parent_glottocode: Optional[str] = None
lineage
class-attribute
instance-attribute
lineage: list[str] = Field(default_factory=list)
children_glottocodes
class-attribute
instance-attribute
children_glottocodes: list[str] = Field(
default_factory=list
)
Endangerment
Bases: BaseModel
Endangerment status summary (if available).
status
class-attribute
instance-attribute
status: Optional[str] = None
source
class-attribute
instance-attribute
source: Optional[str] = None
date_assessed
class-attribute
instance-attribute
date_assessed: Optional[date] = None
note
class-attribute
instance-attribute
note: Optional[str] = None
Link
Bases: BaseModel
External hyperlink with title.
TransliterationSystem
Bases: BaseModel
Transliteration scheme description and provenance.
standard_body
class-attribute
instance-attribute
standard_body: Optional[str] = None
year
class-attribute
instance-attribute
year: Optional[int] = None
description
class-attribute
instance-attribute
description: Optional[str] = None
sources
class-attribute
instance-attribute
sources: list[SourceRef] = Field(default_factory=list)
Orthography
Bases: BaseModel
Orthography used for a language/dialect in a given period/region.
script
instance-attribute
direction
class-attribute
instance-attribute
direction: Optional[ScriptDir] = None
period
class-attribute
instance-attribute
period: Optional[Timespan] = None
region
class-attribute
instance-attribute
region: Optional[str] = None
description
class-attribute
instance-attribute
description: Optional[str] = None
conventions
class-attribute
instance-attribute
conventions: list[str] = Field(default_factory=list)
transliteration
class-attribute
instance-attribute
transliteration: list[TransliterationSystem] = Field(
default_factory=list
)
sample
class-attribute
instance-attribute
sample: Optional[str] = None
sources
class-attribute
instance-attribute
sources: list[SourceRef] = Field(default_factory=list)
links
class-attribute
instance-attribute
links: list[Link] = Field(default_factory=list)
Dialect
Bases: BaseModel
Dialect metadata record from Glottolog‑derived data.
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
language_code
class-attribute
instance-attribute
language_code: Optional[str] = None
status
class-attribute
instance-attribute
status: Optional[Status] = None
alt_names
class-attribute
instance-attribute
alt_names: list[NameVariant] = Field(default_factory=list)
identifiers
class-attribute
instance-attribute
identifiers: list[Identifier] = Field(default_factory=list)
geo
class-attribute
instance-attribute
geo: Optional[GeoArea] = None
timespan
class-attribute
instance-attribute
timespan: Optional[Timespan] = None
scripts
class-attribute
instance-attribute
scripts: list[str] = Field(default_factory=list)
orthographies
class-attribute
instance-attribute
orthographies: list[Orthography] = Field(
default_factory=list
)
sources
class-attribute
instance-attribute
sources: list[SourceRef] = Field(default_factory=list)
links
class-attribute
instance-attribute
links: list[Link] = Field(default_factory=list)
Language
Bases: BaseModel
Language metadata record from Glottolog‑derived data.
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
identifiers
class-attribute
instance-attribute
identifiers: list[Identifier] = Field(default_factory=list)
level
class-attribute
instance-attribute
level: Optional[Level] = None
status
class-attribute
instance-attribute
status: Optional[Status] = None
type
class-attribute
instance-attribute
type: Optional[str] = None
geo
class-attribute
instance-attribute
geo: Optional[GeoArea] = None
timespan
class-attribute
instance-attribute
timespan: Optional[Timespan] = None
classification
class-attribute
instance-attribute
classification: Optional[Classification] = None
family_id
class-attribute
instance-attribute
family_id: Optional[str] = None
parent_id
class-attribute
instance-attribute
parent_id: Optional[str] = None
iso
class-attribute
instance-attribute
iso: Optional[str] = None
iso_set
class-attribute
instance-attribute
iso_set: dict[ISOType, str] = Field(default_factory=dict)
alt_names
class-attribute
instance-attribute
alt_names: list[NameVariant] = Field(default_factory=list)
scripts
class-attribute
instance-attribute
scripts: list[str] = Field(default_factory=list)
orthographies
class-attribute
instance-attribute
orthographies: list[Orthography] = Field(
default_factory=list
)
sources
class-attribute
instance-attribute
sources: list[SourceRef] = Field(default_factory=list)
links
class-attribute
instance-attribute
links: list[Link] = Field(default_factory=list)
dialects
class-attribute
instance-attribute
dialects: list[Dialect] = Field(default_factory=list)
default_variety_id
class-attribute
instance-attribute
default_variety_id: Optional[str] = None
glottolog_version
class-attribute
instance-attribute
glottolog_version: Optional[str] = None
commit_sha
class-attribute
instance-attribute
commit_sha: Optional[str] = None
last_updated
class-attribute
instance-attribute
last_updated: Optional[date] = None
endangerment
class-attribute
instance-attribute
endangerment: Optional[Endangerment] = None
latitude
class-attribute
instance-attribute
latitude: Optional[float] = None
longitude
class-attribute
instance-attribute
longitude: Optional[float] = None
dates
class-attribute
instance-attribute
dates: list[int] = Field(default_factory=list)
newick
class-attribute
instance-attribute
newick: Optional[str] = None
CLTKBaseModel
Bases: BaseModel
Base Pydantic model for CLTK runtime containers.
model_config
class-attribute
instance-attribute
model_config = {'arbitrary_types_allowed': True}
Word
Bases: CLTKBaseModel
Contains attributes of each processed word in a list of words.
index_char_start
class-attribute
instance-attribute
index_char_start: Optional[int] = None
index_char_stop
class-attribute
instance-attribute
index_char_stop: Optional[int] = None
index_token
class-attribute
instance-attribute
index_token: Optional[int] = None
index_sentence
class-attribute
instance-attribute
index_sentence: Optional[int] = None
string
class-attribute
instance-attribute
string: Optional[str] = None
lemma
class-attribute
instance-attribute
lemma: Optional[str] = None
upos
class-attribute
instance-attribute
upos: Optional[UDPartOfSpeechTag] = None
features
class-attribute
instance-attribute
features: Optional[UDFeatureTagSet] = None
dependency_relation
class-attribute
instance-attribute
dependency_relation: Optional[UDDeprelTag] = None
governor
class-attribute
instance-attribute
governor: Optional[int] = None
stem
class-attribute
instance-attribute
stem: Optional[str] = None
scansion
class-attribute
instance-attribute
scansion: Optional[str] = None
xpos
class-attribute
instance-attribute
xpos: Optional[str] = None
embedding
class-attribute
instance-attribute
embedding: Optional[ndarray] = None
stop
class-attribute
instance-attribute
stop: Optional[bool] = None
named_entity
class-attribute
instance-attribute
named_entity: Optional[str] = None
syllables
class-attribute
instance-attribute
syllables: Optional[list[str]] = Field(default_factory=list)
phonetic_transcription
class-attribute
instance-attribute
phonetic_transcription: Optional[str] = None
definition
class-attribute
instance-attribute
definition: Optional[str] = None
enrichment
class-attribute
instance-attribute
enrichment: Optional[WordEnrichment] = None
annotation_sources
class-attribute
instance-attribute
annotation_sources: dict[str, str] = Field(
default_factory=dict
)
confidence
class-attribute
instance-attribute
confidence: dict[str, float] = Field(default_factory=dict)
model_config
class-attribute
instance-attribute
model_config = {'arbitrary_types_allowed': True}
Sentence
Bases: CLTKBaseModel
A sentence containing words and optional embedding.
words
class-attribute
instance-attribute
words: Optional[list[Word]] = Field(default_factory=list)
index
class-attribute
instance-attribute
index: Optional[int] = None
embedding
class-attribute
instance-attribute
embedding: Optional[ndarray] = None
translation
class-attribute
instance-attribute
translation: Optional[Translation] = None
annotation_sources
class-attribute
instance-attribute
annotation_sources: dict[str, str] = Field(
default_factory=dict
)
model_config
class-attribute
instance-attribute
model_config = {'arbitrary_types_allowed': True}
ModelConfig
Bases: BaseModel
Common base for backend configuration blocks.
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
StanzaBackendConfig
Bases: ModelConfig
Options specific to the Stanza backend.
model
class-attribute
instance-attribute
model: Optional[str] = Field(
default=None,
description="Optional non-default Stanza model/treebank name to load.",
)
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
OpenAIBackendConfig
Bases: ModelConfig
Options specific to the OpenAI/ChatGPT backend.
model
class-attribute
instance-attribute
model: Optional[Union[AVAILABLE_OPENAI_MODELS, str]] = None
temperature
class-attribute
instance-attribute
temperature: float = Field(default=1.0, ge=0, le=2)
max_output_tokens
class-attribute
instance-attribute
max_output_tokens: Optional[int] = Field(default=None, gt=0)
top_p
class-attribute
instance-attribute
top_p: Optional[float] = Field(default=None, ge=0, le=1)
presence_penalty
class-attribute
instance-attribute
presence_penalty: Optional[float] = Field(
default=None, ge=-2, le=2
)
frequency_penalty
class-attribute
instance-attribute
frequency_penalty: Optional[float] = Field(
default=None, ge=-2, le=2
)
max_retries
class-attribute
instance-attribute
max_retries: int = Field(default=2, ge=0)
api_key
class-attribute
instance-attribute
api_key: Optional[str] = None
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
MistralBackendConfig
Bases: ModelConfig
Options specific to the Mistral backend.
model
class-attribute
instance-attribute
model: Optional[Union[AVAILABLE_MISTRAL_MODELS, str]] = None
temperature
class-attribute
instance-attribute
temperature: float = Field(default=1.0, ge=0, le=2)
max_tokens
class-attribute
instance-attribute
max_tokens: Optional[int] = Field(default=None, gt=0)
top_p
class-attribute
instance-attribute
top_p: Optional[float] = Field(default=None, ge=0, le=1)
random_seed
class-attribute
instance-attribute
random_seed: Optional[int] = Field(default=None, ge=0)
max_retries
class-attribute
instance-attribute
max_retries: int = Field(default=2, ge=0)
api_key
class-attribute
instance-attribute
api_key: Optional[str] = None
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
OllamaBackendConfig
Bases: ModelConfig
Options specific to the Ollama backend (local or remote).
model
class-attribute
instance-attribute
model: Optional[str] = None
temperature
class-attribute
instance-attribute
temperature: float = Field(default=0.8, ge=0)
top_p
class-attribute
instance-attribute
top_p: Optional[float] = Field(default=None, ge=0, le=1)
num_ctx
class-attribute
instance-attribute
num_ctx: Optional[int] = Field(default=None, gt=0)
num_predict
class-attribute
instance-attribute
num_predict: Optional[int] = Field(default=None, gt=0)
host
class-attribute
instance-attribute
host: Optional[str] = Field(
default="http://127.0.0.1",
description="Base URL for the Ollama server, e.g., http://localhost or https://ollama.example.com.",
)
port
class-attribute
instance-attribute
port: Optional[int] = Field(default=11434, ge=1, le=65535)
use_cloud
class-attribute
instance-attribute
api_key
class-attribute
instance-attribute
api_key: Optional[str] = None
options
class-attribute
instance-attribute
options: dict[str, Any] = Field(
default_factory=dict,
description="Additional model options passed directly to the Ollama client.",
)
max_retries
class-attribute
instance-attribute
max_retries: int = Field(default=2, ge=0)
base_url
property
Return a combined host:port string when both are provided.
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
CLTKConfig
Bases: BaseModel
Bundled configuration for initializing :class:~cltk.nlp.NLP.
language_code
class-attribute
instance-attribute
language_code: Optional[str] = None
language
class-attribute
instance-attribute
language: Optional[Language] = None
backend
class-attribute
instance-attribute
backend: BACKEND_TYPES = 'stanza'
model
class-attribute
instance-attribute
model: Optional[str] = None
custom_pipeline
class-attribute
instance-attribute
custom_pipeline: Optional[Pipeline] = None
suppress_banner
class-attribute
instance-attribute
suppress_banner: bool = False
stanza
class-attribute
instance-attribute
stanza: Optional[StanzaBackendConfig] = None
openai
class-attribute
instance-attribute
openai: Optional[OpenAIBackendConfig] = None
mistral
class-attribute
instance-attribute
mistral: Optional[MistralBackendConfig] = None
ollama
class-attribute
instance-attribute
ollama: Optional[OllamaBackendConfig] = None
model_config
class-attribute
instance-attribute
model_config = {'extra': 'forbid'}
active_backend_config
property
active_backend_config: Optional[ModelConfig]
Return the config block matching backend.
Doc
Bases: CLTKBaseModel
Top‑level container returned from NLP() pipelines.
Attributes:
-
language
(Language)
–
Language metadata associated with the text.
-
words
(list[Word])
–
Token‑level annotations (may be empty prior to analysis).
-
pipeline
(Optional[Pipeline])
–
Pipeline instance that produced this document, if any.
-
raw
(Optional[str])
–
-
normalized_text
(Optional[str])
–
Normalized version of the text.
-
sentence_embeddings
(dict[int, ndarray])
–
Optional embeddings per sentence index.
-
sentence_translations
(dict[int, Translation])
–
Structured translations keyed by sentence index.
-
translation
(Optional[str])
–
Optional document-level translation string (usually aggregated).
-
translations
(list[Translation])
–
Collected structured translations (e.g., per sentence).
-
summary
(Optional[str])
–
Optional summary of the document.
-
topic
(Optional[str])
–
Optional topic classification.
-
discourse_relations
(list[str])
–
Discourse relation labels (if available).
-
coreferences
(list[tuple[str, str, int, int]])
–
Coreference links as (mention, antecedent, i, j).
-
sentence_boundaries
(list[tuple[int, int]])
–
List of (start, stop) character offsets.
-
genai_use
(list[dict[str, Any]])
–
List of usage/metadata dicts from model calls.
-
metadata
(dict[str, Any])
–
Arbitrary metadata about the document.
language
instance-attribute
words
class-attribute
instance-attribute
words: list[Word] = Field(default_factory=list)
pipeline
class-attribute
instance-attribute
pipeline: Optional[Pipeline] = None
raw
class-attribute
instance-attribute
raw: Optional[str] = None
normalized_text
class-attribute
instance-attribute
normalized_text: Optional[str] = None
embeddings_model
class-attribute
instance-attribute
embeddings_model: Optional[Any] = None
sentence_embeddings
class-attribute
instance-attribute
sentence_embeddings: dict[int, ndarray] = Field(
default_factory=dict
)
sentence_translations
class-attribute
instance-attribute
sentence_translations: dict[int, Translation] = Field(
default_factory=dict
)
translation
class-attribute
instance-attribute
translation: Optional[str] = None
translations
class-attribute
instance-attribute
translations: list[Translation] = Field(
default_factory=list
)
summary
class-attribute
instance-attribute
summary: Optional[str] = None
topic
class-attribute
instance-attribute
topic: Optional[str] = None
discourse_relations
class-attribute
instance-attribute
discourse_relations: list[str] = Field(default_factory=list)
coreferences
class-attribute
instance-attribute
coreferences: list[tuple[str, str, int, int]] = Field(
default_factory=list
)
idiom_spans
class-attribute
instance-attribute
idiom_spans: list[IdiomSpan] = Field(default_factory=list)
sentence_boundaries
class-attribute
instance-attribute
sentence_boundaries: list[tuple[int, int]] = Field(
default_factory=list
)
genai_use
class-attribute
instance-attribute
genai_use: list[dict[str, Any]] = Field(
default_factory=list
)
backend
class-attribute
instance-attribute
backend: Optional[BACKEND_TYPES] = None
model
class-attribute
instance-attribute
model: Optional[Union[BACKEND_TYPES, str]] = None
dialect
class-attribute
instance-attribute
dialect: Optional[Dialect] = None
metadata: dict[str, Any] = Field(default_factory=dict)
provenance
class-attribute
instance-attribute
provenance: dict[str, ProvenanceRecord] = Field(
default_factory=dict
)
default_provenance_id
class-attribute
instance-attribute
default_provenance_id: Optional[str] = None
sentence_annotation_sources
class-attribute
instance-attribute
sentence_annotation_sources: dict[int, dict[str, str]] = (
Field(default_factory=dict)
)
sentence_strings
property
sentence_strings: list[str]
Return sentence strings derived from boundaries and text.
Returns:
-
list[str]
–
A list of substrings of normalized_text cut by
-
list[str]
–
sentence_boundaries. Returns an empty list if either field is
-
list[str]
–
sentences
property
sentences: list[Sentence]
model_config
class-attribute
instance-attribute
model_config = {'arbitrary_types_allowed': True}
Process
Bases: BaseModel
Abstract base for NLP processes operating on a Doc.
Subclasses implement run() to transform or enrich a document.
Attributes:
-
glottolog_id
(Optional[str])
–
Optional target language code for language‑specific logic.
process_id
class-attribute
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
run
abstractmethod
run(input_doc: Doc) -> Doc
Process input_doc and return an enriched/modified copy.
Source code in cltk/core/data_types.py
| @abstractmethod
def run(self, input_doc: Doc) -> Doc:
"""Process ``input_doc`` and return an enriched/modified copy."""
pass
|
Pipeline
Bases: BaseModel
Composable set of processes to analyze a document.
Attributes:
-
description
(Optional[str])
–
Human‑readable description.
-
processes
(Optional[list[Any]])
–
Ordered list of process classes to apply.
-
language
(Optional[Language])
–
Resolved language metadata.
-
dialect
(Optional[Dialect])
–
Resolved dialect metadata (if applicable).
-
glottolog_id
(Optional[str])
–
Language code used for auto‑resolution.
description
class-attribute
instance-attribute
description: Optional[str] = None
processes
class-attribute
instance-attribute
processes: Optional[list[Any]] = Field(default_factory=list)
language
class-attribute
instance-attribute
language: Optional[Language] = None
dialect
class-attribute
instance-attribute
dialect: Optional[Dialect] = None
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
spec
class-attribute
instance-attribute
spec: Optional[Any] = None
add_process
add_process(process: Any) -> None
Append a process class to the pipeline order.
Parameters:
-
process
(Any)
–
A Process subclass to add to the pipeline.
Source code in cltk/core/data_types.py
| def add_process(self, process: Any) -> None:
"""Append a process class to the pipeline order.
Args:
process: A ``Process`` subclass to add to the pipeline.
"""
if self.processes is None:
self.processes = []
self.processes.append(process)
|
describe
Return a human-friendly list describing pipeline order.
Source code in cltk/core/data_types.py
| def describe(self) -> list[str]:
"""Return a human-friendly list describing pipeline order."""
lines: list[str] = []
if self.spec and getattr(self.spec, "steps", None):
try:
from cltk.core.process_registry import ProcessRegistry
registry = ProcessRegistry.list_processes()
except Exception:
registry = {}
for idx, step in enumerate(self.spec.steps, 1):
proc_cls = registry.get(step.id)
class_name = proc_cls.__name__ if proc_cls else step.id
provides = getattr(proc_cls, "provides", None) if proc_cls else None
requires = getattr(proc_cls, "requires", None) if proc_cls else None
status = "enabled" if step.enabled else "disabled"
line = f"{idx}. {step.id} ({class_name}) [{status}]"
if provides:
line += f" provides={_format_list(provides)}"
if requires:
line += f" requires={_format_list(requires)}"
lines.append(line)
return lines
for idx, proc in enumerate(self.processes or [], 1):
pid = _process_id(proc)
class_name = _process_name(proc)
provides = getattr(proc, "provides", None)
requires = getattr(proc, "requires", None)
line = f"{idx}. {pid} ({class_name})"
if provides:
line += f" provides={_format_list(provides)}"
if requires:
line += f" requires={_format_list(requires)}"
lines.append(line)
return lines
|
enable
enable(process_id: str) -> None
Enable a step by process_id or class name.
Source code in cltk/core/data_types.py
| def enable(self, process_id: str) -> None:
"""Enable a step by process_id or class name."""
if self.spec and getattr(self.spec, "steps", None):
idx = _find_step_index(self.spec.steps, process_id)
if idx is not None:
self.spec.steps[idx].enabled = True
self._sync_processes_from_spec()
return
if self.processes is None:
self.processes = []
if any(
_matches_identifier(_process_id(p), process_id)
or _process_name(p) == process_id
for p in self.processes
):
return
try:
from cltk.core.process_registry import ProcessRegistry
registry = ProcessRegistry.list_processes()
except Exception:
registry = {}
proc_cls = registry.get(process_id)
if not proc_cls:
for candidate in registry.values():
if candidate.__name__ == process_id:
proc_cls = candidate
break
if proc_cls:
self.processes.append(proc_cls)
|
disable
disable(process_id: str) -> None
Disable a step by process_id or class name.
Source code in cltk/core/data_types.py
| def disable(self, process_id: str) -> None:
"""Disable a step by process_id or class name."""
if self.spec and getattr(self.spec, "steps", None):
idx = _find_step_index(self.spec.steps, process_id)
if idx is not None:
self.spec.steps[idx].enabled = False
self._sync_processes_from_spec()
return
self._remove_process(process_id)
|
remove
remove(process_id: str) -> None
Remove a step from the pipeline entirely.
Source code in cltk/core/data_types.py
| def remove(self, process_id: str) -> None:
"""Remove a step from the pipeline entirely."""
if self.spec and getattr(self.spec, "steps", None):
registry = {}
try:
from cltk.core.process_registry import ProcessRegistry
registry = ProcessRegistry.list_processes()
except Exception:
registry = {}
original = list(self.spec.steps)
self.spec.steps = [
step
for step in self.spec.steps
if not _step_matches(step.id, process_id, registry)
]
if self.spec.steps != original:
self._sync_processes_from_spec()
return
self._remove_process(process_id)
|
move_before
move_before(
process_id: str, before_process_id: str
) -> None
Move a step before another step.
Source code in cltk/core/data_types.py
| def move_before(self, process_id: str, before_process_id: str) -> None:
"""Move a step before another step."""
if self.spec and getattr(self.spec, "steps", None):
self._move_step(process_id, before_process_id, before=True)
return
self._move_process(process_id, before_process_id, before=True)
|
move_after
move_after(process_id: str, after_process_id: str) -> None
Move a step after another step.
Source code in cltk/core/data_types.py
| def move_after(self, process_id: str, after_process_id: str) -> None:
"""Move a step after another step."""
if self.spec and getattr(self.spec, "steps", None):
self._move_step(process_id, after_process_id, before=False)
return
self._move_process(process_id, after_process_id, before=False)
|
to_spec
Return a best-effort PipelineSpec from this pipeline.
Source code in cltk/core/data_types.py
| def to_spec(self) -> Any:
"""Return a best-effort PipelineSpec from this pipeline."""
if self.spec is not None:
return self.spec
try:
from cltk.pipeline.specs import PipelineSpec, StepSpec
steps = [
StepSpec(
id=_process_id(proc),
enabled=True,
config=(
proc.model_dump(exclude_none=True, exclude={"glottolog_id"})
if isinstance(proc, Process)
else {}
),
)
for proc in (self.processes or [])
]
return PipelineSpec(
language=self.glottolog_id,
steps=steps,
)
except Exception:
return None
|
from_toml
classmethod
from_toml(path: str) -> Pipeline
Build a Pipeline from a TOML spec file.
Source code in cltk/core/data_types.py
| @classmethod
def from_toml(cls, path: str) -> "Pipeline":
"""Build a Pipeline from a TOML spec file."""
from cltk.pipeline.compiler import compile_pipeline
from cltk.pipeline.spec_io import load_pipeline_spec
spec = load_pipeline_spec(path)
pipeline = compile_pipeline(spec)
return pipeline
|