Skip to content

cltk.stanza

Optional Stanza-backed processes for CLTK.

This subpackage is available when installing the stanza extra:

pip install "cltk[stanza]"

It exposes: StanzaAnalyzeProcess

StanzaAnalyzeProcess

Bases: Process

Run Stanza and populate a Doc with UD annotations.

Notes
  • If Stanza is not installed, raises an ImportError with guidance to install the optional extra.
  • Uses Stanza defaults; language package must be available locally or it will trigger Stanza's download mechanism externally.

glottolog_id class-attribute instance-attribute

glottolog_id: Optional[str] = None

process_id class-attribute

process_id: str = 'stanza.analyze'

run

run(input_doc: Doc) -> Doc

Run a Stanza pipeline and populate the Doc with UD annotations.

Source code in cltk/stanza/processes.py
def run(self, input_doc: Doc) -> Doc:
    """Run a Stanza pipeline and populate the Doc with UD annotations."""
    output_doc = Doc(**input_doc.model_dump())
    log = bind_context(
        glottolog_id=getattr(self, "glottolog_id", None), model="stanza"
    )

    if not output_doc.normalized_text:
        raise ValueError("Doc must have `normalized_text` to run Stanza.")

    # Optional dependency guard
    try:
        pass
    except Exception as e:  # pragma: no cover - only when stanza missing
        raise ImportError(
            "Stanza not installed. Install with: pip install 'cltk[stanza]'"
        ) from e

    lang_id = (
        getattr(self, "glottolog_id", None) or output_doc.language.glottolog_id
    )
    if lang_id is None:
        raise ValueError("glottolog_id must be set to run Stanza.")
    lang = _GLOTTO_TO_STANZA_LANG.get(lang_id)
    if not lang:
        raise ValueError(
            f"No Stanza language mapping for glottolog_id='{lang_id}'."
        )

    # Optional per-treebank override (e.g., different UD package)
    stanza_package: Optional[str] = None
    try:
        stanza_package = output_doc.metadata.get("stanza_package")
    except Exception:
        stanza_package = None

    config_snapshot = extract_doc_config(output_doc)
    prov_record = build_provenance_record(
        language=lang_id,
        backend=output_doc.backend or "stanza",
        process=self.__class__.__name__,
        model=stanza_package or "stanza",
        provider="stanza",
        config=config_snapshot,
        notes={
            "stanza_lang": lang,
            "processors": "tokenize,pos,lemma,depparse",
            "stanza_package": stanza_package,
        },
    )
    prov_id = add_provenance_record(
        output_doc,
        prov_record,
        set_default=output_doc.default_provenance_id is None,
    )

    # Build Stanza pipeline; let it handle sentence splitting and tagging
    nlp = _get_stanza_pipeline(
        lang=lang,
        processors="tokenize,pos,lemma,depparse",
        tokenize_no_ssplit=False,
        package=stanza_package,
    )
    sdoc = nlp(output_doc.normalized_text)

    words: list[Word] = []
    remap_report = UDFeatureRemapReport()
    sent_bounds: list[tuple[int, int]] = []
    token_counter = 0
    for s_idx, sent in enumerate(getattr(sdoc, "sentences", []) or []):
        # Sentence boundary from token char offsets when available
        s_tokens = getattr(sent, "tokens", []) or []
        s_start = None
        s_end = None
        for t in s_tokens:
            sc = getattr(t, "start_char", None)
            ec = getattr(t, "end_char", None)
            if isinstance(sc, int):
                s_start = sc if s_start is None else min(s_start, sc)
            if isinstance(ec, int):
                s_end = ec if s_end is None else max(s_end, ec)
        if s_start is not None and s_end is not None:
            sent_bounds.append((s_start, s_end))

        # Stanza words for dependency info (head/deprel) live under sent.words
        for w in getattr(sent, "words", []) or []:
            form = getattr(w, "text", None)
            lemma = getattr(w, "lemma", None)
            upos_s = getattr(w, "upos", None)
            feats_s = getattr(w, "feats", None)
            head_i = getattr(w, "head", None)
            deprel_s = getattr(w, "deprel", None)

            # UPOS
            upos_obj: Optional[UDPartOfSpeechTag] = None
            if isinstance(upos_s, str) and upos_s:
                try:
                    upos_obj = UDPartOfSpeechTag(tag=upos_s)
                except Exception:
                    upos_obj = None

            # FEATS
            feats_obj: Optional[UDFeatureTagSet] = None
            if isinstance(feats_s, str) and feats_s and feats_s != "_":
                try:
                    feats_obj = convert_pos_features_to_ud(
                        feats_raw=feats_s,
                        remap_report=remap_report,
                        source_word=form if isinstance(form, str) else None,
                    )
                except Exception:
                    feats_obj = None

            # DEPREL
            dep_obj: Optional[UDDeprelTag] = None
            if isinstance(deprel_s, str) and deprel_s:
                main, subtype = (deprel_s.split(":", 1) + [None])[:2]
                try:
                    if isinstance(main, str):
                        dep_obj = get_ud_deprel_tag(main, subtype=subtype)
                except Exception:
                    dep_obj = None

            # HEAD (convert UD 1-based to 0-based)
            gov: Optional[int] = None
            try:
                if isinstance(head_i, int):
                    gov = None if head_i == 0 else head_i - 1
            except Exception:
                gov = None

            # Character offsets at word-level (best effort from token grouping)
            # Stanza exposes start_char for tokens; words may not carry it directly
            start_char = getattr(w, "start_char", None)
            end_char = getattr(w, "end_char", None)

            word = Word(
                string=form,
                index_token=token_counter,
                index_sentence=s_idx,
                lemma=lemma,
                upos=upos_obj,
                features=feats_obj,
                dependency_relation=dep_obj,
                governor=gov,
            )
            if prov_id:
                word.annotation_sources["lemma"] = prov_id
                word.annotation_sources["upos"] = prov_id
                word.annotation_sources["features"] = prov_id
                word.annotation_sources["dependency_relation"] = prov_id
                word.annotation_sources["governor"] = prov_id
            if isinstance(start_char, int) and isinstance(end_char, int):
                word.index_char_start = start_char
                word.index_char_stop = end_char

            words.append(word)
            token_counter += 1

    output_doc.words = words
    remap_report.log_summary(label="Unmapped UD feature pairs from stanza")
    if sent_bounds:
        output_doc.sentence_boundaries = sent_bounds
    if sent_bounds and prov_id:
        if not output_doc.sentence_annotation_sources:
            output_doc.sentence_annotation_sources = {}
        for idx in range(len(sent_bounds)):
            entry = output_doc.sentence_annotation_sources.get(idx, {})
            entry["span"] = prov_id
            output_doc.sentence_annotation_sources[idx] = entry
    log.info(
        "Stanza annotated %d sentences and %d tokens", len(sent_bounds), len(words)
    )
    return output_doc

Submodules