def run(self, input_doc: Doc) -> Doc:
"""Run a Stanza pipeline and populate the Doc with UD annotations."""
output_doc = Doc(**input_doc.model_dump())
log = bind_context(
glottolog_id=getattr(self, "glottolog_id", None), model="stanza"
)
if not output_doc.normalized_text:
raise ValueError("Doc must have `normalized_text` to run Stanza.")
# Optional dependency guard
try:
pass
except Exception as e: # pragma: no cover - only when stanza missing
raise ImportError(
"Stanza not installed. Install with: pip install 'cltk[stanza]'"
) from e
lang_id = (
getattr(self, "glottolog_id", None) or output_doc.language.glottolog_id
)
if lang_id is None:
raise ValueError("glottolog_id must be set to run Stanza.")
lang = _GLOTTO_TO_STANZA_LANG.get(lang_id)
if not lang:
raise ValueError(
f"No Stanza language mapping for glottolog_id='{lang_id}'."
)
# Optional per-treebank override (e.g., different UD package)
stanza_package: Optional[str] = None
try:
stanza_package = output_doc.metadata.get("stanza_package")
except Exception:
stanza_package = None
config_snapshot = extract_doc_config(output_doc)
prov_record = build_provenance_record(
language=lang_id,
backend=output_doc.backend or "stanza",
process=self.__class__.__name__,
model=stanza_package or "stanza",
provider="stanza",
config=config_snapshot,
notes={
"stanza_lang": lang,
"processors": "tokenize,pos,lemma,depparse",
"stanza_package": stanza_package,
},
)
prov_id = add_provenance_record(
output_doc,
prov_record,
set_default=output_doc.default_provenance_id is None,
)
# Build Stanza pipeline; let it handle sentence splitting and tagging
nlp = _get_stanza_pipeline(
lang=lang,
processors="tokenize,pos,lemma,depparse",
tokenize_no_ssplit=False,
package=stanza_package,
)
sdoc = nlp(output_doc.normalized_text)
words: list[Word] = []
remap_report = UDFeatureRemapReport()
sent_bounds: list[tuple[int, int]] = []
token_counter = 0
for s_idx, sent in enumerate(getattr(sdoc, "sentences", []) or []):
# Sentence boundary from token char offsets when available
s_tokens = getattr(sent, "tokens", []) or []
s_start = None
s_end = None
for t in s_tokens:
sc = getattr(t, "start_char", None)
ec = getattr(t, "end_char", None)
if isinstance(sc, int):
s_start = sc if s_start is None else min(s_start, sc)
if isinstance(ec, int):
s_end = ec if s_end is None else max(s_end, ec)
if s_start is not None and s_end is not None:
sent_bounds.append((s_start, s_end))
# Stanza words for dependency info (head/deprel) live under sent.words
for w in getattr(sent, "words", []) or []:
form = getattr(w, "text", None)
lemma = getattr(w, "lemma", None)
upos_s = getattr(w, "upos", None)
feats_s = getattr(w, "feats", None)
head_i = getattr(w, "head", None)
deprel_s = getattr(w, "deprel", None)
# UPOS
upos_obj: Optional[UDPartOfSpeechTag] = None
if isinstance(upos_s, str) and upos_s:
try:
upos_obj = UDPartOfSpeechTag(tag=upos_s)
except Exception:
upos_obj = None
# FEATS
feats_obj: Optional[UDFeatureTagSet] = None
if isinstance(feats_s, str) and feats_s and feats_s != "_":
try:
feats_obj = convert_pos_features_to_ud(
feats_raw=feats_s,
remap_report=remap_report,
source_word=form if isinstance(form, str) else None,
)
except Exception:
feats_obj = None
# DEPREL
dep_obj: Optional[UDDeprelTag] = None
if isinstance(deprel_s, str) and deprel_s:
main, subtype = (deprel_s.split(":", 1) + [None])[:2]
try:
if isinstance(main, str):
dep_obj = get_ud_deprel_tag(main, subtype=subtype)
except Exception:
dep_obj = None
# HEAD (convert UD 1-based to 0-based)
gov: Optional[int] = None
try:
if isinstance(head_i, int):
gov = None if head_i == 0 else head_i - 1
except Exception:
gov = None
# Character offsets at word-level (best effort from token grouping)
# Stanza exposes start_char for tokens; words may not carry it directly
start_char = getattr(w, "start_char", None)
end_char = getattr(w, "end_char", None)
word = Word(
string=form,
index_token=token_counter,
index_sentence=s_idx,
lemma=lemma,
upos=upos_obj,
features=feats_obj,
dependency_relation=dep_obj,
governor=gov,
)
if prov_id:
word.annotation_sources["lemma"] = prov_id
word.annotation_sources["upos"] = prov_id
word.annotation_sources["features"] = prov_id
word.annotation_sources["dependency_relation"] = prov_id
word.annotation_sources["governor"] = prov_id
if isinstance(start_char, int) and isinstance(end_char, int):
word.index_char_start = start_char
word.index_char_stop = end_char
words.append(word)
token_counter += 1
output_doc.words = words
remap_report.log_summary(label="Unmapped UD feature pairs from stanza")
if sent_bounds:
output_doc.sentence_boundaries = sent_bounds
if sent_bounds and prov_id:
if not output_doc.sentence_annotation_sources:
output_doc.sentence_annotation_sources = {}
for idx in range(len(sent_bounds)):
entry = output_doc.sentence_annotation_sources.get(idx, {})
entry["span"] = prov_id
output_doc.sentence_annotation_sources[idx] = entry
log.info(
"Stanza annotated %d sentences and %d tokens", len(sent_bounds), len(words)
)
return output_doc