def doc_to_tei_xml(
doc: Doc,
*,
tei_minimal: bool = True,
include_morph: bool = True,
include_lemma: bool = True,
include_gloss: bool = False,
include_translation: bool = False,
max_sentences: Optional[int] = None,
) -> str:
"""Render a TEI-ish XML document with standoff dependency relations."""
sentences = iter_sentences(doc, max_sentences)
tei = ET.Element(f"{{{_TEI_NS}}}TEI")
tei_header = ET.SubElement(tei, f"{{{_TEI_NS}}}teiHeader")
file_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}fileDesc")
title_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}titleStmt")
title = ET.SubElement(title_stmt, f"{{{_TEI_NS}}}title")
metadata = getattr(doc, "metadata", None) or {}
title.text = str(
metadata.get("title") or metadata.get("reference") or "CLTK Document"
)
pub_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}publicationStmt")
pub_p = ET.SubElement(pub_stmt, f"{{{_TEI_NS}}}p")
pub_p.text = "Unpublished"
source_desc = ET.SubElement(file_desc, f"{{{_TEI_NS}}}sourceDesc")
source_p = ET.SubElement(source_desc, f"{{{_TEI_NS}}}p")
source_p.text = "Generated by CLTK"
lang_id, lang_name = _doc_language(doc)
if lang_id or lang_name:
profile_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}profileDesc")
lang_usage = ET.SubElement(profile_desc, f"{{{_TEI_NS}}}langUsage")
language = ET.SubElement(lang_usage, f"{{{_TEI_NS}}}language")
if lang_id:
language.set("ident", lang_id)
language.text = lang_name or ""
app_version = _app_version()
if app_version or not tei_minimal:
encoding_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}encodingDesc")
app_info = ET.SubElement(encoding_desc, f"{{{_TEI_NS}}}appInfo")
app = ET.SubElement(app_info, f"{{{_TEI_NS}}}application")
app.set("ident", "cltk")
if app_version:
app.set("version", app_version)
app.text = "CLTK"
text = ET.SubElement(tei, f"{{{_TEI_NS}}}text")
body = ET.SubElement(text, f"{{{_TEI_NS}}}body")
div = ET.SubElement(body, f"{{{_TEI_NS}}}div")
_set_attributes(div, [("type", "document")])
div.set(f"{{{_XML_NS}}}id", _doc_id(doc))
paragraph = ET.SubElement(div, f"{{{_TEI_NS}}}p")
list_relation = None
stand_off = ET.SubElement(tei, f"{{{_TEI_NS}}}standOff")
list_relation = ET.SubElement(stand_off, f"{{{_TEI_NS}}}listRelation")
list_relation.set("type", "dependency")
for sent_num, sentence in enumerate(sentences, start=1):
words = iter_words(sentence)
if not words:
continue
s_elem = ET.SubElement(paragraph, f"{{{_TEI_NS}}}s")
s_elem.set(f"{{{_XML_NS}}}id", make_sentence_id(sent_num))
root_anchor = ET.SubElement(list_relation, f"{{{_TEI_NS}}}anchor")
root_anchor.set(f"{{{_XML_NS}}}id", make_root_id(sent_num))
for tok_num, word in enumerate(words, start=1):
w_elem = ET.SubElement(s_elem, f"{{{_TEI_NS}}}w")
attrs: list[tuple[str, str]] = []
attrs.append((f"{{{_XML_NS}}}id", make_token_id(sent_num, tok_num)))
attrs.append(("n", str(tok_num)))
if include_lemma:
lemma = get_lemma(word)
if lemma:
attrs.append(("lemma", lemma))
pos = get_upos_tag(word)
if pos:
attrs.append(("pos", pos))
if include_morph:
feats = format_features(word)
if feats:
attrs.append(("msd", feats))
if include_gloss:
gloss = get_gloss(word)
if gloss:
attrs.append(("gloss", gloss))
_set_attributes(w_elem, attrs)
w_elem.text = get_token_text(word)
if include_translation:
translation_map = getattr(doc, "sentence_translations", None) or {}
translation = translation_map.get(sent_num - 1)
text_value = getattr(translation, "text", None) if translation else None
if text_value:
note = ET.SubElement(s_elem, f"{{{_TEI_NS}}}note")
note.set("type", "translation")
note.text = str(text_value)
for tok_num, word in enumerate(words, start=1):
dep = getattr(word, "dependency_relation", None)
dep_code = getattr(dep, "code", None) or getattr(dep, "tag", None)
if not dep_code:
continue
governor = getattr(word, "governor", None)
active = make_token_id(sent_num, tok_num)
if governor is None:
passive = make_root_id(sent_num)
else:
try:
head_index = int(governor)
except (TypeError, ValueError):
continue
if head_index < 0 or head_index >= len(words):
continue
passive = make_token_id(sent_num, head_index + 1)
rel = ET.SubElement(list_relation, f"{{{_TEI_NS}}}relation")
rel_id = f"r-{active}-{passive}"
_set_attributes(
rel,
[
(f"{{{_XML_NS}}}id", rel_id),
("name", str(dep_code)),
("active", f"#{active}"),
("passive", f"#{passive}"),
],
)
_indent(tei)
xml_bytes: bytes = ET.tostring(tei, encoding="utf-8", xml_declaration=True)
return xml_bytes.decode("utf-8")