Skip to content

tei

TEI-ish XML export with standoff dependencies.

doc_to_tei_xml

doc_to_tei_xml(
    doc: Doc,
    *,
    tei_minimal: bool = True,
    include_morph: bool = True,
    include_lemma: bool = True,
    include_gloss: bool = False,
    include_translation: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a TEI-ish XML document with standoff dependency relations.

Source code in cltk/exports/tei.py
def doc_to_tei_xml(
    doc: Doc,
    *,
    tei_minimal: bool = True,
    include_morph: bool = True,
    include_lemma: bool = True,
    include_gloss: bool = False,
    include_translation: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a TEI-ish XML document with standoff dependency relations."""
    sentences = iter_sentences(doc, max_sentences)
    tei = ET.Element(f"{{{_TEI_NS}}}TEI")

    tei_header = ET.SubElement(tei, f"{{{_TEI_NS}}}teiHeader")
    file_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}fileDesc")
    title_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}titleStmt")
    title = ET.SubElement(title_stmt, f"{{{_TEI_NS}}}title")
    metadata = getattr(doc, "metadata", None) or {}
    title.text = str(
        metadata.get("title") or metadata.get("reference") or "CLTK Document"
    )
    pub_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}publicationStmt")
    pub_p = ET.SubElement(pub_stmt, f"{{{_TEI_NS}}}p")
    pub_p.text = "Unpublished"
    source_desc = ET.SubElement(file_desc, f"{{{_TEI_NS}}}sourceDesc")
    source_p = ET.SubElement(source_desc, f"{{{_TEI_NS}}}p")
    source_p.text = "Generated by CLTK"

    lang_id, lang_name = _doc_language(doc)
    if lang_id or lang_name:
        profile_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}profileDesc")
        lang_usage = ET.SubElement(profile_desc, f"{{{_TEI_NS}}}langUsage")
        language = ET.SubElement(lang_usage, f"{{{_TEI_NS}}}language")
        if lang_id:
            language.set("ident", lang_id)
        language.text = lang_name or ""

    app_version = _app_version()
    if app_version or not tei_minimal:
        encoding_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}encodingDesc")
        app_info = ET.SubElement(encoding_desc, f"{{{_TEI_NS}}}appInfo")
        app = ET.SubElement(app_info, f"{{{_TEI_NS}}}application")
        app.set("ident", "cltk")
        if app_version:
            app.set("version", app_version)
        app.text = "CLTK"

    text = ET.SubElement(tei, f"{{{_TEI_NS}}}text")
    body = ET.SubElement(text, f"{{{_TEI_NS}}}body")
    div = ET.SubElement(body, f"{{{_TEI_NS}}}div")
    _set_attributes(div, [("type", "document")])
    div.set(f"{{{_XML_NS}}}id", _doc_id(doc))
    paragraph = ET.SubElement(div, f"{{{_TEI_NS}}}p")

    list_relation = None
    stand_off = ET.SubElement(tei, f"{{{_TEI_NS}}}standOff")
    list_relation = ET.SubElement(stand_off, f"{{{_TEI_NS}}}listRelation")
    list_relation.set("type", "dependency")

    for sent_num, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        s_elem = ET.SubElement(paragraph, f"{{{_TEI_NS}}}s")
        s_elem.set(f"{{{_XML_NS}}}id", make_sentence_id(sent_num))
        root_anchor = ET.SubElement(list_relation, f"{{{_TEI_NS}}}anchor")
        root_anchor.set(f"{{{_XML_NS}}}id", make_root_id(sent_num))

        for tok_num, word in enumerate(words, start=1):
            w_elem = ET.SubElement(s_elem, f"{{{_TEI_NS}}}w")
            attrs: list[tuple[str, str]] = []
            attrs.append((f"{{{_XML_NS}}}id", make_token_id(sent_num, tok_num)))
            attrs.append(("n", str(tok_num)))
            if include_lemma:
                lemma = get_lemma(word)
                if lemma:
                    attrs.append(("lemma", lemma))
            pos = get_upos_tag(word)
            if pos:
                attrs.append(("pos", pos))
            if include_morph:
                feats = format_features(word)
                if feats:
                    attrs.append(("msd", feats))
            if include_gloss:
                gloss = get_gloss(word)
                if gloss:
                    attrs.append(("gloss", gloss))
            _set_attributes(w_elem, attrs)
            w_elem.text = get_token_text(word)

        if include_translation:
            translation_map = getattr(doc, "sentence_translations", None) or {}
            translation = translation_map.get(sent_num - 1)
            text_value = getattr(translation, "text", None) if translation else None
            if text_value:
                note = ET.SubElement(s_elem, f"{{{_TEI_NS}}}note")
                note.set("type", "translation")
                note.text = str(text_value)

        for tok_num, word in enumerate(words, start=1):
            dep = getattr(word, "dependency_relation", None)
            dep_code = getattr(dep, "code", None) or getattr(dep, "tag", None)
            if not dep_code:
                continue
            governor = getattr(word, "governor", None)
            active = make_token_id(sent_num, tok_num)
            if governor is None:
                passive = make_root_id(sent_num)
            else:
                try:
                    head_index = int(governor)
                except (TypeError, ValueError):
                    continue
                if head_index < 0 or head_index >= len(words):
                    continue
                passive = make_token_id(sent_num, head_index + 1)
            rel = ET.SubElement(list_relation, f"{{{_TEI_NS}}}relation")
            rel_id = f"r-{active}-{passive}"
            _set_attributes(
                rel,
                [
                    (f"{{{_XML_NS}}}id", rel_id),
                    ("name", str(dep_code)),
                    ("active", f"#{active}"),
                    ("passive", f"#{passive}"),
                ],
            )

    _indent(tei)
    xml_bytes: bytes = ET.tostring(tei, encoding="utf-8", xml_declaration=True)
    return xml_bytes.decode("utf-8")