Skip to content

cltk.exports

Scholar-facing export helpers.

doc_to_igt_html

doc_to_igt_html(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a token-level IGT table in a self-contained HTML document.

Source code in cltk/exports/igt.py
def doc_to_igt_html(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a token-level IGT table in a self-contained HTML document."""
    sentences = iter_sentences(doc, max_sentences)
    mode = _select_second_line(
        include_gloss=include_gloss,
        include_lemma=include_lemma,
        include_morph=include_morph,
    )
    blocks: list[str] = []
    for sent_idx, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        token_cells = "".join(
            f"<td>{html_escape(get_token_text(word))}</td>" for word in words
        )
        if mode == "morph":
            gloss_cells = "".join(
                f"<td>{html_escape(format_morph(word))}</td>" for word in words
            )
        elif mode == "lemma":
            gloss_cells = "".join(
                f"<td>{html_escape(get_lemma(word))}</td>" for word in words
            )
        elif mode == "gloss":
            gloss_cells = "".join(
                f"<td>{html_escape(get_gloss(word))}</td>" for word in words
            )
        else:
            gloss_cells = "".join(
                f"<td>{html_escape(get_token_text(word))}</td>" for word in words
            )
        translation_html = ""
        if include_translation:
            translation = _sentence_translation(doc, sent_idx - 1)
            if translation:
                translation_html = f'<div class="igt-translation">{html_escape(str(translation))}</div>'
        sentence_text_value = html_escape(sentence_text(doc, sentence, words))
        blocks.append(
            "\n".join(
                [
                    f'<section class="igt-sentence" id="igt-s{sent_idx}">',
                    f'<div class="igt-surface">{sentence_text_value}</div>',
                    '<table class="igt-table">',
                    f"<tr>{token_cells}</tr>",
                    f"<tr>{gloss_cells}</tr>",
                    "</table>",
                    translation_html,
                    "</section>",
                ]
            )
        )

    body = "\n".join(blocks)
    return (
        "\n".join(
            [
                "<!doctype html>",
                '<html lang="en">',
                "<head>",
                '<meta charset="utf-8" />',
                "<title>CLTK Token-level IGT</title>",
                "<style>",
                ".igt-sentence{margin:1.5rem 0;}",
                ".igt-surface{font-weight:600;margin-bottom:0.25rem;}",
                ".igt-table{border-collapse:collapse;width:100%;}",
                ".igt-table td{border:1px solid #ccc;padding:0.35rem;text-align:center;}",
                ".igt-translation{margin-top:0.4rem;font-style:italic;}",
                "</style>",
                "</head>",
                "<body>",
                "<h1>Token-level IGT</h1>",
                body,
                "</body>",
                "</html>",
            ]
        ).strip()
        + "\n"
    )

doc_to_igt_latex

doc_to_igt_latex(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a token-level IGT table in LaTeX.

Gloss selection priority: 1) word.enrichment.gloss 2) first word.enrichment.lemma_translations entry 3) word.lemma 4) token string

Source code in cltk/exports/igt.py
def doc_to_igt_latex(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a token-level IGT table in LaTeX.

    Gloss selection priority:
    1) ``word.enrichment.gloss``
    2) first ``word.enrichment.lemma_translations`` entry
    3) ``word.lemma``
    4) token string

    """
    sentences = iter_sentences(doc, max_sentences)
    mode = _select_second_line(
        include_gloss=include_gloss,
        include_lemma=include_lemma,
        include_morph=include_morph,
    )
    lines: list[str] = ["% Token-level IGT generated by CLTK", ""]
    for sent_idx, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        columns = " ".join(["l"] * len(words))
        lines.append(f"\\begin{{tabular}}{{{columns}}}")
        token_row = " & ".join(latex_escape(get_token_text(word)) for word in words)
        lines.append(f"{token_row} \\")
        if mode == "morph":
            gloss_row = " & ".join(latex_escape(format_morph(word)) for word in words)
        elif mode == "lemma":
            gloss_row = " & ".join(latex_escape(get_lemma(word)) for word in words)
        elif mode == "gloss":
            gloss_row = " & ".join(latex_escape(get_gloss(word)) for word in words)
        else:
            gloss_row = " & ".join(latex_escape(get_token_text(word)) for word in words)
        lines.append(f"{gloss_row} \\")
        if include_translation:
            translation = _sentence_translation(doc, sent_idx - 1)
            if translation:
                escaped = latex_escape(str(translation))
                lines.append(f"\\multicolumn{{{len(words)}}}{{l}}{{{escaped}}} \\")
        lines.append("\\end{tabular}")
        lines.append("")
    return "\n".join(lines).strip() + "\n"

doc_to_readers_guide_html

doc_to_readers_guide_html(
    doc: Doc,
    *,
    title: Optional[str] = None,
    include_ipa: bool = True,
    include_gloss: bool = True,
    include_morph: bool = True,
    include_dependencies: bool = True,
    include_translation: bool = True,
    max_sentences: Optional[int] = None
) -> str

Render a self-contained HTML reader's guide for a Doc.

Source code in cltk/exports/readers_guide_html.py
def doc_to_readers_guide_html(
    doc: Doc,
    *,
    title: Optional[str] = None,
    include_ipa: bool = True,
    include_gloss: bool = True,
    include_morph: bool = True,
    include_dependencies: bool = True,
    include_translation: bool = True,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a self-contained HTML reader's guide for a ``Doc``."""
    sentences = iter_sentences(doc, max_sentences)
    doc_title = (
        title
        or (getattr(doc, "metadata", None) or {}).get("title")
        or (getattr(doc, "metadata", None) or {}).get("reference")
        or "Reader's Guide"
    )

    toc_items: list[str] = []
    sentence_blocks: list[str] = []
    for sent_idx, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        anchor = f"sentence-{sent_idx}"
        toc_items.append(f'<li><a href="#{anchor}">Sentence {sent_idx}</a></li>')
        sentence_value = html_escape(sentence_text(doc, sentence, words))
        translation_html = ""
        if include_translation:
            translation = _sentence_translation(doc, sent_idx - 1)
            if translation:
                translation_html = f'<div class="sentence-translation">{html_escape(str(translation))}</div>'

        token_strip_parts: list[str] = []
        for word in words:
            token = html_escape(get_token_text(word))
            gloss = get_gloss(word) if include_gloss else ""
            ipa_value, ipa_mode = get_ipa(word) if include_ipa else (None, None)
            tooltip_parts: list[str] = []
            if gloss:
                tooltip_parts.append(f"Gloss: {gloss}")
            if ipa_value:
                ipa_text = f"IPA: {ipa_value}"
                if ipa_mode:
                    ipa_text += f" ({ipa_mode})"
                tooltip_parts.append(ipa_text)
            tooltip_text = html_escape(" • ".join(tooltip_parts))
            data_attr = f' data-tooltip="{tooltip_text}"' if tooltip_text else ""
            token_strip_parts.append(f'<span class="token"{data_attr}>{token}</span>')
        token_strip = " ".join(token_strip_parts)

        card_blocks: list[str] = []
        for tok_idx, word in enumerate(words, start=1):
            token = html_escape(get_token_text(word))
            lemma = get_lemma(word)
            pos = get_upos_tag(word)
            feats = format_features(word)
            gloss = get_gloss(word) if include_gloss else ""
            ipa_value, ipa_mode = get_ipa(word) if include_ipa else (None, None)
            morph = format_morph(word) if include_morph else ""
            dep = getattr(word, "dependency_relation", None)
            dep_code = getattr(dep, "code", None) or getattr(dep, "tag", None)
            dep_name = getattr(dep, "name", None)
            governor = getattr(word, "governor", None)

            rows: list[str] = []
            if lemma:
                rows.append(
                    "".join(
                        [
                            "<dt>Lemma</dt>",
                            f'<dd>{html_escape(str(lemma))} <button class="copy" data-copy="{html_escape(str(lemma))}">Copy</button></dd>',
                        ]
                    )
                )
            if pos:
                rows.append(f"<dt>POS</dt><dd>{html_escape(str(pos))}</dd>")
            if include_morph and morph:
                rows.append(f"<dt>Morphology</dt><dd>{html_escape(morph)}</dd>")
            if include_gloss and gloss:
                rows.append(
                    "".join(
                        [
                            "<dt>Gloss</dt>",
                            f'<dd>{html_escape(gloss)} <button class="copy" data-copy="{html_escape(gloss)}">Copy</button></dd>',
                        ]
                    )
                )
            if include_ipa and ipa_value:
                ipa_label = html_escape(ipa_value)
                if ipa_mode:
                    ipa_label += (
                        f' <span class="muted">({html_escape(ipa_mode)})</span>'
                    )
                rows.append(f"<dt>IPA</dt><dd>{ipa_label}</dd>")
            if feats:
                rows.append(f"<dt>Features</dt><dd>{html_escape(feats)}</dd>")
            if include_dependencies and (dep_code or governor is not None):
                dep_label = dep_code or ""
                if dep_name and dep_name != dep_code:
                    dep_label = (
                        f"{dep_name} ({dep_code})" if dep_code else str(dep_name)
                    )
                governor_label = ""
                if governor is not None:
                    try:
                        head_index = int(governor)
                    except (TypeError, ValueError):
                        head_index = None
                    if head_index is not None and 0 <= head_index < len(words):
                        head_word = get_token_text(words[head_index])
                        governor_label = f"token {head_index + 1} ({head_word})"
                    elif head_index is not None:
                        governor_label = f"token {head_index + 1}"
                dep_details = " · ".join(
                    part for part in [dep_label, governor_label] if part
                )
                if dep_details:
                    rows.append(
                        f"<dt>Dependencies</dt><dd>{html_escape(dep_details)}</dd>"
                    )

            rows_html = "".join(rows) if rows else "<p>No details available.</p>"
            card_blocks.append(
                "\n".join(
                    [
                        '<details class="token-card">',
                        f"<summary>{token}</summary>",
                        f"<dl>{rows_html}</dl>",
                        "</details>",
                    ]
                )
            )

        sentence_blocks.append(
            "\n".join(
                [
                    f'<section id="{anchor}" class="sentence">',
                    f"<h2>Sentence {sent_idx}</h2>",
                    f'<div class="sentence-text">{sentence_value}</div>',
                    translation_html,
                    f'<div class="token-strip">{token_strip}</div>',
                    '<div class="token-cards">',
                    "\n".join(card_blocks),
                    "</div>",
                    "</section>",
                ]
            )
        )

    toc_html = ""
    if len(toc_items) > 1:
        toc_html = (
            '<nav class="toc"><h2>Contents</h2><ol>'
            + "".join(toc_items)
            + "</ol></nav>"
        )

    body = "\n".join(sentence_blocks)
    return (
        "\n".join(
            [
                "<!doctype html>",
                '<html lang="en">',
                "<head>",
                '<meta charset="utf-8" />',
                f"<title>{html_escape(str(doc_title))}</title>",
                "<style>",
                "body{font-family:Georgia,serif;line-height:1.6;margin:2rem;color:#1f1f1f;}",
                "h1,h2{font-family:'Palatino Linotype',serif;}",
                ".toc{background:#f5f5f5;padding:1rem;border-radius:8px;margin-bottom:1.5rem;}",
                ".sentence{margin:2rem 0;padding-bottom:1.5rem;border-bottom:1px solid #ddd;}",
                ".sentence-text{font-weight:600;margin-bottom:0.25rem;}",
                ".sentence-translation{font-style:italic;margin-bottom:0.75rem;color:#444;}",
                ".token-strip{display:flex;flex-wrap:wrap;gap:0.5rem;margin:0.75rem 0;}",
                ".token{position:relative;padding:0.2rem 0.4rem;border-radius:4px;background:#eef2ff;}",
                ".token[data-tooltip]:hover::after{content:attr(data-tooltip);position:absolute;left:0;top:100%;background:#111;color:#fff;padding:0.4rem;border-radius:4px;white-space:nowrap;font-size:0.8rem;z-index:5;}",
                ".token-cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:0.75rem;}",
                ".token-card{background:#fafafa;border:1px solid #e2e2e2;border-radius:8px;padding:0.5rem;}",
                ".token-card summary{cursor:pointer;font-weight:600;margin-bottom:0.4rem;}",
                ".token-card dl{margin:0;}",
                ".token-card dt{font-weight:600;margin-top:0.3rem;}",
                ".token-card dd{margin:0 0 0.3rem 0.5rem;}",
                ".copy{margin-left:0.4rem;font-size:0.75rem;}",
                ".muted{color:#666;font-size:0.85rem;}",
                "</style>",
                "</head>",
                "<body>",
                f"<h1>{html_escape(str(doc_title))}</h1>",
                toc_html,
                body,
                "<script>",
                "document.addEventListener('click', (event) => {",
                "  const target = event.target;",
                "  if (target && target.classList.contains('copy')) {",
                "    const text = target.getAttribute('data-copy') || '';",
                "    if (navigator.clipboard) {",
                "      navigator.clipboard.writeText(text);",
                "    } else {",
                "      const temp = document.createElement('textarea');",
                "      temp.value = text;",
                "      document.body.appendChild(temp);",
                "      temp.select();",
                "      document.execCommand('copy');",
                "      document.body.removeChild(temp);",
                "    }",
                "  }",
                "});",
                "</script>",
                "</body>",
                "</html>",
            ]
        ).strip()
        + "\n"
    )

doc_to_tei_xml

doc_to_tei_xml(
    doc: Doc,
    *,
    tei_minimal: bool = True,
    include_morph: bool = True,
    include_lemma: bool = True,
    include_gloss: bool = False,
    include_translation: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a TEI-ish XML document with standoff dependency relations.

Source code in cltk/exports/tei.py
def doc_to_tei_xml(
    doc: Doc,
    *,
    tei_minimal: bool = True,
    include_morph: bool = True,
    include_lemma: bool = True,
    include_gloss: bool = False,
    include_translation: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a TEI-ish XML document with standoff dependency relations."""
    sentences = iter_sentences(doc, max_sentences)
    tei = ET.Element(f"{{{_TEI_NS}}}TEI")

    tei_header = ET.SubElement(tei, f"{{{_TEI_NS}}}teiHeader")
    file_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}fileDesc")
    title_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}titleStmt")
    title = ET.SubElement(title_stmt, f"{{{_TEI_NS}}}title")
    metadata = getattr(doc, "metadata", None) or {}
    title.text = str(
        metadata.get("title") or metadata.get("reference") or "CLTK Document"
    )
    pub_stmt = ET.SubElement(file_desc, f"{{{_TEI_NS}}}publicationStmt")
    pub_p = ET.SubElement(pub_stmt, f"{{{_TEI_NS}}}p")
    pub_p.text = "Unpublished"
    source_desc = ET.SubElement(file_desc, f"{{{_TEI_NS}}}sourceDesc")
    source_p = ET.SubElement(source_desc, f"{{{_TEI_NS}}}p")
    source_p.text = "Generated by CLTK"

    lang_id, lang_name = _doc_language(doc)
    if lang_id or lang_name:
        profile_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}profileDesc")
        lang_usage = ET.SubElement(profile_desc, f"{{{_TEI_NS}}}langUsage")
        language = ET.SubElement(lang_usage, f"{{{_TEI_NS}}}language")
        if lang_id:
            language.set("ident", lang_id)
        language.text = lang_name or ""

    app_version = _app_version()
    if app_version or not tei_minimal:
        encoding_desc = ET.SubElement(tei_header, f"{{{_TEI_NS}}}encodingDesc")
        app_info = ET.SubElement(encoding_desc, f"{{{_TEI_NS}}}appInfo")
        app = ET.SubElement(app_info, f"{{{_TEI_NS}}}application")
        app.set("ident", "cltk")
        if app_version:
            app.set("version", app_version)
        app.text = "CLTK"

    text = ET.SubElement(tei, f"{{{_TEI_NS}}}text")
    body = ET.SubElement(text, f"{{{_TEI_NS}}}body")
    div = ET.SubElement(body, f"{{{_TEI_NS}}}div")
    _set_attributes(div, [("type", "document")])
    div.set(f"{{{_XML_NS}}}id", _doc_id(doc))
    paragraph = ET.SubElement(div, f"{{{_TEI_NS}}}p")

    list_relation = None
    stand_off = ET.SubElement(tei, f"{{{_TEI_NS}}}standOff")
    list_relation = ET.SubElement(stand_off, f"{{{_TEI_NS}}}listRelation")
    list_relation.set("type", "dependency")

    for sent_num, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        s_elem = ET.SubElement(paragraph, f"{{{_TEI_NS}}}s")
        s_elem.set(f"{{{_XML_NS}}}id", make_sentence_id(sent_num))
        root_anchor = ET.SubElement(list_relation, f"{{{_TEI_NS}}}anchor")
        root_anchor.set(f"{{{_XML_NS}}}id", make_root_id(sent_num))

        for tok_num, word in enumerate(words, start=1):
            w_elem = ET.SubElement(s_elem, f"{{{_TEI_NS}}}w")
            attrs: list[tuple[str, str]] = []
            attrs.append((f"{{{_XML_NS}}}id", make_token_id(sent_num, tok_num)))
            attrs.append(("n", str(tok_num)))
            if include_lemma:
                lemma = get_lemma(word)
                if lemma:
                    attrs.append(("lemma", lemma))
            pos = get_upos_tag(word)
            if pos:
                attrs.append(("pos", pos))
            if include_morph:
                feats = format_features(word)
                if feats:
                    attrs.append(("msd", feats))
            if include_gloss:
                gloss = get_gloss(word)
                if gloss:
                    attrs.append(("gloss", gloss))
            _set_attributes(w_elem, attrs)
            w_elem.text = get_token_text(word)

        if include_translation:
            translation_map = getattr(doc, "sentence_translations", None) or {}
            translation = translation_map.get(sent_num - 1)
            text_value = getattr(translation, "text", None) if translation else None
            if text_value:
                note = ET.SubElement(s_elem, f"{{{_TEI_NS}}}note")
                note.set("type", "translation")
                note.text = str(text_value)

        for tok_num, word in enumerate(words, start=1):
            dep = getattr(word, "dependency_relation", None)
            dep_code = getattr(dep, "code", None) or getattr(dep, "tag", None)
            if not dep_code:
                continue
            governor = getattr(word, "governor", None)
            active = make_token_id(sent_num, tok_num)
            if governor is None:
                passive = make_root_id(sent_num)
            else:
                try:
                    head_index = int(governor)
                except (TypeError, ValueError):
                    continue
                if head_index < 0 or head_index >= len(words):
                    continue
                passive = make_token_id(sent_num, head_index + 1)
            rel = ET.SubElement(list_relation, f"{{{_TEI_NS}}}relation")
            rel_id = f"r-{active}-{passive}"
            _set_attributes(
                rel,
                [
                    (f"{{{_XML_NS}}}id", rel_id),
                    ("name", str(dep_code)),
                    ("active", f"#{active}"),
                    ("passive", f"#{passive}"),
                ],
            )

    _indent(tei)
    xml_bytes: bytes = ET.tostring(tei, encoding="utf-8", xml_declaration=True)
    return xml_bytes.decode("utf-8")

Submodules