igt

Interlinear Glossed Text (IGT) exporters.

doc_to_igt_latex

doc_to_igt_latex(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a token-level IGT table in LaTeX.

Gloss selection priority: 1) word.enrichment.gloss 2) first word.enrichment.lemma_translations entry 3) word.lemma 4) token string

Source code in cltk/exports/igt.py

def doc_to_igt_latex(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a token-level IGT table in LaTeX.

    Gloss selection priority:
    1) ``word.enrichment.gloss``
    2) first ``word.enrichment.lemma_translations`` entry
    3) ``word.lemma``
    4) token string

    """
    sentences = iter_sentences(doc, max_sentences)
    mode = _select_second_line(
        include_gloss=include_gloss,
        include_lemma=include_lemma,
        include_morph=include_morph,
    )
    lines: list[str] = ["% Token-level IGT generated by CLTK", ""]
    for sent_idx, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        columns = " ".join(["l"] * len(words))
        lines.append(f"\\begin{{tabular}}{{{columns}}}")
        token_row = " & ".join(latex_escape(get_token_text(word)) for word in words)
        lines.append(f"{token_row} \\")
        if mode == "morph":
            gloss_row = " & ".join(latex_escape(format_morph(word)) for word in words)
        elif mode == "lemma":
            gloss_row = " & ".join(latex_escape(get_lemma(word)) for word in words)
        elif mode == "gloss":
            gloss_row = " & ".join(latex_escape(get_gloss(word)) for word in words)
        else:
            gloss_row = " & ".join(latex_escape(get_token_text(word)) for word in words)
        lines.append(f"{gloss_row} \\")
        if include_translation:
            translation = _sentence_translation(doc, sent_idx - 1)
            if translation:
                escaped = latex_escape(str(translation))
                lines.append(f"\\multicolumn{{{len(words)}}}{{l}}{{{escaped}}} \\")
        lines.append("\\end{tabular}")
        lines.append("")
    return "\n".join(lines).strip() + "\n"

doc_to_igt_html

doc_to_igt_html(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None
) -> str

Render a token-level IGT table in a self-contained HTML document.

Source code in cltk/exports/igt.py

def doc_to_igt_html(
    doc: Doc,
    *,
    include_translation: bool = True,
    include_gloss: bool = True,
    include_lemma: bool = False,
    include_morph: bool = False,
    max_sentences: Optional[int] = None,
) -> str:
    """Render a token-level IGT table in a self-contained HTML document."""
    sentences = iter_sentences(doc, max_sentences)
    mode = _select_second_line(
        include_gloss=include_gloss,
        include_lemma=include_lemma,
        include_morph=include_morph,
    )
    blocks: list[str] = []
    for sent_idx, sentence in enumerate(sentences, start=1):
        words = iter_words(sentence)
        if not words:
            continue
        token_cells = "".join(
            f"<td>{html_escape(get_token_text(word))}</td>" for word in words
        )
        if mode == "morph":
            gloss_cells = "".join(
                f"<td>{html_escape(format_morph(word))}</td>" for word in words
            )
        elif mode == "lemma":
            gloss_cells = "".join(
                f"<td>{html_escape(get_lemma(word))}</td>" for word in words
            )
        elif mode == "gloss":
            gloss_cells = "".join(
                f"<td>{html_escape(get_gloss(word))}</td>" for word in words
            )
        else:
            gloss_cells = "".join(
                f"<td>{html_escape(get_token_text(word))}</td>" for word in words
            )
        translation_html = ""
        if include_translation:
            translation = _sentence_translation(doc, sent_idx - 1)
            if translation:
                translation_html = f'<div class="igt-translation">{html_escape(str(translation))}</div>'
        sentence_text_value = html_escape(sentence_text(doc, sentence, words))
        blocks.append(
            "\n".join(
                [
                    f'<section class="igt-sentence" id="igt-s{sent_idx}">',
                    f'<div class="igt-surface">{sentence_text_value}</div>',
                    '<table class="igt-table">',
                    f"<tr>{token_cells}</tr>",
                    f"<tr>{gloss_cells}</tr>",
                    "</table>",
                    translation_html,
                    "</section>",
                ]
            )
        )

    body = "\n".join(blocks)
    return (
        "\n".join(
            [
                "<!doctype html>",
                '<html lang="en">',
                "<head>",
                '<meta charset="utf-8" />',
                "<title>CLTK Token-level IGT</title>",
                "<style>",
                ".igt-sentence{margin:1.5rem 0;}",
                ".igt-surface{font-weight:600;margin-bottom:0.25rem;}",
                ".igt-table{border-collapse:collapse;width:100%;}",
                ".igt-table td{border:1px solid #ccc;padding:0.35rem;text-align:center;}",
                ".igt-translation{margin-top:0.4rem;font-style:italic;}",
                "</style>",
                "</head>",
                "<body>",
                "<h1>Token-level IGT</h1>",
                body,
                "</body>",
                "</html>",
            ]
        ).strip()
        + "\n"
    )