Skip to content

dispatch

Output dispatch helpers for the CLTK CLI (pure; no I/O).

OutputPayload module-attribute

OutputPayload = str | dict[str, Any] | Any

OUTPUT_TARGETS module-attribute

OUTPUT_TARGETS: dict[
    str, Callable[[Doc], OutputPayload]
] = {
    "raw": lambda doc: _render_raw(doc),
    "conllu": doc_to_conllu,
    "feature-table": doc_to_feature_table,
    "readers-guide": format_readers_guide,
    "json": lambda doc: doc_to_json(doc),
}

OUTPUT_FORMATS module-attribute

OUTPUT_FORMATS: dict[str, set[str]] = {
    "feature-table": {"csv", "tsv", "parquet"},
    "json": {"pretty", "min"},
}

DEFAULT_FORMATS module-attribute

DEFAULT_FORMATS: dict[str, str] = {
    "feature-table": "csv",
    "json": "pretty",
}

normalize_output_name

normalize_output_name(name: str) -> str

Normalize output identifiers.

Source code in cltk/cli/dispatch.py
def normalize_output_name(name: str) -> str:
    """Normalize output identifiers."""
    return name.strip().lower()

normalize_format

normalize_format(name: Optional[str]) -> Optional[str]

Normalize format identifiers.

Source code in cltk/cli/dispatch.py
def normalize_format(name: Optional[str]) -> Optional[str]:
    """Normalize format identifiers."""
    if name is None:
        return None
    return name.strip().lower()

resolve_format

resolve_format(
    out: str, fmt: Optional[str]
) -> Optional[str]

Validate and resolve the output format for an output target.

Source code in cltk/cli/dispatch.py
def resolve_format(out: str, fmt: Optional[str]) -> Optional[str]:
    """Validate and resolve the output format for an output target."""
    out_name = normalize_output_name(out)
    fmt_name = normalize_format(fmt)
    allowed = OUTPUT_FORMATS.get(out_name)
    if allowed is None:
        if fmt_name is not None:
            raise ValueError(f"--format is not supported for output '{out_name}'.")
        return None
    if fmt_name is None:
        return DEFAULT_FORMATS.get(out_name)
    if fmt_name not in allowed:
        allowed_list = ", ".join(sorted(allowed))
        raise ValueError(
            f"Unsupported format '{fmt_name}' for output '{out_name}'. "
            f"Choose from: {allowed_list}."
        )
    return fmt_name

render_output

render_output(
    doc: Doc,
    out: str,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None
) -> OutputPayload

Render output for a document using the named output target.

Source code in cltk/cli/dispatch.py
def render_output(
    doc: Doc,
    out: str,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None,
) -> OutputPayload:
    """Render output for a document using the named output target."""
    out_name = normalize_output_name(out)
    renderer = OUTPUT_TARGETS.get(out_name)
    if renderer is None:
        allowed = ", ".join(sorted(OUTPUT_TARGETS))
        raise ValueError(f"Unsupported output '{out_name}'. Choose from: {allowed}.")
    if out_name == "raw":
        return raw_summary(doc, max_sentences=max_sentences, max_tokens=max_tokens)
    if out_name == "json":
        return doc_to_json(doc, max_sentences=max_sentences, max_tokens=max_tokens)
    sliced = _slice_doc(doc, max_sentences=max_sentences, max_tokens=max_tokens)
    return renderer(sliced)

doc_to_json

doc_to_json(
    doc: Doc,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None
) -> dict[str, Any]

Return a JSON-serializable structure for a Doc.

Source code in cltk/cli/dispatch.py
def doc_to_json(
    doc: Doc,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None,
) -> dict[str, Any]:
    """Return a JSON-serializable structure for a ``Doc``."""
    sliced = _slice_doc(doc, max_sentences=max_sentences, max_tokens=max_tokens)
    groups = _group_words(sliced)
    language = sliced.language
    dialect = sliced.dialect
    data: dict[str, Any] = {
        "meta": {
            "language": {
                "name": language.name,
                "glottolog_id": language.glottolog_id,
                "iso_639_3": language.iso,
            },
            "dialect": {
                "name": dialect.name,
                "glottolog_id": dialect.glottolog_id,
                "language_code": dialect.language_code,
            }
            if dialect
            else None,
            "backend": sliced.backend,
            "model": sliced.model,
            "pipeline": sliced.pipeline.__class__.__name__ if sliced.pipeline else None,
        },
        "text": {
            "raw": sliced.raw,
            "normalized": sliced.normalized_text,
        },
        "sentences": [],
    }
    sentences: list[dict[str, Any]] = []
    for sent_idx, words in groups:
        sent_text = " ".join([w.string for w in words if w.string]).strip()
        translation = (
            _serialize_translation(sliced.sentence_translations.get(sent_idx))
            if sent_idx is not None
            else None
        )
        tokens: list[dict[str, Any]] = []
        for token_idx, word in enumerate(words, start=1):
            tokens.append(
                {
                    "index_token": word.index_token,
                    "token_index_sentence": token_idx,
                    "index_char_start": word.index_char_start,
                    "index_char_stop": word.index_char_stop,
                    "string": word.string,
                    "lemma": word.lemma,
                    "upos": _serialize_upos(word),
                    "xpos": word.xpos,
                    "features": _serialize_features(word),
                    "dependency_relation": _serialize_deprel(word),
                    "governor": word.governor,
                }
            )
        sentences.append(
            {
                "index": sent_idx,
                "text": sent_text or None,
                "translation": translation,
                "tokens": tokens,
            }
        )
    data["sentences"] = sentences
    return data

ensure_text_payload

ensure_text_payload(
    payload: OutputPayload, out: str
) -> str

Ensure a rendered payload is text.

Source code in cltk/cli/dispatch.py
def ensure_text_payload(payload: OutputPayload, out: str) -> str:
    """Ensure a rendered payload is text."""
    if isinstance(payload, str):
        return payload
    raise ValueError(f"Output '{out}' did not produce text.")

ensure_json_payload

ensure_json_payload(
    payload: OutputPayload, out: str
) -> dict[str, Any]

Ensure a rendered payload is JSON-serializable dict.

Source code in cltk/cli/dispatch.py
def ensure_json_payload(payload: OutputPayload, out: str) -> dict[str, Any]:
    """Ensure a rendered payload is JSON-serializable dict."""
    if isinstance(payload, dict):
        return payload
    raise ValueError(f"Output '{out}' did not produce JSON.")

raw_summary

raw_summary(
    doc: Doc,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None
) -> str

Return a human-readable summary for Doc.

Source code in cltk/cli/dispatch.py
def raw_summary(
    doc: Doc,
    *,
    max_sentences: Optional[int] = None,
    max_tokens: Optional[int] = None,
) -> str:
    """Return a human-readable summary for ``Doc``."""
    total_sentences = len(_group_words(doc))
    total_tokens = len(doc.words or [])
    sliced = _slice_doc(doc, max_sentences=max_sentences, max_tokens=max_tokens)
    out_sentences = len(_group_words(sliced))
    out_tokens = len(sliced.words or [])

    lang = doc.language
    backend = doc.backend or "unknown"
    model = doc.model
    lines = [
        f"Language: {lang.name} ({lang.glottolog_id})",
        f"Backend: {backend}" + (f" (model: {model})" if model else ""),
        f"Sentences: {out_sentences}",
        f"Tokens: {out_tokens}",
    ]
    if (max_sentences or max_tokens) and (
        total_sentences != out_sentences or total_tokens != out_tokens
    ):
        lines.append(
            "Limits: "
            f"sentences={max_sentences or '-'}, "
            f"tokens={max_tokens or '-'} "
            f"(original {total_sentences} sentences, {total_tokens} tokens)"
        )
    return "\n".join(lines).rstrip() + "\n"