Skip to content

compare_backends

Compare CLTK NLP backends on the same text and report differences.

Example

from cltk.evaluation.compare_backends import compare_backends report = compare_backends( ... "lati1261", ... "Amor vincit omnia.", ... ["stanza", "openai"], ... ) print(report["summary"]["agreement_rates"]["upos"])

Report schema (high-level): report = { "meta": { "language": str, "backends": list[str], "base_backend": str, "timestamp": str, "text_hash": str, "cltk_version": str | None, }, "backends": { backend: { "model": str | None, "backend_config": dict | None, "metadata": dict, }, }, "sentences": [ { "index": int, "text": str | None, "alignment": { "base_backend": str, "ops": {backend: list[dict]}, "strategy": {backend: str}, "edit_distance": {backend: int}, }, "tokens": [ { "row": int, "base_index": int | None, "by_backend": { backend: { "index": int | None, "string": str | None, "lemma": str | None, "upos": str | None, "feats": str | None, "head": int | None, "deprel": str | None, } | None, }, "diff": { field: { "agree": bool, "values": {backend: str | int | None}, }, }, }, ], "metrics": { "agreement_rates": {field: {pair: dict}}, }, }, ], "summary": { "agreement_rates": {field: {pair: dict}}, "most_disagreed_tokens": list[dict], "confusion": { "upos": {pair: {tag_a: {tag_b: int}}}, "deprel": {pair: {tag_a: {tag_b: int}}}, }, }, }

FieldName module-attribute

FieldName = str

COMPARE_FIELDS module-attribute

COMPARE_FIELDS: tuple[FieldName, ...] = (
    "tokenization",
    "lemma",
    "upos",
    "feats",
    "head",
    "deprel",
)

NormalizedToken dataclass

NormalizedToken(
    index: Optional[int],
    string: Optional[str],
    lemma: Optional[str],
    upos: Optional[str],
    feats: Optional[str],
    head: Optional[int],
    deprel: Optional[str],
)

Comparable token representation extracted from a CLTK word.

index instance-attribute

index: Optional[int]

string instance-attribute

string: Optional[str]

lemma instance-attribute

lemma: Optional[str]

upos instance-attribute

upos: Optional[str]

feats instance-attribute

feats: Optional[str]

head instance-attribute

head: Optional[int]

deprel instance-attribute

deprel: Optional[str]

NormalizedSentence dataclass

NormalizedSentence(
    index: int,
    text: Optional[str],
    tokens: list[NormalizedToken],
)

Comparable sentence representation with normalized tokens.

index instance-attribute

index: int

text instance-attribute

text: Optional[str]

tokens instance-attribute

tokens: list[NormalizedToken]

AlignmentOp dataclass

AlignmentOp(
    op: str,
    base_index: Optional[int],
    other_index: Optional[int],
    base_token: Optional[str],
    other_token: Optional[str],
)

Single alignment operation between base and other token lists.

op instance-attribute

op: str

base_index instance-attribute

base_index: Optional[int]

other_index instance-attribute

other_index: Optional[int]

base_token instance-attribute

base_token: Optional[str]

other_token instance-attribute

other_token: Optional[str]

AlignmentResult dataclass

AlignmentResult(
    strategy: str, cost: int, ops: list[AlignmentOp]
)

Alignment output with ops and edit cost metadata.

strategy instance-attribute

strategy: str

cost instance-attribute

cost: int

ops instance-attribute

ops: list[AlignmentOp]

AlignmentRow

Bases: TypedDict

Row structure for aligned token comparisons.

base_index instance-attribute

base_index: Optional[int]

by_backend instance-attribute

by_backend: dict[str, dict[str, Any] | None]

compare_backends

compare_backends(
    language: str,
    text: str,
    backends: list[str],
    *,
    configs: dict[str, dict[str, Any]] | None = None,
    max_sentences: int | None = None,
    max_tokens: int | None = None,
    top_n: int | None = None
) -> dict[str, Any]

Run multiple NLP backends on the same text and compare their outputs.

Parameters:

  • language (str) –

    Glottolog language id.

  • text (str) –

    Raw text to analyze.

  • backends (list[str]) –

    Backend names (e.g., ["stanza", "openai", "ollama"]).

  • configs (dict[str, dict[str, Any]] | None, default: None ) –

    Optional per-backend config overrides, keyed by backend name.

  • max_sentences (int | None, default: None ) –

    Optional cap on number of sentences to compare.

  • max_tokens (int | None, default: None ) –

    Optional cap on tokens per sentence.

  • top_n (int | None, default: None ) –

    Optional number of top disagreements to include in summary.

Returns:

  • dict[str, Any]

    A structured report dict. See module docstring for schema.

Source code in cltk/evaluation/compare_backends.py
def compare_backends(
    language: str,
    text: str,
    backends: list[str],
    *,
    configs: dict[str, dict[str, Any]] | None = None,
    max_sentences: int | None = None,
    max_tokens: int | None = None,
    top_n: int | None = None,
) -> dict[str, Any]:
    """Run multiple NLP backends on the same text and compare their outputs.

    Args:
        language: Glottolog language id.
        text: Raw text to analyze.
        backends: Backend names (e.g., ["stanza", "openai", "ollama"]).
        configs: Optional per-backend config overrides, keyed by backend name.
        max_sentences: Optional cap on number of sentences to compare.
        max_tokens: Optional cap on tokens per sentence.
        top_n: Optional number of top disagreements to include in summary.

    Returns:
        A structured report dict. See module docstring for schema.

    """
    if not backends:
        raise ValueError("At least one backend is required.")
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Text must be a non-empty string.")

    configs = configs or {}
    docs_by_backend: dict[str, Doc] = {}
    backend_meta: dict[str, dict[str, Any]] = {}
    for backend in backends:
        cltk_config = _build_cltk_config(
            language=language,
            backend=backend,
            overrides=configs.get(backend),
        )
        logger.info("Running backend '%s' for comparison.", backend)
        nlp = NLP(cltk_config=cltk_config, suppress_banner=True)
        doc = nlp.analyze(text)
        docs_by_backend[backend] = doc
        backend_meta[backend] = _collect_backend_meta(doc)

    return _compare_docs(
        language=language,
        text=text,
        backends=backends,
        docs_by_backend=docs_by_backend,
        backend_meta=backend_meta,
        max_sentences=max_sentences,
        max_tokens=max_tokens,
        top_n=top_n,
    )

report_to_markdown

report_to_markdown(report: dict[str, Any]) -> str

Render a compare_backends report as Markdown.

Source code in cltk/evaluation/compare_backends.py
def report_to_markdown(report: dict[str, Any]) -> str:
    """Render a compare_backends report as Markdown."""
    meta = report.get("meta", {})
    backends = meta.get("backends", [])
    lines: list[str] = []
    lines.append("# Compare Backends Report")
    lines.append("")
    lines.append("## Metadata")
    lines.append("")
    lines.append(f"- Language: {meta.get('language')}")
    lines.append(f"- Backends: {', '.join(backends)}")
    lines.append(f"- Base backend: {meta.get('base_backend')}")
    lines.append(f"- Timestamp: {meta.get('timestamp')}")
    lines.append(f"- Text hash: {meta.get('text_hash')}")
    cltk_version = meta.get("cltk_version")
    if cltk_version:
        lines.append(f"- CLTK version: {cltk_version}")

    summary = report.get("summary", {})
    agreement_rates = summary.get("agreement_rates", {})
    lines.append("")
    lines.append("## Agreement Rates")
    lines.append("")
    lines.append("| Field | Backend Pair | Agree | Total | Rate |")
    lines.append("| --- | --- | --- | --- | --- |")
    for field in COMPARE_FIELDS:
        field_rates = agreement_rates.get(field, {})
        for pair, stats in field_rates.items():
            agree = stats.get("agree", 0)
            total = stats.get("total", 0)
            rate = stats.get("rate")
            rate_str = f"{rate:.3f}" if isinstance(rate, float) else "-"
            lines.append(f"| {field} | {pair} | {agree} | {total} | {rate_str} |")

    lines.append("")
    lines.append("## Top Disagreements")
    lines.append("")
    lines.append("| Sentence | Row | Fields | Tokenization |")
    lines.append("| --- | --- | --- | --- |")
    for item in summary.get("most_disagreed_tokens", []):
        sent_idx = item.get("sentence_index")
        row = item.get("row")
        fields = ", ".join(item.get("fields", []))
        token_pairs = item.get("tokenization", {})
        tokens_str = "; ".join(f"{k}={v}" for k, v in token_pairs.items())
        lines.append(f"| {sent_idx} | {row} | {fields} | {tokens_str} |")

    sentences = report.get("sentences", [])
    if sentences:
        lines.append("")
        lines.append("## Per-Sentence Details")
        for sent in sentences:
            sent_idx = sent.get("index")
            sent_text = sent.get("text") or ""
            lines.append("")
            lines.append(f"### Sentence {sent_idx}")
            if sent_text:
                lines.append("")
                lines.append(sent_text)
            disagreement_rows = [
                tok
                for tok in sent.get("tokens", [])
                if any(
                    not tok.get("diff", {}).get(field, {}).get("agree", True)
                    for field in COMPARE_FIELDS
                )
            ]
            if not disagreement_rows:
                lines.append("")
                lines.append("No disagreements found.")
                continue
            lines.append("")
            lines.append("| Row | Tokenization |")
            lines.append("| --- | --- |")
            for tok in disagreement_rows[:20]:
                row = tok.get("row")
                token_values = (
                    tok.get("diff", {}).get("tokenization", {}).get("values", {})
                )
                tokens_str = "; ".join(f"{k}={v}" for k, v in token_values.items())
                lines.append(f"| {row} | {tokens_str} |")
            if len(disagreement_rows) > 20:
                lines.append("")
                lines.append(
                    f"Truncated {len(disagreement_rows) - 20} additional rows."
                )
    lines.append("")
    return "\n".join(lines)

write_report

write_report(
    report: dict[str, Any],
    out_dir: str,
    basename: str = "compare_backends",
) -> list[str]

Write report JSON, Markdown, and CSV tables to disk.

Source code in cltk/evaluation/compare_backends.py
def write_report(
    report: dict[str, Any],
    out_dir: str,
    basename: str = "compare_backends",
) -> list[str]:
    """Write report JSON, Markdown, and CSV tables to disk."""
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    written: list[str] = []

    json_path = out_path / f"{basename}.json"
    json_path.write_text(json.dumps(report, indent=2, sort_keys=True))
    written.append(str(json_path))

    md_path = out_path / f"{basename}.md"
    md_path.write_text(report_to_markdown(report))
    written.append(str(md_path))

    csv_paths = _write_csv_tables(report, out_path, basename)
    written.extend(csv_paths)
    return written