Compare CLTK NLP backends on the same text and report differences.
Example
from cltk.evaluation.compare_backends import compare_backends
report = compare_backends(
... "lati1261",
... "Amor vincit omnia.",
... ["stanza", "openai"],
... )
print(report["summary"]["agreement_rates"]["upos"])
Report schema (high-level):
report = {
"meta": {
"language": str,
"backends": list[str],
"base_backend": str,
"timestamp": str,
"text_hash": str,
"cltk_version": str | None,
},
"backends": {
backend: {
"model": str | None,
"backend_config": dict | None,
"metadata": dict,
},
},
"sentences": [
{
"index": int,
"text": str | None,
"alignment": {
"base_backend": str,
"ops": {backend: list[dict]},
"strategy": {backend: str},
"edit_distance": {backend: int},
},
"tokens": [
{
"row": int,
"base_index": int | None,
"by_backend": {
backend: {
"index": int | None,
"string": str | None,
"lemma": str | None,
"upos": str | None,
"feats": str | None,
"head": int | None,
"deprel": str | None,
} | None,
},
"diff": {
field: {
"agree": bool,
"values": {backend: str | int | None},
},
},
},
],
"metrics": {
"agreement_rates": {field: {pair: dict}},
},
},
],
"summary": {
"agreement_rates": {field: {pair: dict}},
"most_disagreed_tokens": list[dict],
"confusion": {
"upos": {pair: {tag_a: {tag_b: int}}},
"deprel": {pair: {tag_a: {tag_b: int}}},
},
},
}
FieldName
module-attribute
COMPARE_FIELDS
module-attribute
COMPARE_FIELDS: tuple[FieldName, ...] = (
"tokenization",
"lemma",
"upos",
"feats",
"head",
"deprel",
)
NormalizedToken
dataclass
NormalizedToken(
index: Optional[int],
string: Optional[str],
lemma: Optional[str],
upos: Optional[str],
feats: Optional[str],
head: Optional[int],
deprel: Optional[str],
)
Comparable token representation extracted from a CLTK word.
string
instance-attribute
deprel
instance-attribute
NormalizedSentence
dataclass
NormalizedSentence(
index: int,
text: Optional[str],
tokens: list[NormalizedToken],
)
Comparable sentence representation with normalized tokens.
tokens
instance-attribute
tokens: list[NormalizedToken]
AlignmentOp
dataclass
AlignmentOp(
op: str,
base_index: Optional[int],
other_index: Optional[int],
base_token: Optional[str],
other_token: Optional[str],
)
Single alignment operation between base and other token lists.
base_index
instance-attribute
base_index: Optional[int]
other_index
instance-attribute
other_index: Optional[int]
base_token
instance-attribute
base_token: Optional[str]
other_token
instance-attribute
other_token: Optional[str]
AlignmentResult
dataclass
AlignmentResult(
strategy: str, cost: int, ops: list[AlignmentOp]
)
Alignment output with ops and edit cost metadata.
strategy
instance-attribute
AlignmentRow
Bases: TypedDict
Row structure for aligned token comparisons.
base_index
instance-attribute
base_index: Optional[int]
by_backend
instance-attribute
by_backend: dict[str, dict[str, Any] | None]
compare_backends
compare_backends(
language: str,
text: str,
backends: list[str],
*,
configs: dict[str, dict[str, Any]] | None = None,
max_sentences: int | None = None,
max_tokens: int | None = None,
top_n: int | None = None
) -> dict[str, Any]
Run multiple NLP backends on the same text and compare their outputs.
Parameters:
-
language
(str)
–
-
text
(str)
–
-
backends
(list[str])
–
Backend names (e.g., ["stanza", "openai", "ollama"]).
-
configs
(dict[str, dict[str, Any]] | None, default:
None
)
–
Optional per-backend config overrides, keyed by backend name.
-
max_sentences
(int | None, default:
None
)
–
Optional cap on number of sentences to compare.
-
max_tokens
(int | None, default:
None
)
–
Optional cap on tokens per sentence.
-
top_n
(int | None, default:
None
)
–
Optional number of top disagreements to include in summary.
Returns:
-
dict[str, Any]
–
A structured report dict. See module docstring for schema.
Source code in cltk/evaluation/compare_backends.py
| def compare_backends(
language: str,
text: str,
backends: list[str],
*,
configs: dict[str, dict[str, Any]] | None = None,
max_sentences: int | None = None,
max_tokens: int | None = None,
top_n: int | None = None,
) -> dict[str, Any]:
"""Run multiple NLP backends on the same text and compare their outputs.
Args:
language: Glottolog language id.
text: Raw text to analyze.
backends: Backend names (e.g., ["stanza", "openai", "ollama"]).
configs: Optional per-backend config overrides, keyed by backend name.
max_sentences: Optional cap on number of sentences to compare.
max_tokens: Optional cap on tokens per sentence.
top_n: Optional number of top disagreements to include in summary.
Returns:
A structured report dict. See module docstring for schema.
"""
if not backends:
raise ValueError("At least one backend is required.")
if not isinstance(text, str) or not text.strip():
raise ValueError("Text must be a non-empty string.")
configs = configs or {}
docs_by_backend: dict[str, Doc] = {}
backend_meta: dict[str, dict[str, Any]] = {}
for backend in backends:
cltk_config = _build_cltk_config(
language=language,
backend=backend,
overrides=configs.get(backend),
)
logger.info("Running backend '%s' for comparison.", backend)
nlp = NLP(cltk_config=cltk_config, suppress_banner=True)
doc = nlp.analyze(text)
docs_by_backend[backend] = doc
backend_meta[backend] = _collect_backend_meta(doc)
return _compare_docs(
language=language,
text=text,
backends=backends,
docs_by_backend=docs_by_backend,
backend_meta=backend_meta,
max_sentences=max_sentences,
max_tokens=max_tokens,
top_n=top_n,
)
|
report_to_markdown
report_to_markdown(report: dict[str, Any]) -> str
Render a compare_backends report as Markdown.
Source code in cltk/evaluation/compare_backends.py
| def report_to_markdown(report: dict[str, Any]) -> str:
"""Render a compare_backends report as Markdown."""
meta = report.get("meta", {})
backends = meta.get("backends", [])
lines: list[str] = []
lines.append("# Compare Backends Report")
lines.append("")
lines.append("## Metadata")
lines.append("")
lines.append(f"- Language: {meta.get('language')}")
lines.append(f"- Backends: {', '.join(backends)}")
lines.append(f"- Base backend: {meta.get('base_backend')}")
lines.append(f"- Timestamp: {meta.get('timestamp')}")
lines.append(f"- Text hash: {meta.get('text_hash')}")
cltk_version = meta.get("cltk_version")
if cltk_version:
lines.append(f"- CLTK version: {cltk_version}")
summary = report.get("summary", {})
agreement_rates = summary.get("agreement_rates", {})
lines.append("")
lines.append("## Agreement Rates")
lines.append("")
lines.append("| Field | Backend Pair | Agree | Total | Rate |")
lines.append("| --- | --- | --- | --- | --- |")
for field in COMPARE_FIELDS:
field_rates = agreement_rates.get(field, {})
for pair, stats in field_rates.items():
agree = stats.get("agree", 0)
total = stats.get("total", 0)
rate = stats.get("rate")
rate_str = f"{rate:.3f}" if isinstance(rate, float) else "-"
lines.append(f"| {field} | {pair} | {agree} | {total} | {rate_str} |")
lines.append("")
lines.append("## Top Disagreements")
lines.append("")
lines.append("| Sentence | Row | Fields | Tokenization |")
lines.append("| --- | --- | --- | --- |")
for item in summary.get("most_disagreed_tokens", []):
sent_idx = item.get("sentence_index")
row = item.get("row")
fields = ", ".join(item.get("fields", []))
token_pairs = item.get("tokenization", {})
tokens_str = "; ".join(f"{k}={v}" for k, v in token_pairs.items())
lines.append(f"| {sent_idx} | {row} | {fields} | {tokens_str} |")
sentences = report.get("sentences", [])
if sentences:
lines.append("")
lines.append("## Per-Sentence Details")
for sent in sentences:
sent_idx = sent.get("index")
sent_text = sent.get("text") or ""
lines.append("")
lines.append(f"### Sentence {sent_idx}")
if sent_text:
lines.append("")
lines.append(sent_text)
disagreement_rows = [
tok
for tok in sent.get("tokens", [])
if any(
not tok.get("diff", {}).get(field, {}).get("agree", True)
for field in COMPARE_FIELDS
)
]
if not disagreement_rows:
lines.append("")
lines.append("No disagreements found.")
continue
lines.append("")
lines.append("| Row | Tokenization |")
lines.append("| --- | --- |")
for tok in disagreement_rows[:20]:
row = tok.get("row")
token_values = (
tok.get("diff", {}).get("tokenization", {}).get("values", {})
)
tokens_str = "; ".join(f"{k}={v}" for k, v in token_values.items())
lines.append(f"| {row} | {tokens_str} |")
if len(disagreement_rows) > 20:
lines.append("")
lines.append(
f"Truncated {len(disagreement_rows) - 20} additional rows."
)
lines.append("")
return "\n".join(lines)
|
write_report
write_report(
report: dict[str, Any],
out_dir: str,
basename: str = "compare_backends",
) -> list[str]
Write report JSON, Markdown, and CSV tables to disk.
Source code in cltk/evaluation/compare_backends.py
| def write_report(
report: dict[str, Any],
out_dir: str,
basename: str = "compare_backends",
) -> list[str]:
"""Write report JSON, Markdown, and CSV tables to disk."""
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
written: list[str] = []
json_path = out_path / f"{basename}.json"
json_path.write_text(json.dumps(report, indent=2, sort_keys=True))
written.append(str(json_path))
md_path = out_path / f"{basename}.md"
md_path.write_text(report_to_markdown(report))
written.append(str(md_path))
csv_paths = _write_csv_tables(report, out_path, basename)
written.extend(csv_paths)
return written
|