Skip to content

analyze

CLI handler for the cltk analyze subcommand.

configure_parser

configure_parser(subparsers: _SubParsersAction) -> None

Register the analyze subcommand parser.

Source code in cltk/cli/analyze.py
def configure_parser(subparsers: argparse._SubParsersAction) -> None:
    """Register the analyze subcommand parser."""
    parser = subparsers.add_parser(
        "analyze",
        help="Run a CLTK pipeline on text and emit a chosen output format.",
        formatter_class=HelpFormatter,
    )
    parser.add_argument(
        "--lang",
        "--language",
        dest="language",
        required=True,
        help="Glottolog id or CLTK language key.",
    )
    parser.add_argument(
        "--backend",
        default="stanza",
        help="Backend to use (stanza, openai, ollama, mistral, spacy).",
    )
    parser.add_argument("--pipeline", help="Optional pipeline class name to use.")
    parser.add_argument("--text", help="Raw text to analyze.")
    parser.add_argument("--text-file", help="Path to a text file to analyze.")
    parser.add_argument("--input-dir", help="Batch mode: directory of input files.")
    parser.add_argument(
        "--glob",
        default="*.txt",
        help="Glob pattern for --input-dir (default: *.txt).",
    )
    parser.add_argument(
        "--out",
        required=True,
        help="Output type (raw, conllu, readers-guide, feature-table, json).",
    )
    parser.add_argument(
        "--format",
        help="Format for feature-table (csv, tsv, parquet) or json (pretty, min).",
    )
    parser.add_argument(
        "--out-file",
        help="Write output to this path; defaults to stdout.",
    )
    parser.add_argument(
        "--out-dir",
        help="Output directory for batch mode (--input-dir).",
    )
    parser.add_argument(
        "--config",
        help="JSON string or path to JSON file for backend/pipeline settings.",
    )
    parser.add_argument(
        "--max-sentences",
        type=int,
        help="Cap the number of sentences in output.",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        help="Cap the number of tokens per sentence in output.",
    )
    parser.add_argument(
        "--continue-on-error",
        action="store_true",
        help="Continue processing batch inputs after errors.",
    )
    parser.add_argument(
        "--quiet",
        action="store_true",
        help="Suppress non-error logs.",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable info-level logs.",
    )
    parser.set_defaults(func=run)

run

run(args: Namespace) -> int

Run the analyze command.

Source code in cltk/cli/analyze.py
def run(args: argparse.Namespace) -> int:
    """Run the analyze command."""
    set_log_level(quiet=args.quiet, verbose=args.verbose)
    if args.max_sentences is not None and args.max_sentences <= 0:
        raise SystemExit("--max-sentences must be a positive integer.")
    if args.max_tokens is not None and args.max_tokens <= 0:
        raise SystemExit("--max-tokens must be a positive integer.")
    config = parse_json_input(args.config) if args.config else None
    pipeline = resolve_pipeline(args.pipeline) if args.pipeline else None
    cltk_config = build_cltk_config(
        language=args.language,
        backend=args.backend,
        config=config,
        pipeline=pipeline,
    )

    if args.input_dir:
        return _run_batch(args, cltk_config)

    if args.out_dir:
        raise SystemExit("--out-dir is only valid with --input-dir.")

    text = load_text(args.text, args.text_file)
    try:
        nlp = NLP(cltk_config=cltk_config, suppress_banner=True)
        doc = nlp.analyze(text)
    except Exception as exc:
        raise SystemExit(str(exc)) from exc
    _emit_output(doc, args, out_path=_resolve_out_path(args.out_file))
    return 0