Skip to content

utils

Utilities for GenAI-driven enrichment (glosses, IPA, idioms, pedagogy).

PromptBuilder module-attribute

PromptBuilder = (
    Callable[[str, str, IPA_PRONUNCIATION_MODE], PromptInfo]
    | PromptInfo
    | str
)

generate_enrichment_for_sentence

generate_enrichment_for_sentence(
    *,
    doc: Doc,
    sentence_idx: int,
    words: list[Word],
    client: Any,
    ipa_mode: IPA_PRONUNCIATION_MODE,
    max_retries: int,
    prompt_builder: Optional[PromptBuilder],
    prompt_profile: Optional[str],
    prompt_digest: Optional[str],
    fields: Optional[set[str]] = None,
    provenance_process: Optional[str] = None
) -> tuple[list[Word], list[IdiomSpan], dict[str, int]]

Call the LLM for one sentence, optionally filtered by fields.

Source code in cltk/enrichment/utils.py
def generate_enrichment_for_sentence(
    *,
    doc: Doc,
    sentence_idx: int,
    words: list[Word],
    client: Any,
    ipa_mode: IPA_PRONUNCIATION_MODE,
    max_retries: int,
    prompt_builder: Optional[PromptBuilder],
    prompt_profile: Optional[str],
    prompt_digest: Optional[str],
    fields: Optional[set[str]] = None,
    provenance_process: Optional[str] = None,
) -> tuple[list[Word], list[IdiomSpan], dict[str, int]]:
    """Call the LLM for one sentence, optionally filtered by fields."""
    lang_or_dialect_name = doc.dialect.name if doc.dialect else doc.language.name
    token_table = _build_token_table(words)
    pinfo = _resolve_enrichment_prompt(
        lang_or_dialect_name=lang_or_dialect_name,
        token_table=token_table,
        ipa_mode=ipa_mode,
        builder=prompt_builder,
    )
    prompt = pinfo.text

    log = bind_from_doc(
        doc, sentence_idx=sentence_idx, prompt_version=str(pinfo.version)
    )
    log.info("[prompt] %s v%s hash=%s", pinfo.kind, pinfo.version, pinfo.digest)
    import os as _os

    if _os.getenv("CLTK_LOG_CONTENT", "").strip().lower() in {"1", "true", "yes", "on"}:
        log.debug(prompt)

    lang_id = None
    try:
        if doc.dialect and doc.dialect.glottolog_id:
            lang_id = doc.dialect.glottolog_id
        else:
            lang_id = doc.language.glottolog_id
    except Exception:
        lang_id = None
    config_snapshot = extract_doc_config(doc)
    notes = {
        "prompt_kind": pinfo.kind,
        "sentence_idx": sentence_idx,
        "ipa_mode": ipa_mode,
    }
    if prompt_profile:
        notes["prompt_profile"] = prompt_profile
    prov_record = build_provenance_record(
        language=lang_id,
        backend=doc.backend,
        process=provenance_process or "enrichment",
        model=str(doc.model) if doc.model else None,
        provider=str(doc.backend) if doc.backend else None,
        prompt_version=str(pinfo.version),
        prompt_text=prompt,
        prompt_digest=prompt_digest,
        config=config_snapshot,
        notes=notes,
    )
    prov_id = add_provenance_record(
        doc, prov_record, set_default=doc.default_provenance_id is None
    )

    res_obj: CLTKGenAIResponse = client.generate(prompt=prompt, max_retries=max_retries)
    payload = _parse_enrichment_payload(res_obj.response)
    updated_words, idioms = _apply_payload_to_words(
        words, payload, sentence_idx, provenance_id=prov_id, fields=fields
    )
    return updated_words, idioms, res_obj.usage

generate_gpt_enrichment

generate_gpt_enrichment(
    doc: Doc,
    *,
    ipa_mode: IPA_PRONUNCIATION_MODE = "attic_5c_bce",
    prompt_builder: Optional[PromptBuilder] = None,
    prompt_profile: Optional[str] = None,
    prompt_digest: Optional[str] = None,
    fields: Optional[set[str]] = None,
    max_retries: int = 2,
    provenance_process: Optional[str] = None
) -> Doc

Sequential enrichment across sentences, optionally scoped by fields.

Source code in cltk/enrichment/utils.py
def generate_gpt_enrichment(
    doc: Doc,
    *,
    ipa_mode: IPA_PRONUNCIATION_MODE = "attic_5c_bce",
    prompt_builder: Optional[PromptBuilder] = None,
    prompt_profile: Optional[str] = None,
    prompt_digest: Optional[str] = None,
    fields: Optional[set[str]] = None,
    max_retries: int = 2,
    provenance_process: Optional[str] = None,
) -> Doc:
    """Sequential enrichment across sentences, optionally scoped by fields."""
    log = bind_from_doc(doc)
    if not doc.words:
        msg = "Doc must contain tokens (with morph + dependency) before enrichment."
        log.error(msg)
        raise CLTKException(msg)
    if not doc.backend:
        msg_backend = "Doc must set `.backend` to use enrichment."
        log.error(msg_backend)
        raise CLTKException(msg_backend)
    if not doc.model:
        msg_model = "Doc missing `.model`. Set to a supported model for enrichment."
        log.error(msg_model)
        raise CLTKException(msg_model)

    backend_config = _get_backend_config(doc)
    if backend_config and getattr(backend_config, "max_retries", None) is not None:
        max_retries = int(getattr(backend_config, "max_retries"))

    # Reuse one client across all sentences
    client: Any
    if doc.backend == "openai":
        if doc.model not in get_args(AVAILABLE_OPENAI_MODELS):
            msg_unsupported_backend_version: str = (
                f"Doc has unsupported `.model`: {doc.model}. "
                f"Supported versions are: {get_args(AVAILABLE_OPENAI_MODELS)}."
            )
            log.error(msg_unsupported_backend_version)
            raise CLTKException(msg_unsupported_backend_version)
        openai_cfg = (
            backend_config if isinstance(backend_config, OpenAIBackendConfig) else None
        )
        client = OpenAIConnection(
            model=cast(AVAILABLE_OPENAI_MODELS, doc.model),
            api_key=getattr(openai_cfg, "api_key", None),
            temperature=getattr(openai_cfg, "temperature", 1.0),
        )
    elif doc.backend in ("ollama", "ollama-cloud"):
        ollama_cfg = (
            backend_config if isinstance(backend_config, OllamaBackendConfig) else None
        )
        host = None
        if ollama_cfg:
            host = ollama_cfg.base_url or ollama_cfg.host
        client = OllamaConnection(
            model=str(doc.model),
            use_cloud=doc.backend == "ollama-cloud",
            host=host,
            api_key=getattr(ollama_cfg, "api_key", None),
            temperature=getattr(ollama_cfg, "temperature", None),
            top_p=getattr(ollama_cfg, "top_p", None),
            num_ctx=getattr(ollama_cfg, "num_ctx", None),
            num_predict=getattr(ollama_cfg, "num_predict", None),
            options=getattr(ollama_cfg, "options", None),
        )
    elif doc.backend == "mistral":
        if doc.model not in get_args(AVAILABLE_MISTRAL_MODELS):
            msg_unsupported_mistral_version: str = (
                f"Doc has unsupported `.model`: {doc.model}. "
                f"Supported versions are: {get_args(AVAILABLE_MISTRAL_MODELS)}."
            )
            log.error(msg_unsupported_mistral_version)
            raise CLTKException(msg_unsupported_mistral_version)
        mistral_cfg = (
            backend_config if isinstance(backend_config, MistralBackendConfig) else None
        )
        client = MistralConnection(
            model=cast(AVAILABLE_MISTRAL_MODELS, doc.model),
            api_key=getattr(mistral_cfg, "api_key", None),
            temperature=getattr(mistral_cfg, "temperature", 1.0),
        )
    else:
        raise CLTKException(f"Unsupported backend for enrichment: {doc.backend}.")

    genai_total_tokens = {"input": 0, "output": 0, "total": 0}
    all_idioms: list[IdiomSpan] = []

    # Prefer sentence grouping from token annotations
    sentences: list[Sentence]
    if doc.sentences:
        sentences = doc.sentences
    else:
        # Fallback: treat all words as one sentence
        sentences = [Sentence(words=doc.words, index=0)]

    for sent in sentences:
        if not sent.words:
            continue
        sent_idx = getattr(sent, "index", None)
        words = sent.words
        updated_words, idioms, usage = generate_enrichment_for_sentence(
            doc=doc,
            sentence_idx=sent_idx if sent_idx is not None else 0,
            words=words,
            client=client,
            ipa_mode=ipa_mode,
            max_retries=max_retries,
            prompt_builder=prompt_builder,
            prompt_profile=prompt_profile,
            prompt_digest=prompt_digest,
            fields=fields,
            provenance_process=provenance_process,
        )
        # updated_words are references into doc.words via doc.sentences; no reassignment needed
        all_idioms.extend(idioms)
        for k in genai_total_tokens:
            genai_total_tokens[k] += usage.get(k, 0)
        bind_from_doc(doc, sentence_idx=sent_idx).info(
            f"[enrich] Completed enrichment for sentence #{(sent_idx or 0) + 1}"
        )

    # Store idiom spans at doc level only when idioms are included.
    if fields is None or "idioms" in fields:
        doc.idiom_spans = all_idioms
    _update_doc_genai_stage(doc, stage="enrich", stage_tokens=genai_total_tokens)
    log.info(
        "[enrich] Completed enrichment: %d tokens across %d sentences",
        len(doc.words),
        len(sentences),
    )
    return doc

generate_gpt_enrichment_concurrent

generate_gpt_enrichment_concurrent(
    doc: Doc,
    *,
    ipa_mode: IPA_PRONUNCIATION_MODE = "attic_5c_bce",
    prompt_builder: Optional[PromptBuilder] = None,
    prompt_profile: Optional[str] = None,
    prompt_digest: Optional[str] = None,
    fields: Optional[set[str]] = None,
    max_retries: int = 2,
    provenance_process: Optional[str] = None
) -> Doc

Safely call enrichment even when an event loop is running, honoring field filters.

Source code in cltk/enrichment/utils.py
def generate_gpt_enrichment_concurrent(
    doc: Doc,
    *,
    ipa_mode: IPA_PRONUNCIATION_MODE = "attic_5c_bce",
    prompt_builder: Optional[PromptBuilder] = None,
    prompt_profile: Optional[str] = None,
    prompt_digest: Optional[str] = None,
    fields: Optional[set[str]] = None,
    max_retries: int = 2,
    provenance_process: Optional[str] = None,
) -> Doc:
    """Safely call enrichment even when an event loop is running, honoring field filters."""
    log = bind_from_doc(doc)
    try:
        asyncio.get_running_loop()
    except RuntimeError:
        log.info("[async-wrap] No running event loop; using direct enrichment.")
        return generate_gpt_enrichment(
            doc,
            ipa_mode=ipa_mode,
            prompt_builder=prompt_builder,
            prompt_profile=prompt_profile,
            prompt_digest=prompt_digest,
            fields=fields,
            max_retries=max_retries,
            provenance_process=provenance_process,
        )

    def _runner() -> Doc:
        """Run enrichment inside a worker thread when an event loop exists."""
        return generate_gpt_enrichment(
            doc,
            ipa_mode=ipa_mode,
            prompt_builder=prompt_builder,
            prompt_profile=prompt_profile,
            prompt_digest=prompt_digest,
            fields=fields,
            max_retries=max_retries,
            provenance_process=provenance_process,
        )

    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
        fut = ex.submit(_runner)
        result = fut.result()
        log.info("[async-wrap] Completed enrichment in worker thread")
        return result