Enrichment layer for GenAI-driven annotations (glosses, IPA, idioms).
GenAIEnrichmentProcess
Bases: Process
Language-agnostic enrichment process using a generative GPT model (legacy).
glottolog_id
class-attribute
instance-attribute
glottolog_id: Optional[str] = None
process_id
class-attribute
process_id: str = 'enrichment.genai'
enrichment_fields
class-attribute
enrichment_fields: Optional[set[str]] = None
prompt_template_id
class-attribute
prompt_template_id: Optional[str] = None
prompt_builder
class-attribute
instance-attribute
prompt_builder: Optional[EnrichmentPromptBuilder] = None
ipa_mode
class-attribute
instance-attribute
ipa_mode: IPA_PRONUNCIATION_MODE = 'attic_5c_bce'
prompt_profile
class-attribute
instance-attribute
prompt_profile: Optional[str] = None
prompt_version
class-attribute
instance-attribute
prompt_version: Optional[str] = None
algorithm
cached
property
algorithm: Callable[..., Doc]
Return the enrichment generation function for this process.
run
run(input_doc: Doc) -> Doc
Run the configured GPT enrichment workflow.
Source code in cltk/enrichment/processes.py
| def run(self, input_doc: Doc) -> Doc:
"""Run the configured GPT enrichment workflow."""
output_doc: Doc = copy(input_doc)
if not output_doc.normalized_text:
msg = "Doc must have `normalized_text`."
bind_from_doc(output_doc).error(msg)
raise ValueError(msg)
if self.glottolog_id is None:
raise ValueError("glottolog_id must be set for enrichment.")
prompt_builder = self.prompt_builder
prompt_digest = None
if prompt_builder is None and self.prompt_profile:
template_id = self.prompt_template_id or self.process_id
template = PromptProfileRegistry.get_prompt(
self.prompt_profile, template_id, self.prompt_version
)
prompt_digest = template.digest
def _builder(
lang: str,
table: str,
ipa_mode: IPA_PRONUNCIATION_MODE,
_template: PromptTemplate = template,
) -> PromptInfo:
"""Build prompt info from template and parameters."""
return build_prompt_info(
_template,
lang_or_dialect_name=lang,
token_table=table,
ipa_mode=ipa_mode,
)
prompt_builder = _builder
output_doc = self.algorithm(
output_doc,
ipa_mode=self.ipa_mode,
prompt_builder=prompt_builder,
prompt_profile=self.prompt_profile,
prompt_digest=prompt_digest,
fields=self.enrichment_fields,
provenance_process=f"{self.process_id}:{self.__class__.__name__}",
)
return output_doc
|