Skip to content

glottolog

Utilities for resolving CLTK language data.

This module resolves user-supplied keys (ISO codes, names, Glottolog IDs) into (Language, Optional[Dialect]) pairs using the curated language data in languages.py.

HISTORIC_CUTOFF_YEAR module-attribute

HISTORIC_CUTOFF_YEAR: int = 1700

get_dialect

get_dialect(key: str) -> tuple[Language, Dialect]

Return (Language, Dialect) by dialect glottocode or exact dialect name.

Dialect resolution requires a Glottolog dialect ID or an exact dialect name. ISO 639-3 codes are language-level and cannot identify dialects.

Parameters:

  • key (str) –

    Dialect glottocode or exact dialect name.

Returns:

  • tuple[Language, Dialect]

    A tuple of (parent Language, matching Dialect).

Raises:

  • KeyError

    If the key refers to a language, or no dialect can be found (with hints on ambiguous names).

Source code in cltk/languages/glottolog.py
def get_dialect(key: str) -> tuple[Language, Dialect]:
    """Return ``(Language, Dialect)`` by dialect glottocode or exact dialect name.

    Dialect resolution requires a Glottolog dialect ID or an exact dialect name.
    ISO 639-3 codes are language-level and cannot identify dialects.

    Args:
      key: Dialect glottocode or exact dialect name.

    Returns:
      A tuple of (parent ``Language``, matching ``Dialect``).

    Raises:
      KeyError: If the key refers to a language, or no dialect can be found
        (with hints on ambiguous names).

    """
    logger.debug(f"Looking up dialect for key='{key}'")
    langs: dict[str, Language] = LANGUAGES
    idx = _build_indices()

    k = key.lower()

    if key in langs or k in langs:
        msg = f"'{key}' is a language glottocode. Use get_language('{key}')."
        glog(k).info(msg)
        raise KeyError(msg)

    if k in idx["by_iso"]:
        lang_id = idx["by_iso"][k]
        lang = langs[lang_id]
        msg = (
            f"'{key}' resolves to the language {lang.name} (glottolog_id={lang_id}). "
            f"Use get_language('{key}') for languages."
        )
        glog(lang_id).info(msg)
        raise KeyError(msg)

    # Direct dialect glottocode
    parent = idx["by_dialect"].get(key) or idx["by_dialect"].get(k)
    if parent:
        lang = langs[parent]
        target_id = key if key in idx["by_dialect"] else k
        for d in lang.dialects:
            if d.glottolog_id == target_id:
                glog(key).debug(
                    f"Found dialect by id '{key}' under language '{lang.name}'"
                )
                return lang, d
        # Very unlikely: index points to parent but dialect not found; scan as fallback
        for L in langs.values():
            for d in L.dialects:
                if d.glottolog_id == target_id:
                    glog(key).debug(
                        f"Found dialect by id '{key}' under language '{L.name}' (fallback scan)"
                    )
                    return L, d
        msg = f"Dialect id '{key}' indexed but not found in model data."
        logger.error(msg)
        raise KeyError(msg)

    # Exact dialect name (lowercased) may map to multiple dialect ids
    hits = idx["by_name_dialect"].get(k, [])
    if hits:
        if len(hits) == 1:
            did = hits[0]
            parent = idx["by_dialect"].get(did)
            if parent and parent in langs:
                lang = langs[parent]
                dialect_match = next(
                    (d for d in lang.dialects if d.glottolog_id == did), None
                )
                if dialect_match:
                    glog(did).debug(
                        f"Found dialect by name '{key}' -> {dialect_match.name} (id={did}) under '{lang.name}'"
                    )
                    return lang, dialect_match
        # Ambiguous name → ask for a glottocode, list a few options
        options: list[str] = []
        for did in hits[:5]:
            p = idx["by_dialect"].get(did)
            if p and p in langs:
                lang = langs[p]
                dname = next(
                    (d.name for d in lang.dialects if d.glottolog_id == did), did
                )
                options.append(
                    f"{dname} (dialect id={did}, parent={lang.name}, glottolog_id={p})"
                )
        msg = (
            f"Ambiguous dialect name '{key}'. Please specify a dialect glottocode. "
            + ("Options: " + "; ".join(options) if options else "No options available.")
        )
        glog(key).info(msg)
        raise KeyError(msg)

    msg = f"No dialect found for '{key}'"
    logger.error(msg)
    raise KeyError(msg)

get_language

get_language(
    lang_id: str,
) -> tuple[Language, Optional[Dialect]]

Resolve a language or dialect key.

Parameters:

  • lang_id (str) –

    Language glottocode/ISO/name or dialect glottocode/name.

Returns:

  • Language

    (Language, None) for a language; (Language, Dialect) for a

  • Optional[Dialect]

    dialect.

Raises:

  • KeyError

    If nothing matches (or name is ambiguous without a glottocode).

Source code in cltk/languages/glottolog.py
def get_language(lang_id: str) -> tuple[Language, Optional[Dialect]]:
    """Resolve a language or dialect key.

    Args:
      lang_id: Language glottocode/ISO/name or dialect glottocode/name.

    Returns:
      (``Language``, ``None``) for a language; (``Language``, ``Dialect``) for a
      dialect.

    Raises:
      KeyError: If nothing matches (or name is ambiguous without a glottocode).

    """
    logger.debug(f"Resolving languoid for key='{lang_id}'")

    langs: dict[str, Language] = LANGUAGES
    idx = _build_indices()

    k = lang_id.strip()
    if not k:
        msg = "Language identifier cannot be empty."
        logger.error(msg)
        raise KeyError(msg)
    k_lower = k.lower()

    # 1) Exact language glottocode
    if k in langs:
        L = langs[k]
        glog(k).debug(f"Resolved language by glottocode: {k} -> {L.name}")
        return L, None
    if k_lower in langs:
        L = langs[k_lower]
        glog(k_lower).debug(
            f"Resolved language by glottocode: {k} -> {L.name} (normalized)"
        )
        return L, None

    # 2) Exact dialect glottocode
    parent = idx["by_dialect"].get(k) or idx["by_dialect"].get(k_lower)
    if parent:
        lang = langs[parent]
        target_id = k if k in idx["by_dialect"] else k_lower
        for d in lang.dialects:
            if d.glottolog_id == target_id:
                glog(target_id).debug(
                    f"Resolved dialect by glottocode: {k} -> {d.name} (parent={lang.name})"
                )
                return lang, d
        # Fallback scan (should not be needed)
        for L in langs.values():
            for d in L.dialects:
                if d.glottolog_id == target_id:
                    glog(target_id).debug(
                        f"Resolved dialect by glottocode via fallback scan: {k} -> {d.name} (parent={L.name})"
                    )
                    return L, d
        msg = f"Dialect id '{k}' indexed but not found in model data."
        logger.error(msg)
        raise KeyError(msg)

    # 3) ISO (language only)
    g = idx["by_iso"].get(k_lower)
    if g:
        L = langs[g]
        glog(g).debug(f"Resolved language by ISO: {k} -> {L.name} (glottolog_id={g})")
        return L, None

    # 4) Exact language name/alt-name (may be ambiguous) → prefer historic
    hits_lang: list[str] = idx["by_name_lang"].get(k_lower, [])
    if hits_lang:
        if len(hits_lang) == 1:
            g0 = hits_lang[0]
            L = langs[g0]
            glog(g0).debug(
                f"Resolved language by name: '{k}' -> {L.name} (glottolog_id={g0})"
            )
            return L, None
        cands = [langs[gx] for gx in hits_lang if gx in langs]
        cands.sort(key=_historic_rank, reverse=True)
        best = cands[0]
        glog(best.glottolog_id).info(
            f"Ambiguous language name '{k}' matched {len(cands)} entries; "
            f"selecting '{best.name}' (glottolog_id={best.glottolog_id}) by historic preference."
        )
        return best, None

    # 5) Exact dialect name (may be ambiguous)
    hits_dia: list[str] = idx["by_name_dialect"].get(k_lower, [])
    if hits_dia:
        if len(hits_dia) == 1:
            did = hits_dia[0]
            parent = idx["by_dialect"].get(did)
            if parent and parent in langs:
                lang = langs[parent]
                dialect_match = next(
                    (d for d in lang.dialects if d.glottolog_id == did), None
                )
                if dialect_match:
                    glog(did).debug(
                        f"Resolved dialect by name: '{k}' -> {dialect_match.name} (id={did}, parent={lang.name})"
                    )
                    return lang, dialect_match
        # Ambiguous dialect name → ask for a glottocode, list options
        options: list[str] = []
        for did in hits_dia[:5]:
            p = idx["by_dialect"].get(did)
            if p and p in langs:
                lang = langs[p]
                dname = next(
                    (d.name for d in lang.dialects if d.glottolog_id == did), did
                )
                options.append(
                    f"{dname} (dialect id={did}, parent={lang.name}, glottolog_id={p})"
                )
        msg = f"Ambiguous dialect name '{k}'. Please specify a dialect glottocode. " + (
            "Options: " + "; ".join(options) if options else "No options available."
        )
        glog(k).info(msg)
        raise KeyError(msg)

    # 6) No match
    msg = f"No language or dialect found for '{k}'"
    logger.error(msg)
    raise KeyError(msg)