Skip to content

conll

Code for ingesting, processing, and producing CoNLL-U formatted data.

words_to_conllu

words_to_conllu(words: Iterable[Word]) -> str

Serialize a flat list of Words to a single-sentence CoNLL-U string.

Source code in cltk/morphosyntax/conll.py
def words_to_conllu(words: Iterable[Word]) -> str:
    """Serialize a flat list of Words to a single-sentence CoNLL-U string."""
    lines: list[str] = []
    for i, w in enumerate(words, start=1):
        form = getattr(w, "string", None) or "_"
        lemma = getattr(w, "lemma", None) or "_"
        upos_str = _upos_to_str(getattr(w, "upos", None))
        feats_str = _feats_to_str(getattr(w, "features", None))

        # HEAD
        head_val = None
        for cand in ("head", "head_index", "governor", "head_id"):
            head_val = getattr(w, cand, None)
            if head_val is not None:
                break
        try:
            head = str(int(head_val)) if head_val is not None else "_"
        except Exception:
            head = "_"

        # DEPREL
        dep_obj = None
        for cand in (
            "deprel",
            "dep_rel",
            "dependency_relation",
            "relation",
            "ud_relation",
            "dependency_label",
            "dep_label",
        ):
            dep_obj = getattr(w, cand, None)
            if dep_obj:
                break
        deprel_str = _deprel_to_str(dep_obj)

        cols = [
            str(i),  # ID
            form,  # FORM
            lemma,  # LEMMA
            upos_str,  # UPOS
            "_",  # XPOS
            feats_str,  # FEATS
            head,  # HEAD
            deprel_str,  # DEPREL
            "_",  # DEPS
            "_",  # MISC
        ]
        lines.append("\t".join(cols))
    return "\n".join(lines) + ("\n" if lines else "")

doc_to_conllu

doc_to_conllu(
    doc: Doc,
    *,
    include_provenance: bool = False,
    include_confidence: bool = False
) -> str

Serialize Doc.words to a single-sentence CoNLL-U string.

Source code in cltk/morphosyntax/conll.py
def doc_to_conllu(
    doc: Doc,
    *,
    include_provenance: bool = False,
    include_confidence: bool = False,
) -> str:
    """Serialize Doc.words to a single-sentence CoNLL-U string."""
    log = bind_from_doc(doc)
    words = getattr(doc, "words", []) or []
    log.debug("Serializing %s tokens to CoNLL-U", len(words))
    if include_provenance or include_confidence:
        from cltk.utils.file_outputs import doc_to_conllu as _doc_to_conllu

        return _doc_to_conllu(
            doc,
            include_provenance=include_provenance,
            include_confidence=include_confidence,
        )
    return words_to_conllu(words)

conllu_to_words

conllu_to_words(conllu: str) -> list[Word]

Parse a CoNLL-U string (single or multiple sentences) into a flat list[Word].

  • Ignores comment lines (#).
  • Skips multiword tokens (e.g., '3-4') and empty nodes (e.g., '5.1').
  • Maps FEATS using convert_pos_features_to_ud when possible.
  • Sets Word.index_token as 0-based (ID-1).
Source code in cltk/morphosyntax/conll.py
def conllu_to_words(conllu: str) -> list[Word]:
    """Parse a CoNLL-U string (single or multiple sentences) into a flat list[Word].

    - Ignores comment lines (#).
    - Skips multiword tokens (e.g., '3-4') and empty nodes (e.g., '5.1').
    - Maps FEATS using convert_pos_features_to_ud when possible.
    - Sets Word.index_token as 0-based (ID-1).
    """
    out: list[Word] = []
    remap_report = UDFeatureRemapReport()
    for raw_line in conllu.splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue
        cols = line.split("\t")
        if len(cols) < 10:
            logger.debug("Skipping malformed CoNLL-U line (expected 10 cols): %r", line)
            continue

        (
            tid,
            form,
            lemma,
            upos_raw,
            xpos,
            feats_raw,
            head_raw,
            deprel_raw,
            deps,
            misc,
        ) = cols[:10]

        # Skip multiword tokens and empty nodes
        if "-" in tid or "." in tid:
            continue
        try:
            tid_i = int(tid)
        except Exception:
            logger.debug("Skipping non-integer token ID: %r", tid)
            continue

        # UPOS
        upos_obj: Optional[UDPartOfSpeechTag] = None
        if upos_raw and upos_raw != "_":
            try:
                upos_obj = UDPartOfSpeechTag(tag=upos_raw)
            except Exception as e:
                logger.debug("Could not build UDPartOfSpeechTag(%r): %s", upos_raw, e)
                upos_obj = None

        # FEATS
        feats_obj: Optional[UDFeatureTagSet] = None
        if feats_raw and feats_raw != "_":
            try:
                feats_obj = convert_pos_features_to_ud(
                    feats_raw=feats_raw,
                    remap_report=remap_report,
                    source_word=None if form == "_" else form,
                )
            except Exception as e:
                logger.debug("Could not parse FEATS %r: %s", feats_raw, e)

        # Build Word (index_token 0-based)
        try:
            w = Word(
                string=None if form == "_" else form,
                index_token=tid_i - 1,
                lemma=None if lemma == "_" else lemma,
                upos=upos_obj,
            )
        except Exception:
            # Fallback minimal constructor
            w = Word(string=None if form == "_" else form, index_token=tid_i - 1)
            # Try to assign upos if dataclass allows attribute setting
            try:
                setattr(w, "upos", upos_obj)
            except Exception:
                pass

        # Assign features if possible
        try:
            setattr(w, "features", feats_obj)
        except Exception:
            pass

        # Optional dependency info
        # HEAD
        head_val: Optional[int] = None
        if head_raw and head_raw != "_":
            try:
                head_val = int(head_raw)
            except Exception:
                head_val = None
        for cand in ("head", "head_index", "governor", "head_id"):
            try:
                setattr(w, cand, head_val)
                break
            except Exception:
                continue
        # DEPREL
        if deprel_raw and deprel_raw != "_":
            try:
                setattr(w, "deprel", deprel_raw)
            except Exception:
                pass

        out.append(w)

    remap_report.log_summary(label="Unmapped UD feature pairs from CoNLL-U")
    return out