def doc_to_feature_table(
doc: Doc,
*,
include_provenance: bool = False,
include_confidence: bool = False,
) -> Table:
"""Return a ``pyarrow.Table`` of POS, morphology, and dependency features.
- Raises ValueError when ``Doc.words`` is missing or empty.
- Writes an empty row (all ``None``) when a list entry is ``None``.
- Ignores ``Word.xpos``.
- Adds tree-shape features, document metadata, and sentence-level metrics.
- Requires ``pyarrow`` to serialize downstream. Example::
>>> table = doc_to_feature_table(doc)
>>> import pyarrow.parquet as pq
>>> pq.write_table(table, "doc_features.parquet")
Feature Glossary (selected)
- head_upos: UPOS tag of the head token (empty for root)
- dir_to_head: Direction to head within sentence: ``L``, ``R``, or ``ROOT``
- dist_to_head: Absolute token distance to head (0 for root)
- dist_to_head_norm: ``dist_to_head`` normalized by sentence length
- child_count: Number of dependents of the token (out-degree)
- left_child_count / right_child_count: Dependents to the left/right of the token
- is_leaf: ``1`` if token has no dependents else ``0``
- depth_from_root / path_len_to_root: Steps from root to token (root = 0)
- height_to_leaf: Max steps from token down to any leaf (leaf = 0)
- subtree_size: Tokens in token’s subtree (including the token itself)
- subtree_span_width: Index span width covering the subtree (max − min + 1)
- subtree_gap: ``subtree_span_width - subtree_size`` (non‑zero suggests discontinuity)
- crossing_arcs: Number of arcs that cross this token’s arc
- parent_branching_factor: Number of children of the token’s parent
- sibling_index: Position among siblings by token order (0‑based)
- sibling_count: Number of siblings (``parent_branching_factor - 1``)
- is_root: ``1`` if root token else ``0``
- valid_head: ``1`` if token’s head index is valid else ``0``
Sentence-level (repeated per token)
- sent_root_index: 1‑based index of the root in sentence order
- sent_tree_depth: Max ``depth_from_root`` within the sentence
- sent_avg_branching_nonleaf: Average children among non‑leaf nodes
- sent_crossing_arcs: Number of crossing‑arc pairs in the sentence
- sent_is_projective: ``1`` if no crossings, else ``0``
- sent_len: Sentence length in tokens
"""
words: Optional[list[Optional[Word]]] = getattr(doc, "words", None)
if not words:
raise ValueError("Doc.words must be a non-empty list.")
# Collect UD feature keys across the document
feature_keys: set[str] = set()
for w in words:
if not w:
continue
feats = getattr(w, "features", None)
feature_list = getattr(feats, "features", None)
if not feature_list:
continue
for feat in feature_list:
key = getattr(feat, "key", None)
if key:
feature_keys.add(str(key))
sorted_feature_keys: list[str] = sorted(feature_keys)
metadata_map_raw = getattr(doc, "metadata", {}) or {}
metadata_map: dict[str, Any]
if not isinstance(metadata_map_raw, dict):
metadata_map = dict(metadata_map_raw)
else:
metadata_map = metadata_map_raw
metadata_keys: list[str] = sorted(str(k) for k in metadata_map.keys())
# Group by sentence index (may be None)
sent_groups: dict[Optional[int], list[tuple[int, Word]]] = {}
for doc_idx, w in enumerate(words):
if w is None:
continue
sidx = getattr(w, "index_sentence", None)
sent_groups.setdefault(sidx, []).append((doc_idx, w))
# Derived features per token, keyed by doc index
dep_feats_by_doc_idx: dict[int, dict[str, Union[str, int, float, None]]] = {}
def _cross(a1: int, a2: int, b1: int, b2: int) -> bool:
"""Return True if the arcs (a1, a2) and (b1, b2) cross."""
x1, x2 = (a1, a2) if a1 <= a2 else (a2, a1)
y1, y2 = (b1, b2) if b1 <= b2 else (b2, b1)
return (x1 < y1 < x2 < y2) or (y1 < x1 < y2 < x2)
for sidx, entries in sent_groups.items():
# Sort sentence tokens by in-sentence order: prefer index_token else doc order
def sort_key(item: tuple[int, Word]) -> int:
"""Order tokens within a sentence by index_token, then document index."""
_, w = item
tok = getattr(w, "index_token", None)
return int(tok) if tok is not None else item[0]
entries_sorted = sorted(entries, key=sort_key)
n = len(entries_sorted)
if n == 0:
continue
# Maps
local_to_doc: list[int] = [doc_i for doc_i, _ in entries_sorted]
# doc_to_local: dict[int, int] = {doc_i: i for i, doc_i in enumerate(local_to_doc)}
upos_by_local: list[str] = [""] * n
lemma_by_local: list[str] = [""] * n
form_by_local: list[str] = [""] * n
head_local: list[Optional[int]] = [None] * n
depcode_by_local: list[str] = [""] * n
index_token_to_local: dict[int, int] = {}
for i, (_, w) in enumerate(entries_sorted):
tok = getattr(w, "index_token", None)
if isinstance(tok, int):
index_token_to_local[tok] = i
for i, (_, w) in enumerate(entries_sorted):
upos = getattr(getattr(w, "upos", None), "tag", None) or ""
upos_by_local[i] = str(upos)
lemma_by_local[i] = getattr(w, "lemma", "") or ""
form_by_local[i] = getattr(w, "string", "") or ""
dep = getattr(w, "dependency_relation", None)
if dep:
code = getattr(dep, "code", None)
subtype = getattr(dep, "subtype", None)
depcode_by_local[i] = (
f"{code}:{subtype}"
if code and subtype
else (str(code) if code else "")
)
gov = getattr(w, "governor", None)
if isinstance(gov, int) and 0 <= gov < n:
head_local[i] = gov
elif isinstance(gov, int) and gov in index_token_to_local:
head_local[i] = index_token_to_local[gov]
else:
head_local[i] = None
# Children adjacency
children: list[list[int]] = [[] for _ in range(n)]
for i, h in enumerate(head_local):
if isinstance(h, int) and 0 <= h < n:
children[h].append(i)
for kids in children:
kids.sort()
# Depth from root and cycle detection
depths: list[Optional[int]] = [None] * n
for i in range(n):
if depths[i] is not None:
continue
seen: set[int] = set()
cur = i
d = 0
while True:
if cur in seen:
for node in seen:
depths[node] = None
break
seen.add(cur)
h = head_local[cur]
if h is None:
# Assign precise depths by walking from i upward
dd = 0
node = i
visited2: set[int] = set()
while node not in visited2:
visited2.add(node)
depths[node] = dd
h2 = head_local[node]
if h2 is None:
break
node = h2
dd += 1
break
cur = h
d += 1
if d > n + 5:
for node in seen:
depths[node] = None
break
# Subtree metrics (height, size, span)
from functools import lru_cache
@lru_cache(maxsize=None)
def subtree_metrics(
i: int, _stack: tuple[int, ...] = ()
) -> tuple[Optional[int], Optional[int], int, int]:
"""Compute height, size, and span bounds for a token's subtree."""
if i in _stack:
return None, None, i, i
kids = children[i]
if not kids:
return 0, 1, i, i
max_h: Optional[int] = 0
total_size = 1
mn = i
mx = i
for c in kids:
h, sz, c_mn, c_mx = subtree_metrics(c, _stack + (i,))
if h is None or sz is None:
max_h = None
else:
if max_h is not None:
max_h = max(max_h, 1 + h)
total_size += sz
mn = min(mn, c_mn)
mx = max(mx, c_mx)
return max_h, (None if max_h is None else total_size), mn, mx
height_to_leaf: list[Optional[int]] = [None] * n
subtree_size: list[Optional[int]] = [None] * n
span_min: list[int] = [0] * n
span_max: list[int] = [0] * n
for i in range(n):
h, sz, mn, mx = subtree_metrics(i)
height_to_leaf[i] = h
subtree_size[i] = sz
span_min[i] = mn
span_max[i] = mx
# Left/right child counts and leaf flags
left_child_count = [0] * n
right_child_count = [0] * n
child_count = [len(children[i]) for i in range(n)]
for i in range(n):
for c in children[i]:
if c < i:
left_child_count[i] += 1
elif c > i:
right_child_count[i] += 1
is_leaf = [1 if child_count[i] == 0 else 0 for i in range(n)]
# Distances and directions to head
dist_to_head = [0] * n
dir_to_head = ["ROOT"] * n
for i in range(n):
h = head_local[i]
if h is None:
dist_to_head[i] = 0
dir_to_head[i] = "ROOT"
else:
dist_to_head[i] = abs(i - h)
dir_to_head[i] = "L" if h < i else "R"
dist_to_head_norm = [(d / n) if n > 0 else 0.0 for d in dist_to_head]
# Sibling info
sibling_index = [0] * n
sibling_count = [0] * n
parent_branching_factor = [0] * n
for i in range(n):
h = head_local[i]
if h is None:
sibling_index[i] = 0
sibling_count[i] = 0
parent_branching_factor[i] = 0
else:
sibs = children[h]
parent_branching_factor[i] = len(sibs)
sibling_count[i] = max(0, len(sibs) - 1)
try:
sibling_index[i] = sibs.index(i)
except ValueError:
sibling_index[i] = 0
# Span metrics
subtree_span_width: list[int] = [0] * n
subtree_gap: list[Optional[int]] = [None] * n
for i in range(n):
width = span_max[i] - span_min[i] + 1
subtree_span_width[i] = width
size_val = subtree_size[i]
gap_val: Optional[int]
if size_val is None:
gap_val = None
else:
gap_val = width - size_val
subtree_gap[i] = gap_val
# Crossing arcs
arcs: list[tuple[int, int, int]] = []
for i in range(n):
h = head_local[i]
if h is None:
continue
lo, hi = (i, h) if i <= h else (h, i)
arcs.append((lo, hi, i))
crossing_per_token = [0] * n
total_crossings = 0
for a in range(len(arcs)):
a1, a2, ai = arcs[a]
for b in range(a + 1, len(arcs)):
b1, b2, bi = arcs[b]
if _cross(a1, a2, b1, b2):
total_crossings += 1
crossing_per_token[ai] += 1
crossing_per_token[bi] += 1
roots = [i for i, h in enumerate(head_local) if h is None]
sent_root_index_1b = (roots[0] + 1) if roots else 1
sent_tree_depth = max([d for d in depths if d is not None], default=0)
nonleaf = [i for i in range(n) if child_count[i] > 0]
sent_avg_branching_nonleaf = (
(sum(child_count[i] for i in nonleaf) / len(nonleaf)) if nonleaf else 0.0
)
sent_crossing_arcs = total_crossings
sent_is_projective = 1 if total_crossings == 0 else 0
sent_len = n
# Head attributes
head_upos = [""] * n
head_form = [""] * n
head_lemma = [""] * n
for i in range(n):
h = head_local[i]
if h is not None and 0 <= h < n:
head_upos[i] = upos_by_local[h]
head_form[i] = form_by_local[h]
head_lemma[i] = lemma_by_local[h]
# Child extremes
leftmost_child_index_1b = [""] * n
rightmost_child_index_1b = [""] * n
for i in range(n):
if children[i]:
leftmost_child_index_1b[i] = str(children[i][0] + 1)
rightmost_child_index_1b[i] = str(children[i][-1] + 1)
# Assign per-token features back by doc index
for local_i, doc_i in enumerate(local_to_doc):
head_idx = head_local[local_i]
depth_val = depths[local_i]
height_val = height_to_leaf[local_i]
subtree_size_val = subtree_size[local_i]
subtree_gap_val = subtree_gap[local_i]
if head_idx is None:
head_value: Optional[int] = None
dir_value = "ROOT"
else:
head_value = head_idx + 1
dir_value = "L" if head_idx < local_i else "R"
dep_feats_by_doc_idx[doc_i] = {
"token_index_sentence": local_i + 1,
"head": head_value,
"deprel": depcode_by_local[local_i],
"head_upos": head_upos[local_i],
"head_form": head_form[local_i],
"head_lemma": head_lemma[local_i],
"dir_to_head": dir_value,
"dist_to_head": dist_to_head[local_i],
"dist_to_head_norm": dist_to_head_norm[local_i],
"child_count": child_count[local_i],
"left_child_count": left_child_count[local_i],
"right_child_count": right_child_count[local_i],
"is_leaf": is_leaf[local_i],
"depth_from_root": depth_val,
"path_len_to_root": depth_val,
"height_to_leaf": height_val,
"subtree_size": subtree_size_val,
"subtree_span_width": subtree_span_width[local_i],
"subtree_gap": subtree_gap_val,
"crossing_arcs": crossing_per_token[local_i],
"parent_branching_factor": parent_branching_factor[local_i],
"sibling_index": sibling_index[local_i],
"sibling_count": sibling_count[local_i],
"is_root": 1 if head_local[local_i] is None else 0,
"valid_head": 1 if head_local[local_i] is not None else 0,
# Sentence-level metrics
"sent_root_index": sent_root_index_1b,
"sent_tree_depth": sent_tree_depth,
"sent_avg_branching_nonleaf": sent_avg_branching_nonleaf,
"sent_crossing_arcs": sent_crossing_arcs,
"sent_is_projective": sent_is_projective,
"sent_len": sent_len,
}
# Build header/order for downstream schema/rows
base_header: list[str] = [
"sentence_index",
"token_index",
"token_index_sentence",
"token",
"lemma",
"upos",
"head",
"deprel",
]
prov_header: list[str] = []
if include_provenance:
prov_header = [
"prov_lemma",
"prov_upos",
"prov_feats",
"prov_head",
"prov_deprel",
]
conf_header: list[str] = []
if include_confidence:
conf_header = [
"conf_lemma",
"conf_upos",
"conf_feats",
"conf_head",
"conf_deprel",
]
metadata_header = [f"metadata_{key}" for key in metadata_keys]
feat_header = [f"feat_{key}" for key in sorted_feature_keys]
# Dependency feature columns (token-level)
dep_extra_header: list[str] = [
"head_upos",
"head_form",
"head_lemma",
"dir_to_head",
"dist_to_head",
"dist_to_head_norm",
"child_count",
"left_child_count",
"right_child_count",
"is_leaf",
"depth_from_root",
"path_len_to_root",
"height_to_leaf",
"subtree_size",
"subtree_span_width",
"subtree_gap",
"crossing_arcs",
"parent_branching_factor",
"sibling_index",
"sibling_count",
"is_root",
"valid_head",
# Sentence-level features (repeated per token)
"sent_root_index",
"sent_tree_depth",
"sent_avg_branching_nonleaf",
"sent_crossing_arcs",
"sent_is_projective",
"sent_len",
]
header = (
base_header
+ prov_header
+ conf_header
+ metadata_header
+ feat_header
+ dep_extra_header
)
try:
import pyarrow as pa
except ImportError as exc: # pragma: no cover - optional dependency
raise ImportError(
"doc_to_feature_table() requires `pyarrow`. Install it via `pip install pyarrow`."
) from exc
def _build_schema(feature_keys: list[str], metadata_keys: list[str]) -> "pa.Schema":
"""Construct the Arrow schema for the feature table."""
base_fields = [
pa.field("sentence_index", pa.int64()),
pa.field("token_index", pa.int64()),
pa.field("token_index_sentence", pa.int64()),
pa.field("token", pa.string()),
pa.field("lemma", pa.string()),
pa.field("upos", pa.string()),
pa.field("head", pa.int64()),
pa.field("deprel", pa.string()),
]
prov_fields = (
[
pa.field("prov_lemma", pa.string()),
pa.field("prov_upos", pa.string()),
pa.field("prov_feats", pa.string()),
pa.field("prov_head", pa.string()),
pa.field("prov_deprel", pa.string()),
]
if include_provenance
else []
)
conf_fields = (
[
pa.field("conf_lemma", pa.float64()),
pa.field("conf_upos", pa.float64()),
pa.field("conf_feats", pa.float64()),
pa.field("conf_head", pa.float64()),
pa.field("conf_deprel", pa.float64()),
]
if include_confidence
else []
)
metadata_fields = [
pa.field(f"metadata_{key}", pa.string()) for key in metadata_keys
]
feat_fields = [pa.field(f"feat_{key}", pa.string()) for key in feature_keys]
dep_fields = [
pa.field("head_upos", pa.string()),
pa.field("head_form", pa.string()),
pa.field("head_lemma", pa.string()),
pa.field("dir_to_head", pa.string()),
pa.field("dist_to_head", pa.int64()),
pa.field("dist_to_head_norm", pa.float64()),
pa.field("child_count", pa.int64()),
pa.field("left_child_count", pa.int64()),
pa.field("right_child_count", pa.int64()),
pa.field("is_leaf", pa.int64()),
pa.field("depth_from_root", pa.int64()),
pa.field("path_len_to_root", pa.int64()),
pa.field("height_to_leaf", pa.int64()),
pa.field("subtree_size", pa.int64()),
pa.field("subtree_span_width", pa.int64()),
pa.field("subtree_gap", pa.int64()),
pa.field("crossing_arcs", pa.int64()),
pa.field("parent_branching_factor", pa.int64()),
pa.field("sibling_index", pa.int64()),
pa.field("sibling_count", pa.int64()),
pa.field("is_root", pa.int8()),
pa.field("valid_head", pa.int8()),
pa.field("sent_root_index", pa.int64()),
pa.field("sent_tree_depth", pa.int64()),
pa.field("sent_avg_branching_nonleaf", pa.float64()),
pa.field("sent_crossing_arcs", pa.int64()),
pa.field("sent_is_projective", pa.int8()),
pa.field("sent_len", pa.int64()),
]
return pa.schema(
base_fields
+ prov_fields
+ conf_fields
+ metadata_fields
+ feat_fields
+ dep_fields
)
schema = _build_schema(sorted_feature_keys, metadata_keys)
columns: dict[str, list[Any]] = {name: [] for name in header}
def _serialize_metadata_value(value: Any) -> Optional[str]:
"""Convert metadata values to JSON-safe strings where possible."""
if value is None:
return None
if isinstance(value, str):
return value
if isinstance(value, (int, float, bool)):
return str(value)
try:
return json.dumps(value, ensure_ascii=True, sort_keys=True)
except TypeError:
return str(value)
for doc_idx, word in enumerate(words):
if word is None:
for name in header:
columns[name].append(None)
continue
sentence_idx_raw = getattr(word, "index_sentence", None)
global_idx = getattr(word, "index_token", None)
if global_idx is None:
global_idx = doc_idx
upos_tag = getattr(getattr(word, "upos", None), "tag", "") or ""
dep_extra = dep_feats_by_doc_idx.get(doc_idx, {})
deprel_value_raw = dep_extra.get("deprel", "")
raw_head = dep_extra.get("head")
try:
head_value = int(raw_head) if raw_head is not None else None
except (TypeError, ValueError):
head_value = None
token_in_sentence = dep_extra.get("token_index_sentence")
deprel_value = str(deprel_value_raw) if deprel_value_raw is not None else ""
# Explode UD features for this word
feature_map: dict[str, str] = {}
feats_obj = getattr(word, "features", None)
feature_list = getattr(feats_obj, "features", None)
if feature_list:
for feat in feature_list:
key = getattr(feat, "key", None)
val = getattr(feat, "value", None)
if key and val:
feature_map[str(key)] = str(val)
row: list[Union[str, int, float, None]] = [
sentence_idx_raw,
global_idx,
token_in_sentence,
getattr(word, "string", "") or "",
getattr(word, "lemma", "") or "",
upos_tag,
head_value,
deprel_value,
]
if include_provenance:
sources = getattr(word, "annotation_sources", None) or {}
row.extend(
[
sources.get("lemma"),
sources.get("upos"),
sources.get("features"),
sources.get("governor"),
sources.get("dependency_relation"),
]
)
if include_confidence:
confidences = getattr(word, "confidence", None) or {}
def _conf_val(key: str) -> Optional[float]:
"""Return a float confidence for a key or None if invalid."""
try:
val = confidences.get(key)
return float(val) if val is not None else None
except (TypeError, ValueError):
return None
row.extend(
[
_conf_val("lemma"),
_conf_val("upos"),
_conf_val("features"),
_conf_val("governor"),
_conf_val("dependency_relation"),
]
)
row.extend(
_serialize_metadata_value(metadata_map.get(key)) for key in metadata_keys
)
row.extend(feature_map.get(key) for key in sorted_feature_keys)
for col in dep_extra_header:
row.append(dep_extra.get(col))
for name, value in zip(header, row):
columns[name].append(value)
arrays = [pa.array(columns[field.name], type=field.type) for field in schema]
return pa.Table.from_arrays(arrays, schema=schema)