Source code for cltk.utils.feature_extraction
"""Helper functions for extracting features from CLTK data structures,
especially for the purpose of preparing data for machine learning.
"""
from typing import Optional, Union
import numpy as np
from cltk.core.data_types import Doc, Word
from cltk.core.exceptions import CLTKException
from cltk.dependency.utils import ( # get_governor_relationship,; get_governor_word,
get_governor_word2,
)
from cltk.morphology.utils import get_features, get_pos
[docs]def cltk_doc_to_features_table(
cltk_doc: Doc,
) -> tuple[list[str], list[list[Union[str, int, float, None]]]]:
"""Take a CLTK ``Doc`` and return a list of lists ready for
machine learning.
This expects the default features available for Greek and Latin
(word embeddings, morphology, syntax, lemmata). This should be
improved to fail gracefully when less features available in the
input ``Doc``.
TODO: Fail gracefully when missing info in ``Doc``.
"""
if len(cltk_doc.sentences) < 1:
raise CLTKException("Must contain at least one ``Doc.sentence``.")
list_of_list_features: list[list[Union[str, int, float, None, np.ndarray]]] = list()
variable_names: Optional[list[str]] = None
for sentence in cltk_doc.sentences:
for word in sentence:
word_features_list: list[Union[str, int, float, None, np.ndarray]] = list()
# note: this gets made and remade; only needs to be done once, at beginning or at end; need to add check that len == the actual instance row
variable_names = list()
# Get word token chars
word_features_list.append(word.string)
variable_names.append("string")
# Get lemma
word_features_list.append(word.lemma)
variable_names.append("lemma")
# Get embedding
word_features_list.append(word.embedding)
variable_names.append("embedding")
# Get stopword binary
word_features_list.append(word.stop)
variable_names.append("is_stop")
# Get NER binary
word_features_list.append(word.named_entity)
variable_names.append("lemma")
# Get morphological info
pos_label = get_pos(word=word)
word_features_list.append(
pos_label
) # note: incorrectly labels upper-cased words as proper_noun, eg 'Βίβλος'
variable_names.append("pos")
feature_names, features_present = get_features(word=word)
word_features_list += (
features_present # add the features list to the big list
)
variable_names += feature_names
# Get dependency info
# strs = [w.string for w in sentence.words]
# idxs = [w.dependency_relation for w in sentence.words]
# govs = [w.governor for w in sentence.words]
# print(list(zip(strs, idxs, govs)))
# input()
# governing_word = get_governor_word(word=word, sentence=sentence.words)
governing_word: Optional[Word] = get_governor_word2(
word=word, sentence_words=sentence.words
)
pos_label_governor = get_pos(word=governing_word)
word_features_list.append(pos_label_governor)
variable_names.append("governing_word")
feature_names_governor, features_present_governor = get_features(
word=governing_word, prepend_to_label="governor_"
)
word_features_list += (
features_present_governor # add the features list to the big list
)
variable_names += feature_names_governor
# governor_edge = get_governor_relationship(word=word, sentence=sentence)
# word_features_list.append(governor_edge)
relation_type = word.dependency_relation
word_features_list.append(relation_type)
variable_names.append("governing_relationship")
list_of_list_features.append(word_features_list)
if not variable_names:
raise CLTKException(
"Input data problem, variable ``variable_names`` not created."
)
assert len(variable_names) == len(
list_of_list_features[0]
), f"The names of variables ({len(variable_names)}) does not match then actual number of variables ({len(list_of_list_features[0])}). These must be equal."
return variable_names, list_of_list_features