Source code for cltk.utils.feature_extraction

"""Helper functions for extracting features from CLTK data structures,
especially for the purpose of preparing data for machine learning.
"""

from typing import Optional, Union

import numpy as np

from cltk.core.data_types import Doc, Word
from cltk.core.exceptions import CLTKException
from cltk.dependency.utils import (  # get_governor_relationship,; get_governor_word,
    get_governor_word2,
)
from cltk.morphology.utils import get_features, get_pos


[docs]def cltk_doc_to_features_table(
    cltk_doc: Doc,
) -> tuple[list[str], list[list[Union[str, int, float, None]]]]:
    """Take a CLTK ``Doc`` and return a list of lists ready for
    machine learning.

    This expects the default features available for Greek and Latin
    (word embeddings, morphology, syntax, lemmata). This should be
    improved to fail gracefully when less features available in the
    input ``Doc``.

    TODO: Fail gracefully when missing info in ``Doc``.
    """

    if len(cltk_doc.sentences) < 1:
        raise CLTKException("Must contain at least one ``Doc.sentence``.")

    list_of_list_features: list[list[Union[str, int, float, None, np.ndarray]]] = list()

    variable_names: Optional[list[str]] = None
    for sentence in cltk_doc.sentences:
        for word in sentence:
            word_features_list: list[Union[str, int, float, None, np.ndarray]] = list()
            # note: this gets made and remade; only needs to be done once, at beginning or at end; need to add check that len == the actual instance row
            variable_names = list()
            # Get word token chars
            word_features_list.append(word.string)
            variable_names.append("string")
            # Get lemma
            word_features_list.append(word.lemma)
            variable_names.append("lemma")
            # Get embedding
            word_features_list.append(word.embedding)
            variable_names.append("embedding")
            # Get stopword binary
            word_features_list.append(word.stop)
            variable_names.append("is_stop")
            # Get NER binary
            word_features_list.append(word.named_entity)
            variable_names.append("lemma")

            # Get morphological info
            pos_label = get_pos(word=word)
            word_features_list.append(
                pos_label
            )  # note: incorrectly labels upper-cased words as proper_noun, eg 'Βίβλος'
            variable_names.append("pos")
            feature_names, features_present = get_features(word=word)
            word_features_list += (
                features_present  # add the features list to the big list
            )
            variable_names += feature_names

            # Get dependency info
            # strs = [w.string for w in sentence.words]
            # idxs = [w.dependency_relation for w in sentence.words]
            # govs = [w.governor for w in sentence.words]
            # print(list(zip(strs, idxs, govs)))
            # input()
            # governing_word = get_governor_word(word=word, sentence=sentence.words)
            governing_word: Optional[Word] = get_governor_word2(
                word=word, sentence_words=sentence.words
            )
            pos_label_governor = get_pos(word=governing_word)
            word_features_list.append(pos_label_governor)
            variable_names.append("governing_word")
            feature_names_governor, features_present_governor = get_features(
                word=governing_word, prepend_to_label="governor_"
            )
            word_features_list += (
                features_present_governor  # add the features list to the big list
            )
            variable_names += feature_names_governor
            # governor_edge = get_governor_relationship(word=word, sentence=sentence)
            # word_features_list.append(governor_edge)
            relation_type = word.dependency_relation
            word_features_list.append(relation_type)
            variable_names.append("governing_relationship")

            list_of_list_features.append(word_features_list)

    if not variable_names:
        raise CLTKException(
            "Input data problem, variable ``variable_names`` not created."
        )
    assert len(variable_names) == len(
        list_of_list_features[0]
    ), f"The names of variables ({len(variable_names)}) does not match then actual number of variables ({len(list_of_list_features[0])}). These must be equal."

    return variable_names, list_of_list_features
Source code for cltk.utils.feature_extraction

The Classical Language Toolkit

Navigation

Related Topics