Source code for cltk.embeddings.sentence

"""For computing embeddings for lists of words."""

from typing import Union, ValuesView

import numpy as np
from sklearn.decomposition import TruncatedSVD

from cltk.core import Sentence


[docs]def rescale_idf(val: float, min_idf: float, max_idf: float) -> float:
    """Rescale idf values."""
    return (val - min_idf) / (max_idf - min_idf)


[docs]def compute_pc(x: np.ndarray, npc: int = 1) -> np.ndarray:
    """Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!

    :param x: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc

    This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`.
    """
    svd: TruncatedSVD = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(x)
    return svd.components_


[docs]def remove_pc(x: np.ndarray, npc: int = 1) -> np.ndarray:
    """Remove the projection on the principal components. Calling this on a collection of sentence embeddings, prior to comparison, may improve accuracy.

    :param x: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection

    This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`.
    """
    pc: np.ndarray = compute_pc(x, npc)
    if npc == 1:
        xx: np.ndarray = x - x.dot(pc.transpose()) * pc
    else:
        xx: np.ndarray = x - x.dot(pc.transpose()).dot(pc)
    return xx


[docs]def get_sent_embeddings(
    sent: Sentence,
    idf_model: dict[str, Union[float, np.float64]],
    min_idf: Union[float, np.float64],
    max_idf: Union[float, np.float64],
    dimensions: int = 300,
) -> np.ndarray:
    """Provides the weighted average of a sentence's word vectors.

    Expectations:
    Word can only appear once in a sentence, multiple occurrences are collapsed.
    Must have 2 or more embeddings, otherwise Principle Component cannot be found and removed.

    :param sent: ``Sentence``
    :param idf_model: a dictionary of tokens and idf values
    :param min_idf: the min idf score to use for scaling
    :param max_idf: the max idf score to use for scaling
    :param dimensions: the number of dimensions of the embedding

    :return ndarray: values of the sentence embedding, or returns an array of zeroes if no sentence embedding could be computed.
    """
    map_word_embedding: dict[str, tuple[np.float64, np.ndarray]] = {
        token.string: (
            rescale_idf(idf_model.get(token.string.lower(), min_idf), min_idf, max_idf),
            token.embedding,
        )
        for token in sent.words
        if not np.all((token.embedding == 0))  # skip processing empty embeddings
    }
    weight_embedding_tuple: ValuesView = map_word_embedding.values()
    # We can't create a sentence embedding for just one word
    if len(weight_embedding_tuple) < 2:
        return np.zeros(dimensions)
    weights, embeddings = zip(*weight_embedding_tuple)
    if sum(weights) == 0:
        return np.zeros(dimensions)
    scale_factor: np.float64 = 1 / sum(weights)
    scaled_weights: list[np.float64] = [weight * scale_factor for weight in weights]
    scaled_values: np.ndarray = np.array(scaled_weights)
    # Apply our weighted terms to the adjusted embeddings
    weighted_embeds: np.ndarray = embeddings * scaled_values[:, None]
    return np.sum(weighted_embeds, axis=0)
Source code for cltk.embeddings.sentence

The Classical Language Toolkit

Navigation

Related Topics