Source code for cltk.embeddings.sentence

"""For computing embeddings for lists of words."""

from typing import Union, ValuesView

import numpy as np
from sklearn.decomposition import TruncatedSVD

from cltk.core import Sentence


[docs]def rescale_idf(val: float, min_idf: float, max_idf: float) -> float: """Rescale idf values.""" return (val - min_idf) / (max_idf - min_idf)
[docs]def compute_pc(x: np.ndarray, npc: int = 1) -> np.ndarray: """Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN! :param x: X[i,:] is a data point :param npc: number of principal components to remove :return: component_[i,:] is the i-th pc This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`. """ svd: TruncatedSVD = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) svd.fit(x) return svd.components_
[docs]def remove_pc(x: np.ndarray, npc: int = 1) -> np.ndarray: """Remove the projection on the principal components. Calling this on a collection of sentence embeddings, prior to comparison, may improve accuracy. :param x: X[i,:] is a data point :param npc: number of principal components to remove :return: XX[i, :] is the data point after removing its projection This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`. """ pc: np.ndarray = compute_pc(x, npc) if npc == 1: xx: np.ndarray = x - x.dot(pc.transpose()) * pc else: xx: np.ndarray = x - x.dot(pc.transpose()).dot(pc) return xx
[docs]def get_sent_embeddings( sent: Sentence, idf_model: dict[str, Union[float, np.float64]], min_idf: Union[float, np.float64], max_idf: Union[float, np.float64], dimensions: int = 300, ) -> np.ndarray: """Provides the weighted average of a sentence's word vectors. Expectations: Word can only appear once in a sentence, multiple occurrences are collapsed. Must have 2 or more embeddings, otherwise Principle Component cannot be found and removed. :param sent: ``Sentence`` :param idf_model: a dictionary of tokens and idf values :param min_idf: the min idf score to use for scaling :param max_idf: the max idf score to use for scaling :param dimensions: the number of dimensions of the embedding :return ndarray: values of the sentence embedding, or returns an array of zeroes if no sentence embedding could be computed. """ map_word_embedding: dict[str, tuple[np.float64, np.ndarray]] = { token.string: ( rescale_idf(idf_model.get(token.string.lower(), min_idf), min_idf, max_idf), token.embedding, ) for token in sent.words if not np.all((token.embedding == 0)) # skip processing empty embeddings } weight_embedding_tuple: ValuesView = map_word_embedding.values() # We can't create a sentence embedding for just one word if len(weight_embedding_tuple) < 2: return np.zeros(dimensions) weights, embeddings = zip(*weight_embedding_tuple) if sum(weights) == 0: return np.zeros(dimensions) scale_factor: np.float64 = 1 / sum(weights) scaled_weights: list[np.float64] = [weight * scale_factor for weight in weights] scaled_values: np.ndarray = np.array(scaled_weights) # Apply our weighted terms to the adjusted embeddings weighted_embeds: np.ndarray = embeddings * scaled_values[:, None] return np.sum(weighted_embeds, axis=0)