Source code for cltk.embeddings.sentence
"""For computing embeddings for lists of words."""
from typing import Union, ValuesView
import numpy as np
from sklearn.decomposition import TruncatedSVD
from cltk.core import Sentence
[docs]def rescale_idf(val: float, min_idf: float, max_idf: float) -> float:
"""Rescale idf values."""
return (val - min_idf) / (max_idf - min_idf)
[docs]def compute_pc(x: np.ndarray, npc: int = 1) -> np.ndarray:
"""Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
:param x: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`.
"""
svd: TruncatedSVD = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(x)
return svd.components_
[docs]def remove_pc(x: np.ndarray, npc: int = 1) -> np.ndarray:
"""Remove the projection on the principal components. Calling this on a collection of sentence embeddings, prior to comparison, may improve accuracy.
:param x: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
This has been adapted from the SIF paper code: `https://openreview.net/pdf?id=SyK00v5xx`.
"""
pc: np.ndarray = compute_pc(x, npc)
if npc == 1:
xx: np.ndarray = x - x.dot(pc.transpose()) * pc
else:
xx: np.ndarray = x - x.dot(pc.transpose()).dot(pc)
return xx
[docs]def get_sent_embeddings(
sent: Sentence,
idf_model: dict[str, Union[float, np.float64]],
min_idf: Union[float, np.float64],
max_idf: Union[float, np.float64],
dimensions: int = 300,
) -> np.ndarray:
"""Provides the weighted average of a sentence's word vectors.
Expectations:
Word can only appear once in a sentence, multiple occurrences are collapsed.
Must have 2 or more embeddings, otherwise Principle Component cannot be found and removed.
:param sent: ``Sentence``
:param idf_model: a dictionary of tokens and idf values
:param min_idf: the min idf score to use for scaling
:param max_idf: the max idf score to use for scaling
:param dimensions: the number of dimensions of the embedding
:return ndarray: values of the sentence embedding, or returns an array of zeroes if no sentence embedding could be computed.
"""
map_word_embedding: dict[str, tuple[np.float64, np.ndarray]] = {
token.string: (
rescale_idf(idf_model.get(token.string.lower(), min_idf), min_idf, max_idf),
token.embedding,
)
for token in sent.words
if not np.all((token.embedding == 0)) # skip processing empty embeddings
}
weight_embedding_tuple: ValuesView = map_word_embedding.values()
# We can't create a sentence embedding for just one word
if len(weight_embedding_tuple) < 2:
return np.zeros(dimensions)
weights, embeddings = zip(*weight_embedding_tuple)
if sum(weights) == 0:
return np.zeros(dimensions)
scale_factor: np.float64 = 1 / sum(weights)
scaled_weights: list[np.float64] = [weight * scale_factor for weight in weights]
scaled_values: np.ndarray = np.array(scaled_weights)
# Apply our weighted terms to the adjusted embeddings
weighted_embeds: np.ndarray = embeddings * scaled_values[:, None]
return np.sum(weighted_embeds, axis=0)