Source code for topologic.embedding.distance

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from scipy import spatial
import numpy as np
from typing import Callable, KeysView, Union
from . import EmbeddingContainer


__all__ = [
    "cosine",
    "euclidean",
    "mahalanobis",
    "valid_distance_functions",
    "vector_distance",
    "embedding_distances_from"
]


[docs]def cosine(first_vector: np.ndarray, second_vector: np.ndarray) -> float: """ Distance function for two vectors of equal length. `Cosine distance <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.cosine.html>`_ See also: https://en.wikipedia.org/wiki/Cosine_similarity :param numpy.ndarray first_vector: nonzero vector. must be same length as second_vector :param numpy.ndarray second_vector: nonzero vector. must be same length as first_vector :return: cosine distance - Resulting range is between 0 and 2. Values closer to 0 are more similar. Values closer to 2 are approaching total dissimilarity. :rtype: float :examples: >>> cosine(np.array([1,3,5]), np.array([2,3,4])) 0.026964528109766017 """ # noqa:501 return spatial.distance.cosine(first_vector, second_vector)
[docs]def euclidean(first_vector: np.ndarray, second_vector: np.ndarray) -> float: """ Distance function for two vectors of equal length `Euclidean distance <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.euclidean.html>`_ See also: https://en.wikipedia.org/wiki/Euclidean_distance :param numpy.ndarray first_vector: nonzero vector. must be same length as second_vector :param numpy.ndarray second_vector: nonzero vector. must be same length as first_vector :return: euclidean distance - Resulting range is a positive real number. Values closer to 0 are more similar. :rtype: float :examples: >>> euclidean(np.array([1,3,5]), np.array([2,3,4])) 1.4142135623730951 """ # noqa:501 return spatial.distance.euclidean(first_vector, second_vector)
[docs]def mahalanobis(inverse_covariance: np.ndarray) -> Callable[[np.ndarray, np.ndarray], float]: """ Unlike cosine and euclidean distances which scipy provides that take in only two vectors, mahalanobis also requires an inverse covariance matrix. This function can be used but first this matrix must be provided and a curried function handler returned, which can then be passed in to the `vector_distance` and `embedding_distances_from` functions. See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.mahalanobis.html :param np.ndarray inverse_covariance: The inverse covariance matrix :return: A curried function that now takes in 2 vectors and determines distance based on the inverse_covariance provided. """ def _mahalanobis(first_vector: np.ndarray, second_vector: np.ndarray) -> float: return spatial.distance.mahalanobis(first_vector, second_vector, inverse_covariance) return _mahalanobis
_valid_distance_functions = { "cosine": cosine, "euclidean": euclidean }
[docs]def valid_distance_functions() -> KeysView[str]: """ The topologic builtin list of valid distance functions. Any function that return a float when given two np.ndarray 1d vectors is a valid choice, but the only ones we support without any other work are cosine or euclidean. :return: A set-like view of the string names of the functions we support """ return _valid_distance_functions.keys()
def _distance_function( method: Union[str, Callable[[np.ndarray, np.ndarray], float]] ) -> Callable[[np.ndarray, np.ndarray], float]: if isinstance(method, str): if method not in valid_distance_functions(): raise ValueError( f"Method {method} not in list of valid distance functions: {valid_distance_functions()}" ) return _valid_distance_functions[method] else: return method
[docs]def vector_distance( first_vector: np.ndarray, second_vector: np.ndarray, method: Union[str, Callable[[np.ndarray, np.ndarray], float]] = cosine ) -> float: """ Vector distance is a function that will do any distance function you would like on two vectors. This is most commonly used by changing the method parameter, as a string, from "cosine" to "euclidean" - allowing you to change your flow based on configuration not on code changes to the actual cosine and euclidean functions. :param np.ndarray first_vector: A 1d array-like (list, tuple, np.array) that represents the first vector :param np.ndarray second_vector: A 1d array-like (list, tuple, np.array) that represents the second vector :param method: Method can be any distance function that takes in 2 parameters. It can also be the string mapping to that function (as described by valid_distance_functions()). Note that you can also provide other functions, such as `mahalanobis`, but they require more information than just the comparative vectors. :type method: Union[str, Callable[[np.ndarray, np.ndarray], float]] :return: A float indicating the distance between two vectors. """ method = _distance_function(method) return method(first_vector, second_vector)
[docs]def embedding_distances_from( vector: np.ndarray, embedding: Union[EmbeddingContainer, np.ndarray], method: Union[str, Callable[[np.ndarray, np.ndarray], float]] = cosine ) -> np.ndarray: """ This function will return a 1d np.ndarray of floats by doing a distance calculation from the given `vector` to each `vector` stored in the embedding (likely including itself). The distance calculation can be provided either as a function reference or a string representation mapped to the 2 standard distance functions we natively support. The functions supported are cosine and euclidean, both of which are `scipy` implementations. There is also a mahalanobis generator function that can be used, but first you must provide it with the inverse covariance matrix necessary for the distance calculations to be performed. :param np.ndarray vector: A 1d array-like (list, tuple, np.array) that represents the vector to compare against every other vector in the embedding :param Union[EmbeddingContainer, np.ndarray] embedding: The embedding is either a 2d np array, where each row is a vector and the number of columns is identical to the length of the vector to compare against. :param method: Method can be any distance function that takes in 2 parameters. It can also be the string mapping to that function (as described by valid_distance_functions()). Note that you can also provide other functions, such as `mahalanobis`, but they require more information than just the comparative vectors. :type method: Union[str, Callable[[np.ndarray, np.ndarray], float]] :return: np.ndarray of dtype float the same length as the count of embedded vectors :examples: >>> vector = [0.3, 0.4, 0.5] >>> embedding = np.array([[0.3, 0.4, 0.5], [0.31, 0.44, 0.7]]) >>> embedding_distances_from(vector, embedding, method="cosine") # using string version of method name array([0. , 0.00861606]) >>> embedding_distances_from(vector, embedding, method=euclidean) # using function handle array([0. , 0.20420578]) """ method = _distance_function(method) if isinstance(embedding, EmbeddingContainer): embedding = embedding.embedding length = embedding.shape[0] scores = np.zeros(length, dtype=float) for i in range(0, length): distance = method(vector, embedding[i]) scores[i] = distance return scores