Source code for topologic.embedding.clustering.wards

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from sklearn.cluster import AgglomerativeClustering


[docs]def wards_clustering( embedding: np.ndarray, num_clusters: int = 2, affinity: str = 'euclidean', memory: str = None, connectivity: np.ndarray = None, compute_full_tree: str = 'auto' ) -> np.ndarray: """ Uses agglomerative clustering with ward linkage Recursively merges the pair of clusters that minimally increases a given linkage distance. :param numpy.ndarray embedding: An n x d array of vectors representing n labels in a d dimensional space :param int num_clusters: int, default=2 The number of clusters to find. :param str affinity: string or callable, default: "euclidean" Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or 'precomputed'. If linkage is "ward", only "euclidean" is accepted. :param memory: None, str or object with the joblib.Memory interface, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. :type memory: Optional[Union[str, joblib.Memory]] :param numpy.ndarray connectivity: array-like or callable, optional Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms the data into a connectivity matrix, such as derived from kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. :param Optional[str] compute_full_tree: bool or 'auto' (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree. :return: The cluster labels for each vector in the given embedding. The vector at index n in the embedding will have the label at index n in this returned array :rtype: np.ndarray """ agglomerative_cluster_model = _agglomerative_clustering( embedding, num_clusters=num_clusters, affinity=affinity, memory=memory, connectivity=connectivity, compute_full_tree=compute_full_tree, linkage='ward' ) return agglomerative_cluster_model.labels_
def _agglomerative_clustering( embedding: np.ndarray, num_clusters: int = 2, affinity: str = 'euclidean', memory: str = None, connectivity: np.ndarray = None, compute_full_tree: str = 'auto', linkage: str = 'ward' ) -> np.ndarray: """ Agglomerative Clustering Recursively merges the pair of clusters that minimally increases a given linkage distance. :param numpy.ndarray embedding: A feature matrix used to generate the model :param int num_clusters: int, default=2 The number of clusters to find. :param str affinity: string or callable, default: "euclidean" Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or 'precomputed'. If linkage is "ward", only "euclidean" is accepted. :param memory: None, str or object with the joblib.Memory interface, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. :type memory: Optional[Union[str, joblib.Memory]] :param numpy.ndarray connectivity: array-like or callable, optional Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms the data into a connectivity matrix, such as derived from kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. :param Optional[str] compute_full_tree: bool or 'auto' (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree. :param str linkage: {"ward", "complete", "average", "single"}, optional (default="ward") Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. - ward minimizes the variance of the clusters being merged. - average uses the average of the distances of each observation of the two sets. - complete or maximum linkage uses the maximum distances between all observations of the two sets. - single uses the minimum of the distances between all observations of the two sets. :return: Model with attributes: labels_ : array [n_samples] cluster labels for each point n_leaves_ : int Number of leaves in the hierarchical tree. n_components_ : int The estimated number of connected components in the graph. children_ : array-like, shape (n_samples-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` :rtype: Object with attributes labels, n_leaves, n_components, children """ model = AgglomerativeClustering( n_clusters=num_clusters, affinity=affinity, memory=memory, connectivity=connectivity, compute_full_tree=compute_full_tree, linkage=linkage ) model.fit(embedding) return model