Source code for topologic.embedding.clustering.dbscan

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from array import array
from sklearn.cluster import dbscan as sk_dbscan


[docs]def dbscan( embedding: np.ndarray, eps: float = 0.5, min_samples: int = 5, metric: str = 'minkowski', metric_params: dict = None, algorithm: str = 'auto', leaf_size: int = 30, p: float = 2, sample_weight: array = None, n_jobs: int = None ) -> np.ndarray: """ Perform DBSCAN clustering from vector array or distance matrix. :param numpy.ndarray embedding: An n x d array of vectors representing n labels in a d dimensional space :param Optional[float] eps: The maximum distance between two samples for them to be considered as in the same neighborhood. :param Optional[int] min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. :param metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. :type metric: Union[str, Callable[[float, float], float]] :param Optional[dict] metric_params: Additional keyword arguments for the metric function. :param Optional[str] algorithm: The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. Potential values: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional :param Optional[int] leaf_size: Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. Default 30 :param Optional[float] p: The power of the Minkowski metric to be used to calculate distance between points. Default 2.0 :param Optional[Array[int]] sample_weight: Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. :param Optional[int] n_jobs: The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. :return: The cluster labels for each vector in the given embedding. The vector at index n in the embedding will have the label at index n in this returned array. Noisy samples are given the value -1 :rtype: np.ndarray """ return sk_dbscan( X=embedding, eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params, algorithm=algorithm, leaf_size=leaf_size, p=p, sample_weight=sample_weight, n_jobs=n_jobs )[1] # element at index 1 contains the labels