Source code for topologic.embedding.elbow_finder

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

# Adapted from Youngser Park.
# @references, Zhu, Mu and Ghodsi, Ali (2006), Automatic dimensionality selection
# from the scree plot via the use of profile likelihood,
# Computational Statistics & Data Analysis, Volume 51 Issue 2, pp 918-930, November, 2006

from typing import Union

import numpy as np
from scipy.stats import norm


[docs]def find_elbows( iterable_to_search: Union[list, np.array], num_elbows: int = 1, threshold: int = 0 ) -> np.array: """ An implementation of profile likelihood as outlined in Zhu and Ghodsi References, Zhu, Mu and Ghodsi, Ali (2006), Automatic dimensionality selection from the scree plot via the use of profile likelihood, Computational Statistics & Data Analysis, Volume 51 Issue 2, pp 918-930, November, 2006 :examples: >>> input_data = [2, 3, 4, 5, 6, 7, 8, 9] >>> result: np.array = find_elbows(input_data, num_elbows=1, threshold=0) >>> result.size 1 :param iterable_to_search: An ordered or unordered list of values that will be used to find the elbows. :param num_elbows: The number of elbows to return :param threshold: Smallest value to consider. Nonzero thresholds will affect elbow selection. :return: A numpy array containing elbows """ iterable_copy = iterable_to_search.copy() if isinstance(iterable_copy, list): # cast to array for functionality later iterable_copy = np.array(iterable_copy) if num_elbows == 0: # nothing to do.. return np.array([]) if iterable_copy.ndim == 2: iterable_copy = np.std(iterable_copy, axis=0) # select values greater than the threshold iterable_copy = iterable_copy[iterable_copy > threshold] if len(iterable_copy) == 0: return np.array([]) elbows = [] if len(iterable_copy) == 1: elbows.append(iterable_copy[0]) return np.array(elbows) iterable_copy.sort() iterable_copy = iterable_copy[::-1] # reverse array so that it is sorted in descending order n = len(iterable_copy) while len(elbows) < num_elbows and len(iterable_copy) > 1: d = 1 sample_var = np.var(iterable_copy, ddof=1) sample_scale = sample_var ** (1 / 2) elbow = 0 likelihood_elbow = 0 while d < len(iterable_copy): mean_sig = np.mean(iterable_copy[:d]) mean_noise = np.mean(iterable_copy[d:]) sig_likelihood = 0 noise_likelihood = 0 for i in range(d): sig_likelihood += norm.pdf(iterable_copy[i], mean_sig, sample_scale) for i in range(d, len(iterable_copy)): noise_likelihood += norm.pdf(iterable_copy[i], mean_noise, sample_scale) likelihood = noise_likelihood + sig_likelihood if likelihood > likelihood_elbow: likelihood_elbow = likelihood elbow = d d += 1 if len(elbows) == 0: elbows.append(elbow) else: elbows.append(elbow + elbows[-1]) iterable_copy = iterable_copy[elbow:] if len(elbows) == num_elbows: return np.array(elbows) if len(iterable_copy) == 0: return np.array(elbows) else: elbows.append(n) return np.array(elbows)