Source code for topologic.embedding.adjacency_spectral_embedding

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Optional

import networkx as nx

from .embedding_container import EmbeddingContainer
from .spectral_embedding import _create_augmented_adjacency_matrix, \
    _generate_embedding
from .. import assertions


[docs]def adjacency_embedding(
        graph: nx.Graph,
        maximum_dimensions: int = 100,
        elbow_cut: Optional[int] = 1,
        weight_column: str = 'weight',
        svd_seed: Optional[int] = None,
        num_iterations: int = 5,
        power_iteration_normalizer: str = 'QR',
        num_oversamples: int = 10
) -> EmbeddingContainer:
    """
    Generates a spectral embedding based upon the adjacency matrix of the graph.

    See also: https://csustan.csustan.edu/~tom/Clustering/GraphLaplacian-tutorial.pdf

    :param networkx.Graph graph: graph_augmented_sparse_matrix networkx Graph object containing no more than one
        connected component. Note that if the graph is a directed graph, the resulting dimensionality of the embedding
        will be twice that of an undirected graph
    :param int maximum_dimensions: Maximum dimensions of embeddings that will be returned - defaults to 100.  Actual
        dimensions of resulting embeddings should be significantly smaller, but will never be over this value.
    :param Optional[int] elbow_cut: scree plot elbow detection will detect (usually) many elbows.  This value specifies
        which elbow to use prior to filtering out extraneous dimensions. If None, then an embedding of size
        `maximum_dimensions` will be returned.
    :param str weight_column: The weight column to use in the Graph.
    :param Optional[int] svd_seed: If not provided, uses a random number every time, making consistent results difficult
        Set this to a random int if you want consistency between executions over the same graph.
    :param int num_iterations: The number of iterations to be used in the svd solver.
    :param int num_oversamples: Additional number of random vectors to sample the range of M so as
        to ensure proper conditioning. The total number of random vectors
        used to find the range of M is n_components + n_oversamples. Smaller
        number can improve speed but can negatively impact the quality of
        approximation of singular vectors and singular values.
    :param Optional[str] power_iteration_normalizer:
        Whether the power iterations are normalized with step-by-step
        QR factorization (the slowest but most accurate), 'none'
        (the fastest but numerically unstable when `n_iter` is large, e.g.
        typically 5 or larger), or 'LU' factorization (numerically stable
        but can lose slightly in accuracy). The 'auto' mode applies no
        normalization if `num_iterations` <= 2 and switches to LU otherwise.

        Options: 'auto' (default), 'QR', 'LU', 'none'
    :return: EmbeddingContainer containing a matrix, which itself contains the embedding for each node.  the tuple also
        contains a vector containing the corresponding vertex labels for each row in the matrix.  the matrix and vector
        are positionally correlated.
    :rtype: EmbeddingContainer
    """
    assertions.assert_single_connected_components(
        graph,
        "Run this algorithm over the largest connected component (see: topologic.largest_connected_component()) or " +
        "run it over every connected component separately."
    )

    working_graph = graph.copy()

    augmented_graph, graph_matrix, sorted_vertices = _create_augmented_adjacency_matrix(weight_column, working_graph)

    minimum_matrix_dimension = min(graph_matrix.shape)

    embedding = _generate_embedding(
        elbow_cut,
        nx.is_directed(graph),
        graph_matrix,
        maximum_dimensions,
        minimum_matrix_dimension,
        num_oversamples,
        num_iterations,
        power_iteration_normalizer,
        svd_seed
    )

    result = EmbeddingContainer(embedding=embedding, vertex_labels=sorted_vertices)

    return result