Source code for topologic.embedding.omnibus_embedding

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import logging
from typing import List, Union, Optional, Tuple

import networkx as nx
import numpy as np
import scipy as sp

from .embedding_container import EmbeddingContainer
from .spectral_embedding import _generate_embedding
from ..connected_components import largest_connected_component
from ..embedding import EmbeddingMethod
from ..graph_augmentation import rank_edges, \
    diagonal_augmentation


[docs]def omnibus_embedding( graphs: List[nx.Graph], maximum_dimensions: int = 100, elbow_cut: Optional[int] = 1, embedding_method: EmbeddingMethod = EmbeddingMethod.LAPLACIAN_SPECTRAL_EMBEDDING, svd_seed: Optional[int] = None, num_iterations: int = 5, power_iteration_normalizer: str = 'QR', num_oversamples: int = 10 ) -> List[Tuple[EmbeddingContainer, EmbeddingContainer]]: """ Generates a pairwise omnibus embedding for each pair of graphs in a list of graphs. If given graphs A, B, and C, the embeddings will be computed for A,B and B,C. There should be exactly the same number of nodes in each graph with exactly the same labels. The list of graphs should represent a time series and should be in an order such that time is continuous through the list of graphs. If the labels differ between each pair of graphs, then those nodes will only be found in the resulting embedding if they exist in the largest connected component of the union of all edges across all graphs in the time series. :param List[networkx.Graph] graphs: A list of graphs that will be used to generate the omnibus embedding. Each graph should have exactly the same vertices as each of the other graphs. The order of the graphs in the list matter. The first graph will be at time 0 and each following graph will increment time by 1. :param int maximum_dimensions: Maximum dimensions of embeddings that will be returned - defaults to 100. Actual dimensions of resulting embeddings should be significantly smaller, but will never be over this value. :param int elbow_cut: scree plot elbow detection will detect (usually) many elbows. This value specifies which elbow to use prior to filtering out extraneous dimensions. :param topologic.embedding.EmbeddingMethod embedding_method: The embedding technique used to generate the Omnibus embedding. :param Optional[int] svd_seed: If not provided, uses a random number every time, making consistent results difficult Set this to a random int if you want consistency between executions over the same graph. :param int num_iterations: The number of iterations to be used in the svd solver. :param int num_oversamples: Additional number of random vectors to sample the range of M so as to ensure proper conditioning. The total number of random vectors used to find the range of M is n_components + n_oversamples. Smaller number can improve speed but can negatively impact the quality of approximation of singular vectors and singular values. :param Optional[str] power_iteration_normalizer: Whether the power iterations are normalized with step-by-step QR factorization (the slowest but most accurate), 'none' (the fastest but numerically unstable when `n_iter` is large, e.g. typically 5 or larger), or 'LU' factorization (numerically stable but can lose slightly in accuracy). The 'auto' mode applies no normalization if `num_iterations` <= 2 and switches to LU otherwise. Options: 'auto' (default), 'QR', 'LU', 'none' :return: A List of EmbeddingContainers each containing a matrix, which itself contains the embedding for each node. the tuple also contains a vector containing the corresponding vertex labels for each row in the matrix. the matrix and vector are positionally correlated. :rtype: List[(EmbeddingContainer, EmbeddingContainer)] """ logger = logging.getLogger(__name__) if not graphs: raise ValueError('Graphs must be provided but was None') if len(graphs) <= 1: raise ValueError('You must provide at least two graphs to compute the Omnibus embedding but there were only ' f'{len(graphs)} graphs.') is_directed = nx.is_directed(graphs[0]) for graph in graphs[1:]: if nx.is_directed(graph) != is_directed: raise ValueError('All graphs must be either directed or all must be undirected.') logger.info('Generating adjacency matrices from list of graphs') if embedding_method == EmbeddingMethod.ADJACENCY_SPECTRAL_EMBEDDING: get_matrices_function = _get_adjacency_matrices elif embedding_method == EmbeddingMethod.LAPLACIAN_SPECTRAL_EMBEDDING: get_matrices_function = _get_laplacian_matrices else: raise TypeError(f'Unexpected EmbeddingMethod for argument embedding_method: {embedding_method}.') logger.info('Generating the omnibus embedding') embedding_containers = [] # union graph lcc strategy union_graph = graphs[0].copy() for graph in graphs[1:]: for source, target, weight in graph.edges(data='weight'): if union_graph.has_edge(source, target): union_graph[source][target]['weight'] += weight else: union_graph.add_edge(source, target, weight=weight) union_graph_lcc = largest_connected_component(union_graph) union_graph_lcc_nodes = union_graph_lcc.nodes() previous_graph = graphs[0].copy() count = 1 for graph in graphs[1:]: logging.debug(f'Calculating omni for graph {count} of {len(graphs) - 1}') count = count + 1 current_graph = graph.copy() # reduce each graph so that the node set contains exactly the same nodes as union_graph_lcc_nodes _sync_nodes(previous_graph, union_graph_lcc_nodes) _sync_nodes(current_graph, union_graph_lcc_nodes) pairwise_graphs_reduced = [previous_graph, current_graph] for i in range(len(pairwise_graphs_reduced)): graph_to_augment = pairwise_graphs_reduced[i] augmented_graph = _augment_graph(graph_to_augment) pairwise_graphs_reduced[i] = augmented_graph labels, pairwise_matrices = get_matrices_function(pairwise_graphs_reduced) omnibus_matrix = generate_omnibus_matrix(pairwise_matrices) embedding = _generate_embedding( elbow_cut, is_directed, omnibus_matrix, maximum_dimensions, min(omnibus_matrix.shape), num_oversamples, num_iterations, power_iteration_normalizer, svd_seed ) number_of_nodes = len(pairwise_graphs_reduced[0].nodes()) embeddings = [embedding[x: x + number_of_nodes] for x in range(0, embedding.shape[0], number_of_nodes)] previous_graph = graph embedding_containers.append( (EmbeddingContainer(embeddings[0], labels), EmbeddingContainer(embeddings[1], labels)) ) return embedding_containers
def _augment_graph(graph_to_augment): ranked_graph = rank_edges(graph_to_augment) augmented_graph = diagonal_augmentation(ranked_graph) return augmented_graph def _sync_nodes(graph_to_reduce, set_of_valid_nodes): to_remove = [] for n in graph_to_reduce.nodes(): if n not in set_of_valid_nodes: to_remove.append(n) graph_to_reduce.remove_nodes_from(to_remove) for node in set_of_valid_nodes: if not graph_to_reduce.has_node(node): graph_to_reduce.add_node(node) def _reduce_to_common_nodes(graphs: List[nx.Graph]): """ Reduces each graph in the provided list to only the nodes contained in each and every other graph. In other words, reduce each graph to be the intersection of nodes of all other graphs. :param graphs: The list of graphs to reduce :return: A list of pruned graphs """ sets = [] graphs_copy = copy.deepcopy(graphs) for graph in graphs_copy: sets.append(set(graph.nodes())) set_of_all_common_nodes = set.intersection(*sets) # reduce graphs to only have nodes contained in all other graphs for graph in graphs_copy: to_remove = [] for node in graph.nodes(): if node not in set_of_all_common_nodes: to_remove.append(node) for node in to_remove: graph.remove_node(node) return graphs_copy def _get_unstacked_embeddings(embedding, graphs, labels): containers = [] # unstack embeddings and labels number_of_nodes = len(graphs[0].nodes()) embeddings = [embedding[x: x + number_of_nodes] for x in range(0, embedding.shape[0], number_of_nodes)] vertex_labels = [labels[x: x + number_of_nodes] for x in range(0, embedding.shape[0], number_of_nodes)] for i, embedding in enumerate(embeddings): containers.append( EmbeddingContainer( embedding=embedding, vertex_labels=vertex_labels[i] ) ) return containers def _get_adjacency_matrices(graphs): matrices = [] labels = set() for graph in graphs: sorted_nodes = sorted(graph.nodes()) matrices.append(nx.to_scipy_sparse_matrix(graph, nodelist=sorted_nodes)) for node in sorted_nodes: labels.add(node) return sorted(list(labels)), matrices def _get_laplacian_matrices(graphs): labels, adjacency_matrices = _get_adjacency_matrices(graphs) laplacian_matrices = [] for matrix in adjacency_matrices: laplacian_matrices.append(_get_lse_matrix(matrix)) return labels, laplacian_matrices def _get_lse_matrix(adjacency_matrix: np.ndarray): in_degree = adjacency_matrix.sum(axis=0).astype(float) out_degree = adjacency_matrix.sum(axis=1).T.astype(float) in_degree_array = np.squeeze(np.asarray(in_degree)) out_degree_array = np.squeeze(np.asarray(out_degree)) in_root = in_degree_array ** (-0.5) out_root = out_degree_array ** (-0.5) in_root[np.isinf(in_root)] = 0 out_root[np.isinf(out_root)] = 0 in_diagonal = sp.sparse.diags(in_root) out_diagonal = sp.sparse.diags(out_root) lse_matrix = out_diagonal.dot(adjacency_matrix).dot(in_diagonal) return lse_matrix
[docs]def generate_omnibus_matrix( matrices: List[Union[np.ndarray, sp.sparse.csr_matrix]] ) -> np.ndarray: """ Generate the omnibus matrix from a list of adjacency or laplacian matrices as described by 'A central limit theorem for an omnibus embedding of random dot product graphs.' Given an iterable of matrices a, b, ... n then the omnibus matrix is defined as:: [[ a, .5 * (a + b), ..., .5 * (a + n)], [.5 * (b + a), b, ..., .5 * (b + n)], [ ..., ..., ..., ...], [.5 * (n + a), .5 * (n + b, ..., n] ] The current iteration of this function operates in O(n) but a further optimization could take it to O(.5 * n) See also: The original paper - https://arxiv.org/abs/1705.09355 :param matrices: The list of matrices to generate the Omnibus matrix :type matrices: List[Union[numpy.ndarray, scipy.sparse.csr_matrix]] :return: An Omnibus matrix """ horizontal_stacker, vertical_stacker = _get_stacker_functions(matrices[0]) rows = [] # Iterate over each column for column_index in range(0, len(matrices)): current_row = [] current_matrix = matrices[column_index] for row_index in range(0, len(matrices)): if row_index == column_index: # we are on the diagonal, we do not need to perform any calculation and instead add the current matrix # to the current_row current_row.append(current_matrix) else: # otherwise we are not on the diagonal and we average the current_matrix with the matrix at row_index # and add that to our current_row matrices_averaged = (current_matrix + matrices[row_index]) * .5 current_row.append(matrices_averaged) # an entire row has been generated, we will create a horizontal stack of each matrix in the row completing the # row rows.append(horizontal_stacker(current_row)) return vertical_stacker(rows)
def _get_stacker_functions(matrix): if sp.sparse.issparse(matrix): horizontal_stacker = sp.sparse.hstack vertical_stacker = sp.sparse.vstack else: horizontal_stacker = np.hstack vertical_stacker = np.vstack return horizontal_stacker, vertical_stacker