Source code for topologic.similarity

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from sklearn.metrics.cluster import adjusted_rand_score
from typing import Any, Dict
import numpy as np

__all__ = ["ari"]


[docs]def ari(
    primary_partition: Dict[Any, int],
    test_partition: Dict[Any, int],
) -> float:
    """
    Given two partition schemas, a primary partition mapping (the most accurate representation of truth) and the test
    partition mapping (to be scored against that accurate representation of truth), calculate the Adjusted Rand Index.

    See https://en.wikipedia.org/wiki/Rand_index

    :param Dict[Any, int] primary_partition: The most accurate representation of truth for cluster or community
        membership of nodes. The keys are vertex labels and the values are the cluster/community/partition labels.
    :param Dict[Any, int] test_partition: The partition mapping to compare against the primary partition. The keys are
        vertex labels and the values are the cluster/community/partition labels.
    :return: The adjusted rand index for the two mappings
    :rtype float:
    :raises ValueError: If the primary partition and test partition do not have an identical vertex label set.
    """
    if primary_partition.keys() != test_partition.keys():
        raise ValueError("The reference partition provided does not contain the exact same keys as the predicted "
                         "clusters; an ari score cannot be generated automatically.")

    size = len(primary_partition.keys())
    primary = np.empty(size, dtype=int)
    test = np.empty(size, dtype=int)
    for i, vertex in enumerate(primary_partition.keys()):
        primary[i] = primary_partition[vertex]
        test[i] = test_partition[vertex]

    return adjusted_rand_score(labels_true=primary, labels_pred=test)