Source code for topologic.io.edge_detector

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from collections import Counter
from itertools import combinations

from topologic.io.potential_edge_column_pair import PotentialEdgeColumnPair
from topologic.io.graph_properties import GraphProperties
from topologic.io.csv_loader import CsvDataset

from typing import Dict


[docs]def find_edges( csv_dataset: CsvDataset, common_values_count: int = 20, rare_values_count: int = 20 ): values: Dict[str, Counter] = {} headers = csv_dataset.headers() for header in headers: values[header] = Counter() for row in csv_dataset.reader(): stop = min(len(headers), len(row)) for i in range(0, stop): header = headers[i] value = row[i] values[header][value] += 1 result = [] for x in combinations(headers, 2): source = x[0] destination = x[1] # Make sure that we store the columns in alphabetical order when we return our result. This will make # testing more deterministic if destination < source: temp = destination destination = source source = temp intersect = values[source] & values[destination] result.append(PotentialEdgeColumnPair(source, destination, len(intersect.keys()))) result.sort(key=lambda y: (int(y.score())), reverse=True) common_values = {} rare_values = {} for header in headers: common_values[header] = values[header].most_common(common_values_count) rare_values[header] = values[header].most_common()[(-1 * rare_values_count):] return GraphProperties( headers, result, common_column_values=common_values, rare_column_values=rare_values)