"""Compute cluster statistics for all (network, method) pairs + ground truth.""" import argparse import sys import os import json sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import numpy as np import pandas as pd import igraph as ig from config import NETWORKS, METHODS, RESULTS_DIR from load_data import load_edge_list, load_communities def build_neighbors(edge_df): """Build adjacency dict from edge DataFrame.""" neighbors = {} for _, row in edge_df.iterrows(): s, t = row["src"], row["tgt"] neighbors.setdefault(s, set()).add(t) neighbors.setdefault(t, set()).add(s) return neighbors def compute_cluster_stats(network_name, method_name, com_path): """Compute statistics for a given community assignment.""" net = NETWORKS[network_name] edge_df = load_edge_list(net["edge_tsv"]) neighbors = build_neighbors(edge_df) all_nodes = set(neighbors.keys()) node2com = load_communities(com_path) # Build com2nodes com2nodes = {} for node, com in node2com.items(): com2nodes.setdefault(com, set()).add(node) # Separate singletons vs non-singletons non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1} singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1} # Nodes in non-singleton clusters nodes_in_clusters = set() for nodes in non_singleton_coms.values(): nodes_in_clusters.update(nodes) node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0 # Per-cluster stats cluster_details = [] total_clusters = len(non_singleton_coms) for ci, (com_id, nodes) in enumerate(sorted(non_singleton_coms.items())): n = len(nodes) # Internal edges m_internal = 0 c_boundary = 0 for node in nodes: for nbr in neighbors.get(node, set()): if nbr in nodes: m_internal += 1 else: c_boundary += 1 m_internal //= 2 # each edge counted twice edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0 degree_density = m_internal / n if n > 0 else 0.0 conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0 # Minimum edge cut via igraph mincut = 0 if n >= 2 and m_internal >= 1: node_list = sorted(nodes) local_map = {nd: i for i, nd in enumerate(node_list)} edges = [] for nd in node_list: for nbr in neighbors.get(nd, set()): if nbr in nodes and local_map[nd] < local_map[nbr]: edges.append((local_map[nd], local_map[nbr])) sg = ig.Graph(n=n, edges=edges, directed=False) mincut = sg.mincut().value mincut_over_log10n = mincut / np.log10(n) if n > 1 else 0.0 cluster_details.append({ "com_id": com_id, "n": n, "m_internal": m_internal, "c_boundary": c_boundary, "edge_density": edge_density, "degree_density": degree_density, "conductance": conductance, "mincut": int(mincut), "mincut_over_log10n": mincut_over_log10n, }) if (ci + 1) % 500 == 0: print(f" ... {ci+1}/{total_clusters} clusters processed") # Per-node mixing parameter mixing_params = [] for node in all_nodes: if node not in node2com: mixing_params.append(1.0) # outlier continue my_com = node2com[node] nbrs = neighbors.get(node, set()) if len(nbrs) == 0: mixing_params.append(0.0) continue n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com) mixing_params.append(1.0 - n_in / len(nbrs)) summary = { "network": network_name, "method": method_name, "n_nodes": len(all_nodes), "n_clusters_total": len(com2nodes), "n_clusters_non_singleton": len(non_singleton_coms), "n_singleton_clusters": len(singleton_coms), "node_coverage": node_coverage, "mean_mixing_param": np.mean(mixing_params), "median_mixing_param": np.median(mixing_params), "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0, "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0, "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0, "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0, "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0, "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0, "mean_mincut": np.mean([d["mincut"] for d in cluster_details]) if cluster_details else 0, "median_mincut": np.median([d["mincut"] for d in cluster_details]) if cluster_details else 0, "mean_mincut_over_log10n": np.mean([d["mincut_over_log10n"] for d in cluster_details]) if cluster_details else 0, "n_connected": sum(1 for d in cluster_details if d["mincut"] > 0), "n_disconnected": sum(1 for d in cluster_details if d["mincut"] == 0), "n_wellconnected": sum(1 for d in cluster_details if d["mincut"] > np.log10(d["n"])), } return summary, cluster_details, mixing_params def compute_all_stats(): """Compute stats for all methods on all networks plus ground truth.""" all_summaries = [] stats_dir = os.path.join(RESULTS_DIR, "stats") os.makedirs(stats_dir, exist_ok=True) for net_name in NETWORKS: net = NETWORKS[net_name] # Ground truth print(f"Computing stats: {net_name} / ground_truth") summary, details, mixing = compute_cluster_stats( net_name, "ground_truth", net["com_gt_tsv"] ) all_summaries.append(summary) _save_details(stats_dir, net_name, "ground_truth", details, mixing) # Each method for method in METHODS: m_name = method["name"] est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv") if not os.path.exists(est_path): print(f" WARNING: {est_path} not found, skipping") continue print(f"Computing stats: {net_name} / {m_name}") summary, details, mixing = compute_cluster_stats( net_name, m_name, est_path ) all_summaries.append(summary) _save_details(stats_dir, net_name, m_name, details, mixing) df = pd.DataFrame(all_summaries) out_path = os.path.join(stats_dir, "cluster_stats_summary.csv") df.to_csv(out_path, index=False) print(f"\nCluster stats saved to {out_path}") print(df.to_string(index=False)) return df def _save_details(stats_dir, net_name, method_name, details, mixing): """Save per-cluster details and mixing params as JSON.""" out_dir = os.path.join(stats_dir, net_name, method_name) os.makedirs(out_dir, exist_ok=True) with open(os.path.join(out_dir, "cluster_details.json"), "w") as f: json.dump(details, f, indent=2) with open(os.path.join(out_dir, "mixing_params.json"), "w") as f: json.dump(mixing, f) if __name__ == "__main__": compute_all_stats()