diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-24 08:40:49 +0000 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-24 08:40:49 +0000 |
| commit | 8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch) | |
| tree | ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts/compute_stats.py | |
EC-SBM community detection analysis: full pipeline and writeup
Implement community detection on 3 EC-SBM networks (polblogs, topology,
internet_as) using 5 methods (Leiden-Mod, Leiden-CPM at 0.1 and 0.01,
Infomap, graph-tool SBM). Compute AMI/ARI/NMI accuracy, cluster statistics,
and generate figures and LaTeX report.
Diffstat (limited to 'scripts/compute_stats.py')
| -rw-r--r-- | scripts/compute_stats.py | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py new file mode 100644 index 0000000..2e88252 --- /dev/null +++ b/scripts/compute_stats.py @@ -0,0 +1,165 @@ +"""Compute cluster statistics for all (network, method) pairs + ground truth.""" + +import argparse +import sys +import os +import json +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import numpy as np +import pandas as pd + +from config import NETWORKS, METHODS, RESULTS_DIR +from load_data import load_edge_list, load_communities + + +def build_neighbors(edge_df): + """Build adjacency dict from edge DataFrame.""" + neighbors = {} + for _, row in edge_df.iterrows(): + s, t = row["src"], row["tgt"] + neighbors.setdefault(s, set()).add(t) + neighbors.setdefault(t, set()).add(s) + return neighbors + + +def compute_cluster_stats(network_name, method_name, com_path): + """Compute statistics for a given community assignment.""" + net = NETWORKS[network_name] + edge_df = load_edge_list(net["edge_tsv"]) + neighbors = build_neighbors(edge_df) + + all_nodes = set(neighbors.keys()) + node2com = load_communities(com_path) + + # Build com2nodes + com2nodes = {} + for node, com in node2com.items(): + com2nodes.setdefault(com, set()).add(node) + + # Separate singletons vs non-singletons + non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1} + singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1} + + # Nodes in non-singleton clusters + nodes_in_clusters = set() + for nodes in non_singleton_coms.values(): + nodes_in_clusters.update(nodes) + + node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0 + + # Per-cluster stats + cluster_details = [] + for com_id, nodes in sorted(non_singleton_coms.items()): + n = len(nodes) + # Internal edges + m_internal = 0 + c_boundary = 0 + for node in nodes: + for nbr in neighbors.get(node, set()): + if nbr in nodes: + m_internal += 1 + else: + c_boundary += 1 + m_internal //= 2 # each edge counted twice + + edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0 + degree_density = m_internal / n if n > 0 else 0.0 + conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0 + + cluster_details.append({ + "com_id": com_id, + "n": n, + "m_internal": m_internal, + "c_boundary": c_boundary, + "edge_density": edge_density, + "degree_density": degree_density, + "conductance": conductance, + }) + + # Per-node mixing parameter + mixing_params = [] + for node in all_nodes: + if node not in node2com: + mixing_params.append(1.0) # outlier + continue + my_com = node2com[node] + nbrs = neighbors.get(node, set()) + if len(nbrs) == 0: + mixing_params.append(0.0) + continue + n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com) + mixing_params.append(1.0 - n_in / len(nbrs)) + + summary = { + "network": network_name, + "method": method_name, + "n_nodes": len(all_nodes), + "n_clusters_total": len(com2nodes), + "n_clusters_non_singleton": len(non_singleton_coms), + "n_singleton_clusters": len(singleton_coms), + "node_coverage": node_coverage, + "mean_mixing_param": np.mean(mixing_params), + "median_mixing_param": np.median(mixing_params), + "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0, + "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0, + "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0, + "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0, + "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0, + "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0, + } + + return summary, cluster_details, mixing_params + + +def compute_all_stats(): + """Compute stats for all methods on all networks plus ground truth.""" + all_summaries = [] + stats_dir = os.path.join(RESULTS_DIR, "stats") + os.makedirs(stats_dir, exist_ok=True) + + for net_name in NETWORKS: + net = NETWORKS[net_name] + + # Ground truth + print(f"Computing stats: {net_name} / ground_truth") + summary, details, mixing = compute_cluster_stats( + net_name, "ground_truth", net["com_gt_tsv"] + ) + all_summaries.append(summary) + _save_details(stats_dir, net_name, "ground_truth", details, mixing) + + # Each method + for method in METHODS: + m_name = method["name"] + est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv") + if not os.path.exists(est_path): + print(f" WARNING: {est_path} not found, skipping") + continue + print(f"Computing stats: {net_name} / {m_name}") + summary, details, mixing = compute_cluster_stats( + net_name, m_name, est_path + ) + all_summaries.append(summary) + _save_details(stats_dir, net_name, m_name, details, mixing) + + df = pd.DataFrame(all_summaries) + out_path = os.path.join(stats_dir, "cluster_stats_summary.csv") + df.to_csv(out_path, index=False) + print(f"\nCluster stats saved to {out_path}") + print(df.to_string(index=False)) + return df + + +def _save_details(stats_dir, net_name, method_name, details, mixing): + """Save per-cluster details and mixing params as JSON.""" + out_dir = os.path.join(stats_dir, net_name, method_name) + os.makedirs(out_dir, exist_ok=True) + with open(os.path.join(out_dir, "cluster_details.json"), "w") as f: + json.dump(details, f, indent=2) + with open(os.path.join(out_dir, "mixing_params.json"), "w") as f: + json.dump(mixing, f) + + +if __name__ == "__main__": + compute_all_stats() |
