EC-SBM community detection analysis: full pipeline and writeup

Implement community detection on 3 EC-SBM networks (polblogs, topology, internet_as) using 5 methods (Leiden-Mod, Leiden-CPM at 0.1 and 0.01, Infomap, graph-tool SBM). Compute AMI/ARI/NMI accuracy, cluster statistics, and generate figures and LaTeX report.
author: YurenHao0426 <blackhao0426@gmail.com> 2026-02-24 08:40:49 +0000
committer: YurenHao0426 <blackhao0426@gmail.com> 2026-02-24 08:40:49 +0000
commit: 8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch)
tree: ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts/compute_stats.py
1 files changed, 165 insertions, 0 deletions
diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py
new file mode 100644
index 0000000..2e88252
--- /dev/null
+++ b/scripts/compute_stats.py
@@ -0,0 +1,165 @@
+"""Compute cluster statistics for all (network, method) pairs + ground truth."""
+
+import argparse
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def build_neighbors(edge_df):
+    """Build adjacency dict from edge DataFrame."""
+    neighbors = {}
+    for _, row in edge_df.iterrows():
+        s, t = row["src"], row["tgt"]
+        neighbors.setdefault(s, set()).add(t)
+        neighbors.setdefault(t, set()).add(s)
+    return neighbors
+
+
+def compute_cluster_stats(network_name, method_name, com_path):
+    """Compute statistics for a given community assignment."""
+    net = NETWORKS[network_name]
+    edge_df = load_edge_list(net["edge_tsv"])
+    neighbors = build_neighbors(edge_df)
+
+    all_nodes = set(neighbors.keys())
+    node2com = load_communities(com_path)
+
+    # Build com2nodes
+    com2nodes = {}
+    for node, com in node2com.items():
+        com2nodes.setdefault(com, set()).add(node)
+
+    # Separate singletons vs non-singletons
+    non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1}
+    singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1}
+
+    # Nodes in non-singleton clusters
+    nodes_in_clusters = set()
+    for nodes in non_singleton_coms.values():
+        nodes_in_clusters.update(nodes)
+
+    node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0
+
+    # Per-cluster stats
+    cluster_details = []
+    for com_id, nodes in sorted(non_singleton_coms.items()):
+        n = len(nodes)
+        # Internal edges
+        m_internal = 0
+        c_boundary = 0
+        for node in nodes:
+            for nbr in neighbors.get(node, set()):
+                if nbr in nodes:
+                    m_internal += 1
+                else:
+                    c_boundary += 1
+        m_internal //= 2  # each edge counted twice
+
+        edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0
+        degree_density = m_internal / n if n > 0 else 0.0
+        conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0
+
+        cluster_details.append({
+            "com_id": com_id,
+            "n": n,
+            "m_internal": m_internal,
+            "c_boundary": c_boundary,
+            "edge_density": edge_density,
+            "degree_density": degree_density,
+            "conductance": conductance,
+        })
+
+    # Per-node mixing parameter
+    mixing_params = []
+    for node in all_nodes:
+        if node not in node2com:
+            mixing_params.append(1.0)  # outlier
+            continue
+        my_com = node2com[node]
+        nbrs = neighbors.get(node, set())
+        if len(nbrs) == 0:
+            mixing_params.append(0.0)
+            continue
+        n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com)
+        mixing_params.append(1.0 - n_in / len(nbrs))
+
+    summary = {
+        "network": network_name,
+        "method": method_name,
+        "n_nodes": len(all_nodes),
+        "n_clusters_total": len(com2nodes),
+        "n_clusters_non_singleton": len(non_singleton_coms),
+        "n_singleton_clusters": len(singleton_coms),
+        "node_coverage": node_coverage,
+        "mean_mixing_param": np.mean(mixing_params),
+        "median_mixing_param": np.median(mixing_params),
+        "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0,
+        "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0,
+        "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+        "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+        "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0,
+        "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0,
+    }
+
+    return summary, cluster_details, mixing_params
+
+
+def compute_all_stats():
+    """Compute stats for all methods on all networks plus ground truth."""
+    all_summaries = []
+    stats_dir = os.path.join(RESULTS_DIR, "stats")
+    os.makedirs(stats_dir, exist_ok=True)
+
+    for net_name in NETWORKS:
+        net = NETWORKS[net_name]
+
+        # Ground truth
+        print(f"Computing stats: {net_name} / ground_truth")
+        summary, details, mixing = compute_cluster_stats(
+            net_name, "ground_truth", net["com_gt_tsv"]
+        )
+        all_summaries.append(summary)
+        _save_details(stats_dir, net_name, "ground_truth", details, mixing)
+
+        # Each method
+        for method in METHODS:
+            m_name = method["name"]
+            est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv")
+            if not os.path.exists(est_path):
+                print(f"  WARNING: {est_path} not found, skipping")
+                continue
+            print(f"Computing stats: {net_name} / {m_name}")
+            summary, details, mixing = compute_cluster_stats(
+                net_name, m_name, est_path
+            )
+            all_summaries.append(summary)
+            _save_details(stats_dir, net_name, m_name, details, mixing)
+
+    df = pd.DataFrame(all_summaries)
+    out_path = os.path.join(stats_dir, "cluster_stats_summary.csv")
+    df.to_csv(out_path, index=False)
+    print(f"\nCluster stats saved to {out_path}")
+    print(df.to_string(index=False))
+    return df
+
+
+def _save_details(stats_dir, net_name, method_name, details, mixing):
+    """Save per-cluster details and mixing params as JSON."""
+    out_dir = os.path.join(stats_dir, net_name, method_name)
+    os.makedirs(out_dir, exist_ok=True)
+    with open(os.path.join(out_dir, "cluster_details.json"), "w") as f:
+        json.dump(details, f, indent=2)
+    with open(os.path.join(out_dir, "mixing_params.json"), "w") as f:
+        json.dump(mixing, f)
+
+
+if __name__ == "__main__":
+    compute_all_stats()
author	YurenHao0426 <blackhao0426@gmail.com>	2026-02-24 08:40:49 +0000
committer	YurenHao0426 <blackhao0426@gmail.com>	2026-02-24 08:40:49 +0000
commit	8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch)
tree	ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts/compute_stats.py