EC-SBM community detection analysis: full pipeline and writeup

Implement community detection on 3 EC-SBM networks (polblogs, topology, internet_as) using 5 methods (Leiden-Mod, Leiden-CPM at 0.1 and 0.01, Infomap, graph-tool SBM). Compute AMI/ARI/NMI accuracy, cluster statistics, and generate figures and LaTeX report.
author: YurenHao0426 <blackhao0426@gmail.com> 2026-02-24 08:40:49 +0000
committer: YurenHao0426 <blackhao0426@gmail.com> 2026-02-24 08:40:49 +0000
commit: 8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch)
tree: ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts
9 files changed, 924 insertions, 0 deletions
diff --git a/scripts/compute_accuracy.py b/scripts/compute_accuracy.py
new file mode 100644
index 0000000..4aeb6a2
--- /dev/null
+++ b/scripts/compute_accuracy.py
@@ -0,0 +1,92 @@
+"""Compute AMI, ARI, NMI for all (network, method) pairs."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, normalized_mutual_info_score
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def align_labels(gt_com, est_com, edge_path):
+    """Align ground truth and estimated labels over the full node set from edges.
+    Nodes missing from a clustering get unique singleton community IDs."""
+    edge_df = load_edge_list(edge_path)
+    all_nodes = sorted(set(
+        pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+    ))
+
+    gt_labels = []
+    est_labels = []
+    # For nodes not in GT or EST, assign unique singleton IDs
+    gt_next = max((int(v) for v in gt_com.values() if v.lstrip('-').isdigit()), default=0) + 1
+    est_next = max((int(v) for v in est_com.values() if v.lstrip('-').isdigit()), default=0) + 1
+
+    for node in all_nodes:
+        if node in gt_com:
+            gt_labels.append(gt_com[node])
+        else:
+            gt_labels.append(f"gt_singleton_{gt_next}")
+            gt_next += 1
+
+        if node in est_com:
+            est_labels.append(est_com[node])
+        else:
+            est_labels.append(f"est_singleton_{est_next}")
+            est_next += 1
+
+    return gt_labels, est_labels
+
+
+def compute_accuracy(network_name, method_name):
+    """Compute AMI, ARI, NMI for a single (network, method) pair."""
+    net = NETWORKS[network_name]
+    gt_com = load_communities(net["com_gt_tsv"])
+
+    est_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv")
+    if not os.path.exists(est_path):
+        print(f"  WARNING: {est_path} not found, skipping")
+        return None
+
+    est_com = load_communities(est_path)
+    gt_labels, est_labels = align_labels(gt_com, est_com, net["edge_tsv"])
+
+    ami = adjusted_mutual_info_score(gt_labels, est_labels, average_method="arithmetic")
+    ari = adjusted_rand_score(gt_labels, est_labels)
+    nmi = normalized_mutual_info_score(gt_labels, est_labels, average_method="arithmetic")
+
+    return {"ami": ami, "ari": ari, "nmi": nmi}
+
+
+def compute_all_accuracy():
+    """Compute accuracy for all (network, method) pairs and save CSV."""
+    rows = []
+    for net_name in NETWORKS:
+        for method in METHODS:
+            m_name = method["name"]
+            print(f"Computing accuracy: {net_name} / {m_name}")
+            result = compute_accuracy(net_name, m_name)
+            if result is not None:
+                rows.append({
+                    "network": net_name,
+                    "method": m_name,
+                    **result,
+                })
+
+    df = pd.DataFrame(rows)
+    out_dir = os.path.join(RESULTS_DIR, "accuracy")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, "accuracy_table.csv")
+    df.to_csv(out_path, index=False)
+    print(f"\nAccuracy table saved to {out_path}")
+    print(df.to_string(index=False))
+    return df
+
+
+if __name__ == "__main__":
+    compute_all_accuracy()
diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py
new file mode 100644
index 0000000..2e88252
--- /dev/null
+++ b/scripts/compute_stats.py
@@ -0,0 +1,165 @@
+"""Compute cluster statistics for all (network, method) pairs + ground truth."""
+
+import argparse
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def build_neighbors(edge_df):
+    """Build adjacency dict from edge DataFrame."""
+    neighbors = {}
+    for _, row in edge_df.iterrows():
+        s, t = row["src"], row["tgt"]
+        neighbors.setdefault(s, set()).add(t)
+        neighbors.setdefault(t, set()).add(s)
+    return neighbors
+
+
+def compute_cluster_stats(network_name, method_name, com_path):
+    """Compute statistics for a given community assignment."""
+    net = NETWORKS[network_name]
+    edge_df = load_edge_list(net["edge_tsv"])
+    neighbors = build_neighbors(edge_df)
+
+    all_nodes = set(neighbors.keys())
+    node2com = load_communities(com_path)
+
+    # Build com2nodes
+    com2nodes = {}
+    for node, com in node2com.items():
+        com2nodes.setdefault(com, set()).add(node)
+
+    # Separate singletons vs non-singletons
+    non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1}
+    singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1}
+
+    # Nodes in non-singleton clusters
+    nodes_in_clusters = set()
+    for nodes in non_singleton_coms.values():
+        nodes_in_clusters.update(nodes)
+
+    node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0
+
+    # Per-cluster stats
+    cluster_details = []
+    for com_id, nodes in sorted(non_singleton_coms.items()):
+        n = len(nodes)
+        # Internal edges
+        m_internal = 0
+        c_boundary = 0
+        for node in nodes:
+            for nbr in neighbors.get(node, set()):
+                if nbr in nodes:
+                    m_internal += 1
+                else:
+                    c_boundary += 1
+        m_internal //= 2  # each edge counted twice
+
+        edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0
+        degree_density = m_internal / n if n > 0 else 0.0
+        conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0
+
+        cluster_details.append({
+            "com_id": com_id,
+            "n": n,
+            "m_internal": m_internal,
+            "c_boundary": c_boundary,
+            "edge_density": edge_density,
+            "degree_density": degree_density,
+            "conductance": conductance,
+        })
+
+    # Per-node mixing parameter
+    mixing_params = []
+    for node in all_nodes:
+        if node not in node2com:
+            mixing_params.append(1.0)  # outlier
+            continue
+        my_com = node2com[node]
+        nbrs = neighbors.get(node, set())
+        if len(nbrs) == 0:
+            mixing_params.append(0.0)
+            continue
+        n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com)
+        mixing_params.append(1.0 - n_in / len(nbrs))
+
+    summary = {
+        "network": network_name,
+        "method": method_name,
+        "n_nodes": len(all_nodes),
+        "n_clusters_total": len(com2nodes),
+        "n_clusters_non_singleton": len(non_singleton_coms),
+        "n_singleton_clusters": len(singleton_coms),
+        "node_coverage": node_coverage,
+        "mean_mixing_param": np.mean(mixing_params),
+        "median_mixing_param": np.median(mixing_params),
+        "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0,
+        "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0,
+        "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+        "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+        "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0,
+        "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0,
+    }
+
+    return summary, cluster_details, mixing_params
+
+
+def compute_all_stats():
+    """Compute stats for all methods on all networks plus ground truth."""
+    all_summaries = []
+    stats_dir = os.path.join(RESULTS_DIR, "stats")
+    os.makedirs(stats_dir, exist_ok=True)
+
+    for net_name in NETWORKS:
+        net = NETWORKS[net_name]
+
+        # Ground truth
+        print(f"Computing stats: {net_name} / ground_truth")
+        summary, details, mixing = compute_cluster_stats(
+            net_name, "ground_truth", net["com_gt_tsv"]
+        )
+        all_summaries.append(summary)
+        _save_details(stats_dir, net_name, "ground_truth", details, mixing)
+
+        # Each method
+        for method in METHODS:
+            m_name = method["name"]
+            est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv")
+            if not os.path.exists(est_path):
+                print(f"  WARNING: {est_path} not found, skipping")
+                continue
+            print(f"Computing stats: {net_name} / {m_name}")
+            summary, details, mixing = compute_cluster_stats(
+                net_name, m_name, est_path
+            )
+            all_summaries.append(summary)
+            _save_details(stats_dir, net_name, m_name, details, mixing)
+
+    df = pd.DataFrame(all_summaries)
+    out_path = os.path.join(stats_dir, "cluster_stats_summary.csv")
+    df.to_csv(out_path, index=False)
+    print(f"\nCluster stats saved to {out_path}")
+    print(df.to_string(index=False))
+    return df
+
+
+def _save_details(stats_dir, net_name, method_name, details, mixing):
+    """Save per-cluster details and mixing params as JSON."""
+    out_dir = os.path.join(stats_dir, net_name, method_name)
+    os.makedirs(out_dir, exist_ok=True)
+    with open(os.path.join(out_dir, "cluster_details.json"), "w") as f:
+        json.dump(details, f, indent=2)
+    with open(os.path.join(out_dir, "mixing_params.json"), "w") as f:
+        json.dump(mixing, f)
+
+
+if __name__ == "__main__":
+    compute_all_stats()
diff --git a/scripts/config.py b/scripts/config.py
new file mode 100644
index 0000000..b23b090
--- /dev/null
+++ b/scripts/config.py
@@ -0,0 +1,32 @@
+"""Central configuration for EC-SBM community detection analysis."""
+
+import os
+
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_DIR = os.path.join(BASE_DIR, "data")
+RESULTS_DIR = os.path.join(BASE_DIR, "results")
+
+NETWORKS = {
+    "polblogs": {
+        "edge_tsv": os.path.join(DATA_DIR, "polblogs", "edge.tsv"),
+        "com_gt_tsv": os.path.join(DATA_DIR, "polblogs", "com_gt.tsv"),
+    },
+    "topology": {
+        "edge_tsv": os.path.join(DATA_DIR, "topology", "edge.tsv"),
+        "com_gt_tsv": os.path.join(DATA_DIR, "topology", "com_gt.tsv"),
+    },
+    "internet_as": {
+        "edge_tsv": os.path.join(DATA_DIR, "internet_as", "edge.tsv"),
+        "com_gt_tsv": os.path.join(DATA_DIR, "internet_as", "com_gt.tsv"),
+    },
+}
+
+METHODS = [
+    {"name": "leiden_mod", "type": "leiden", "quality": "modularity"},
+    {"name": "leiden_cpm_01", "type": "leiden", "quality": "cpm", "resolution": 0.1},
+    {"name": "leiden_cpm_001", "type": "leiden", "quality": "cpm", "resolution": 0.01},
+    {"name": "infomap", "type": "infomap"},
+    {"name": "graphtool_sbm", "type": "graphtool_sbm"},
+]
+
+SEED = 42
diff --git a/scripts/generate_plots.py b/scripts/generate_plots.py
new file mode 100644
index 0000000..be5db9f
--- /dev/null
+++ b/scripts/generate_plots.py
@@ -0,0 +1,324 @@
+"""Generate all figures and LaTeX tables for the EC-SBM analysis."""
+
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.colors import Normalize
+from matplotlib.cm import ScalarMappable
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+
+FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
+os.makedirs(FIGURES_DIR, exist_ok=True)
+
+METHOD_NAMES = [m["name"] for m in METHODS]
+METHOD_LABELS = {
+    "leiden_mod": "Leiden-Mod",
+    "leiden_cpm_01": "Leiden-CPM(0.1)",
+    "leiden_cpm_001": "Leiden-CPM(0.01)",
+    "infomap": "Infomap",
+    "graphtool_sbm": "graph-tool SBM",
+}
+NET_LABELS = {
+    "polblogs": "polblogs",
+    "topology": "topology",
+    "internet_as": "internet\\_as",
+}
+
+
+def plot_accuracy_heatmap():
+    """Create a heatmap of accuracy metrics (network x method)."""
+    acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
+    if not os.path.exists(acc_path):
+        print("No accuracy table found, skipping heatmap")
+        return
+    df = pd.read_csv(acc_path)
+
+    for metric in ["ami", "ari", "nmi"]:
+        fig, ax = plt.subplots(figsize=(8, 3.5))
+        pivot = df.pivot(index="network", columns="method", values=metric)
+        pivot = pivot.reindex(index=list(NETWORKS.keys()), columns=METHOD_NAMES)
+
+        im = ax.imshow(pivot.values, cmap="YlOrRd", aspect="auto",
+                        vmin=0, vmax=1)
+        ax.set_xticks(range(len(METHOD_NAMES)))
+        ax.set_xticklabels([METHOD_LABELS.get(m, m) for m in METHOD_NAMES],
+                            rotation=30, ha="right", fontsize=9)
+        ax.set_yticks(range(len(NETWORKS)))
+        ax.set_yticklabels(list(NETWORKS.keys()), fontsize=10)
+
+        for i in range(pivot.shape[0]):
+            for j in range(pivot.shape[1]):
+                val = pivot.values[i, j]
+                if not np.isnan(val):
+                    ax.text(j, i, f"{val:.3f}", ha="center", va="center",
+                            fontsize=9, color="black" if val < 0.6 else "white")
+
+        plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+        ax.set_title(f"{metric.upper()} Accuracy", fontsize=12)
+        plt.tight_layout()
+        plt.savefig(os.path.join(FIGURES_DIR, f"heatmap_{metric}.pdf"),
+                     bbox_inches="tight")
+        plt.close()
+        print(f"  Saved heatmap_{metric}.pdf")
+
+
+def plot_cluster_size_distributions():
+    """Histogram of cluster sizes per network/method."""
+    stats_dir = os.path.join(RESULTS_DIR, "stats")
+
+    for net_name in NETWORKS:
+        all_labels = ["ground_truth"] + METHOD_NAMES
+        fig, axes = plt.subplots(2, 3, figsize=(14, 8))
+        axes = axes.flatten()
+
+        for idx, label in enumerate(all_labels):
+            ax = axes[idx]
+            det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
+            if not os.path.exists(det_path):
+                ax.set_title(METHOD_LABELS.get(label, label))
+                ax.text(0.5, 0.5, "No data", ha="center", va="center",
+                         transform=ax.transAxes)
+                continue
+
+            with open(det_path) as f:
+                details = json.load(f)
+            sizes = [d["n"] for d in details]
+
+            if sizes:
+                ax.hist(sizes, bins=min(50, max(10, len(set(sizes)))),
+                         edgecolor="black", alpha=0.7, color="steelblue")
+            ax.set_title(METHOD_LABELS.get(label, label), fontsize=10)
+            ax.set_xlabel("Cluster size")
+            ax.set_ylabel("Count")
+            if sizes and max(sizes) > 100:
+                ax.set_xscale("log")
+
+        # Remove extra subplot if any
+        for idx in range(len(all_labels), len(axes)):
+            fig.delaxes(axes[idx])
+
+        fig.suptitle(f"Cluster Size Distribution — {net_name}", fontsize=13)
+        plt.tight_layout()
+        plt.savefig(os.path.join(FIGURES_DIR, f"cluster_sizes_{net_name}.pdf"),
+                     bbox_inches="tight")
+        plt.close()
+        print(f"  Saved cluster_sizes_{net_name}.pdf")
+
+
+def plot_edge_density_boxplots():
+    """Boxplots of edge density across methods for each network."""
+    stats_dir = os.path.join(RESULTS_DIR, "stats")
+
+    for net_name in NETWORKS:
+        all_labels = ["ground_truth"] + METHOD_NAMES
+        data = []
+        labels = []
+
+        for label in all_labels:
+            det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
+            if not os.path.exists(det_path):
+                continue
+            with open(det_path) as f:
+                details = json.load(f)
+            densities = [d["edge_density"] for d in details]
+            if densities:
+                data.append(densities)
+                labels.append(METHOD_LABELS.get(label, label))
+
+        if not data:
+            continue
+
+        fig, ax = plt.subplots(figsize=(9, 4))
+        bp = ax.boxplot(data, tick_labels=labels, patch_artist=True, showfliers=False)
+        for patch in bp["boxes"]:
+            patch.set_facecolor("lightblue")
+        ax.set_ylabel("Edge Density")
+        ax.set_title(f"Edge Density Distribution — {net_name}")
+        plt.xticks(rotation=20, ha="right")
+        plt.tight_layout()
+        plt.savefig(os.path.join(FIGURES_DIR, f"edge_density_{net_name}.pdf"),
+                     bbox_inches="tight")
+        plt.close()
+        print(f"  Saved edge_density_{net_name}.pdf")
+
+
+def plot_mixing_parameter_comparison():
+    """Bar chart of mean mixing parameter per method/network."""
+    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+    if not os.path.exists(stats_path):
+        print("No stats summary found, skipping mixing param plot")
+        return
+
+    df = pd.read_csv(stats_path)
+
+    fig, ax = plt.subplots(figsize=(10, 4.5))
+    net_names = list(NETWORKS.keys())
+    all_methods = ["ground_truth"] + METHOD_NAMES
+    x = np.arange(len(net_names))
+    width = 0.13
+    offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
+
+    colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))
+
+    for i, method in enumerate(all_methods):
+        vals = []
+        for net in net_names:
+            row = df[(df["network"] == net) & (df["method"] == method)]
+            vals.append(row["mean_mixing_param"].values[0] if len(row) > 0 else 0)
+        ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
+               color=colors[i])
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(net_names)
+    ax.set_ylabel("Mean Mixing Parameter")
+    ax.set_title("Mean Mixing Parameter by Network and Method")
+    ax.legend(fontsize=7, ncol=2)
+    plt.tight_layout()
+    plt.savefig(os.path.join(FIGURES_DIR, "mixing_parameter.pdf"), bbox_inches="tight")
+    plt.close()
+    print("  Saved mixing_parameter.pdf")
+
+
+def plot_node_coverage_comparison():
+    """Bar chart of node coverage per method/network."""
+    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+    if not os.path.exists(stats_path):
+        return
+
+    df = pd.read_csv(stats_path)
+
+    fig, ax = plt.subplots(figsize=(10, 4.5))
+    net_names = list(NETWORKS.keys())
+    all_methods = ["ground_truth"] + METHOD_NAMES
+    x = np.arange(len(net_names))
+    width = 0.13
+    offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
+    colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))
+
+    for i, method in enumerate(all_methods):
+        vals = []
+        for net in net_names:
+            row = df[(df["network"] == net) & (df["method"] == method)]
+            vals.append(row["node_coverage"].values[0] if len(row) > 0 else 0)
+        ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
+               color=colors[i])
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(net_names)
+    ax.set_ylabel("Node Coverage")
+    ax.set_title("Node Coverage by Network and Method")
+    ax.legend(fontsize=7, ncol=2)
+    ax.set_ylim(0, 1.05)
+    plt.tight_layout()
+    plt.savefig(os.path.join(FIGURES_DIR, "node_coverage.pdf"), bbox_inches="tight")
+    plt.close()
+    print("  Saved node_coverage.pdf")
+
+
+def generate_latex_accuracy_table():
+    """Generate a LaTeX accuracy table."""
+    acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
+    if not os.path.exists(acc_path):
+        return
+
+    df = pd.read_csv(acc_path)
+    lines = []
+    lines.append(r"\begin{table}[htbp]")
+    lines.append(r"\centering")
+    lines.append(r"\caption{Community detection accuracy (AMI, ARI, NMI) on EC-SBM networks.}")
+    lines.append(r"\label{tab:accuracy}")
+    lines.append(r"\begin{tabular}{llrrr}")
+    lines.append(r"\toprule")
+    lines.append(r"Network & Method & AMI & ARI & NMI \\")
+    lines.append(r"\midrule")
+
+    for net_name in NETWORKS:
+        first = True
+        for _, row in df[df["network"] == net_name].iterrows():
+            net_disp = net_name if first else ""
+            m_label = METHOD_LABELS.get(row["method"], row["method"])
+            lines.append(
+                f"{net_disp} & {m_label} & {row['ami']:.4f} & {row['ari']:.4f} & {row['nmi']:.4f} \\\\"
+            )
+            first = False
+        lines.append(r"\midrule")
+
+    lines[-1] = r"\bottomrule"
+    lines.append(r"\end{tabular}")
+    lines.append(r"\end{table}")
+
+    out_path = os.path.join(FIGURES_DIR, "accuracy_table.tex")
+    with open(out_path, "w") as f:
+        f.write("\n".join(lines))
+    print(f"  Saved accuracy_table.tex")
+
+
+def generate_latex_stats_table():
+    """Generate a LaTeX cluster stats table."""
+    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+    if not os.path.exists(stats_path):
+        return
+
+    df = pd.read_csv(stats_path)
+    lines = []
+    lines.append(r"\begin{table}[htbp]")
+    lines.append(r"\centering")
+    lines.append(r"\caption{Cluster statistics summary for each network and method.}")
+    lines.append(r"\label{tab:cluster_stats}")
+    lines.append(r"\footnotesize")
+    lines.append(r"\begin{tabular}{llrrrrrr}")
+    lines.append(r"\toprule")
+    lines.append(r"Network & Method & \#Clusters & Node Cov. & Mean Size & Mean Density & Mean Cond. & Mean Mix. \\")
+    lines.append(r"\midrule")
+
+    for net_name in NETWORKS:
+        first = True
+        for _, row in df[df["network"] == net_name].iterrows():
+            net_disp = net_name if first else ""
+            m_label = METHOD_LABELS.get(row["method"], row["method"])
+            lines.append(
+                f"{net_disp} & {m_label} & {int(row['n_clusters_non_singleton'])} & "
+                f"{row['node_coverage']:.3f} & {row['mean_cluster_size']:.1f} & "
+                f"{row['mean_edge_density']:.3f} & {row['mean_conductance']:.3f} & "
+                f"{row['mean_mixing_param']:.3f} \\\\"
+            )
+            first = False
+        lines.append(r"\midrule")
+
+    lines[-1] = r"\bottomrule"
+    lines.append(r"\end{tabular}")
+    lines.append(r"\end{table}")
+
+    out_path = os.path.join(FIGURES_DIR, "cluster_stats_table.tex")
+    with open(out_path, "w") as f:
+        f.write("\n".join(lines))
+    print(f"  Saved cluster_stats_table.tex")
+
+
+def generate_all():
+    print("Generating accuracy heatmaps...")
+    plot_accuracy_heatmap()
+    print("Generating cluster size distributions...")
+    plot_cluster_size_distributions()
+    print("Generating edge density boxplots...")
+    plot_edge_density_boxplots()
+    print("Generating mixing parameter comparison...")
+    plot_mixing_parameter_comparison()
+    print("Generating node coverage comparison...")
+    plot_node_coverage_comparison()
+    print("Generating LaTeX tables...")
+    generate_latex_accuracy_table()
+    generate_latex_stats_table()
+    print("All plots and tables generated.")
+
+
+if __name__ == "__main__":
+    generate_all()
diff --git a/scripts/load_data.py b/scripts/load_data.py
new file mode 100644
index 0000000..5a0362b
--- /dev/null
+++ b/scripts/load_data.py
@@ -0,0 +1,74 @@
+"""Shared data loading utilities for EC-SBM analysis."""
+
+import pandas as pd
+import numpy as np
+
+
+def load_edge_list(path):
+    """Load a tab-separated edge list (no header, two columns: node1, node2)."""
+    df = pd.read_csv(path, sep="\t", header=None, names=["src", "tgt"],
+                      dtype=str, comment="#")
+    return df
+
+
+def load_communities(path):
+    """Load a tab-separated community file (no header: node, community).
+    Returns dict {node_str: community_str}."""
+    node2com = {}
+    with open(path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split("\t")
+            if len(parts) >= 2:
+                node2com[parts[0]] = parts[1]
+    return node2com
+
+
+def build_igraph(edge_df):
+    """Build an igraph Graph from an edge DataFrame.
+    Returns (graph, name_to_idx, idx_to_name)."""
+    import igraph as ig
+
+    all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+    name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+    idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+    src_ids = edge_df["src"].map(name_to_idx).values
+    tgt_ids = edge_df["tgt"].map(name_to_idx).values
+
+    n = len(all_nodes)
+    g = ig.Graph(n=n, edges=list(zip(src_ids, tgt_ids)), directed=False)
+    g.simplify()  # remove multi-edges and self-loops
+    return g, name_to_idx, idx_to_name
+
+
+def build_graphtool_graph(edge_df):
+    """Build a graph-tool Graph from an edge DataFrame.
+    Returns (graph, name_to_idx, idx_to_name)."""
+    import graph_tool.all as gt
+
+    all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+    name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+    idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+    src_ids = edge_df["src"].map(name_to_idx).values.astype(np.int64)
+    tgt_ids = edge_df["tgt"].map(name_to_idx).values.astype(np.int64)
+
+    n = len(all_nodes)
+    g = gt.Graph(directed=False)
+    g.add_vertex(n)
+    g.add_edge_list(np.column_stack([src_ids, tgt_ids]))
+    gt.remove_parallel_edges(g)
+    gt.remove_self_loops(g)
+    return g, name_to_idx, idx_to_name
+
+
+def save_communities(node2com, path):
+    """Save community assignments as TSV (node\tcommunity)."""
+    import os
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w") as f:
+        for node, com in sorted(node2com.items(), key=lambda x: x[0]):
+            f.write(f"{node}\t{com}\n")
diff --git a/scripts/run_all.py b/scripts/run_all.py
new file mode 100644
index 0000000..c98b9a2
--- /dev/null
+++ b/scripts/run_all.py
@@ -0,0 +1,67 @@
+"""Master orchestration script: run all methods, compute accuracy and stats, generate plots."""
+
+import sys
+import os
+import time
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from config import NETWORKS, METHODS
+
+
+def main():
+    start = time.time()
+
+    print("=" * 60)
+    print("EC-SBM Community Detection Analysis Pipeline")
+    print("=" * 60)
+
+    # Step 1: Run community detection methods
+    for net_name in NETWORKS:
+        for method in METHODS:
+            m_name = method["name"]
+            m_type = method["type"]
+
+            print(f"\n{'='*60}")
+            print(f"Running {m_name} on {net_name}")
+            print(f"{'='*60}")
+
+            if m_type == "leiden":
+                from run_leiden import run_leiden
+                run_leiden(net_name, m_name, method["quality"],
+                           method.get("resolution"))
+            elif m_type == "infomap":
+                from run_infomap import run_infomap
+                run_infomap(net_name)
+            elif m_type == "graphtool_sbm":
+                from run_graphtool_sbm import run_graphtool_sbm
+                run_graphtool_sbm(net_name)
+
+    # Step 2: Compute accuracy
+    print(f"\n{'='*60}")
+    print("Computing accuracy metrics")
+    print(f"{'='*60}")
+    from compute_accuracy import compute_all_accuracy
+    compute_all_accuracy()
+
+    # Step 3: Compute cluster stats
+    print(f"\n{'='*60}")
+    print("Computing cluster statistics")
+    print(f"{'='*60}")
+    from compute_stats import compute_all_stats
+    compute_all_stats()
+
+    # Step 4: Generate plots and tables
+    print(f"\n{'='*60}")
+    print("Generating plots and LaTeX tables")
+    print(f"{'='*60}")
+    from generate_plots import generate_all
+    generate_all()
+
+    elapsed = time.time() - start
+    print(f"\n{'='*60}")
+    print(f"Pipeline complete in {elapsed:.1f}s")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_graphtool_sbm.py b/scripts/run_graphtool_sbm.py
new file mode 100644
index 0000000..f860e89
--- /dev/null
+++ b/scripts/run_graphtool_sbm.py
@@ -0,0 +1,47 @@
+"""Run graph-tool SBM inference for community detection."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import graph_tool.all as gt
+import numpy as np
+
+from config import NETWORKS, RESULTS_DIR
+from load_data import load_edge_list, build_graphtool_graph, save_communities
+
+
+def run_graphtool_sbm(network_name):
+    net = NETWORKS[network_name]
+    edge_df = load_edge_list(net["edge_tsv"])
+    g, name_to_idx, idx_to_name = build_graphtool_graph(edge_df)
+
+    print(f"  Graph: {g.num_vertices()} nodes, {g.num_edges()} edges")
+
+    # Use minimize_blockmodel_dl for flat (non-nested) SBM
+    np.random.seed(42)
+    gt.seed_rng(42)
+    state = gt.minimize_blockmodel_dl(g)
+
+    # Extract block assignments
+    blocks = state.get_blocks()
+    n_blocks = len(set(blocks.a))
+    print(f"  Found {n_blocks} blocks")
+
+    node2com = {}
+    for v in g.vertices():
+        node2com[idx_to_name[int(v)]] = str(blocks[v])
+
+    out_path = os.path.join(RESULTS_DIR, network_name, "graphtool_sbm", "com.tsv")
+    save_communities(node2com, out_path)
+    print(f"  Saved to {out_path}")
+    return node2com
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", required=True)
+    args = parser.parse_args()
+    print(f"Running graph-tool SBM on {args.network}...")
+    run_graphtool_sbm(args.network)
diff --git a/scripts/run_infomap.py b/scripts/run_infomap.py
new file mode 100644
index 0000000..e53c77d
--- /dev/null
+++ b/scripts/run_infomap.py
@@ -0,0 +1,61 @@
+"""Run Infomap community detection."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import infomap
+
+from config import NETWORKS, RESULTS_DIR, SEED
+from load_data import load_edge_list, save_communities
+
+
+def run_infomap(network_name):
+    net = NETWORKS[network_name]
+    edge_df = load_edge_list(net["edge_tsv"])
+
+    # Build node mapping
+    import pandas as pd
+    all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+    name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+    idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+    n = len(all_nodes)
+    print(f"  Network: {n} nodes, {len(edge_df)} edges")
+
+    im = infomap.Infomap("--two-level --flow-model undirected --seed {}".format(SEED))
+
+    for _, row in edge_df.iterrows():
+        im.add_link(name_to_idx[row["src"]], name_to_idx[row["tgt"]])
+
+    im.run()
+
+    print(f"  Found {im.num_top_modules} top modules, codelength={im.codelength:.4f}")
+
+    # Extract communities
+    node2com = {}
+    for node_id in im.tree:
+        if node_id.is_leaf:
+            node2com[idx_to_name[node_id.node_id]] = str(node_id.module_id)
+
+    # Assign singleton communities to any nodes not in infomap output
+    max_com = max(int(c) for c in node2com.values()) + 1
+    for idx in range(n):
+        name = idx_to_name[idx]
+        if name not in node2com:
+            node2com[name] = str(max_com)
+            max_com += 1
+
+    out_path = os.path.join(RESULTS_DIR, network_name, "infomap", "com.tsv")
+    save_communities(node2com, out_path)
+    print(f"  Saved to {out_path}")
+    return node2com
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", required=True)
+    args = parser.parse_args()
+    print(f"Running Infomap on {args.network}...")
+    run_infomap(args.network)
diff --git a/scripts/run_leiden.py b/scripts/run_leiden.py
new file mode 100644
index 0000000..375b6bb
--- /dev/null
+++ b/scripts/run_leiden.py
@@ -0,0 +1,62 @@
+"""Run Leiden community detection (modularity or CPM)."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import igraph as ig
+import leidenalg
+
+from config import NETWORKS, RESULTS_DIR, SEED
+from load_data import load_edge_list, build_igraph, save_communities
+
+
+def run_leiden(network_name, method_name, quality, resolution=None):
+    net = NETWORKS[network_name]
+    edge_df = load_edge_list(net["edge_tsv"])
+    g, name_to_idx, idx_to_name = build_igraph(edge_df)
+
+    print(f"  Graph: {g.vcount()} nodes, {g.ecount()} edges")
+
+    if quality == "modularity":
+        partition = leidenalg.find_partition(
+            g, leidenalg.ModularityVertexPartition, seed=SEED
+        )
+    elif quality == "cpm":
+        partition = leidenalg.find_partition(
+            g, leidenalg.CPMVertexPartition,
+            resolution_parameter=resolution, seed=SEED
+        )
+    else:
+        raise ValueError(f"Unknown quality function: {quality}")
+
+    print(f"  Found {len(partition)} communities, modularity={partition.modularity:.4f}")
+
+    # Convert partition to node2com dict
+    node2com = {}
+    for comm_id, members in enumerate(partition):
+        for idx in members:
+            node2com[idx_to_name[idx]] = str(comm_id)
+
+    out_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv")
+    save_communities(node2com, out_path)
+    print(f"  Saved to {out_path}")
+    return node2com
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", required=True)
+    parser.add_argument("--method", required=True,
+                        choices=["leiden_mod", "leiden_cpm_01", "leiden_cpm_001"])
+    args = parser.parse_args()
+
+    method_configs = {
+        "leiden_mod": {"quality": "modularity"},
+        "leiden_cpm_01": {"quality": "cpm", "resolution": 0.1},
+        "leiden_cpm_001": {"quality": "cpm", "resolution": 0.01},
+    }
+    cfg = method_configs[args.method]
+    print(f"Running {args.method} on {args.network}...")
+    run_leiden(args.network, args.method, **cfg)
author	YurenHao0426 <blackhao0426@gmail.com>	2026-02-24 08:40:49 +0000
committer	YurenHao0426 <blackhao0426@gmail.com>	2026-02-24 08:40:49 +0000
commit	8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch)
tree	ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts