summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/compute_accuracy.py92
-rw-r--r--scripts/compute_stats.py165
-rw-r--r--scripts/config.py32
-rw-r--r--scripts/generate_plots.py324
-rw-r--r--scripts/load_data.py74
-rw-r--r--scripts/run_all.py67
-rw-r--r--scripts/run_graphtool_sbm.py47
-rw-r--r--scripts/run_infomap.py61
-rw-r--r--scripts/run_leiden.py62
9 files changed, 924 insertions, 0 deletions
diff --git a/scripts/compute_accuracy.py b/scripts/compute_accuracy.py
new file mode 100644
index 0000000..4aeb6a2
--- /dev/null
+++ b/scripts/compute_accuracy.py
@@ -0,0 +1,92 @@
+"""Compute AMI, ARI, NMI for all (network, method) pairs."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, normalized_mutual_info_score
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def align_labels(gt_com, est_com, edge_path):
+ """Align ground truth and estimated labels over the full node set from edges.
+ Nodes missing from a clustering get unique singleton community IDs."""
+ edge_df = load_edge_list(edge_path)
+ all_nodes = sorted(set(
+ pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+ ))
+
+ gt_labels = []
+ est_labels = []
+ # For nodes not in GT or EST, assign unique singleton IDs
+ gt_next = max((int(v) for v in gt_com.values() if v.lstrip('-').isdigit()), default=0) + 1
+ est_next = max((int(v) for v in est_com.values() if v.lstrip('-').isdigit()), default=0) + 1
+
+ for node in all_nodes:
+ if node in gt_com:
+ gt_labels.append(gt_com[node])
+ else:
+ gt_labels.append(f"gt_singleton_{gt_next}")
+ gt_next += 1
+
+ if node in est_com:
+ est_labels.append(est_com[node])
+ else:
+ est_labels.append(f"est_singleton_{est_next}")
+ est_next += 1
+
+ return gt_labels, est_labels
+
+
+def compute_accuracy(network_name, method_name):
+ """Compute AMI, ARI, NMI for a single (network, method) pair."""
+ net = NETWORKS[network_name]
+ gt_com = load_communities(net["com_gt_tsv"])
+
+ est_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv")
+ if not os.path.exists(est_path):
+ print(f" WARNING: {est_path} not found, skipping")
+ return None
+
+ est_com = load_communities(est_path)
+ gt_labels, est_labels = align_labels(gt_com, est_com, net["edge_tsv"])
+
+ ami = adjusted_mutual_info_score(gt_labels, est_labels, average_method="arithmetic")
+ ari = adjusted_rand_score(gt_labels, est_labels)
+ nmi = normalized_mutual_info_score(gt_labels, est_labels, average_method="arithmetic")
+
+ return {"ami": ami, "ari": ari, "nmi": nmi}
+
+
+def compute_all_accuracy():
+ """Compute accuracy for all (network, method) pairs and save CSV."""
+ rows = []
+ for net_name in NETWORKS:
+ for method in METHODS:
+ m_name = method["name"]
+ print(f"Computing accuracy: {net_name} / {m_name}")
+ result = compute_accuracy(net_name, m_name)
+ if result is not None:
+ rows.append({
+ "network": net_name,
+ "method": m_name,
+ **result,
+ })
+
+ df = pd.DataFrame(rows)
+ out_dir = os.path.join(RESULTS_DIR, "accuracy")
+ os.makedirs(out_dir, exist_ok=True)
+ out_path = os.path.join(out_dir, "accuracy_table.csv")
+ df.to_csv(out_path, index=False)
+ print(f"\nAccuracy table saved to {out_path}")
+ print(df.to_string(index=False))
+ return df
+
+
+if __name__ == "__main__":
+ compute_all_accuracy()
diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py
new file mode 100644
index 0000000..2e88252
--- /dev/null
+++ b/scripts/compute_stats.py
@@ -0,0 +1,165 @@
+"""Compute cluster statistics for all (network, method) pairs + ground truth."""
+
+import argparse
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def build_neighbors(edge_df):
+ """Build adjacency dict from edge DataFrame."""
+ neighbors = {}
+ for _, row in edge_df.iterrows():
+ s, t = row["src"], row["tgt"]
+ neighbors.setdefault(s, set()).add(t)
+ neighbors.setdefault(t, set()).add(s)
+ return neighbors
+
+
+def compute_cluster_stats(network_name, method_name, com_path):
+ """Compute statistics for a given community assignment."""
+ net = NETWORKS[network_name]
+ edge_df = load_edge_list(net["edge_tsv"])
+ neighbors = build_neighbors(edge_df)
+
+ all_nodes = set(neighbors.keys())
+ node2com = load_communities(com_path)
+
+ # Build com2nodes
+ com2nodes = {}
+ for node, com in node2com.items():
+ com2nodes.setdefault(com, set()).add(node)
+
+ # Separate singletons vs non-singletons
+ non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1}
+ singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1}
+
+ # Nodes in non-singleton clusters
+ nodes_in_clusters = set()
+ for nodes in non_singleton_coms.values():
+ nodes_in_clusters.update(nodes)
+
+ node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0
+
+ # Per-cluster stats
+ cluster_details = []
+ for com_id, nodes in sorted(non_singleton_coms.items()):
+ n = len(nodes)
+ # Internal edges
+ m_internal = 0
+ c_boundary = 0
+ for node in nodes:
+ for nbr in neighbors.get(node, set()):
+ if nbr in nodes:
+ m_internal += 1
+ else:
+ c_boundary += 1
+ m_internal //= 2 # each edge counted twice
+
+ edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0
+ degree_density = m_internal / n if n > 0 else 0.0
+ conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0
+
+ cluster_details.append({
+ "com_id": com_id,
+ "n": n,
+ "m_internal": m_internal,
+ "c_boundary": c_boundary,
+ "edge_density": edge_density,
+ "degree_density": degree_density,
+ "conductance": conductance,
+ })
+
+ # Per-node mixing parameter
+ mixing_params = []
+ for node in all_nodes:
+ if node not in node2com:
+ mixing_params.append(1.0) # outlier
+ continue
+ my_com = node2com[node]
+ nbrs = neighbors.get(node, set())
+ if len(nbrs) == 0:
+ mixing_params.append(0.0)
+ continue
+ n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com)
+ mixing_params.append(1.0 - n_in / len(nbrs))
+
+ summary = {
+ "network": network_name,
+ "method": method_name,
+ "n_nodes": len(all_nodes),
+ "n_clusters_total": len(com2nodes),
+ "n_clusters_non_singleton": len(non_singleton_coms),
+ "n_singleton_clusters": len(singleton_coms),
+ "node_coverage": node_coverage,
+ "mean_mixing_param": np.mean(mixing_params),
+ "median_mixing_param": np.median(mixing_params),
+ "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0,
+ "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0,
+ "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+ "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+ "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0,
+ "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0,
+ }
+
+ return summary, cluster_details, mixing_params
+
+
+def compute_all_stats():
+ """Compute stats for all methods on all networks plus ground truth."""
+ all_summaries = []
+ stats_dir = os.path.join(RESULTS_DIR, "stats")
+ os.makedirs(stats_dir, exist_ok=True)
+
+ for net_name in NETWORKS:
+ net = NETWORKS[net_name]
+
+ # Ground truth
+ print(f"Computing stats: {net_name} / ground_truth")
+ summary, details, mixing = compute_cluster_stats(
+ net_name, "ground_truth", net["com_gt_tsv"]
+ )
+ all_summaries.append(summary)
+ _save_details(stats_dir, net_name, "ground_truth", details, mixing)
+
+ # Each method
+ for method in METHODS:
+ m_name = method["name"]
+ est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv")
+ if not os.path.exists(est_path):
+ print(f" WARNING: {est_path} not found, skipping")
+ continue
+ print(f"Computing stats: {net_name} / {m_name}")
+ summary, details, mixing = compute_cluster_stats(
+ net_name, m_name, est_path
+ )
+ all_summaries.append(summary)
+ _save_details(stats_dir, net_name, m_name, details, mixing)
+
+ df = pd.DataFrame(all_summaries)
+ out_path = os.path.join(stats_dir, "cluster_stats_summary.csv")
+ df.to_csv(out_path, index=False)
+ print(f"\nCluster stats saved to {out_path}")
+ print(df.to_string(index=False))
+ return df
+
+
+def _save_details(stats_dir, net_name, method_name, details, mixing):
+ """Save per-cluster details and mixing params as JSON."""
+ out_dir = os.path.join(stats_dir, net_name, method_name)
+ os.makedirs(out_dir, exist_ok=True)
+ with open(os.path.join(out_dir, "cluster_details.json"), "w") as f:
+ json.dump(details, f, indent=2)
+ with open(os.path.join(out_dir, "mixing_params.json"), "w") as f:
+ json.dump(mixing, f)
+
+
+if __name__ == "__main__":
+ compute_all_stats()
diff --git a/scripts/config.py b/scripts/config.py
new file mode 100644
index 0000000..b23b090
--- /dev/null
+++ b/scripts/config.py
@@ -0,0 +1,32 @@
+"""Central configuration for EC-SBM community detection analysis."""
+
+import os
+
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_DIR = os.path.join(BASE_DIR, "data")
+RESULTS_DIR = os.path.join(BASE_DIR, "results")
+
+NETWORKS = {
+ "polblogs": {
+ "edge_tsv": os.path.join(DATA_DIR, "polblogs", "edge.tsv"),
+ "com_gt_tsv": os.path.join(DATA_DIR, "polblogs", "com_gt.tsv"),
+ },
+ "topology": {
+ "edge_tsv": os.path.join(DATA_DIR, "topology", "edge.tsv"),
+ "com_gt_tsv": os.path.join(DATA_DIR, "topology", "com_gt.tsv"),
+ },
+ "internet_as": {
+ "edge_tsv": os.path.join(DATA_DIR, "internet_as", "edge.tsv"),
+ "com_gt_tsv": os.path.join(DATA_DIR, "internet_as", "com_gt.tsv"),
+ },
+}
+
+METHODS = [
+ {"name": "leiden_mod", "type": "leiden", "quality": "modularity"},
+ {"name": "leiden_cpm_01", "type": "leiden", "quality": "cpm", "resolution": 0.1},
+ {"name": "leiden_cpm_001", "type": "leiden", "quality": "cpm", "resolution": 0.01},
+ {"name": "infomap", "type": "infomap"},
+ {"name": "graphtool_sbm", "type": "graphtool_sbm"},
+]
+
+SEED = 42
diff --git a/scripts/generate_plots.py b/scripts/generate_plots.py
new file mode 100644
index 0000000..be5db9f
--- /dev/null
+++ b/scripts/generate_plots.py
@@ -0,0 +1,324 @@
+"""Generate all figures and LaTeX tables for the EC-SBM analysis."""
+
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.colors import Normalize
+from matplotlib.cm import ScalarMappable
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+
+FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
+os.makedirs(FIGURES_DIR, exist_ok=True)
+
+METHOD_NAMES = [m["name"] for m in METHODS]
+METHOD_LABELS = {
+ "leiden_mod": "Leiden-Mod",
+ "leiden_cpm_01": "Leiden-CPM(0.1)",
+ "leiden_cpm_001": "Leiden-CPM(0.01)",
+ "infomap": "Infomap",
+ "graphtool_sbm": "graph-tool SBM",
+}
+NET_LABELS = {
+ "polblogs": "polblogs",
+ "topology": "topology",
+ "internet_as": "internet\\_as",
+}
+
+
+def plot_accuracy_heatmap():
+ """Create a heatmap of accuracy metrics (network x method)."""
+ acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
+ if not os.path.exists(acc_path):
+ print("No accuracy table found, skipping heatmap")
+ return
+ df = pd.read_csv(acc_path)
+
+ for metric in ["ami", "ari", "nmi"]:
+ fig, ax = plt.subplots(figsize=(8, 3.5))
+ pivot = df.pivot(index="network", columns="method", values=metric)
+ pivot = pivot.reindex(index=list(NETWORKS.keys()), columns=METHOD_NAMES)
+
+ im = ax.imshow(pivot.values, cmap="YlOrRd", aspect="auto",
+ vmin=0, vmax=1)
+ ax.set_xticks(range(len(METHOD_NAMES)))
+ ax.set_xticklabels([METHOD_LABELS.get(m, m) for m in METHOD_NAMES],
+ rotation=30, ha="right", fontsize=9)
+ ax.set_yticks(range(len(NETWORKS)))
+ ax.set_yticklabels(list(NETWORKS.keys()), fontsize=10)
+
+ for i in range(pivot.shape[0]):
+ for j in range(pivot.shape[1]):
+ val = pivot.values[i, j]
+ if not np.isnan(val):
+ ax.text(j, i, f"{val:.3f}", ha="center", va="center",
+ fontsize=9, color="black" if val < 0.6 else "white")
+
+ plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+ ax.set_title(f"{metric.upper()} Accuracy", fontsize=12)
+ plt.tight_layout()
+ plt.savefig(os.path.join(FIGURES_DIR, f"heatmap_{metric}.pdf"),
+ bbox_inches="tight")
+ plt.close()
+ print(f" Saved heatmap_{metric}.pdf")
+
+
+def plot_cluster_size_distributions():
+ """Histogram of cluster sizes per network/method."""
+ stats_dir = os.path.join(RESULTS_DIR, "stats")
+
+ for net_name in NETWORKS:
+ all_labels = ["ground_truth"] + METHOD_NAMES
+ fig, axes = plt.subplots(2, 3, figsize=(14, 8))
+ axes = axes.flatten()
+
+ for idx, label in enumerate(all_labels):
+ ax = axes[idx]
+ det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
+ if not os.path.exists(det_path):
+ ax.set_title(METHOD_LABELS.get(label, label))
+ ax.text(0.5, 0.5, "No data", ha="center", va="center",
+ transform=ax.transAxes)
+ continue
+
+ with open(det_path) as f:
+ details = json.load(f)
+ sizes = [d["n"] for d in details]
+
+ if sizes:
+ ax.hist(sizes, bins=min(50, max(10, len(set(sizes)))),
+ edgecolor="black", alpha=0.7, color="steelblue")
+ ax.set_title(METHOD_LABELS.get(label, label), fontsize=10)
+ ax.set_xlabel("Cluster size")
+ ax.set_ylabel("Count")
+ if sizes and max(sizes) > 100:
+ ax.set_xscale("log")
+
+ # Remove extra subplot if any
+ for idx in range(len(all_labels), len(axes)):
+ fig.delaxes(axes[idx])
+
+ fig.suptitle(f"Cluster Size Distribution — {net_name}", fontsize=13)
+ plt.tight_layout()
+ plt.savefig(os.path.join(FIGURES_DIR, f"cluster_sizes_{net_name}.pdf"),
+ bbox_inches="tight")
+ plt.close()
+ print(f" Saved cluster_sizes_{net_name}.pdf")
+
+
+def plot_edge_density_boxplots():
+ """Boxplots of edge density across methods for each network."""
+ stats_dir = os.path.join(RESULTS_DIR, "stats")
+
+ for net_name in NETWORKS:
+ all_labels = ["ground_truth"] + METHOD_NAMES
+ data = []
+ labels = []
+
+ for label in all_labels:
+ det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
+ if not os.path.exists(det_path):
+ continue
+ with open(det_path) as f:
+ details = json.load(f)
+ densities = [d["edge_density"] for d in details]
+ if densities:
+ data.append(densities)
+ labels.append(METHOD_LABELS.get(label, label))
+
+ if not data:
+ continue
+
+ fig, ax = plt.subplots(figsize=(9, 4))
+ bp = ax.boxplot(data, tick_labels=labels, patch_artist=True, showfliers=False)
+ for patch in bp["boxes"]:
+ patch.set_facecolor("lightblue")
+ ax.set_ylabel("Edge Density")
+ ax.set_title(f"Edge Density Distribution — {net_name}")
+ plt.xticks(rotation=20, ha="right")
+ plt.tight_layout()
+ plt.savefig(os.path.join(FIGURES_DIR, f"edge_density_{net_name}.pdf"),
+ bbox_inches="tight")
+ plt.close()
+ print(f" Saved edge_density_{net_name}.pdf")
+
+
+def plot_mixing_parameter_comparison():
+ """Bar chart of mean mixing parameter per method/network."""
+ stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+ if not os.path.exists(stats_path):
+ print("No stats summary found, skipping mixing param plot")
+ return
+
+ df = pd.read_csv(stats_path)
+
+ fig, ax = plt.subplots(figsize=(10, 4.5))
+ net_names = list(NETWORKS.keys())
+ all_methods = ["ground_truth"] + METHOD_NAMES
+ x = np.arange(len(net_names))
+ width = 0.13
+ offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
+
+ colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))
+
+ for i, method in enumerate(all_methods):
+ vals = []
+ for net in net_names:
+ row = df[(df["network"] == net) & (df["method"] == method)]
+ vals.append(row["mean_mixing_param"].values[0] if len(row) > 0 else 0)
+ ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
+ color=colors[i])
+
+ ax.set_xticks(x)
+ ax.set_xticklabels(net_names)
+ ax.set_ylabel("Mean Mixing Parameter")
+ ax.set_title("Mean Mixing Parameter by Network and Method")
+ ax.legend(fontsize=7, ncol=2)
+ plt.tight_layout()
+ plt.savefig(os.path.join(FIGURES_DIR, "mixing_parameter.pdf"), bbox_inches="tight")
+ plt.close()
+ print(" Saved mixing_parameter.pdf")
+
+
+def plot_node_coverage_comparison():
+ """Bar chart of node coverage per method/network."""
+ stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+ if not os.path.exists(stats_path):
+ return
+
+ df = pd.read_csv(stats_path)
+
+ fig, ax = plt.subplots(figsize=(10, 4.5))
+ net_names = list(NETWORKS.keys())
+ all_methods = ["ground_truth"] + METHOD_NAMES
+ x = np.arange(len(net_names))
+ width = 0.13
+ offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
+ colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))
+
+ for i, method in enumerate(all_methods):
+ vals = []
+ for net in net_names:
+ row = df[(df["network"] == net) & (df["method"] == method)]
+ vals.append(row["node_coverage"].values[0] if len(row) > 0 else 0)
+ ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
+ color=colors[i])
+
+ ax.set_xticks(x)
+ ax.set_xticklabels(net_names)
+ ax.set_ylabel("Node Coverage")
+ ax.set_title("Node Coverage by Network and Method")
+ ax.legend(fontsize=7, ncol=2)
+ ax.set_ylim(0, 1.05)
+ plt.tight_layout()
+ plt.savefig(os.path.join(FIGURES_DIR, "node_coverage.pdf"), bbox_inches="tight")
+ plt.close()
+ print(" Saved node_coverage.pdf")
+
+
+def generate_latex_accuracy_table():
+ """Generate a LaTeX accuracy table."""
+ acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
+ if not os.path.exists(acc_path):
+ return
+
+ df = pd.read_csv(acc_path)
+ lines = []
+ lines.append(r"\begin{table}[htbp]")
+ lines.append(r"\centering")
+ lines.append(r"\caption{Community detection accuracy (AMI, ARI, NMI) on EC-SBM networks.}")
+ lines.append(r"\label{tab:accuracy}")
+ lines.append(r"\begin{tabular}{llrrr}")
+ lines.append(r"\toprule")
+ lines.append(r"Network & Method & AMI & ARI & NMI \\")
+ lines.append(r"\midrule")
+
+ for net_name in NETWORKS:
+ first = True
+ for _, row in df[df["network"] == net_name].iterrows():
+ net_disp = net_name if first else ""
+ m_label = METHOD_LABELS.get(row["method"], row["method"])
+ lines.append(
+ f"{net_disp} & {m_label} & {row['ami']:.4f} & {row['ari']:.4f} & {row['nmi']:.4f} \\\\"
+ )
+ first = False
+ lines.append(r"\midrule")
+
+ lines[-1] = r"\bottomrule"
+ lines.append(r"\end{tabular}")
+ lines.append(r"\end{table}")
+
+ out_path = os.path.join(FIGURES_DIR, "accuracy_table.tex")
+ with open(out_path, "w") as f:
+ f.write("\n".join(lines))
+ print(f" Saved accuracy_table.tex")
+
+
+def generate_latex_stats_table():
+ """Generate a LaTeX cluster stats table."""
+ stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
+ if not os.path.exists(stats_path):
+ return
+
+ df = pd.read_csv(stats_path)
+ lines = []
+ lines.append(r"\begin{table}[htbp]")
+ lines.append(r"\centering")
+ lines.append(r"\caption{Cluster statistics summary for each network and method.}")
+ lines.append(r"\label{tab:cluster_stats}")
+ lines.append(r"\footnotesize")
+ lines.append(r"\begin{tabular}{llrrrrrr}")
+ lines.append(r"\toprule")
+ lines.append(r"Network & Method & \#Clusters & Node Cov. & Mean Size & Mean Density & Mean Cond. & Mean Mix. \\")
+ lines.append(r"\midrule")
+
+ for net_name in NETWORKS:
+ first = True
+ for _, row in df[df["network"] == net_name].iterrows():
+ net_disp = net_name if first else ""
+ m_label = METHOD_LABELS.get(row["method"], row["method"])
+ lines.append(
+ f"{net_disp} & {m_label} & {int(row['n_clusters_non_singleton'])} & "
+ f"{row['node_coverage']:.3f} & {row['mean_cluster_size']:.1f} & "
+ f"{row['mean_edge_density']:.3f} & {row['mean_conductance']:.3f} & "
+ f"{row['mean_mixing_param']:.3f} \\\\"
+ )
+ first = False
+ lines.append(r"\midrule")
+
+ lines[-1] = r"\bottomrule"
+ lines.append(r"\end{tabular}")
+ lines.append(r"\end{table}")
+
+ out_path = os.path.join(FIGURES_DIR, "cluster_stats_table.tex")
+ with open(out_path, "w") as f:
+ f.write("\n".join(lines))
+ print(f" Saved cluster_stats_table.tex")
+
+
+def generate_all():
+ print("Generating accuracy heatmaps...")
+ plot_accuracy_heatmap()
+ print("Generating cluster size distributions...")
+ plot_cluster_size_distributions()
+ print("Generating edge density boxplots...")
+ plot_edge_density_boxplots()
+ print("Generating mixing parameter comparison...")
+ plot_mixing_parameter_comparison()
+ print("Generating node coverage comparison...")
+ plot_node_coverage_comparison()
+ print("Generating LaTeX tables...")
+ generate_latex_accuracy_table()
+ generate_latex_stats_table()
+ print("All plots and tables generated.")
+
+
+if __name__ == "__main__":
+ generate_all()
diff --git a/scripts/load_data.py b/scripts/load_data.py
new file mode 100644
index 0000000..5a0362b
--- /dev/null
+++ b/scripts/load_data.py
@@ -0,0 +1,74 @@
+"""Shared data loading utilities for EC-SBM analysis."""
+
+import pandas as pd
+import numpy as np
+
+
+def load_edge_list(path):
+ """Load a tab-separated edge list (no header, two columns: node1, node2)."""
+ df = pd.read_csv(path, sep="\t", header=None, names=["src", "tgt"],
+ dtype=str, comment="#")
+ return df
+
+
+def load_communities(path):
+ """Load a tab-separated community file (no header: node, community).
+ Returns dict {node_str: community_str}."""
+ node2com = {}
+ with open(path, "r") as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ parts = line.split("\t")
+ if len(parts) >= 2:
+ node2com[parts[0]] = parts[1]
+ return node2com
+
+
+def build_igraph(edge_df):
+ """Build an igraph Graph from an edge DataFrame.
+ Returns (graph, name_to_idx, idx_to_name)."""
+ import igraph as ig
+
+ all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+ name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+ idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+ src_ids = edge_df["src"].map(name_to_idx).values
+ tgt_ids = edge_df["tgt"].map(name_to_idx).values
+
+ n = len(all_nodes)
+ g = ig.Graph(n=n, edges=list(zip(src_ids, tgt_ids)), directed=False)
+ g.simplify() # remove multi-edges and self-loops
+ return g, name_to_idx, idx_to_name
+
+
+def build_graphtool_graph(edge_df):
+ """Build a graph-tool Graph from an edge DataFrame.
+ Returns (graph, name_to_idx, idx_to_name)."""
+ import graph_tool.all as gt
+
+ all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+ name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+ idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+ src_ids = edge_df["src"].map(name_to_idx).values.astype(np.int64)
+ tgt_ids = edge_df["tgt"].map(name_to_idx).values.astype(np.int64)
+
+ n = len(all_nodes)
+ g = gt.Graph(directed=False)
+ g.add_vertex(n)
+ g.add_edge_list(np.column_stack([src_ids, tgt_ids]))
+ gt.remove_parallel_edges(g)
+ gt.remove_self_loops(g)
+ return g, name_to_idx, idx_to_name
+
+
+def save_communities(node2com, path):
+ """Save community assignments as TSV (node\tcommunity)."""
+ import os
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ with open(path, "w") as f:
+ for node, com in sorted(node2com.items(), key=lambda x: x[0]):
+ f.write(f"{node}\t{com}\n")
diff --git a/scripts/run_all.py b/scripts/run_all.py
new file mode 100644
index 0000000..c98b9a2
--- /dev/null
+++ b/scripts/run_all.py
@@ -0,0 +1,67 @@
+"""Master orchestration script: run all methods, compute accuracy and stats, generate plots."""
+
+import sys
+import os
+import time
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from config import NETWORKS, METHODS
+
+
+def main():
+ start = time.time()
+
+ print("=" * 60)
+ print("EC-SBM Community Detection Analysis Pipeline")
+ print("=" * 60)
+
+ # Step 1: Run community detection methods
+ for net_name in NETWORKS:
+ for method in METHODS:
+ m_name = method["name"]
+ m_type = method["type"]
+
+ print(f"\n{'='*60}")
+ print(f"Running {m_name} on {net_name}")
+ print(f"{'='*60}")
+
+ if m_type == "leiden":
+ from run_leiden import run_leiden
+ run_leiden(net_name, m_name, method["quality"],
+ method.get("resolution"))
+ elif m_type == "infomap":
+ from run_infomap import run_infomap
+ run_infomap(net_name)
+ elif m_type == "graphtool_sbm":
+ from run_graphtool_sbm import run_graphtool_sbm
+ run_graphtool_sbm(net_name)
+
+ # Step 2: Compute accuracy
+ print(f"\n{'='*60}")
+ print("Computing accuracy metrics")
+ print(f"{'='*60}")
+ from compute_accuracy import compute_all_accuracy
+ compute_all_accuracy()
+
+ # Step 3: Compute cluster stats
+ print(f"\n{'='*60}")
+ print("Computing cluster statistics")
+ print(f"{'='*60}")
+ from compute_stats import compute_all_stats
+ compute_all_stats()
+
+ # Step 4: Generate plots and tables
+ print(f"\n{'='*60}")
+ print("Generating plots and LaTeX tables")
+ print(f"{'='*60}")
+ from generate_plots import generate_all
+ generate_all()
+
+ elapsed = time.time() - start
+ print(f"\n{'='*60}")
+ print(f"Pipeline complete in {elapsed:.1f}s")
+ print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/run_graphtool_sbm.py b/scripts/run_graphtool_sbm.py
new file mode 100644
index 0000000..f860e89
--- /dev/null
+++ b/scripts/run_graphtool_sbm.py
@@ -0,0 +1,47 @@
+"""Run graph-tool SBM inference for community detection."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import graph_tool.all as gt
+import numpy as np
+
+from config import NETWORKS, RESULTS_DIR
+from load_data import load_edge_list, build_graphtool_graph, save_communities
+
+
+def run_graphtool_sbm(network_name):
+ net = NETWORKS[network_name]
+ edge_df = load_edge_list(net["edge_tsv"])
+ g, name_to_idx, idx_to_name = build_graphtool_graph(edge_df)
+
+ print(f" Graph: {g.num_vertices()} nodes, {g.num_edges()} edges")
+
+ # Use minimize_blockmodel_dl for flat (non-nested) SBM
+ np.random.seed(42)
+ gt.seed_rng(42)
+ state = gt.minimize_blockmodel_dl(g)
+
+ # Extract block assignments
+ blocks = state.get_blocks()
+ n_blocks = len(set(blocks.a))
+ print(f" Found {n_blocks} blocks")
+
+ node2com = {}
+ for v in g.vertices():
+ node2com[idx_to_name[int(v)]] = str(blocks[v])
+
+ out_path = os.path.join(RESULTS_DIR, network_name, "graphtool_sbm", "com.tsv")
+ save_communities(node2com, out_path)
+ print(f" Saved to {out_path}")
+ return node2com
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--network", required=True)
+ args = parser.parse_args()
+ print(f"Running graph-tool SBM on {args.network}...")
+ run_graphtool_sbm(args.network)
diff --git a/scripts/run_infomap.py b/scripts/run_infomap.py
new file mode 100644
index 0000000..e53c77d
--- /dev/null
+++ b/scripts/run_infomap.py
@@ -0,0 +1,61 @@
+"""Run Infomap community detection."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import infomap
+
+from config import NETWORKS, RESULTS_DIR, SEED
+from load_data import load_edge_list, save_communities
+
+
+def run_infomap(network_name):
+ net = NETWORKS[network_name]
+ edge_df = load_edge_list(net["edge_tsv"])
+
+ # Build node mapping
+ import pandas as pd
+ all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K"))
+ name_to_idx = {name: i for i, name in enumerate(all_nodes)}
+ idx_to_name = {i: name for name, i in name_to_idx.items()}
+
+ n = len(all_nodes)
+ print(f" Network: {n} nodes, {len(edge_df)} edges")
+
+ im = infomap.Infomap("--two-level --flow-model undirected --seed {}".format(SEED))
+
+ for _, row in edge_df.iterrows():
+ im.add_link(name_to_idx[row["src"]], name_to_idx[row["tgt"]])
+
+ im.run()
+
+ print(f" Found {im.num_top_modules} top modules, codelength={im.codelength:.4f}")
+
+ # Extract communities
+ node2com = {}
+ for node_id in im.tree:
+ if node_id.is_leaf:
+ node2com[idx_to_name[node_id.node_id]] = str(node_id.module_id)
+
+ # Assign singleton communities to any nodes not in infomap output
+ max_com = max(int(c) for c in node2com.values()) + 1
+ for idx in range(n):
+ name = idx_to_name[idx]
+ if name not in node2com:
+ node2com[name] = str(max_com)
+ max_com += 1
+
+ out_path = os.path.join(RESULTS_DIR, network_name, "infomap", "com.tsv")
+ save_communities(node2com, out_path)
+ print(f" Saved to {out_path}")
+ return node2com
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--network", required=True)
+ args = parser.parse_args()
+ print(f"Running Infomap on {args.network}...")
+ run_infomap(args.network)
diff --git a/scripts/run_leiden.py b/scripts/run_leiden.py
new file mode 100644
index 0000000..375b6bb
--- /dev/null
+++ b/scripts/run_leiden.py
@@ -0,0 +1,62 @@
+"""Run Leiden community detection (modularity or CPM)."""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import igraph as ig
+import leidenalg
+
+from config import NETWORKS, RESULTS_DIR, SEED
+from load_data import load_edge_list, build_igraph, save_communities
+
+
+def run_leiden(network_name, method_name, quality, resolution=None):
+ net = NETWORKS[network_name]
+ edge_df = load_edge_list(net["edge_tsv"])
+ g, name_to_idx, idx_to_name = build_igraph(edge_df)
+
+ print(f" Graph: {g.vcount()} nodes, {g.ecount()} edges")
+
+ if quality == "modularity":
+ partition = leidenalg.find_partition(
+ g, leidenalg.ModularityVertexPartition, seed=SEED
+ )
+ elif quality == "cpm":
+ partition = leidenalg.find_partition(
+ g, leidenalg.CPMVertexPartition,
+ resolution_parameter=resolution, seed=SEED
+ )
+ else:
+ raise ValueError(f"Unknown quality function: {quality}")
+
+ print(f" Found {len(partition)} communities, modularity={partition.modularity:.4f}")
+
+ # Convert partition to node2com dict
+ node2com = {}
+ for comm_id, members in enumerate(partition):
+ for idx in members:
+ node2com[idx_to_name[idx]] = str(comm_id)
+
+ out_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv")
+ save_communities(node2com, out_path)
+ print(f" Saved to {out_path}")
+ return node2com
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--network", required=True)
+ parser.add_argument("--method", required=True,
+ choices=["leiden_mod", "leiden_cpm_01", "leiden_cpm_001"])
+ args = parser.parse_args()
+
+ method_configs = {
+ "leiden_mod": {"quality": "modularity"},
+ "leiden_cpm_01": {"quality": "cpm", "resolution": 0.1},
+ "leiden_cpm_001": {"quality": "cpm", "resolution": 0.01},
+ }
+ cfg = method_configs[args.method]
+ print(f"Running {args.method} on {args.network}...")
+ run_leiden(args.network, args.method, **cfg)