diff options
| author | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-24 08:40:49 +0000 |
|---|---|---|
| committer | YurenHao0426 <blackhao0426@gmail.com> | 2026-02-24 08:40:49 +0000 |
| commit | 8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 (patch) | |
| tree | ab5c95888849e854f2346db856c7edece7c8b8a7 /scripts | |
EC-SBM community detection analysis: full pipeline and writeup
Implement community detection on 3 EC-SBM networks (polblogs, topology,
internet_as) using 5 methods (Leiden-Mod, Leiden-CPM at 0.1 and 0.01,
Infomap, graph-tool SBM). Compute AMI/ARI/NMI accuracy, cluster statistics,
and generate figures and LaTeX report.
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/compute_accuracy.py | 92 | ||||
| -rw-r--r-- | scripts/compute_stats.py | 165 | ||||
| -rw-r--r-- | scripts/config.py | 32 | ||||
| -rw-r--r-- | scripts/generate_plots.py | 324 | ||||
| -rw-r--r-- | scripts/load_data.py | 74 | ||||
| -rw-r--r-- | scripts/run_all.py | 67 | ||||
| -rw-r--r-- | scripts/run_graphtool_sbm.py | 47 | ||||
| -rw-r--r-- | scripts/run_infomap.py | 61 | ||||
| -rw-r--r-- | scripts/run_leiden.py | 62 |
9 files changed, 924 insertions, 0 deletions
diff --git a/scripts/compute_accuracy.py b/scripts/compute_accuracy.py new file mode 100644 index 0000000..4aeb6a2 --- /dev/null +++ b/scripts/compute_accuracy.py @@ -0,0 +1,92 @@ +"""Compute AMI, ARI, NMI for all (network, method) pairs.""" + +import argparse +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import numpy as np +import pandas as pd +from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, normalized_mutual_info_score + +from config import NETWORKS, METHODS, RESULTS_DIR +from load_data import load_edge_list, load_communities + + +def align_labels(gt_com, est_com, edge_path): + """Align ground truth and estimated labels over the full node set from edges. + Nodes missing from a clustering get unique singleton community IDs.""" + edge_df = load_edge_list(edge_path) + all_nodes = sorted(set( + pd.unique(edge_df[["src", "tgt"]].values.ravel("K")) + )) + + gt_labels = [] + est_labels = [] + # For nodes not in GT or EST, assign unique singleton IDs + gt_next = max((int(v) for v in gt_com.values() if v.lstrip('-').isdigit()), default=0) + 1 + est_next = max((int(v) for v in est_com.values() if v.lstrip('-').isdigit()), default=0) + 1 + + for node in all_nodes: + if node in gt_com: + gt_labels.append(gt_com[node]) + else: + gt_labels.append(f"gt_singleton_{gt_next}") + gt_next += 1 + + if node in est_com: + est_labels.append(est_com[node]) + else: + est_labels.append(f"est_singleton_{est_next}") + est_next += 1 + + return gt_labels, est_labels + + +def compute_accuracy(network_name, method_name): + """Compute AMI, ARI, NMI for a single (network, method) pair.""" + net = NETWORKS[network_name] + gt_com = load_communities(net["com_gt_tsv"]) + + est_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv") + if not os.path.exists(est_path): + print(f" WARNING: {est_path} not found, skipping") + return None + + est_com = load_communities(est_path) + gt_labels, est_labels = align_labels(gt_com, est_com, net["edge_tsv"]) + + ami = adjusted_mutual_info_score(gt_labels, est_labels, average_method="arithmetic") + ari = adjusted_rand_score(gt_labels, est_labels) + nmi = normalized_mutual_info_score(gt_labels, est_labels, average_method="arithmetic") + + return {"ami": ami, "ari": ari, "nmi": nmi} + + +def compute_all_accuracy(): + """Compute accuracy for all (network, method) pairs and save CSV.""" + rows = [] + for net_name in NETWORKS: + for method in METHODS: + m_name = method["name"] + print(f"Computing accuracy: {net_name} / {m_name}") + result = compute_accuracy(net_name, m_name) + if result is not None: + rows.append({ + "network": net_name, + "method": m_name, + **result, + }) + + df = pd.DataFrame(rows) + out_dir = os.path.join(RESULTS_DIR, "accuracy") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, "accuracy_table.csv") + df.to_csv(out_path, index=False) + print(f"\nAccuracy table saved to {out_path}") + print(df.to_string(index=False)) + return df + + +if __name__ == "__main__": + compute_all_accuracy() diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py new file mode 100644 index 0000000..2e88252 --- /dev/null +++ b/scripts/compute_stats.py @@ -0,0 +1,165 @@ +"""Compute cluster statistics for all (network, method) pairs + ground truth.""" + +import argparse +import sys +import os +import json +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import numpy as np +import pandas as pd + +from config import NETWORKS, METHODS, RESULTS_DIR +from load_data import load_edge_list, load_communities + + +def build_neighbors(edge_df): + """Build adjacency dict from edge DataFrame.""" + neighbors = {} + for _, row in edge_df.iterrows(): + s, t = row["src"], row["tgt"] + neighbors.setdefault(s, set()).add(t) + neighbors.setdefault(t, set()).add(s) + return neighbors + + +def compute_cluster_stats(network_name, method_name, com_path): + """Compute statistics for a given community assignment.""" + net = NETWORKS[network_name] + edge_df = load_edge_list(net["edge_tsv"]) + neighbors = build_neighbors(edge_df) + + all_nodes = set(neighbors.keys()) + node2com = load_communities(com_path) + + # Build com2nodes + com2nodes = {} + for node, com in node2com.items(): + com2nodes.setdefault(com, set()).add(node) + + # Separate singletons vs non-singletons + non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1} + singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1} + + # Nodes in non-singleton clusters + nodes_in_clusters = set() + for nodes in non_singleton_coms.values(): + nodes_in_clusters.update(nodes) + + node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0 + + # Per-cluster stats + cluster_details = [] + for com_id, nodes in sorted(non_singleton_coms.items()): + n = len(nodes) + # Internal edges + m_internal = 0 + c_boundary = 0 + for node in nodes: + for nbr in neighbors.get(node, set()): + if nbr in nodes: + m_internal += 1 + else: + c_boundary += 1 + m_internal //= 2 # each edge counted twice + + edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0 + degree_density = m_internal / n if n > 0 else 0.0 + conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0 + + cluster_details.append({ + "com_id": com_id, + "n": n, + "m_internal": m_internal, + "c_boundary": c_boundary, + "edge_density": edge_density, + "degree_density": degree_density, + "conductance": conductance, + }) + + # Per-node mixing parameter + mixing_params = [] + for node in all_nodes: + if node not in node2com: + mixing_params.append(1.0) # outlier + continue + my_com = node2com[node] + nbrs = neighbors.get(node, set()) + if len(nbrs) == 0: + mixing_params.append(0.0) + continue + n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com) + mixing_params.append(1.0 - n_in / len(nbrs)) + + summary = { + "network": network_name, + "method": method_name, + "n_nodes": len(all_nodes), + "n_clusters_total": len(com2nodes), + "n_clusters_non_singleton": len(non_singleton_coms), + "n_singleton_clusters": len(singleton_coms), + "node_coverage": node_coverage, + "mean_mixing_param": np.mean(mixing_params), + "median_mixing_param": np.median(mixing_params), + "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0, + "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0, + "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0, + "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0, + "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0, + "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0, + } + + return summary, cluster_details, mixing_params + + +def compute_all_stats(): + """Compute stats for all methods on all networks plus ground truth.""" + all_summaries = [] + stats_dir = os.path.join(RESULTS_DIR, "stats") + os.makedirs(stats_dir, exist_ok=True) + + for net_name in NETWORKS: + net = NETWORKS[net_name] + + # Ground truth + print(f"Computing stats: {net_name} / ground_truth") + summary, details, mixing = compute_cluster_stats( + net_name, "ground_truth", net["com_gt_tsv"] + ) + all_summaries.append(summary) + _save_details(stats_dir, net_name, "ground_truth", details, mixing) + + # Each method + for method in METHODS: + m_name = method["name"] + est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv") + if not os.path.exists(est_path): + print(f" WARNING: {est_path} not found, skipping") + continue + print(f"Computing stats: {net_name} / {m_name}") + summary, details, mixing = compute_cluster_stats( + net_name, m_name, est_path + ) + all_summaries.append(summary) + _save_details(stats_dir, net_name, m_name, details, mixing) + + df = pd.DataFrame(all_summaries) + out_path = os.path.join(stats_dir, "cluster_stats_summary.csv") + df.to_csv(out_path, index=False) + print(f"\nCluster stats saved to {out_path}") + print(df.to_string(index=False)) + return df + + +def _save_details(stats_dir, net_name, method_name, details, mixing): + """Save per-cluster details and mixing params as JSON.""" + out_dir = os.path.join(stats_dir, net_name, method_name) + os.makedirs(out_dir, exist_ok=True) + with open(os.path.join(out_dir, "cluster_details.json"), "w") as f: + json.dump(details, f, indent=2) + with open(os.path.join(out_dir, "mixing_params.json"), "w") as f: + json.dump(mixing, f) + + +if __name__ == "__main__": + compute_all_stats() diff --git a/scripts/config.py b/scripts/config.py new file mode 100644 index 0000000..b23b090 --- /dev/null +++ b/scripts/config.py @@ -0,0 +1,32 @@ +"""Central configuration for EC-SBM community detection analysis.""" + +import os + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +DATA_DIR = os.path.join(BASE_DIR, "data") +RESULTS_DIR = os.path.join(BASE_DIR, "results") + +NETWORKS = { + "polblogs": { + "edge_tsv": os.path.join(DATA_DIR, "polblogs", "edge.tsv"), + "com_gt_tsv": os.path.join(DATA_DIR, "polblogs", "com_gt.tsv"), + }, + "topology": { + "edge_tsv": os.path.join(DATA_DIR, "topology", "edge.tsv"), + "com_gt_tsv": os.path.join(DATA_DIR, "topology", "com_gt.tsv"), + }, + "internet_as": { + "edge_tsv": os.path.join(DATA_DIR, "internet_as", "edge.tsv"), + "com_gt_tsv": os.path.join(DATA_DIR, "internet_as", "com_gt.tsv"), + }, +} + +METHODS = [ + {"name": "leiden_mod", "type": "leiden", "quality": "modularity"}, + {"name": "leiden_cpm_01", "type": "leiden", "quality": "cpm", "resolution": 0.1}, + {"name": "leiden_cpm_001", "type": "leiden", "quality": "cpm", "resolution": 0.01}, + {"name": "infomap", "type": "infomap"}, + {"name": "graphtool_sbm", "type": "graphtool_sbm"}, +] + +SEED = 42 diff --git a/scripts/generate_plots.py b/scripts/generate_plots.py new file mode 100644 index 0000000..be5db9f --- /dev/null +++ b/scripts/generate_plots.py @@ -0,0 +1,324 @@ +"""Generate all figures and LaTeX tables for the EC-SBM analysis.""" + +import sys +import os +import json +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.colors import Normalize +from matplotlib.cm import ScalarMappable + +from config import NETWORKS, METHODS, RESULTS_DIR + +FIGURES_DIR = os.path.join(RESULTS_DIR, "figures") +os.makedirs(FIGURES_DIR, exist_ok=True) + +METHOD_NAMES = [m["name"] for m in METHODS] +METHOD_LABELS = { + "leiden_mod": "Leiden-Mod", + "leiden_cpm_01": "Leiden-CPM(0.1)", + "leiden_cpm_001": "Leiden-CPM(0.01)", + "infomap": "Infomap", + "graphtool_sbm": "graph-tool SBM", +} +NET_LABELS = { + "polblogs": "polblogs", + "topology": "topology", + "internet_as": "internet\\_as", +} + + +def plot_accuracy_heatmap(): + """Create a heatmap of accuracy metrics (network x method).""" + acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv") + if not os.path.exists(acc_path): + print("No accuracy table found, skipping heatmap") + return + df = pd.read_csv(acc_path) + + for metric in ["ami", "ari", "nmi"]: + fig, ax = plt.subplots(figsize=(8, 3.5)) + pivot = df.pivot(index="network", columns="method", values=metric) + pivot = pivot.reindex(index=list(NETWORKS.keys()), columns=METHOD_NAMES) + + im = ax.imshow(pivot.values, cmap="YlOrRd", aspect="auto", + vmin=0, vmax=1) + ax.set_xticks(range(len(METHOD_NAMES))) + ax.set_xticklabels([METHOD_LABELS.get(m, m) for m in METHOD_NAMES], + rotation=30, ha="right", fontsize=9) + ax.set_yticks(range(len(NETWORKS))) + ax.set_yticklabels(list(NETWORKS.keys()), fontsize=10) + + for i in range(pivot.shape[0]): + for j in range(pivot.shape[1]): + val = pivot.values[i, j] + if not np.isnan(val): + ax.text(j, i, f"{val:.3f}", ha="center", va="center", + fontsize=9, color="black" if val < 0.6 else "white") + + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + ax.set_title(f"{metric.upper()} Accuracy", fontsize=12) + plt.tight_layout() + plt.savefig(os.path.join(FIGURES_DIR, f"heatmap_{metric}.pdf"), + bbox_inches="tight") + plt.close() + print(f" Saved heatmap_{metric}.pdf") + + +def plot_cluster_size_distributions(): + """Histogram of cluster sizes per network/method.""" + stats_dir = os.path.join(RESULTS_DIR, "stats") + + for net_name in NETWORKS: + all_labels = ["ground_truth"] + METHOD_NAMES + fig, axes = plt.subplots(2, 3, figsize=(14, 8)) + axes = axes.flatten() + + for idx, label in enumerate(all_labels): + ax = axes[idx] + det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json") + if not os.path.exists(det_path): + ax.set_title(METHOD_LABELS.get(label, label)) + ax.text(0.5, 0.5, "No data", ha="center", va="center", + transform=ax.transAxes) + continue + + with open(det_path) as f: + details = json.load(f) + sizes = [d["n"] for d in details] + + if sizes: + ax.hist(sizes, bins=min(50, max(10, len(set(sizes)))), + edgecolor="black", alpha=0.7, color="steelblue") + ax.set_title(METHOD_LABELS.get(label, label), fontsize=10) + ax.set_xlabel("Cluster size") + ax.set_ylabel("Count") + if sizes and max(sizes) > 100: + ax.set_xscale("log") + + # Remove extra subplot if any + for idx in range(len(all_labels), len(axes)): + fig.delaxes(axes[idx]) + + fig.suptitle(f"Cluster Size Distribution — {net_name}", fontsize=13) + plt.tight_layout() + plt.savefig(os.path.join(FIGURES_DIR, f"cluster_sizes_{net_name}.pdf"), + bbox_inches="tight") + plt.close() + print(f" Saved cluster_sizes_{net_name}.pdf") + + +def plot_edge_density_boxplots(): + """Boxplots of edge density across methods for each network.""" + stats_dir = os.path.join(RESULTS_DIR, "stats") + + for net_name in NETWORKS: + all_labels = ["ground_truth"] + METHOD_NAMES + data = [] + labels = [] + + for label in all_labels: + det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json") + if not os.path.exists(det_path): + continue + with open(det_path) as f: + details = json.load(f) + densities = [d["edge_density"] for d in details] + if densities: + data.append(densities) + labels.append(METHOD_LABELS.get(label, label)) + + if not data: + continue + + fig, ax = plt.subplots(figsize=(9, 4)) + bp = ax.boxplot(data, tick_labels=labels, patch_artist=True, showfliers=False) + for patch in bp["boxes"]: + patch.set_facecolor("lightblue") + ax.set_ylabel("Edge Density") + ax.set_title(f"Edge Density Distribution — {net_name}") + plt.xticks(rotation=20, ha="right") + plt.tight_layout() + plt.savefig(os.path.join(FIGURES_DIR, f"edge_density_{net_name}.pdf"), + bbox_inches="tight") + plt.close() + print(f" Saved edge_density_{net_name}.pdf") + + +def plot_mixing_parameter_comparison(): + """Bar chart of mean mixing parameter per method/network.""" + stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv") + if not os.path.exists(stats_path): + print("No stats summary found, skipping mixing param plot") + return + + df = pd.read_csv(stats_path) + + fig, ax = plt.subplots(figsize=(10, 4.5)) + net_names = list(NETWORKS.keys()) + all_methods = ["ground_truth"] + METHOD_NAMES + x = np.arange(len(net_names)) + width = 0.13 + offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5 + + colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods))) + + for i, method in enumerate(all_methods): + vals = [] + for net in net_names: + row = df[(df["network"] == net) & (df["method"] == method)] + vals.append(row["mean_mixing_param"].values[0] if len(row) > 0 else 0) + ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method), + color=colors[i]) + + ax.set_xticks(x) + ax.set_xticklabels(net_names) + ax.set_ylabel("Mean Mixing Parameter") + ax.set_title("Mean Mixing Parameter by Network and Method") + ax.legend(fontsize=7, ncol=2) + plt.tight_layout() + plt.savefig(os.path.join(FIGURES_DIR, "mixing_parameter.pdf"), bbox_inches="tight") + plt.close() + print(" Saved mixing_parameter.pdf") + + +def plot_node_coverage_comparison(): + """Bar chart of node coverage per method/network.""" + stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv") + if not os.path.exists(stats_path): + return + + df = pd.read_csv(stats_path) + + fig, ax = plt.subplots(figsize=(10, 4.5)) + net_names = list(NETWORKS.keys()) + all_methods = ["ground_truth"] + METHOD_NAMES + x = np.arange(len(net_names)) + width = 0.13 + offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5 + colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods))) + + for i, method in enumerate(all_methods): + vals = [] + for net in net_names: + row = df[(df["network"] == net) & (df["method"] == method)] + vals.append(row["node_coverage"].values[0] if len(row) > 0 else 0) + ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method), + color=colors[i]) + + ax.set_xticks(x) + ax.set_xticklabels(net_names) + ax.set_ylabel("Node Coverage") + ax.set_title("Node Coverage by Network and Method") + ax.legend(fontsize=7, ncol=2) + ax.set_ylim(0, 1.05) + plt.tight_layout() + plt.savefig(os.path.join(FIGURES_DIR, "node_coverage.pdf"), bbox_inches="tight") + plt.close() + print(" Saved node_coverage.pdf") + + +def generate_latex_accuracy_table(): + """Generate a LaTeX accuracy table.""" + acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv") + if not os.path.exists(acc_path): + return + + df = pd.read_csv(acc_path) + lines = [] + lines.append(r"\begin{table}[htbp]") + lines.append(r"\centering") + lines.append(r"\caption{Community detection accuracy (AMI, ARI, NMI) on EC-SBM networks.}") + lines.append(r"\label{tab:accuracy}") + lines.append(r"\begin{tabular}{llrrr}") + lines.append(r"\toprule") + lines.append(r"Network & Method & AMI & ARI & NMI \\") + lines.append(r"\midrule") + + for net_name in NETWORKS: + first = True + for _, row in df[df["network"] == net_name].iterrows(): + net_disp = net_name if first else "" + m_label = METHOD_LABELS.get(row["method"], row["method"]) + lines.append( + f"{net_disp} & {m_label} & {row['ami']:.4f} & {row['ari']:.4f} & {row['nmi']:.4f} \\\\" + ) + first = False + lines.append(r"\midrule") + + lines[-1] = r"\bottomrule" + lines.append(r"\end{tabular}") + lines.append(r"\end{table}") + + out_path = os.path.join(FIGURES_DIR, "accuracy_table.tex") + with open(out_path, "w") as f: + f.write("\n".join(lines)) + print(f" Saved accuracy_table.tex") + + +def generate_latex_stats_table(): + """Generate a LaTeX cluster stats table.""" + stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv") + if not os.path.exists(stats_path): + return + + df = pd.read_csv(stats_path) + lines = [] + lines.append(r"\begin{table}[htbp]") + lines.append(r"\centering") + lines.append(r"\caption{Cluster statistics summary for each network and method.}") + lines.append(r"\label{tab:cluster_stats}") + lines.append(r"\footnotesize") + lines.append(r"\begin{tabular}{llrrrrrr}") + lines.append(r"\toprule") + lines.append(r"Network & Method & \#Clusters & Node Cov. & Mean Size & Mean Density & Mean Cond. & Mean Mix. \\") + lines.append(r"\midrule") + + for net_name in NETWORKS: + first = True + for _, row in df[df["network"] == net_name].iterrows(): + net_disp = net_name if first else "" + m_label = METHOD_LABELS.get(row["method"], row["method"]) + lines.append( + f"{net_disp} & {m_label} & {int(row['n_clusters_non_singleton'])} & " + f"{row['node_coverage']:.3f} & {row['mean_cluster_size']:.1f} & " + f"{row['mean_edge_density']:.3f} & {row['mean_conductance']:.3f} & " + f"{row['mean_mixing_param']:.3f} \\\\" + ) + first = False + lines.append(r"\midrule") + + lines[-1] = r"\bottomrule" + lines.append(r"\end{tabular}") + lines.append(r"\end{table}") + + out_path = os.path.join(FIGURES_DIR, "cluster_stats_table.tex") + with open(out_path, "w") as f: + f.write("\n".join(lines)) + print(f" Saved cluster_stats_table.tex") + + +def generate_all(): + print("Generating accuracy heatmaps...") + plot_accuracy_heatmap() + print("Generating cluster size distributions...") + plot_cluster_size_distributions() + print("Generating edge density boxplots...") + plot_edge_density_boxplots() + print("Generating mixing parameter comparison...") + plot_mixing_parameter_comparison() + print("Generating node coverage comparison...") + plot_node_coverage_comparison() + print("Generating LaTeX tables...") + generate_latex_accuracy_table() + generate_latex_stats_table() + print("All plots and tables generated.") + + +if __name__ == "__main__": + generate_all() diff --git a/scripts/load_data.py b/scripts/load_data.py new file mode 100644 index 0000000..5a0362b --- /dev/null +++ b/scripts/load_data.py @@ -0,0 +1,74 @@ +"""Shared data loading utilities for EC-SBM analysis.""" + +import pandas as pd +import numpy as np + + +def load_edge_list(path): + """Load a tab-separated edge list (no header, two columns: node1, node2).""" + df = pd.read_csv(path, sep="\t", header=None, names=["src", "tgt"], + dtype=str, comment="#") + return df + + +def load_communities(path): + """Load a tab-separated community file (no header: node, community). + Returns dict {node_str: community_str}.""" + node2com = {} + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + parts = line.split("\t") + if len(parts) >= 2: + node2com[parts[0]] = parts[1] + return node2com + + +def build_igraph(edge_df): + """Build an igraph Graph from an edge DataFrame. + Returns (graph, name_to_idx, idx_to_name).""" + import igraph as ig + + all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K")) + name_to_idx = {name: i for i, name in enumerate(all_nodes)} + idx_to_name = {i: name for name, i in name_to_idx.items()} + + src_ids = edge_df["src"].map(name_to_idx).values + tgt_ids = edge_df["tgt"].map(name_to_idx).values + + n = len(all_nodes) + g = ig.Graph(n=n, edges=list(zip(src_ids, tgt_ids)), directed=False) + g.simplify() # remove multi-edges and self-loops + return g, name_to_idx, idx_to_name + + +def build_graphtool_graph(edge_df): + """Build a graph-tool Graph from an edge DataFrame. + Returns (graph, name_to_idx, idx_to_name).""" + import graph_tool.all as gt + + all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K")) + name_to_idx = {name: i for i, name in enumerate(all_nodes)} + idx_to_name = {i: name for name, i in name_to_idx.items()} + + src_ids = edge_df["src"].map(name_to_idx).values.astype(np.int64) + tgt_ids = edge_df["tgt"].map(name_to_idx).values.astype(np.int64) + + n = len(all_nodes) + g = gt.Graph(directed=False) + g.add_vertex(n) + g.add_edge_list(np.column_stack([src_ids, tgt_ids])) + gt.remove_parallel_edges(g) + gt.remove_self_loops(g) + return g, name_to_idx, idx_to_name + + +def save_communities(node2com, path): + """Save community assignments as TSV (node\tcommunity).""" + import os + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + for node, com in sorted(node2com.items(), key=lambda x: x[0]): + f.write(f"{node}\t{com}\n") diff --git a/scripts/run_all.py b/scripts/run_all.py new file mode 100644 index 0000000..c98b9a2 --- /dev/null +++ b/scripts/run_all.py @@ -0,0 +1,67 @@ +"""Master orchestration script: run all methods, compute accuracy and stats, generate plots.""" + +import sys +import os +import time +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from config import NETWORKS, METHODS + + +def main(): + start = time.time() + + print("=" * 60) + print("EC-SBM Community Detection Analysis Pipeline") + print("=" * 60) + + # Step 1: Run community detection methods + for net_name in NETWORKS: + for method in METHODS: + m_name = method["name"] + m_type = method["type"] + + print(f"\n{'='*60}") + print(f"Running {m_name} on {net_name}") + print(f"{'='*60}") + + if m_type == "leiden": + from run_leiden import run_leiden + run_leiden(net_name, m_name, method["quality"], + method.get("resolution")) + elif m_type == "infomap": + from run_infomap import run_infomap + run_infomap(net_name) + elif m_type == "graphtool_sbm": + from run_graphtool_sbm import run_graphtool_sbm + run_graphtool_sbm(net_name) + + # Step 2: Compute accuracy + print(f"\n{'='*60}") + print("Computing accuracy metrics") + print(f"{'='*60}") + from compute_accuracy import compute_all_accuracy + compute_all_accuracy() + + # Step 3: Compute cluster stats + print(f"\n{'='*60}") + print("Computing cluster statistics") + print(f"{'='*60}") + from compute_stats import compute_all_stats + compute_all_stats() + + # Step 4: Generate plots and tables + print(f"\n{'='*60}") + print("Generating plots and LaTeX tables") + print(f"{'='*60}") + from generate_plots import generate_all + generate_all() + + elapsed = time.time() - start + print(f"\n{'='*60}") + print(f"Pipeline complete in {elapsed:.1f}s") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_graphtool_sbm.py b/scripts/run_graphtool_sbm.py new file mode 100644 index 0000000..f860e89 --- /dev/null +++ b/scripts/run_graphtool_sbm.py @@ -0,0 +1,47 @@ +"""Run graph-tool SBM inference for community detection.""" + +import argparse +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import graph_tool.all as gt +import numpy as np + +from config import NETWORKS, RESULTS_DIR +from load_data import load_edge_list, build_graphtool_graph, save_communities + + +def run_graphtool_sbm(network_name): + net = NETWORKS[network_name] + edge_df = load_edge_list(net["edge_tsv"]) + g, name_to_idx, idx_to_name = build_graphtool_graph(edge_df) + + print(f" Graph: {g.num_vertices()} nodes, {g.num_edges()} edges") + + # Use minimize_blockmodel_dl for flat (non-nested) SBM + np.random.seed(42) + gt.seed_rng(42) + state = gt.minimize_blockmodel_dl(g) + + # Extract block assignments + blocks = state.get_blocks() + n_blocks = len(set(blocks.a)) + print(f" Found {n_blocks} blocks") + + node2com = {} + for v in g.vertices(): + node2com[idx_to_name[int(v)]] = str(blocks[v]) + + out_path = os.path.join(RESULTS_DIR, network_name, "graphtool_sbm", "com.tsv") + save_communities(node2com, out_path) + print(f" Saved to {out_path}") + return node2com + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--network", required=True) + args = parser.parse_args() + print(f"Running graph-tool SBM on {args.network}...") + run_graphtool_sbm(args.network) diff --git a/scripts/run_infomap.py b/scripts/run_infomap.py new file mode 100644 index 0000000..e53c77d --- /dev/null +++ b/scripts/run_infomap.py @@ -0,0 +1,61 @@ +"""Run Infomap community detection.""" + +import argparse +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import infomap + +from config import NETWORKS, RESULTS_DIR, SEED +from load_data import load_edge_list, save_communities + + +def run_infomap(network_name): + net = NETWORKS[network_name] + edge_df = load_edge_list(net["edge_tsv"]) + + # Build node mapping + import pandas as pd + all_nodes = pd.unique(edge_df[["src", "tgt"]].values.ravel("K")) + name_to_idx = {name: i for i, name in enumerate(all_nodes)} + idx_to_name = {i: name for name, i in name_to_idx.items()} + + n = len(all_nodes) + print(f" Network: {n} nodes, {len(edge_df)} edges") + + im = infomap.Infomap("--two-level --flow-model undirected --seed {}".format(SEED)) + + for _, row in edge_df.iterrows(): + im.add_link(name_to_idx[row["src"]], name_to_idx[row["tgt"]]) + + im.run() + + print(f" Found {im.num_top_modules} top modules, codelength={im.codelength:.4f}") + + # Extract communities + node2com = {} + for node_id in im.tree: + if node_id.is_leaf: + node2com[idx_to_name[node_id.node_id]] = str(node_id.module_id) + + # Assign singleton communities to any nodes not in infomap output + max_com = max(int(c) for c in node2com.values()) + 1 + for idx in range(n): + name = idx_to_name[idx] + if name not in node2com: + node2com[name] = str(max_com) + max_com += 1 + + out_path = os.path.join(RESULTS_DIR, network_name, "infomap", "com.tsv") + save_communities(node2com, out_path) + print(f" Saved to {out_path}") + return node2com + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--network", required=True) + args = parser.parse_args() + print(f"Running Infomap on {args.network}...") + run_infomap(args.network) diff --git a/scripts/run_leiden.py b/scripts/run_leiden.py new file mode 100644 index 0000000..375b6bb --- /dev/null +++ b/scripts/run_leiden.py @@ -0,0 +1,62 @@ +"""Run Leiden community detection (modularity or CPM).""" + +import argparse +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import igraph as ig +import leidenalg + +from config import NETWORKS, RESULTS_DIR, SEED +from load_data import load_edge_list, build_igraph, save_communities + + +def run_leiden(network_name, method_name, quality, resolution=None): + net = NETWORKS[network_name] + edge_df = load_edge_list(net["edge_tsv"]) + g, name_to_idx, idx_to_name = build_igraph(edge_df) + + print(f" Graph: {g.vcount()} nodes, {g.ecount()} edges") + + if quality == "modularity": + partition = leidenalg.find_partition( + g, leidenalg.ModularityVertexPartition, seed=SEED + ) + elif quality == "cpm": + partition = leidenalg.find_partition( + g, leidenalg.CPMVertexPartition, + resolution_parameter=resolution, seed=SEED + ) + else: + raise ValueError(f"Unknown quality function: {quality}") + + print(f" Found {len(partition)} communities, modularity={partition.modularity:.4f}") + + # Convert partition to node2com dict + node2com = {} + for comm_id, members in enumerate(partition): + for idx in members: + node2com[idx_to_name[idx]] = str(comm_id) + + out_path = os.path.join(RESULTS_DIR, network_name, method_name, "com.tsv") + save_communities(node2com, out_path) + print(f" Saved to {out_path}") + return node2com + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--network", required=True) + parser.add_argument("--method", required=True, + choices=["leiden_mod", "leiden_cpm_01", "leiden_cpm_001"]) + args = parser.parse_args() + + method_configs = { + "leiden_mod": {"quality": "modularity"}, + "leiden_cpm_01": {"quality": "cpm", "resolution": 0.1}, + "leiden_cpm_001": {"quality": "cpm", "resolution": 0.01}, + } + cfg = method_configs[args.method] + print(f"Running {args.method} on {args.network}...") + run_leiden(args.network, args.method, **cfg) |
