summaryrefslogtreecommitdiff
path: root/scripts/compute_stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/compute_stats.py')
-rw-r--r--scripts/compute_stats.py165
1 files changed, 165 insertions, 0 deletions
diff --git a/scripts/compute_stats.py b/scripts/compute_stats.py
new file mode 100644
index 0000000..2e88252
--- /dev/null
+++ b/scripts/compute_stats.py
@@ -0,0 +1,165 @@
+"""Compute cluster statistics for all (network, method) pairs + ground truth."""
+
+import argparse
+import sys
+import os
+import json
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import numpy as np
+import pandas as pd
+
+from config import NETWORKS, METHODS, RESULTS_DIR
+from load_data import load_edge_list, load_communities
+
+
+def build_neighbors(edge_df):
+ """Build adjacency dict from edge DataFrame."""
+ neighbors = {}
+ for _, row in edge_df.iterrows():
+ s, t = row["src"], row["tgt"]
+ neighbors.setdefault(s, set()).add(t)
+ neighbors.setdefault(t, set()).add(s)
+ return neighbors
+
+
+def compute_cluster_stats(network_name, method_name, com_path):
+ """Compute statistics for a given community assignment."""
+ net = NETWORKS[network_name]
+ edge_df = load_edge_list(net["edge_tsv"])
+ neighbors = build_neighbors(edge_df)
+
+ all_nodes = set(neighbors.keys())
+ node2com = load_communities(com_path)
+
+ # Build com2nodes
+ com2nodes = {}
+ for node, com in node2com.items():
+ com2nodes.setdefault(com, set()).add(node)
+
+ # Separate singletons vs non-singletons
+ non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1}
+ singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1}
+
+ # Nodes in non-singleton clusters
+ nodes_in_clusters = set()
+ for nodes in non_singleton_coms.values():
+ nodes_in_clusters.update(nodes)
+
+ node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0
+
+ # Per-cluster stats
+ cluster_details = []
+ for com_id, nodes in sorted(non_singleton_coms.items()):
+ n = len(nodes)
+ # Internal edges
+ m_internal = 0
+ c_boundary = 0
+ for node in nodes:
+ for nbr in neighbors.get(node, set()):
+ if nbr in nodes:
+ m_internal += 1
+ else:
+ c_boundary += 1
+ m_internal //= 2 # each edge counted twice
+
+ edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0
+ degree_density = m_internal / n if n > 0 else 0.0
+ conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0
+
+ cluster_details.append({
+ "com_id": com_id,
+ "n": n,
+ "m_internal": m_internal,
+ "c_boundary": c_boundary,
+ "edge_density": edge_density,
+ "degree_density": degree_density,
+ "conductance": conductance,
+ })
+
+ # Per-node mixing parameter
+ mixing_params = []
+ for node in all_nodes:
+ if node not in node2com:
+ mixing_params.append(1.0) # outlier
+ continue
+ my_com = node2com[node]
+ nbrs = neighbors.get(node, set())
+ if len(nbrs) == 0:
+ mixing_params.append(0.0)
+ continue
+ n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com)
+ mixing_params.append(1.0 - n_in / len(nbrs))
+
+ summary = {
+ "network": network_name,
+ "method": method_name,
+ "n_nodes": len(all_nodes),
+ "n_clusters_total": len(com2nodes),
+ "n_clusters_non_singleton": len(non_singleton_coms),
+ "n_singleton_clusters": len(singleton_coms),
+ "node_coverage": node_coverage,
+ "mean_mixing_param": np.mean(mixing_params),
+ "median_mixing_param": np.median(mixing_params),
+ "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0,
+ "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0,
+ "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+ "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
+ "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0,
+ "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0,
+ }
+
+ return summary, cluster_details, mixing_params
+
+
+def compute_all_stats():
+ """Compute stats for all methods on all networks plus ground truth."""
+ all_summaries = []
+ stats_dir = os.path.join(RESULTS_DIR, "stats")
+ os.makedirs(stats_dir, exist_ok=True)
+
+ for net_name in NETWORKS:
+ net = NETWORKS[net_name]
+
+ # Ground truth
+ print(f"Computing stats: {net_name} / ground_truth")
+ summary, details, mixing = compute_cluster_stats(
+ net_name, "ground_truth", net["com_gt_tsv"]
+ )
+ all_summaries.append(summary)
+ _save_details(stats_dir, net_name, "ground_truth", details, mixing)
+
+ # Each method
+ for method in METHODS:
+ m_name = method["name"]
+ est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv")
+ if not os.path.exists(est_path):
+ print(f" WARNING: {est_path} not found, skipping")
+ continue
+ print(f"Computing stats: {net_name} / {m_name}")
+ summary, details, mixing = compute_cluster_stats(
+ net_name, m_name, est_path
+ )
+ all_summaries.append(summary)
+ _save_details(stats_dir, net_name, m_name, details, mixing)
+
+ df = pd.DataFrame(all_summaries)
+ out_path = os.path.join(stats_dir, "cluster_stats_summary.csv")
+ df.to_csv(out_path, index=False)
+ print(f"\nCluster stats saved to {out_path}")
+ print(df.to_string(index=False))
+ return df
+
+
+def _save_details(stats_dir, net_name, method_name, details, mixing):
+ """Save per-cluster details and mixing params as JSON."""
+ out_dir = os.path.join(stats_dir, net_name, method_name)
+ os.makedirs(out_dir, exist_ok=True)
+ with open(os.path.join(out_dir, "cluster_details.json"), "w") as f:
+ json.dump(details, f, indent=2)
+ with open(os.path.join(out_dir, "mixing_params.json"), "w") as f:
+ json.dump(mixing, f)
+
+
+if __name__ == "__main__":
+ compute_all_stats()