"""Compute cluster statistics for all (network, method) pairs + ground truth."""

import argparse
import sys
import os
import json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import numpy as np
import pandas as pd
import igraph as ig

from config import NETWORKS, METHODS, RESULTS_DIR
from load_data import load_edge_list, load_communities


def build_neighbors(edge_df):
    """Build adjacency dict from edge DataFrame."""
    neighbors = {}
    for _, row in edge_df.iterrows():
        s, t = row["src"], row["tgt"]
        neighbors.setdefault(s, set()).add(t)
        neighbors.setdefault(t, set()).add(s)
    return neighbors


def compute_cluster_stats(network_name, method_name, com_path):
    """Compute statistics for a given community assignment."""
    net = NETWORKS[network_name]
    edge_df = load_edge_list(net["edge_tsv"])
    neighbors = build_neighbors(edge_df)

    all_nodes = set(neighbors.keys())
    node2com = load_communities(com_path)

    # Build com2nodes
    com2nodes = {}
    for node, com in node2com.items():
        com2nodes.setdefault(com, set()).add(node)

    # Separate singletons vs non-singletons
    non_singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) > 1}
    singleton_coms = {c: nodes for c, nodes in com2nodes.items() if len(nodes) == 1}

    # Nodes in non-singleton clusters
    nodes_in_clusters = set()
    for nodes in non_singleton_coms.values():
        nodes_in_clusters.update(nodes)

    node_coverage = len(nodes_in_clusters) / len(all_nodes) if all_nodes else 0.0

    # Per-cluster stats
    cluster_details = []
    total_clusters = len(non_singleton_coms)
    for ci, (com_id, nodes) in enumerate(sorted(non_singleton_coms.items())):
        n = len(nodes)
        # Internal edges
        m_internal = 0
        c_boundary = 0
        for node in nodes:
            for nbr in neighbors.get(node, set()):
                if nbr in nodes:
                    m_internal += 1
                else:
                    c_boundary += 1
        m_internal //= 2  # each edge counted twice

        edge_density = 2 * m_internal / (n * (n - 1)) if n > 1 else 0.0
        degree_density = m_internal / n if n > 0 else 0.0
        conductance = c_boundary / (2 * m_internal + c_boundary) if (2 * m_internal + c_boundary) > 0 else 0.0

        # Minimum edge cut via igraph
        mincut = 0
        if n >= 2 and m_internal >= 1:
            node_list = sorted(nodes)
            local_map = {nd: i for i, nd in enumerate(node_list)}
            edges = []
            for nd in node_list:
                for nbr in neighbors.get(nd, set()):
                    if nbr in nodes and local_map[nd] < local_map[nbr]:
                        edges.append((local_map[nd], local_map[nbr]))
            sg = ig.Graph(n=n, edges=edges, directed=False)
            mincut = sg.mincut().value

        mincut_over_log10n = mincut / np.log10(n) if n > 1 else 0.0

        cluster_details.append({
            "com_id": com_id,
            "n": n,
            "m_internal": m_internal,
            "c_boundary": c_boundary,
            "edge_density": edge_density,
            "degree_density": degree_density,
            "conductance": conductance,
            "mincut": int(mincut),
            "mincut_over_log10n": mincut_over_log10n,
        })

        if (ci + 1) % 500 == 0:
            print(f"    ... {ci+1}/{total_clusters} clusters processed")

    # Per-node mixing parameter
    mixing_params = []
    for node in all_nodes:
        if node not in node2com:
            mixing_params.append(1.0)  # outlier
            continue
        my_com = node2com[node]
        nbrs = neighbors.get(node, set())
        if len(nbrs) == 0:
            mixing_params.append(0.0)
            continue
        n_in = sum(1 for nbr in nbrs if node2com.get(nbr) == my_com)
        mixing_params.append(1.0 - n_in / len(nbrs))

    summary = {
        "network": network_name,
        "method": method_name,
        "n_nodes": len(all_nodes),
        "n_clusters_total": len(com2nodes),
        "n_clusters_non_singleton": len(non_singleton_coms),
        "n_singleton_clusters": len(singleton_coms),
        "node_coverage": node_coverage,
        "mean_mixing_param": np.mean(mixing_params),
        "median_mixing_param": np.median(mixing_params),
        "mean_cluster_size": np.mean([d["n"] for d in cluster_details]) if cluster_details else 0,
        "median_cluster_size": np.median([d["n"] for d in cluster_details]) if cluster_details else 0,
        "mean_edge_density": np.mean([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
        "median_edge_density": np.median([d["edge_density"] for d in cluster_details]) if cluster_details else 0,
        "mean_conductance": np.mean([d["conductance"] for d in cluster_details]) if cluster_details else 0,
        "mean_degree_density": np.mean([d["degree_density"] for d in cluster_details]) if cluster_details else 0,
        "mean_mincut": np.mean([d["mincut"] for d in cluster_details]) if cluster_details else 0,
        "median_mincut": np.median([d["mincut"] for d in cluster_details]) if cluster_details else 0,
        "mean_mincut_over_log10n": np.mean([d["mincut_over_log10n"] for d in cluster_details]) if cluster_details else 0,
        "n_connected": sum(1 for d in cluster_details if d["mincut"] > 0),
        "n_disconnected": sum(1 for d in cluster_details if d["mincut"] == 0),
        "n_wellconnected": sum(1 for d in cluster_details if d["mincut"] > np.log10(d["n"])),
    }

    return summary, cluster_details, mixing_params


def compute_all_stats():
    """Compute stats for all methods on all networks plus ground truth."""
    all_summaries = []
    stats_dir = os.path.join(RESULTS_DIR, "stats")
    os.makedirs(stats_dir, exist_ok=True)

    for net_name in NETWORKS:
        net = NETWORKS[net_name]

        # Ground truth
        print(f"Computing stats: {net_name} / ground_truth")
        summary, details, mixing = compute_cluster_stats(
            net_name, "ground_truth", net["com_gt_tsv"]
        )
        all_summaries.append(summary)
        _save_details(stats_dir, net_name, "ground_truth", details, mixing)

        # Each method
        for method in METHODS:
            m_name = method["name"]
            est_path = os.path.join(RESULTS_DIR, net_name, m_name, "com.tsv")
            if not os.path.exists(est_path):
                print(f"  WARNING: {est_path} not found, skipping")
                continue
            print(f"Computing stats: {net_name} / {m_name}")
            summary, details, mixing = compute_cluster_stats(
                net_name, m_name, est_path
            )
            all_summaries.append(summary)
            _save_details(stats_dir, net_name, m_name, details, mixing)

    df = pd.DataFrame(all_summaries)
    out_path = os.path.join(stats_dir, "cluster_stats_summary.csv")
    df.to_csv(out_path, index=False)
    print(f"\nCluster stats saved to {out_path}")
    print(df.to_string(index=False))
    return df


def _save_details(stats_dir, net_name, method_name, details, mixing):
    """Save per-cluster details and mixing params as JSON."""
    out_dir = os.path.join(stats_dir, net_name, method_name)
    os.makedirs(out_dir, exist_ok=True)
    with open(os.path.join(out_dir, "cluster_details.json"), "w") as f:
        json.dump(details, f, indent=2)
    with open(os.path.join(out_dir, "mixing_params.json"), "w") as f:
        json.dump(mixing, f)


if __name__ == "__main__":
    compute_all_stats()