scripts/generate_plots.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411

"""Generate all figures and LaTeX tables for the EC-SBM analysis."""

import sys
import os
import json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

from config import NETWORKS, METHODS, RESULTS_DIR

FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
os.makedirs(FIGURES_DIR, exist_ok=True)

METHOD_NAMES = [m["name"] for m in METHODS]
METHOD_LABELS = {
    "leiden_mod": "Leiden-Mod",
    "leiden_cpm_01": "Leiden-CPM(0.1)",
    "leiden_cpm_001": "Leiden-CPM(0.01)",
    "infomap": "Infomap",
    "graphtool_sbm": "graph-tool SBM",
}
NET_LABELS = {
    "polblogs": "polblogs",
    "topology": "topology",
    "internet_as": "internet\\_as",
}


def plot_accuracy_heatmap():
    """Create a heatmap of accuracy metrics (network x method)."""
    acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
    if not os.path.exists(acc_path):
        print("No accuracy table found, skipping heatmap")
        return
    df = pd.read_csv(acc_path)

    for metric in ["ami", "ari", "nmi"]:
        fig, ax = plt.subplots(figsize=(8, 3.5))
        pivot = df.pivot(index="network", columns="method", values=metric)
        pivot = pivot.reindex(index=list(NETWORKS.keys()), columns=METHOD_NAMES)

        im = ax.imshow(pivot.values, cmap="YlOrRd", aspect="auto",
                        vmin=0, vmax=1)
        ax.set_xticks(range(len(METHOD_NAMES)))
        ax.set_xticklabels([METHOD_LABELS.get(m, m) for m in METHOD_NAMES],
                            rotation=30, ha="right", fontsize=9)
        ax.set_yticks(range(len(NETWORKS)))
        ax.set_yticklabels(list(NETWORKS.keys()), fontsize=10)

        for i in range(pivot.shape[0]):
            for j in range(pivot.shape[1]):
                val = pivot.values[i, j]
                if not np.isnan(val):
                    ax.text(j, i, f"{val:.3f}", ha="center", va="center",
                            fontsize=9, color="black" if val < 0.6 else "white")

        plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
        ax.set_title(f"{metric.upper()} Accuracy", fontsize=12)
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"heatmap_{metric}.pdf"),
                     bbox_inches="tight")
        plt.close()
        print(f"  Saved heatmap_{metric}.pdf")


def plot_cluster_size_distributions():
    """Histogram of cluster sizes per network/method."""
    stats_dir = os.path.join(RESULTS_DIR, "stats")

    for net_name in NETWORKS:
        all_labels = ["ground_truth"] + METHOD_NAMES
        fig, axes = plt.subplots(2, 3, figsize=(14, 8))
        axes = axes.flatten()

        for idx, label in enumerate(all_labels):
            ax = axes[idx]
            det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
            if not os.path.exists(det_path):
                ax.set_title(METHOD_LABELS.get(label, label))
                ax.text(0.5, 0.5, "No data", ha="center", va="center",
                         transform=ax.transAxes)
                continue

            with open(det_path) as f:
                details = json.load(f)
            sizes = [d["n"] for d in details]

            if sizes:
                ax.hist(sizes, bins=min(50, max(10, len(set(sizes)))),
                         edgecolor="black", alpha=0.7, color="steelblue")
            ax.set_title(METHOD_LABELS.get(label, label), fontsize=10)
            ax.set_xlabel("Cluster size")
            ax.set_ylabel("Count")
            if sizes and max(sizes) > 100:
                ax.set_xscale("log")

        # Remove extra subplot if any
        for idx in range(len(all_labels), len(axes)):
            fig.delaxes(axes[idx])

        fig.suptitle(f"Cluster Size Distribution — {net_name}", fontsize=13)
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"cluster_sizes_{net_name}.pdf"),
                     bbox_inches="tight")
        plt.close()
        print(f"  Saved cluster_sizes_{net_name}.pdf")


def plot_edge_density_boxplots():
    """Boxplots of edge density across methods for each network."""
    stats_dir = os.path.join(RESULTS_DIR, "stats")

    for net_name in NETWORKS:
        all_labels = ["ground_truth"] + METHOD_NAMES
        data = []
        labels = []

        for label in all_labels:
            det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
            if not os.path.exists(det_path):
                continue
            with open(det_path) as f:
                details = json.load(f)
            densities = [d["edge_density"] for d in details]
            if densities:
                data.append(densities)
                labels.append(METHOD_LABELS.get(label, label))

        if not data:
            continue

        fig, ax = plt.subplots(figsize=(9, 4))
        bp = ax.boxplot(data, tick_labels=labels, patch_artist=True, showfliers=False)
        for patch in bp["boxes"]:
            patch.set_facecolor("lightblue")
        ax.set_ylabel("Edge Density")
        ax.set_title(f"Edge Density Distribution — {net_name}")
        plt.xticks(rotation=20, ha="right")
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"edge_density_{net_name}.pdf"),
                     bbox_inches="tight")
        plt.close()
        print(f"  Saved edge_density_{net_name}.pdf")


def plot_mixing_parameter_comparison():
    """Bar chart of mean mixing parameter per method/network."""
    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
    if not os.path.exists(stats_path):
        print("No stats summary found, skipping mixing param plot")
        return

    df = pd.read_csv(stats_path)

    fig, ax = plt.subplots(figsize=(10, 4.5))
    net_names = list(NETWORKS.keys())
    all_methods = ["ground_truth"] + METHOD_NAMES
    x = np.arange(len(net_names))
    width = 0.13
    offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5

    colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))

    for i, method in enumerate(all_methods):
        vals = []
        for net in net_names:
            row = df[(df["network"] == net) & (df["method"] == method)]
            vals.append(row["mean_mixing_param"].values[0] if len(row) > 0 else 0)
        ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
               color=colors[i])

    ax.set_xticks(x)
    ax.set_xticklabels(net_names)
    ax.set_ylabel("Mean Mixing Parameter")
    ax.set_title("Mean Mixing Parameter by Network and Method")
    ax.legend(fontsize=7, ncol=2)
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, "mixing_parameter.pdf"), bbox_inches="tight")
    plt.close()
    print("  Saved mixing_parameter.pdf")


def plot_node_coverage_comparison():
    """Bar chart of node coverage per method/network."""
    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
    if not os.path.exists(stats_path):
        return

    df = pd.read_csv(stats_path)

    fig, ax = plt.subplots(figsize=(10, 4.5))
    net_names = list(NETWORKS.keys())
    all_methods = ["ground_truth"] + METHOD_NAMES
    x = np.arange(len(net_names))
    width = 0.13
    offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
    colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))

    for i, method in enumerate(all_methods):
        vals = []
        for net in net_names:
            row = df[(df["network"] == net) & (df["method"] == method)]
            vals.append(row["node_coverage"].values[0] if len(row) > 0 else 0)
        ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
               color=colors[i])

    ax.set_xticks(x)
    ax.set_xticklabels(net_names)
    ax.set_ylabel("Node Coverage")
    ax.set_title("Node Coverage by Network and Method")
    ax.legend(fontsize=7, ncol=2)
    ax.set_ylim(0, 1.05)
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, "node_coverage.pdf"), bbox_inches="tight")
    plt.close()
    print("  Saved node_coverage.pdf")


def plot_edge_connectivity_boxplots():
    """Boxplots of mincut/log10(n) across methods for each network."""
    stats_dir = os.path.join(RESULTS_DIR, "stats")

    for net_name in NETWORKS:
        all_labels = ["ground_truth"] + METHOD_NAMES
        data = []
        labels = []

        for label in all_labels:
            det_path = os.path.join(stats_dir, net_name, label, "cluster_details.json")
            if not os.path.exists(det_path):
                continue
            with open(det_path) as f:
                details = json.load(f)
            vals = [d["mincut_over_log10n"] for d in details if "mincut_over_log10n" in d]
            if vals:
                data.append(vals)
                labels.append(METHOD_LABELS.get(label, label))

        if not data:
            continue

        fig, ax = plt.subplots(figsize=(9, 4))
        bp = ax.boxplot(data, tick_labels=labels, patch_artist=True, showfliers=False)
        for patch in bp["boxes"]:
            patch.set_facecolor("lightyellow")
        ax.axhline(y=1.0, color="red", linestyle="--", linewidth=0.8, label="well-connected threshold")
        ax.set_ylabel("Min Edge Cut / log$_{10}$(n)")
        ax.set_title(f"Edge Connectivity — {net_name}")
        ax.legend(fontsize=8)
        plt.xticks(rotation=20, ha="right")
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"edge_connectivity_{net_name}.pdf"),
                     bbox_inches="tight")
        plt.close()
        print(f"  Saved edge_connectivity_{net_name}.pdf")


def plot_wellconnected_bar():
    """Bar chart of fraction well-connected clusters per method/network."""
    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
    if not os.path.exists(stats_path):
        return

    df = pd.read_csv(stats_path)

    fig, ax = plt.subplots(figsize=(10, 4.5))
    net_names = list(NETWORKS.keys())
    all_methods = ["ground_truth"] + METHOD_NAMES
    x = np.arange(len(net_names))
    width = 0.13
    offsets = np.arange(len(all_methods)) - len(all_methods) / 2 + 0.5
    colors = plt.cm.Set2(np.linspace(0, 1, len(all_methods)))

    for i, method in enumerate(all_methods):
        vals = []
        for net in net_names:
            row = df[(df["network"] == net) & (df["method"] == method)]
            if len(row) > 0:
                nc = row["n_clusters_non_singleton"].values[0]
                nwc = row["n_wellconnected"].values[0]
                vals.append(nwc / nc if nc > 0 else 0)
            else:
                vals.append(0)
        ax.bar(x + offsets[i] * width, vals, width, label=METHOD_LABELS.get(method, method),
               color=colors[i])

    ax.set_xticks(x)
    ax.set_xticklabels(net_names)
    ax.set_ylabel("Fraction Well-Connected")
    ax.set_title("Fraction of Well-Connected Clusters (mincut > log$_{10}$(n))")
    ax.legend(fontsize=7, ncol=2)
    ax.set_ylim(0, 1.05)
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, "wellconnected.pdf"), bbox_inches="tight")
    plt.close()
    print("  Saved wellconnected.pdf")


def generate_latex_accuracy_table():
    """Generate a LaTeX accuracy table."""
    acc_path = os.path.join(RESULTS_DIR, "accuracy", "accuracy_table.csv")
    if not os.path.exists(acc_path):
        return

    df = pd.read_csv(acc_path)
    lines = []
    lines.append(r"\begin{table}[htbp]")
    lines.append(r"\centering")
    lines.append(r"\caption{Community detection accuracy (AMI, ARI, NMI) on EC-SBM networks.}")
    lines.append(r"\label{tab:accuracy}")
    lines.append(r"\begin{tabular}{llrrr}")
    lines.append(r"\toprule")
    lines.append(r"Network & Method & AMI & ARI & NMI \\")
    lines.append(r"\midrule")

    for net_name in NETWORKS:
        first = True
        for _, row in df[df["network"] == net_name].iterrows():
            net_disp = net_name if first else ""
            m_label = METHOD_LABELS.get(row["method"], row["method"])
            lines.append(
                f"{net_disp} & {m_label} & {row['ami']:.4f} & {row['ari']:.4f} & {row['nmi']:.4f} \\\\"
            )
            first = False
        lines.append(r"\midrule")

    lines[-1] = r"\bottomrule"
    lines.append(r"\end{tabular}")
    lines.append(r"\end{table}")

    out_path = os.path.join(FIGURES_DIR, "accuracy_table.tex")
    with open(out_path, "w") as f:
        f.write("\n".join(lines))
    print(f"  Saved accuracy_table.tex")


def generate_latex_stats_table():
    """Generate a LaTeX cluster stats table."""
    stats_path = os.path.join(RESULTS_DIR, "stats", "cluster_stats_summary.csv")
    if not os.path.exists(stats_path):
        return

    df = pd.read_csv(stats_path)
    lines = []
    lines.append(r"\begin{table}[htbp]")
    lines.append(r"\centering")
    lines.append(r"\caption{Cluster statistics summary for each network and method.}")
    lines.append(r"\label{tab:cluster_stats}")
    lines.append(r"\footnotesize")
    lines.append(r"\begin{tabular}{llrrrrrrr}")
    lines.append(r"\toprule")
    lines.append(r"Network & Method & \#Clust. & Node Cov. & Mean Size & Mean Dens. & Mean Cond. & Mean Mix. & \%WC \\")
    lines.append(r"\midrule")

    for net_name in NETWORKS:
        first = True
        for _, row in df[df["network"] == net_name].iterrows():
            net_disp = net_name if first else ""
            m_label = METHOD_LABELS.get(row["method"], row["method"])
            nc = int(row['n_clusters_non_singleton'])
            nwc = int(row['n_wellconnected']) if 'n_wellconnected' in row and not pd.isna(row.get('n_wellconnected', np.nan)) else 0
            pct_wc = 100 * nwc / nc if nc > 0 else 0.0
            lines.append(
                f"{net_disp} & {m_label} & {nc} & "
                f"{row['node_coverage']:.3f} & {row['mean_cluster_size']:.1f} & "
                f"{row['mean_edge_density']:.3f} & {row['mean_conductance']:.3f} & "
                f"{row['mean_mixing_param']:.3f} & {pct_wc:.0f}\\% \\\\"
            )
            first = False
        lines.append(r"\midrule")

    lines[-1] = r"\bottomrule"
    lines.append(r"\end{tabular}")
    lines.append(r"\end{table}")

    out_path = os.path.join(FIGURES_DIR, "cluster_stats_table.tex")
    with open(out_path, "w") as f:
        f.write("\n".join(lines))
    print(f"  Saved cluster_stats_table.tex")


def generate_all():
    print("Generating accuracy heatmaps...")
    plot_accuracy_heatmap()
    print("Generating cluster size distributions...")
    plot_cluster_size_distributions()
    print("Generating edge density boxplots...")
    plot_edge_density_boxplots()
    print("Generating edge connectivity boxplots...")
    plot_edge_connectivity_boxplots()
    print("Generating well-connected fraction bar chart...")
    plot_wellconnected_bar()
    print("Generating mixing parameter comparison...")
    plot_mixing_parameter_comparison()
    print("Generating node coverage comparison...")
    plot_node_coverage_comparison()
    print("Generating LaTeX tables...")
    generate_latex_accuracy_table()
    generate_latex_stats_table()
    print("All plots and tables generated.")


if __name__ == "__main__":
    generate_all()