summaryrefslogtreecommitdiff
path: root/writeup/main.tex
blob: 44aac28ec89064bbda3241b597b9b4904916f365 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
\documentclass[11pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{hyperref}
\usepackage{natbib}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage{float}

\title{Community Detection on EC-SBM Synthetic Networks:\\A Comparative Evaluation}
\author{Yuren Hao}
\date{February 2026}

\begin{document}
\maketitle

% ============================================================
\section{Introduction}

Community detection is a fundamental task in network analysis, aiming to partition a network's nodes into groups with dense internal connections and sparse external connections.
Evaluating community detection methods is challenging because real-world networks rarely come with verified ground-truth communities.
The EC-SBM (Edge-Connected Stochastic Block Model) benchmark~\citep{vule2025sbm} addresses this by generating synthetic networks whose structure reflects properties of real networks, while providing known ground-truth communities for accuracy evaluation.

In this study, we apply five community detection methods to three EC-SBM networks of varying size.
We evaluate accuracy using Adjusted Mutual Information (AMI), Adjusted Rand Index (ARI), and Normalized Mutual Information (NMI), and also report empirical statistics about the detected communities, including node coverage, cluster size and edge density distributions, conductance, and mixing parameters.

% ============================================================
\section{Methods}

\subsection{Networks}

We selected three EC-SBM benchmark networks based on the \texttt{sbm+wcc} variant, as specified by the assignment.
Table~\ref{tab:networks} summarizes their properties.

\begin{table}[htbp]
\centering
\caption{EC-SBM networks used in this study.}
\label{tab:networks}
\begin{tabular}{lrrl}
\toprule
Network & Nodes & Edges & Description \\
\midrule
\texttt{polblogs} & 1,224 & 16,734 & Small ($<$10K nodes) \\
\texttt{topology} & 34,761 & 108,052 & Large (35K nodes) \\
\texttt{internet\_as} & 22,963 & 48,643 & Large (23K nodes) \\
\bottomrule
\end{tabular}
\end{table}

The \texttt{polblogs} network serves as our small test case, while \texttt{topology} and \texttt{internet\_as} are both large networks exceeding 20,000 nodes.
All networks were downloaded from the Illinois Data Bank~\citep{vule2025sbm} and use the \texttt{sbm+wcc} generation parameters (instance 0).

\subsection{Community Detection Algorithms}

We applied five community detection methods:

\begin{enumerate}
\item \textbf{Leiden optimizing modularity} (Leiden-Mod): The Leiden algorithm~\citep{traag2019leiden} optimizing the modularity quality function~\citep{newman2004modularity}. We used \texttt{leidenalg} v0.10.2 with \texttt{ModularityVertexPartition}.

\item \textbf{Leiden optimizing CPM at resolution 0.1} (Leiden-CPM(0.1)): The Leiden algorithm~\citep{traag2019leiden} with the Constant Potts Model (CPM)~\citep{traag2019cpm} at resolution parameter $\gamma = 0.1$.

\item \textbf{Leiden optimizing CPM at resolution 0.01} (Leiden-CPM(0.01)): Same as above with $\gamma = 0.01$.

\item \textbf{Infomap}: The Infomap algorithm~\citep{rosvall2008maps} (v2.8.1), which minimizes the map equation using a two-level partition with undirected flow model.

\item \textbf{graph-tool SBM}: Stochastic Block Model inference via \texttt{graph-tool}~\citep{peixoto2014efficient}, using \texttt{minimize\_blockmodel\_dl()} for a flat (non-nested) partition that minimizes description length.
\end{enumerate}

All methods used seed 42 for reproducibility.

\subsection{Evaluation Metrics}

\subsubsection{Accuracy Metrics}
We report three standard clustering comparison metrics:
\begin{itemize}
\item \textbf{Adjusted Mutual Information (AMI)}~\citep{vinh2010ami}: Mutual information adjusted for chance, normalized by the arithmetic mean of the marginal entropies. Ranges from 0 (random) to 1 (perfect).
\item \textbf{Adjusted Rand Index (ARI)}~\citep{hubert1985ari}: The Rand index corrected for expected value under random permutation. Ranges from $-0.5$ to 1.
\item \textbf{Normalized Mutual Information (NMI)}~\citep{vinh2010ami}: Mutual information normalized by the arithmetic mean of the marginal entropies, without chance correction. Ranges from 0 to 1.
\end{itemize}

All accuracy metrics were computed using the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}), specifically the \texttt{clustering\_accuracy()} function in \texttt{commdet\_acc/compute\_cd\_accuracy.py}.
AMI and NMI are computed via \texttt{graph-tool}'s \texttt{mutual\_information()} function; ARI is computed via \texttt{scikit-learn}'s \texttt{adjusted\_rand\_score()}.
Labels are aligned over the full node set from the edge list; nodes missing from a clustering are assigned unique singleton community IDs.

\subsubsection{Cluster Statistics}
For each clustering (including ground truth), we report:
\begin{itemize}
\item \textbf{Node coverage}: Fraction of nodes in non-singleton clusters (clusters with $\geq 2$ nodes).
\item \textbf{Number of non-singleton clusters}: The count of communities with at least 2 members.
\item \textbf{Mean cluster size}: Average number of nodes per non-singleton cluster.
\item \textbf{Edge density}: For a cluster of size $n$ with $m$ internal edges, $\rho = 2m / (n(n-1))$.
\item \textbf{Conductance}: $\phi = c_S / (2m_S + c_S)$, where $c_S$ is the number of boundary edges.
\item \textbf{Mixing parameter}: Per-node fraction of neighbors outside the node's own community. A value near 0 indicates strong community structure; near 1 indicates weak structure.
\item \textbf{Edge connectivity (minimum edge cut)}: The minimum number of edges whose removal disconnects the induced subgraph of a cluster. We report mincut$/\log_{10}(n)$, where $n$ is the cluster size. A cluster is \emph{well-connected} if mincut $> \log_{10}(n)$~\citep{park2024wellconnectedness}.
\end{itemize}

% ============================================================
\section{Results}

\subsection{Accuracy}

Table~\ref{tab:accuracy} and Figure~\ref{fig:heatmaps} present the accuracy metrics for all 15 (network, method) combinations.

\input{../results/figures/accuracy_table.tex}

\begin{figure}[htbp]
\centering
\begin{subfigure}[b]{0.32\textwidth}
\includegraphics[width=\textwidth]{../results/figures/heatmap_ami.pdf}
\caption{AMI}
\end{subfigure}
\begin{subfigure}[b]{0.32\textwidth}
\includegraphics[width=\textwidth]{../results/figures/heatmap_ari.pdf}
\caption{ARI}
\end{subfigure}
\begin{subfigure}[b]{0.32\textwidth}
\includegraphics[width=\textwidth]{../results/figures/heatmap_nmi.pdf}
\caption{NMI}
\end{subfigure}
\caption{Accuracy heatmaps for AMI, ARI, and NMI across networks and methods.}
\label{fig:heatmaps}
\end{figure}

Key observations:
\begin{itemize}
\item \textbf{Leiden-CPM(0.1) achieves the highest NMI} across all three networks (0.815 on polblogs, 0.937 on topology, 0.935 on internet\_as), and also the highest AMI.
\item \textbf{ARI values are universally low} (all below 0.17), even for the best-performing methods. This contrasts with the relatively higher NMI values, suggesting that the methods find many of the correct community boundaries (high NMI) but may differ in how they handle rare pairs (low ARI).
\item \textbf{Leiden-Mod and graph-tool SBM have the lowest accuracy}, primarily because they produce very few large clusters compared to the many small ground-truth communities.
\item \textbf{Infomap} sits between the extremes, with moderate NMI (0.35--0.75) and AMI (0.16--0.18).
\end{itemize}

\subsection{Cluster Statistics}

Table~\ref{tab:cluster_stats} summarizes the cluster statistics, including the percentage of well-connected clusters (\%WC, i.e., clusters with mincut $> \log_{10}(n)$).

\input{../results/figures/cluster_stats_table.tex}

\subsubsection{Node Coverage}
Figure~\ref{fig:node_coverage} shows node coverage by method.
Leiden-Mod, Infomap, and graph-tool SBM achieve 100\% coverage because they assign every node to some community.
The CPM-based Leiden methods and the ground truth have lower coverage because many nodes are left as singletons.
The ground truth itself has very low node coverage (21--48\%), indicating that the EC-SBM ground-truth communities cover only a minority of nodes, with the remainder being in singleton clusters.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/node_coverage.pdf}
\caption{Node coverage (fraction of nodes in non-singleton clusters) by network and method.}
\label{fig:node_coverage}
\end{figure}

\subsubsection{Cluster Size Distribution}
Figure~\ref{fig:cluster_sizes_topology} shows cluster size distributions for the \texttt{topology} network (see Appendix for other networks).
The ground truth is dominated by small clusters (mean size 5.0), with a log-scale $x$-axis revealing a heavy-tailed distribution.
Leiden-Mod produces far fewer, much larger clusters (mean 695 nodes), while Leiden-CPM(0.1) closely matches the ground-truth distribution shape.

\begin{figure}[htbp]
\centering
\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_topology.pdf}
\caption{Cluster size distributions for \texttt{topology}. Note logarithmic $x$-axis for methods with large clusters.}
\label{fig:cluster_sizes_topology}
\end{figure}

\subsubsection{Edge Density}
Figure~\ref{fig:edge_density} shows edge density distributions.
The ground truth has very high edge density (mean 0.83--0.84), which is expected for small, tightly-connected clusters.
Leiden-Mod and graph-tool SBM, which produce large clusters, have much lower edge density.
Leiden-CPM(0.1) produces clusters with moderate density (0.57--0.59), closer to ground truth than other methods.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_topology.pdf}
\caption{Edge density distribution across methods for \texttt{topology}.}
\label{fig:edge_density}
\end{figure}

\subsubsection{Mixing Parameter}
Figure~\ref{fig:mixing} presents the mean mixing parameter.
The ground truth has a high mean mixing parameter (0.88--0.91), meaning most nodes have most of their neighbors outside their assigned community.
This is because the ground-truth communities are small and sparse relative to the full network.
Leiden-Mod has the lowest mixing parameter (0.10--0.13) because it creates large clusters that encompass most of each node's neighborhood.
graph-tool SBM has a high mixing parameter (0.68--0.89), similar to ground truth, indicating it also produces blocks where many edges cross community boundaries.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/mixing_parameter.pdf}
\caption{Mean mixing parameter by network and method.}
\label{fig:mixing}
\end{figure}

\subsubsection{Edge Connectivity}
Figure~\ref{fig:edge_conn} shows the distribution of the edge connectivity ratio (mincut$/\log_{10}(n)$) for each method, and Figure~\ref{fig:wellconnected} shows the fraction of well-connected clusters (those with mincut $> \log_{10}(n)$).

\begin{figure}[htbp]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_topology.pdf}
\caption{Edge connectivity (mincut$/\log_{10}(n)$) distribution for \texttt{topology}. The dashed red line marks the well-connected threshold (ratio = 1).}
\label{fig:edge_conn}
\end{figure}

\begin{figure}[htbp]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/wellconnected.pdf}
\caption{Fraction of non-singleton clusters that are well-connected (mincut $> \log_{10}(n)$).}
\label{fig:wellconnected}
\end{figure}

The ground-truth communities are 100\% well-connected on all three networks, confirming that the EC-SBM benchmark produces well-connected clusters by construction.
Leiden-CPM(0.1) achieves the highest well-connectedness among the methods (87--97\%), followed by Leiden-CPM(0.01) (40--96\%).
Leiden-Mod produces well-connected clusters on the small polblogs network (67\%) but drops to 3--40\% on the larger networks because its large clusters are poorly connected relative to their size.
graph-tool SBM has the lowest fraction of well-connected clusters (9--23\%), consistent with its tendency to produce large, internally sparse blocks.
Infomap achieves moderate well-connectedness (50--74\%).

\subsection{Effect of CPM Resolution Parameter}

Comparing Leiden-CPM at resolutions 0.1 and 0.01 reveals the expected trade-off:
\begin{itemize}
\item \textbf{Resolution 0.1} produces more clusters (4,047 on topology vs.\ 1,802 at 0.01), with smaller mean size, higher edge density, but lower node coverage.
\item \textbf{Resolution 0.01} produces fewer, larger clusters with higher node coverage and lower mixing parameter, but slightly lower NMI.
\item On polblogs, resolution 0.1 achieves AMI=0.43 vs.\ 0.32 for resolution 0.01. On topology, they are similar (AMI $\approx$ 0.24 and 0.23).
\end{itemize}

This aligns with the findings of~\citet{park2024wellconnectedness}, who observed that higher CPM resolutions tend to find smaller, denser communities that may better capture local structure.

% ============================================================
\section{Discussion}

\subsection{Method Comparison}

Our results reveal a fundamental tension in community detection: methods that optimize for global objectives (modularity, description length) tend to create few large clusters, while methods that allow for many small clusters (CPM, ground truth) capture finer-grained structure.

\textbf{Leiden-CPM(0.1)} was the best performer overall, achieving the highest AMI and NMI on all three networks. Its cluster size distribution and edge density most closely match the ground truth. This is consistent with the findings of~\citet{vule2025sbm}, who noted that CPM-based methods with appropriate resolution tend to recover EC-SBM communities well.

\textbf{Leiden-Mod} and \textbf{graph-tool SBM} perform poorly because they suffer from a resolution limit~\citep{newman2004modularity}: modularity optimization cannot detect communities smaller than $\sqrt{2m}$ (where $m$ is the number of edges), and SBM inference prefers parsimonious models with fewer blocks.
On topology (108K edges), both methods find fewer than 100 communities vs.\ 1,517 in ground truth.

\textbf{Infomap} provides a middle ground with moderate accuracy. Its flow-based approach creates more communities than modularity (23--1,274 clusters) and achieves 100\% coverage, but its NMI (0.35--0.73) falls well below Leiden-CPM(0.1).

\textbf{graph-tool SBM} is interesting because it identifies structure that is fundamentally different from the other methods: it infers blocks based on connectivity patterns (stochastic equivalence) rather than dense subgraphs. Its high mixing parameter and conductance confirm that its blocks are not internally dense communities but groups of nodes with similar connection profiles, which explains its low accuracy against the dense ground-truth communities.

\subsection{Discrepancy Between NMI and ARI}

A notable observation is the large gap between NMI and ARI values. For example, Leiden-CPM(0.1) on topology has NMI=0.937 but ARI=0.059. This discrepancy arises because NMI measures agreement in information-theoretic terms (sensitive to correct assignment of information), while ARI measures pairwise agreement (sensitive to whether pairs of nodes are correctly co-clustered or separated). When ground-truth clusters are very small (mean size 5), most pairs of nodes are in different clusters, and even small errors in cluster assignment can drastically reduce the pairwise agreement measured by ARI while barely affecting the information-theoretic NMI.

\subsection{Ground Truth Properties}

The EC-SBM ground-truth communities have distinctive properties: very small clusters (mean 4--8 nodes), high edge density (0.77--0.84), and low node coverage (21--48\%). This means the majority of nodes are singletons in the ground truth. Methods that achieve 100\% coverage (Leiden-Mod, Infomap, graph-tool SBM) necessarily create many false-positive community memberships for these singleton nodes, which may artificially inflate or deflate accuracy depending on the metric.

\subsection{Connection to Literature}

Our findings are consistent with:
\begin{itemize}
\item \citet{park2024wellconnectedness}: They demonstrated that well-connectedness is an important property of meaningful communities, and that CPM-based methods tend to produce well-connected clusters. Our edge density and conductance results support this.
\item \citet{vule2025sbm}: Their study of EC-SBM benchmarks showed that CPM-based Leiden methods outperform modularity-based methods on these networks, which matches our accuracy results across all three networks.
\end{itemize}

\subsection{What I Learned and Open Questions}

From this experiment I learned several things.
First, the choice of quality function matters far more than the choice of algorithm: Leiden-CPM(0.1) vastly outperforms Leiden-Mod on the same Leiden framework, solely because CPM can resolve the small communities that modularity merges together.
Second, well-connectedness is a useful diagnostic beyond accuracy---methods with high NMI can still produce clusters that are poorly connected internally (e.g., Infomap on topology has NMI=0.73 but only 53\% well-connected clusters).
Third, the low ARI across all methods highlights that pairwise metrics can be misleading when ground-truth communities are small and numerous: even a ``good'' method scores near zero on ARI.

Several questions remain open for me.
I am still trying to understand how to choose the CPM resolution parameter in practice: resolution 0.1 gives the best NMI on all three networks, but is this a general finding or specific to EC-SBM benchmarks?
I also wonder how much the results would change with nested SBM inference in graph-tool, since the flat SBM performed poorly---the hierarchical model might recover finer-grained blocks that better match the ground truth.
Finally, I find it interesting that the ground truth itself has very low node coverage (21--48\%); this raises the question of whether the ``unclustered'' nodes genuinely lack community structure or whether the EC-SBM model simply does not assign them.

% ============================================================
\section{Reproducibility}

\subsection{Software Versions}
\begin{itemize}
\item Python 3.12.12
\item \texttt{python-igraph} (via conda-forge)
\item \texttt{leidenalg} (via conda-forge)
\item \texttt{infomap} 2.8.1
\item \texttt{graph-tool} (via conda-forge)
\item \texttt{scikit-learn}, \texttt{scipy}, \texttt{matplotlib}, \texttt{pandas} (via conda-forge)
\end{itemize}

\subsection{Data Source}
EC-SBM networks downloaded from the Illinois Data Bank: \url{https://databank.illinois.edu/datasets/IDB-3284069} (file \texttt{networks.zip}, 3.51 GB). We used the \texttt{sbm+wcc} variant, instance 0, for each network.

\subsection{How to Reproduce}
\begin{enumerate}
\item Install dependencies:
\begin{verbatim}
conda install -c conda-forge python-igraph leidenalg \
    infomap graph-tool matplotlib scipy scikit-learn pandas -y
\end{verbatim}
\item Download and extract \texttt{networks.zip} to \texttt{data/networks/}.
\item Create symlinks (see \texttt{scripts/config.py} for paths).
\item Run the full pipeline:
\begin{verbatim}
python scripts/run_all.py
\end{verbatim}
\item Compile this report:
\begin{verbatim}
cd writeup && pdflatex main && bibtex main && pdflatex main && pdflatex main
\end{verbatim}
\end{enumerate}

All code, results, and this writeup are in the project directory. Random seeds are fixed at 42 for all stochastic methods.

% ============================================================
\appendix
\section{Additional Cluster Size Distributions}

\begin{figure}[H]
\centering
\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_polblogs.pdf}
\caption{Cluster size distributions for \texttt{polblogs}.}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_internet_as.pdf}
\caption{Cluster size distributions for \texttt{internet\_as}.}
\end{figure}

\clearpage
\section{Additional Edge Density Distributions}

\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_polblogs.pdf}
\caption{Edge density distribution for \texttt{polblogs}.}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_internet_as.pdf}
\caption{Edge density distribution for \texttt{internet\_as}.}
\end{figure}

\clearpage
\section{Additional Edge Connectivity Distributions}

\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_polblogs.pdf}
\caption{Edge connectivity distribution for \texttt{polblogs}.}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_internet_as.pdf}
\caption{Edge connectivity distribution for \texttt{internet\_as}.}
\end{figure}

% ============================================================
\bibliographystyle{plainnat}
\bibliography{references}

\end{document}