1 files changed, 63 insertions, 9 deletions
diff --git a/writeup/main.tex b/writeup/main.tex
index ad462bd..44aac28 100644
--- a/writeup/main.tex
+++ b/writeup/main.tex
@@ -7,9 +7,10 @@
 \usepackage{natbib}
 \usepackage{subcaption}
 \usepackage{xcolor}
+\usepackage{float}
 
 \title{Community Detection on EC-SBM Synthetic Networks:\\A Comparative Evaluation}
-\author{CS 598 Data Analysis Assignment}
+\author{Yuren Hao}
 \date{February 2026}
 
 \begin{document}
@@ -72,15 +73,16 @@ All methods used seed 42 for reproducibility.
 \subsection{Evaluation Metrics}
 
 \subsubsection{Accuracy Metrics}
-We report three standard clustering comparison metrics, computed using \texttt{scikit-learn}:
+We report three standard clustering comparison metrics:
 \begin{itemize}
 \item \textbf{Adjusted Mutual Information (AMI)}~\citep{vinh2010ami}: Mutual information adjusted for chance, normalized by the arithmetic mean of the marginal entropies. Ranges from 0 (random) to 1 (perfect).
 \item \textbf{Adjusted Rand Index (ARI)}~\citep{hubert1985ari}: The Rand index corrected for expected value under random permutation. Ranges from $-0.5$ to 1.
 \item \textbf{Normalized Mutual Information (NMI)}~\citep{vinh2010ami}: Mutual information normalized by the arithmetic mean of the marginal entropies, without chance correction. Ranges from 0 to 1.
 \end{itemize}
 
-For all metrics, we aligned labels over the full node set derived from the edge list. Nodes missing from a clustering were each assigned unique singleton community IDs, ensuring consistent evaluation across methods.
-We cross-validated our AMI, ARI, and NMI values against the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}) and confirmed identical results.
+All accuracy metrics were computed using the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}), specifically the \texttt{clustering\_accuracy()} function in \texttt{commdet\_acc/compute\_cd\_accuracy.py}.
+AMI and NMI are computed via \texttt{graph-tool}'s \texttt{mutual\_information()} function; ARI is computed via \texttt{scikit-learn}'s \texttt{adjusted\_rand\_score()}.
+Labels are aligned over the full node set from the edge list; nodes missing from a clustering are assigned unique singleton community IDs.
 
 \subsubsection{Cluster Statistics}
 For each clustering (including ground truth), we report:
@@ -91,6 +93,7 @@ For each clustering (including ground truth), we report:
 \item \textbf{Edge density}: For a cluster of size $n$ with $m$ internal edges, $\rho = 2m / (n(n-1))$.
 \item \textbf{Conductance}: $\phi = c_S / (2m_S + c_S)$, where $c_S$ is the number of boundary edges.
 \item \textbf{Mixing parameter}: Per-node fraction of neighbors outside the node's own community. A value near 0 indicates strong community structure; near 1 indicates weak structure.
+\item \textbf{Edge connectivity (minimum edge cut)}: The minimum number of edges whose removal disconnects the induced subgraph of a cluster. We report mincut$/\log_{10}(n)$, where $n$ is the cluster size. A cluster is \emph{well-connected} if mincut $> \log_{10}(n)$~\citep{park2024wellconnectedness}.
 \end{itemize}
 
 % ============================================================
@@ -130,7 +133,7 @@ Key observations:
 
 \subsection{Cluster Statistics}
 
-Table~\ref{tab:cluster_stats} summarizes the cluster statistics.
+Table~\ref{tab:cluster_stats} summarizes the cluster statistics, including the percentage of well-connected clusters (\%WC, i.e., clusters with mincut $> \log_{10}(n)$).
 
 \input{../results/figures/cluster_stats_table.tex}
 
@@ -186,6 +189,29 @@ graph-tool SBM has a high mixing parameter (0.68--0.89), similar to ground truth
 \label{fig:mixing}
 \end{figure}
 
+\subsubsection{Edge Connectivity}
+Figure~\ref{fig:edge_conn} shows the distribution of the edge connectivity ratio (mincut$/\log_{10}(n)$) for each method, and Figure~\ref{fig:wellconnected} shows the fraction of well-connected clusters (those with mincut $> \log_{10}(n)$).
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_topology.pdf}
+\caption{Edge connectivity (mincut$/\log_{10}(n)$) distribution for \texttt{topology}. The dashed red line marks the well-connected threshold (ratio = 1).}
+\label{fig:edge_conn}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/wellconnected.pdf}
+\caption{Fraction of non-singleton clusters that are well-connected (mincut $> \log_{10}(n)$).}
+\label{fig:wellconnected}
+\end{figure}
+
+The ground-truth communities are 100\% well-connected on all three networks, confirming that the EC-SBM benchmark produces well-connected clusters by construction.
+Leiden-CPM(0.1) achieves the highest well-connectedness among the methods (87--97\%), followed by Leiden-CPM(0.01) (40--96\%).
+Leiden-Mod produces well-connected clusters on the small polblogs network (67\%) but drops to 3--40\% on the larger networks because its large clusters are poorly connected relative to their size.
+graph-tool SBM has the lowest fraction of well-connected clusters (9--23\%), consistent with its tendency to produce large, internally sparse blocks.
+Infomap achieves moderate well-connectedness (50--74\%).
+
 \subsection{Effect of CPM Resolution Parameter}
 
 Comparing Leiden-CPM at resolutions 0.1 and 0.01 reveals the expected trade-off:
@@ -229,6 +255,18 @@ Our findings are consistent with:
 \item \citet{vule2025sbm}: Their study of EC-SBM benchmarks showed that CPM-based Leiden methods outperform modularity-based methods on these networks, which matches our accuracy results across all three networks.
 \end{itemize}
 
+\subsection{What I Learned and Open Questions}
+
+From this experiment I learned several things.
+First, the choice of quality function matters far more than the choice of algorithm: Leiden-CPM(0.1) vastly outperforms Leiden-Mod on the same Leiden framework, solely because CPM can resolve the small communities that modularity merges together.
+Second, well-connectedness is a useful diagnostic beyond accuracy---methods with high NMI can still produce clusters that are poorly connected internally (e.g., Infomap on topology has NMI=0.73 but only 53\% well-connected clusters).
+Third, the low ARI across all methods highlights that pairwise metrics can be misleading when ground-truth communities are small and numerous: even a ``good'' method scores near zero on ARI.
+
+Several questions remain open for me.
+I am still trying to understand how to choose the CPM resolution parameter in practice: resolution 0.1 gives the best NMI on all three networks, but is this a general finding or specific to EC-SBM benchmarks?
+I also wonder how much the results would change with nested SBM inference in graph-tool, since the flat SBM performed poorly---the hierarchical model might recover finer-grained blocks that better match the ground truth.
+Finally, I find it interesting that the ground truth itself has very low node coverage (21--48\%); this raises the question of whether the ``unclustered'' nodes genuinely lack community structure or whether the EC-SBM model simply does not assign them.
+
 % ============================================================
 \section{Reproducibility}
 
@@ -270,32 +308,48 @@ All code, results, and this writeup are in the project directory. Random seeds a
 \appendix
 \section{Additional Cluster Size Distributions}
 
-\begin{figure}[htbp]
+\begin{figure}[H]
 \centering
 \includegraphics[width=\textwidth]{../results/figures/cluster_sizes_polblogs.pdf}
 \caption{Cluster size distributions for \texttt{polblogs}.}
 \end{figure}
 
-\begin{figure}[htbp]
+\begin{figure}[H]
 \centering
 \includegraphics[width=\textwidth]{../results/figures/cluster_sizes_internet_as.pdf}
 \caption{Cluster size distributions for \texttt{internet\_as}.}
 \end{figure}
 
+\clearpage
 \section{Additional Edge Density Distributions}
 
-\begin{figure}[htbp]
+\begin{figure}[H]
 \centering
 \includegraphics[width=0.85\textwidth]{../results/figures/edge_density_polblogs.pdf}
 \caption{Edge density distribution for \texttt{polblogs}.}
 \end{figure}
 
-\begin{figure}[htbp]
+\begin{figure}[H]
 \centering
 \includegraphics[width=0.85\textwidth]{../results/figures/edge_density_internet_as.pdf}
 \caption{Edge density distribution for \texttt{internet\_as}.}
 \end{figure}
 
+\clearpage
+\section{Additional Edge Connectivity Distributions}
+
+\begin{figure}[H]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_polblogs.pdf}
+\caption{Edge connectivity distribution for \texttt{polblogs}.}
+\end{figure}
+
+\begin{figure}[H]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_internet_as.pdf}
+\caption{Edge connectivity distribution for \texttt{internet\_as}.}
+\end{figure}
+
 % ============================================================
 \bibliographystyle{plainnat}
 \bibliography{references}