summaryrefslogtreecommitdiff
path: root/writeup/main.tex
diff options
context:
space:
mode:
Diffstat (limited to 'writeup/main.tex')
-rw-r--r--writeup/main.tex72
1 files changed, 63 insertions, 9 deletions
diff --git a/writeup/main.tex b/writeup/main.tex
index ad462bd..44aac28 100644
--- a/writeup/main.tex
+++ b/writeup/main.tex
@@ -7,9 +7,10 @@
\usepackage{natbib}
\usepackage{subcaption}
\usepackage{xcolor}
+\usepackage{float}
\title{Community Detection on EC-SBM Synthetic Networks:\\A Comparative Evaluation}
-\author{CS 598 Data Analysis Assignment}
+\author{Yuren Hao}
\date{February 2026}
\begin{document}
@@ -72,15 +73,16 @@ All methods used seed 42 for reproducibility.
\subsection{Evaluation Metrics}
\subsubsection{Accuracy Metrics}
-We report three standard clustering comparison metrics, computed using \texttt{scikit-learn}:
+We report three standard clustering comparison metrics:
\begin{itemize}
\item \textbf{Adjusted Mutual Information (AMI)}~\citep{vinh2010ami}: Mutual information adjusted for chance, normalized by the arithmetic mean of the marginal entropies. Ranges from 0 (random) to 1 (perfect).
\item \textbf{Adjusted Rand Index (ARI)}~\citep{hubert1985ari}: The Rand index corrected for expected value under random permutation. Ranges from $-0.5$ to 1.
\item \textbf{Normalized Mutual Information (NMI)}~\citep{vinh2010ami}: Mutual information normalized by the arithmetic mean of the marginal entropies, without chance correction. Ranges from 0 to 1.
\end{itemize}
-For all metrics, we aligned labels over the full node set derived from the edge list. Nodes missing from a clustering were each assigned unique singleton community IDs, ensuring consistent evaluation across methods.
-We cross-validated our AMI, ARI, and NMI values against the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}) and confirmed identical results.
+All accuracy metrics were computed using the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}), specifically the \texttt{clustering\_accuracy()} function in \texttt{commdet\_acc/compute\_cd\_accuracy.py}.
+AMI and NMI are computed via \texttt{graph-tool}'s \texttt{mutual\_information()} function; ARI is computed via \texttt{scikit-learn}'s \texttt{adjusted\_rand\_score()}.
+Labels are aligned over the full node set from the edge list; nodes missing from a clustering are assigned unique singleton community IDs.
\subsubsection{Cluster Statistics}
For each clustering (including ground truth), we report:
@@ -91,6 +93,7 @@ For each clustering (including ground truth), we report:
\item \textbf{Edge density}: For a cluster of size $n$ with $m$ internal edges, $\rho = 2m / (n(n-1))$.
\item \textbf{Conductance}: $\phi = c_S / (2m_S + c_S)$, where $c_S$ is the number of boundary edges.
\item \textbf{Mixing parameter}: Per-node fraction of neighbors outside the node's own community. A value near 0 indicates strong community structure; near 1 indicates weak structure.
+\item \textbf{Edge connectivity (minimum edge cut)}: The minimum number of edges whose removal disconnects the induced subgraph of a cluster. We report mincut$/\log_{10}(n)$, where $n$ is the cluster size. A cluster is \emph{well-connected} if mincut $> \log_{10}(n)$~\citep{park2024wellconnectedness}.
\end{itemize}
% ============================================================
@@ -130,7 +133,7 @@ Key observations:
\subsection{Cluster Statistics}
-Table~\ref{tab:cluster_stats} summarizes the cluster statistics.
+Table~\ref{tab:cluster_stats} summarizes the cluster statistics, including the percentage of well-connected clusters (\%WC, i.e., clusters with mincut $> \log_{10}(n)$).
\input{../results/figures/cluster_stats_table.tex}
@@ -186,6 +189,29 @@ graph-tool SBM has a high mixing parameter (0.68--0.89), similar to ground truth
\label{fig:mixing}
\end{figure}
+\subsubsection{Edge Connectivity}
+Figure~\ref{fig:edge_conn} shows the distribution of the edge connectivity ratio (mincut$/\log_{10}(n)$) for each method, and Figure~\ref{fig:wellconnected} shows the fraction of well-connected clusters (those with mincut $> \log_{10}(n)$).
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_topology.pdf}
+\caption{Edge connectivity (mincut$/\log_{10}(n)$) distribution for \texttt{topology}. The dashed red line marks the well-connected threshold (ratio = 1).}
+\label{fig:edge_conn}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/wellconnected.pdf}
+\caption{Fraction of non-singleton clusters that are well-connected (mincut $> \log_{10}(n)$).}
+\label{fig:wellconnected}
+\end{figure}
+
+The ground-truth communities are 100\% well-connected on all three networks, confirming that the EC-SBM benchmark produces well-connected clusters by construction.
+Leiden-CPM(0.1) achieves the highest well-connectedness among the methods (87--97\%), followed by Leiden-CPM(0.01) (40--96\%).
+Leiden-Mod produces well-connected clusters on the small polblogs network (67\%) but drops to 3--40\% on the larger networks because its large clusters are poorly connected relative to their size.
+graph-tool SBM has the lowest fraction of well-connected clusters (9--23\%), consistent with its tendency to produce large, internally sparse blocks.
+Infomap achieves moderate well-connectedness (50--74\%).
+
\subsection{Effect of CPM Resolution Parameter}
Comparing Leiden-CPM at resolutions 0.1 and 0.01 reveals the expected trade-off:
@@ -229,6 +255,18 @@ Our findings are consistent with:
\item \citet{vule2025sbm}: Their study of EC-SBM benchmarks showed that CPM-based Leiden methods outperform modularity-based methods on these networks, which matches our accuracy results across all three networks.
\end{itemize}
+\subsection{What I Learned and Open Questions}
+
+From this experiment I learned several things.
+First, the choice of quality function matters far more than the choice of algorithm: Leiden-CPM(0.1) vastly outperforms Leiden-Mod on the same Leiden framework, solely because CPM can resolve the small communities that modularity merges together.
+Second, well-connectedness is a useful diagnostic beyond accuracy---methods with high NMI can still produce clusters that are poorly connected internally (e.g., Infomap on topology has NMI=0.73 but only 53\% well-connected clusters).
+Third, the low ARI across all methods highlights that pairwise metrics can be misleading when ground-truth communities are small and numerous: even a ``good'' method scores near zero on ARI.
+
+Several questions remain open for me.
+I am still trying to understand how to choose the CPM resolution parameter in practice: resolution 0.1 gives the best NMI on all three networks, but is this a general finding or specific to EC-SBM benchmarks?
+I also wonder how much the results would change with nested SBM inference in graph-tool, since the flat SBM performed poorly---the hierarchical model might recover finer-grained blocks that better match the ground truth.
+Finally, I find it interesting that the ground truth itself has very low node coverage (21--48\%); this raises the question of whether the ``unclustered'' nodes genuinely lack community structure or whether the EC-SBM model simply does not assign them.
+
% ============================================================
\section{Reproducibility}
@@ -270,32 +308,48 @@ All code, results, and this writeup are in the project directory. Random seeds a
\appendix
\section{Additional Cluster Size Distributions}
-\begin{figure}[htbp]
+\begin{figure}[H]
\centering
\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_polblogs.pdf}
\caption{Cluster size distributions for \texttt{polblogs}.}
\end{figure}
-\begin{figure}[htbp]
+\begin{figure}[H]
\centering
\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_internet_as.pdf}
\caption{Cluster size distributions for \texttt{internet\_as}.}
\end{figure}
+\clearpage
\section{Additional Edge Density Distributions}
-\begin{figure}[htbp]
+\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_polblogs.pdf}
\caption{Edge density distribution for \texttt{polblogs}.}
\end{figure}
-\begin{figure}[htbp]
+\begin{figure}[H]
\centering
\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_internet_as.pdf}
\caption{Edge density distribution for \texttt{internet\_as}.}
\end{figure}
+\clearpage
+\section{Additional Edge Connectivity Distributions}
+
+\begin{figure}[H]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_polblogs.pdf}
+\caption{Edge connectivity distribution for \texttt{polblogs}.}
+\end{figure}
+
+\begin{figure}[H]
+\centering
+\includegraphics[width=0.85\textwidth]{../results/figures/edge_connectivity_internet_as.pdf}
+\caption{Edge connectivity distribution for \texttt{internet\_as}.}
+\end{figure}
+
% ============================================================
\bibliographystyle{plainnat}
\bibliography{references}