From 8f63cf9f41bbdb8d55cd4679872d2b4ae2129324 Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Tue, 24 Feb 2026 08:40:49 +0000 Subject: EC-SBM community detection analysis: full pipeline and writeup Implement community detection on 3 EC-SBM networks (polblogs, topology, internet_as) using 5 methods (Leiden-Mod, Leiden-CPM at 0.1 and 0.01, Infomap, graph-tool SBM). Compute AMI/ARI/NMI accuracy, cluster statistics, and generate figures and LaTeX report. --- writeup/main.tex | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 writeup/main.tex (limited to 'writeup/main.tex') diff --git a/writeup/main.tex b/writeup/main.tex new file mode 100644 index 0000000..ad462bd --- /dev/null +++ b/writeup/main.tex @@ -0,0 +1,303 @@ +\documentclass[11pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{booktabs} +\usepackage{graphicx} +\usepackage{amsmath} +\usepackage{hyperref} +\usepackage{natbib} +\usepackage{subcaption} +\usepackage{xcolor} + +\title{Community Detection on EC-SBM Synthetic Networks:\\A Comparative Evaluation} +\author{CS 598 Data Analysis Assignment} +\date{February 2026} + +\begin{document} +\maketitle + +% ============================================================ +\section{Introduction} + +Community detection is a fundamental task in network analysis, aiming to partition a network's nodes into groups with dense internal connections and sparse external connections. +Evaluating community detection methods is challenging because real-world networks rarely come with verified ground-truth communities. +The EC-SBM (Edge-Connected Stochastic Block Model) benchmark~\citep{vule2025sbm} addresses this by generating synthetic networks whose structure reflects properties of real networks, while providing known ground-truth communities for accuracy evaluation. + +In this study, we apply five community detection methods to three EC-SBM networks of varying size. +We evaluate accuracy using Adjusted Mutual Information (AMI), Adjusted Rand Index (ARI), and Normalized Mutual Information (NMI), and also report empirical statistics about the detected communities, including node coverage, cluster size and edge density distributions, conductance, and mixing parameters. + +% ============================================================ +\section{Methods} + +\subsection{Networks} + +We selected three EC-SBM benchmark networks based on the \texttt{sbm+wcc} variant, as specified by the assignment. +Table~\ref{tab:networks} summarizes their properties. + +\begin{table}[htbp] +\centering +\caption{EC-SBM networks used in this study.} +\label{tab:networks} +\begin{tabular}{lrrl} +\toprule +Network & Nodes & Edges & Description \\ +\midrule +\texttt{polblogs} & 1,224 & 16,734 & Small ($<$10K nodes) \\ +\texttt{topology} & 34,761 & 108,052 & Large (35K nodes) \\ +\texttt{internet\_as} & 22,963 & 48,643 & Large (23K nodes) \\ +\bottomrule +\end{tabular} +\end{table} + +The \texttt{polblogs} network serves as our small test case, while \texttt{topology} and \texttt{internet\_as} are both large networks exceeding 20,000 nodes. +All networks were downloaded from the Illinois Data Bank~\citep{vule2025sbm} and use the \texttt{sbm+wcc} generation parameters (instance 0). + +\subsection{Community Detection Algorithms} + +We applied five community detection methods: + +\begin{enumerate} +\item \textbf{Leiden optimizing modularity} (Leiden-Mod): The Leiden algorithm~\citep{traag2019leiden} optimizing the modularity quality function~\citep{newman2004modularity}. We used \texttt{leidenalg} v0.10.2 with \texttt{ModularityVertexPartition}. + +\item \textbf{Leiden optimizing CPM at resolution 0.1} (Leiden-CPM(0.1)): The Leiden algorithm~\citep{traag2019leiden} with the Constant Potts Model (CPM)~\citep{traag2019cpm} at resolution parameter $\gamma = 0.1$. + +\item \textbf{Leiden optimizing CPM at resolution 0.01} (Leiden-CPM(0.01)): Same as above with $\gamma = 0.01$. + +\item \textbf{Infomap}: The Infomap algorithm~\citep{rosvall2008maps} (v2.8.1), which minimizes the map equation using a two-level partition with undirected flow model. + +\item \textbf{graph-tool SBM}: Stochastic Block Model inference via \texttt{graph-tool}~\citep{peixoto2014efficient}, using \texttt{minimize\_blockmodel\_dl()} for a flat (non-nested) partition that minimizes description length. +\end{enumerate} + +All methods used seed 42 for reproducibility. + +\subsection{Evaluation Metrics} + +\subsubsection{Accuracy Metrics} +We report three standard clustering comparison metrics, computed using \texttt{scikit-learn}: +\begin{itemize} +\item \textbf{Adjusted Mutual Information (AMI)}~\citep{vinh2010ami}: Mutual information adjusted for chance, normalized by the arithmetic mean of the marginal entropies. Ranges from 0 (random) to 1 (perfect). +\item \textbf{Adjusted Rand Index (ARI)}~\citep{hubert1985ari}: The Rand index corrected for expected value under random permutation. Ranges from $-0.5$ to 1. +\item \textbf{Normalized Mutual Information (NMI)}~\citep{vinh2010ami}: Mutual information normalized by the arithmetic mean of the marginal entropies, without chance correction. Ranges from 0 to 1. +\end{itemize} + +For all metrics, we aligned labels over the full node set derived from the edge list. Nodes missing from a clustering were each assigned unique singleton community IDs, ensuring consistent evaluation across methods. +We cross-validated our AMI, ARI, and NMI values against the official \texttt{network\_evaluation} scripts (\url{https://github.com/illinois-or-research-analytics/network_evaluation}) and confirmed identical results. + +\subsubsection{Cluster Statistics} +For each clustering (including ground truth), we report: +\begin{itemize} +\item \textbf{Node coverage}: Fraction of nodes in non-singleton clusters (clusters with $\geq 2$ nodes). +\item \textbf{Number of non-singleton clusters}: The count of communities with at least 2 members. +\item \textbf{Mean cluster size}: Average number of nodes per non-singleton cluster. +\item \textbf{Edge density}: For a cluster of size $n$ with $m$ internal edges, $\rho = 2m / (n(n-1))$. +\item \textbf{Conductance}: $\phi = c_S / (2m_S + c_S)$, where $c_S$ is the number of boundary edges. +\item \textbf{Mixing parameter}: Per-node fraction of neighbors outside the node's own community. A value near 0 indicates strong community structure; near 1 indicates weak structure. +\end{itemize} + +% ============================================================ +\section{Results} + +\subsection{Accuracy} + +Table~\ref{tab:accuracy} and Figure~\ref{fig:heatmaps} present the accuracy metrics for all 15 (network, method) combinations. + +\input{../results/figures/accuracy_table.tex} + +\begin{figure}[htbp] +\centering +\begin{subfigure}[b]{0.32\textwidth} +\includegraphics[width=\textwidth]{../results/figures/heatmap_ami.pdf} +\caption{AMI} +\end{subfigure} +\begin{subfigure}[b]{0.32\textwidth} +\includegraphics[width=\textwidth]{../results/figures/heatmap_ari.pdf} +\caption{ARI} +\end{subfigure} +\begin{subfigure}[b]{0.32\textwidth} +\includegraphics[width=\textwidth]{../results/figures/heatmap_nmi.pdf} +\caption{NMI} +\end{subfigure} +\caption{Accuracy heatmaps for AMI, ARI, and NMI across networks and methods.} +\label{fig:heatmaps} +\end{figure} + +Key observations: +\begin{itemize} +\item \textbf{Leiden-CPM(0.1) achieves the highest NMI} across all three networks (0.815 on polblogs, 0.937 on topology, 0.935 on internet\_as), and also the highest AMI. +\item \textbf{ARI values are universally low} (all below 0.17), even for the best-performing methods. This contrasts with the relatively higher NMI values, suggesting that the methods find many of the correct community boundaries (high NMI) but may differ in how they handle rare pairs (low ARI). +\item \textbf{Leiden-Mod and graph-tool SBM have the lowest accuracy}, primarily because they produce very few large clusters compared to the many small ground-truth communities. +\item \textbf{Infomap} sits between the extremes, with moderate NMI (0.35--0.75) and AMI (0.16--0.18). +\end{itemize} + +\subsection{Cluster Statistics} + +Table~\ref{tab:cluster_stats} summarizes the cluster statistics. + +\input{../results/figures/cluster_stats_table.tex} + +\subsubsection{Node Coverage} +Figure~\ref{fig:node_coverage} shows node coverage by method. +Leiden-Mod, Infomap, and graph-tool SBM achieve 100\% coverage because they assign every node to some community. +The CPM-based Leiden methods and the ground truth have lower coverage because many nodes are left as singletons. +The ground truth itself has very low node coverage (21--48\%), indicating that the EC-SBM ground-truth communities cover only a minority of nodes, with the remainder being in singleton clusters. + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.85\textwidth]{../results/figures/node_coverage.pdf} +\caption{Node coverage (fraction of nodes in non-singleton clusters) by network and method.} +\label{fig:node_coverage} +\end{figure} + +\subsubsection{Cluster Size Distribution} +Figure~\ref{fig:cluster_sizes_topology} shows cluster size distributions for the \texttt{topology} network (see Appendix for other networks). +The ground truth is dominated by small clusters (mean size 5.0), with a log-scale $x$-axis revealing a heavy-tailed distribution. +Leiden-Mod produces far fewer, much larger clusters (mean 695 nodes), while Leiden-CPM(0.1) closely matches the ground-truth distribution shape. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_topology.pdf} +\caption{Cluster size distributions for \texttt{topology}. Note logarithmic $x$-axis for methods with large clusters.} +\label{fig:cluster_sizes_topology} +\end{figure} + +\subsubsection{Edge Density} +Figure~\ref{fig:edge_density} shows edge density distributions. +The ground truth has very high edge density (mean 0.83--0.84), which is expected for small, tightly-connected clusters. +Leiden-Mod and graph-tool SBM, which produce large clusters, have much lower edge density. +Leiden-CPM(0.1) produces clusters with moderate density (0.57--0.59), closer to ground truth than other methods. + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_topology.pdf} +\caption{Edge density distribution across methods for \texttt{topology}.} +\label{fig:edge_density} +\end{figure} + +\subsubsection{Mixing Parameter} +Figure~\ref{fig:mixing} presents the mean mixing parameter. +The ground truth has a high mean mixing parameter (0.88--0.91), meaning most nodes have most of their neighbors outside their assigned community. +This is because the ground-truth communities are small and sparse relative to the full network. +Leiden-Mod has the lowest mixing parameter (0.10--0.13) because it creates large clusters that encompass most of each node's neighborhood. +graph-tool SBM has a high mixing parameter (0.68--0.89), similar to ground truth, indicating it also produces blocks where many edges cross community boundaries. + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.85\textwidth]{../results/figures/mixing_parameter.pdf} +\caption{Mean mixing parameter by network and method.} +\label{fig:mixing} +\end{figure} + +\subsection{Effect of CPM Resolution Parameter} + +Comparing Leiden-CPM at resolutions 0.1 and 0.01 reveals the expected trade-off: +\begin{itemize} +\item \textbf{Resolution 0.1} produces more clusters (4,047 on topology vs.\ 1,802 at 0.01), with smaller mean size, higher edge density, but lower node coverage. +\item \textbf{Resolution 0.01} produces fewer, larger clusters with higher node coverage and lower mixing parameter, but slightly lower NMI. +\item On polblogs, resolution 0.1 achieves AMI=0.43 vs.\ 0.32 for resolution 0.01. On topology, they are similar (AMI $\approx$ 0.24 and 0.23). +\end{itemize} + +This aligns with the findings of~\citet{park2024wellconnectedness}, who observed that higher CPM resolutions tend to find smaller, denser communities that may better capture local structure. + +% ============================================================ +\section{Discussion} + +\subsection{Method Comparison} + +Our results reveal a fundamental tension in community detection: methods that optimize for global objectives (modularity, description length) tend to create few large clusters, while methods that allow for many small clusters (CPM, ground truth) capture finer-grained structure. + +\textbf{Leiden-CPM(0.1)} was the best performer overall, achieving the highest AMI and NMI on all three networks. Its cluster size distribution and edge density most closely match the ground truth. This is consistent with the findings of~\citet{vule2025sbm}, who noted that CPM-based methods with appropriate resolution tend to recover EC-SBM communities well. + +\textbf{Leiden-Mod} and \textbf{graph-tool SBM} perform poorly because they suffer from a resolution limit~\citep{newman2004modularity}: modularity optimization cannot detect communities smaller than $\sqrt{2m}$ (where $m$ is the number of edges), and SBM inference prefers parsimonious models with fewer blocks. +On topology (108K edges), both methods find fewer than 100 communities vs.\ 1,517 in ground truth. + +\textbf{Infomap} provides a middle ground with moderate accuracy. Its flow-based approach creates more communities than modularity (23--1,274 clusters) and achieves 100\% coverage, but its NMI (0.35--0.73) falls well below Leiden-CPM(0.1). + +\textbf{graph-tool SBM} is interesting because it identifies structure that is fundamentally different from the other methods: it infers blocks based on connectivity patterns (stochastic equivalence) rather than dense subgraphs. Its high mixing parameter and conductance confirm that its blocks are not internally dense communities but groups of nodes with similar connection profiles, which explains its low accuracy against the dense ground-truth communities. + +\subsection{Discrepancy Between NMI and ARI} + +A notable observation is the large gap between NMI and ARI values. For example, Leiden-CPM(0.1) on topology has NMI=0.937 but ARI=0.059. This discrepancy arises because NMI measures agreement in information-theoretic terms (sensitive to correct assignment of information), while ARI measures pairwise agreement (sensitive to whether pairs of nodes are correctly co-clustered or separated). When ground-truth clusters are very small (mean size 5), most pairs of nodes are in different clusters, and even small errors in cluster assignment can drastically reduce the pairwise agreement measured by ARI while barely affecting the information-theoretic NMI. + +\subsection{Ground Truth Properties} + +The EC-SBM ground-truth communities have distinctive properties: very small clusters (mean 4--8 nodes), high edge density (0.77--0.84), and low node coverage (21--48\%). This means the majority of nodes are singletons in the ground truth. Methods that achieve 100\% coverage (Leiden-Mod, Infomap, graph-tool SBM) necessarily create many false-positive community memberships for these singleton nodes, which may artificially inflate or deflate accuracy depending on the metric. + +\subsection{Connection to Literature} + +Our findings are consistent with: +\begin{itemize} +\item \citet{park2024wellconnectedness}: They demonstrated that well-connectedness is an important property of meaningful communities, and that CPM-based methods tend to produce well-connected clusters. Our edge density and conductance results support this. +\item \citet{vule2025sbm}: Their study of EC-SBM benchmarks showed that CPM-based Leiden methods outperform modularity-based methods on these networks, which matches our accuracy results across all three networks. +\end{itemize} + +% ============================================================ +\section{Reproducibility} + +\subsection{Software Versions} +\begin{itemize} +\item Python 3.12.12 +\item \texttt{python-igraph} (via conda-forge) +\item \texttt{leidenalg} (via conda-forge) +\item \texttt{infomap} 2.8.1 +\item \texttt{graph-tool} (via conda-forge) +\item \texttt{scikit-learn}, \texttt{scipy}, \texttt{matplotlib}, \texttt{pandas} (via conda-forge) +\end{itemize} + +\subsection{Data Source} +EC-SBM networks downloaded from the Illinois Data Bank: \url{https://databank.illinois.edu/datasets/IDB-3284069} (file \texttt{networks.zip}, 3.51 GB). We used the \texttt{sbm+wcc} variant, instance 0, for each network. + +\subsection{How to Reproduce} +\begin{enumerate} +\item Install dependencies: +\begin{verbatim} +conda install -c conda-forge python-igraph leidenalg \ + infomap graph-tool matplotlib scipy scikit-learn pandas -y +\end{verbatim} +\item Download and extract \texttt{networks.zip} to \texttt{data/networks/}. +\item Create symlinks (see \texttt{scripts/config.py} for paths). +\item Run the full pipeline: +\begin{verbatim} +python scripts/run_all.py +\end{verbatim} +\item Compile this report: +\begin{verbatim} +cd writeup && pdflatex main && bibtex main && pdflatex main && pdflatex main +\end{verbatim} +\end{enumerate} + +All code, results, and this writeup are in the project directory. Random seeds are fixed at 42 for all stochastic methods. + +% ============================================================ +\appendix +\section{Additional Cluster Size Distributions} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_polblogs.pdf} +\caption{Cluster size distributions for \texttt{polblogs}.} +\end{figure} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\textwidth]{../results/figures/cluster_sizes_internet_as.pdf} +\caption{Cluster size distributions for \texttt{internet\_as}.} +\end{figure} + +\section{Additional Edge Density Distributions} + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_polblogs.pdf} +\caption{Edge density distribution for \texttt{polblogs}.} +\end{figure} + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.85\textwidth]{../results/figures/edge_density_internet_as.pdf} +\caption{Edge density distribution for \texttt{internet\_as}.} +\end{figure} + +% ============================================================ +\bibliographystyle{plainnat} +\bibliography{references} + +\end{document} -- cgit v1.2.3