From 6f48c4fae3243e280b27a977c6a8cb731becf446 Mon Sep 17 00:00:00 2001
From: YurenHao0426 <Blackhao0426@gmail.com>
Date: Thu, 16 Apr 2026 13:35:31 -0500
Subject: Paper: expand to three comprehensive tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Table 1 (main, K=4): 11 baselines + 5 UPH variants (d=8..128) with
state size, inference tokens, R-L±std, METEOR±std on both tasks.
Table 2 (review_k): full K=4/8/16 ROUGE-L for all 15 methods.
Table 3 (topic_k): full K=4/8/16 ROUGE-L for all 15 methods.

d sweep is now folded into each table's UPH block, replacing the
separate small ablation table.

Rewrote §3.2 prose to reflect the flat K-scaling observed universally
on LongLaMP and the low-dimensional nature of the user prior.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 paper/uph_paper.tex | 107 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 77 insertions(+), 30 deletions(-)

(limited to 'paper/uph_paper.tex')

diff --git a/paper/uph_paper.tex b/paper/uph_paper.tex
index e3e4208..9ebcbca 100644
--- a/paper/uph_paper.tex
+++ b/paper/uph_paper.tex
@@ -125,9 +125,9 @@ Our contribution is not a more expressive personalization module, but the empiri
   \item An ablation over $K{\in}\{4,8,16\}$ and $d{\in}\{8,16,32,64,128\}$ shows that the effect is stable across support set sizes and that $d{=}8$ (a 16-byte user prior) already recovers most of the ROUGE-L gain, revealing how compact a useful user prior can be.
 \end{enumerate}
 
-%% Main table
+%% Main table at K=4
 \begin{table*}[t]
-  \caption{Main results on LongLaMP Topic and Review Writing with $K{=}4$ support examples per user, $N{=}200$ users per setting. ROUGE-L and METEOR are reported as mean$\pm$standard deviation across users. UPH uses 128~bytes per user and zero personalized prompt tokens at inference. Best ROUGE-L among personalized methods in \textbf{bold}; second-best \underline{underlined}. Higher is better. Inf.~tokens is the additional personalization prompt tokens carried at inference.}
+  \caption{\textbf{Main results at $K{=}4$.} LongLaMP Review and Topic Writing, $N{=}200$ users per setting. ROUGE-L and METEOR are mean$\pm$std across users. UPH uses 128~bytes per user and zero personalized prompt tokens at inference; the \emph{ours} block also reports four UPH variants over the user-vector dimension $d$. Best ROUGE-L among personalized methods in \textbf{bold}; second-best \underline{underlined}. Inf.~tokens is the additional personalization prompt tokens carried at inference.}
   \label{tab:main}
   \small
   \begin{tabularx}{\textwidth}{l r r C C C C}
@@ -151,7 +151,12 @@ Our contribution is not a more expressive personalization module, but the empiri
     Prompt Tuning ($L{=}5$)               & 15K  & 0 & .005$\pm$.015 & .007$\pm$.015 & .045$\pm$.044 & .075$\pm$.075 \\
     Prefix Tuning ($L{=}5$)               & 143K & 0 & .074$\pm$.047 & .051$\pm$.034 & .070$\pm$.031 & .077$\pm$.046 \\
     \midrule
-    \textbf{UPH (ours, $d{=}64$)}         & \textbf{128} & \textbf{0} & \underline{.140$\pm$.032} & .149$\pm$.054 & \textbf{.132$\pm$.029} & .192$\pm$.056 \\
+    \emph{\textbf{UPH (ours):}} \\
+    \hspace{1em}$d{=}8$                   & \textbf{16}  & \textbf{0} & .135$\pm$.033 & .144$\pm$.053 & .132$\pm$.031 & .194$\pm$.061 \\
+    \hspace{1em}$d{=}16$                  & 32  & \textbf{0} & .137$\pm$.032 & .147$\pm$.053 & .131$\pm$.030 & .195$\pm$.055 \\
+    \hspace{1em}$d{=}32$                  & 64  & \textbf{0} & .138$\pm$.032 & .145$\pm$.055 & .132$\pm$.030 & .192$\pm$.058 \\
+    \hspace{1em}$d{=}64$ (default)        & 128 & \textbf{0} & \underline{.140$\pm$.032} & .149$\pm$.054 & \textbf{.132$\pm$.029} & .192$\pm$.056 \\
+    \hspace{1em}$d{=}128$                 & 256 & \textbf{0} & .137$\pm$.032 & .148$\pm$.054 & .131$\pm$.030 & .193$\pm$.056 \\
     \bottomrule
   \end{tabularx}
   \vspace{0.3em}
@@ -218,46 +223,88 @@ Viewing the parameter-to-data ratio as a rough complexity index, UPH's 64-parame
 
 \subsection{$K$ and $d$ Ablations}
 
-We study the sensitivity of UPH to the support-set size $K$ and the user-vector dimension $d$ to verify that our main finding is not an artifact of a particular operating point (Table~\ref{tab:ablation}).
+To verify that our main finding is not an artifact of a particular operating point, we sweep the support-set size $K \in \{4, 8, 16\}$ and the user-vector dimension $d \in \{8, 16, 32, 64, 128\}$ for \emph{every} method in Table~\ref{tab:main}.
+Tables~\ref{tab:review_k} (Review) and~\ref{tab:topic_k} (Topic) report ROUGE-L for all 15 method configurations across all three $K$; the $d$ sweep is integrated into the \emph{ours} block of each table.
 
-\paragraph{$K$ scaling.}
-Doubling $K$ from 4 to 16 changes UPH's Review ROUGE-L by at most $0.002$ and Topic ROUGE-L by $0.001$.
-The method is neither bottlenecked nor destabilized by more support data, consistent with a simple scalar-family prior that saturates quickly.
-By contrast, the ICL baselines (BM25, Dense) improve with $K$ on Review ($+0.003$ on ROUGE-L from $K{=}4$ to $K{=}16$) but remain essentially flat on Topic, reinforcing the support--query-mismatch interpretation.
+\paragraph{$K$ scaling is essentially flat for every method on this benchmark.}
+Across both tasks and all methods, doubling $K$ from 4 to 16 moves ROUGE-L by at most ${\pm}0.004$ (ignoring the unstable Prompt/Prefix Tuning runs), whose variance across $K$ is driven by optimization failure rather than data scaling.
+This strongly suggests that on LongLaMP the amount of user signal useful to a frozen 1.5B-parameter LLM is largely captured by $K{=}4$ examples; additional exemplars add variance rather than information.
+The flat $K$-scaling holds equally for text-memory methods (BM25/Dense/Prompt-All-K $\Delta{\le}0.003$), PEFT methods, and UPH.
 
-\paragraph{$d$ scaling.}
-Remarkably, a 16-byte per-user state ($d{=}8$) already captures most of the ROUGE-L gain on both tasks: Review .1353 and Topic .1316, versus .1399 and .1321 at $d{=}64$.
-Gains saturate around $d{=}32$--$64$; $d{=}128$ does not improve over $d{=}64$ and slightly hurts on Review (.1370 vs.\ .1399).
-This suggests that the user-specific information that UPH can extract from $K{=}4$ examples is genuinely low-dimensional, and that a practically deployed version of UPH could use $d{=}8$--$16$ to save per-user storage by 4--8$\times$ with negligible quality loss.
+\paragraph{$d$ scaling shows UPH's effect is stable across two decades of per-user state.}
+Within the \emph{ours} block, ROUGE-L varies by less than $0.005$ across $d \in \{8, 16, 32, 64, 128\}$ at every $K$ and on both tasks.
+A 16-byte per-user state ($d{=}8$) already recovers most of the gain: Review .135 / Topic .132, versus .140 / .132 at $d{=}64$.
+$d{=}128$ does not improve over $d{=}64$ and slightly hurts on Review, indicating mild over-parameterization.
+The practical implication is that a deployed version of UPH can use $d \in [8, 32]$---reducing per-user state by 2--8$\times$---with negligible quality loss.
+More fundamentally, this shows that the user-specific information UPH can extract from four writing samples is genuinely low-dimensional.
 
-\begin{table}[t]
-  \caption{UPH ablations on ROUGE-L (mean$\pm$std across users, $N{=}200$). \textbf{Top:} $K$-scaling at $d{=}64$. \textbf{Bottom:} $d$-scaling at $K{=}4$. Results are stable across a wide range of $K$ and $d$; $d{=}8$ (16 bytes) already recovers most of the gain.}
-  \label{tab:ablation}
+%% Full K-scaling: Review
+\begin{table*}[t]
+  \caption{\textbf{Review-user, ROUGE-L $K$-scaling across all methods.} Mean$\pm$std across $N{=}200$ users. Prompt Tuning / Prefix Tuning at $K{=}16$ use $N{=}50$ due to compute cost. Best per column in \textbf{bold}.}
+  \label{tab:review_k}
   \small
-  \setlength{\tabcolsep}{3pt}
-  \begin{tabularx}{\columnwidth}{l C C C}
+  \begin{tabularx}{\textwidth}{l C C C}
     \toprule
-    \multicolumn{4}{c}{$K$-scaling ($d{=}64$)} \\
-    \cmidrule(lr){1-4}
-    Task & $K{=}4$ & $K{=}8$ & $K{=}16$ \\
+    Method & $K{=}4$ & $K{=}8$ & $K{=}16$ \\
+    \midrule
+    Base (no personalization) & .126$\pm$.028 & .126$\pm$.028 & .126$\pm$.028 \\
+    \midrule
+    \emph{In-context learning (ICL):} \\
+    Prompt-All-K   & .142$\pm$.031 & .140$\pm$.032 & \textbf{.144$\pm$.031} \\
+    BM25-Top1      & .140$\pm$.029 & \textbf{.143$\pm$.067} & .143$\pm$.067 \\
+    Dense-Top1     & \textbf{.143$\pm$.067} & \textbf{.143$\pm$.067} & .143$\pm$.067 \\
+    Profile-based  & .121$\pm$.029 & .119$\pm$.029 & .118$\pm$.028 \\
     \midrule
-    Review-user & \textbf{.140$\pm$.032} & .137$\pm$.031 & .138$\pm$.032 \\
-    Topic-user  & .132$\pm$.029 & .131$\pm$.030 & \textbf{.133$\pm$.029} \\
+    \emph{Parameter-efficient FT (PEFT):} \\
+    LoRA ($r{=}8$)      & .132$\pm$.029 & .130$\pm$.032 & .131$\pm$.030 \\
+    Tiny LoRA ($r{=}1$) & .126$\pm$.032 & .124$\pm$.032 & .126$\pm$.031 \\
+    VeRA ($r{=}256$)    & .124$\pm$.027 & .124$\pm$.029 & .125$\pm$.028 \\
+    Prompt Tuning ($L{=}5$) & .005$\pm$.015 & .007$\pm$.024 & .129$\pm$.031 \\
+    Prefix Tuning ($L{=}5$) & .074$\pm$.047 & .002$\pm$.006 & .022$\pm$.026 \\
+    \midrule
+    \emph{\textbf{UPH (ours):}} \\
+    \hspace{1em}$d{=}8$      & .135$\pm$.033 & .136$\pm$.032 & .136$\pm$.032 \\
+    \hspace{1em}$d{=}16$     & .137$\pm$.032 & .137$\pm$.032 & .136$\pm$.033 \\
+    \hspace{1em}$d{=}32$     & .138$\pm$.032 & .139$\pm$.033 & .138$\pm$.034 \\
+    \hspace{1em}$d{=}64$     & .140$\pm$.032 & .137$\pm$.031 & .138$\pm$.032 \\
+    \hspace{1em}$d{=}128$    & .137$\pm$.032 & .137$\pm$.032 & .139$\pm$.032 \\
     \bottomrule
   \end{tabularx}
+\end{table*}
 
-  \vspace{0.6em}
-  \begin{tabularx}{\columnwidth}{l C C C C C}
+%% Full K-scaling: Topic
+\begin{table*}[t]
+  \caption{\textbf{Topic-user, ROUGE-L $K$-scaling across all methods.} Mean$\pm$std across $N{=}200$ users. Prompt Tuning / Prefix Tuning at $K{=}8$ and $K{=}16$ use $N{=}50$ due to compute cost. Best per column in \textbf{bold}.}
+  \label{tab:topic_k}
+  \small
+  \begin{tabularx}{\textwidth}{l C C C}
     \toprule
-    \multicolumn{6}{c}{$d$-scaling ($K{=}4$)} \\
-    \cmidrule(lr){1-6}
-    Task & $d{=}8$ & $d{=}16$ & $d{=}32$ & $d{=}64$ & $d{=}128$ \\
+    Method & $K{=}4$ & $K{=}8$ & $K{=}16$ \\
+    \midrule
+    Base (no personalization) & .119$\pm$.023 & .119$\pm$.023 & .119$\pm$.023 \\
+    \midrule
+    \emph{In-context learning (ICL):} \\
+    Prompt-All-K   & .123$\pm$.067 & .121$\pm$.027 & .126$\pm$.040 \\
+    BM25-Top1      & .119$\pm$.068 & .120$\pm$.068 & .120$\pm$.068 \\
+    Dense-Top1     & .119$\pm$.068 & .117$\pm$.068 & .119$\pm$.068 \\
+    Profile-based  & .112$\pm$.028 & .113$\pm$.029 & .114$\pm$.025 \\
     \midrule
-    Review-user & .135$\pm$.033 & .137$\pm$.032 & .138$\pm$.032 & \textbf{.140$\pm$.032} & .137$\pm$.032 \\
-    Topic-user  & .132$\pm$.031 & .131$\pm$.030 & .132$\pm$.030 & \textbf{.132$\pm$.029} & .131$\pm$.030 \\
+    \emph{Parameter-efficient FT (PEFT):} \\
+    LoRA ($r{=}8$)      & .119$\pm$.028 & .121$\pm$.027 & .119$\pm$.029 \\
+    Tiny LoRA ($r{=}1$) & .118$\pm$.027 & .116$\pm$.026 & .117$\pm$.027 \\
+    VeRA ($r{=}256$)    & .119$\pm$.025 & .119$\pm$.022 & .117$\pm$.024 \\
+    Prompt Tuning ($L{=}5$) & .045$\pm$.044 & .028$\pm$.039 & .017$\pm$.034 \\
+    Prefix Tuning ($L{=}5$) & .070$\pm$.031 & .027$\pm$.034 & .059$\pm$.031 \\
+    \midrule
+    \emph{\textbf{UPH (ours):}} \\
+    \hspace{1em}$d{=}8$      & \textbf{.132$\pm$.031} & .130$\pm$.031 & .130$\pm$.030 \\
+    \hspace{1em}$d{=}16$     & .131$\pm$.030 & .131$\pm$.031 & .132$\pm$.029 \\
+    \hspace{1em}$d{=}32$     & \textbf{.132$\pm$.030} & \textbf{.132$\pm$.029} & .132$\pm$.028 \\
+    \hspace{1em}$d{=}64$     & \textbf{.132$\pm$.029} & .131$\pm$.030 & \textbf{.133$\pm$.029} \\
+    \hspace{1em}$d{=}128$    & .131$\pm$.030 & \textbf{.132$\pm$.029} & .131$\pm$.029 \\
     \bottomrule
   \end{tabularx}
-\end{table}
+\end{table*}
 
 %% ============================================================
 %% SECTION 4: Discussion and Limitations
-- 
cgit v1.2.3