1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
|
\documentclass{article}
\usepackage[utf8]{inputenc}
\usepackage[margin=1.5cm, top=2cm, bottom=2cm]{geometry}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{setspace}
\setstretch{0.9}
\title{CS229 Notes - June 26, 2025}
\author{Yuren Hao}
\date{\today}
\begin{document}
\maketitle
\section{Maximum Likelihood Estimation (MLE)}
Given i.i.d. observations $x_1, x_2, \ldots, x_n$ from $f(x; \theta)$.
\textbf{Likelihood:} $L(\theta) = \prod_{i=1}^{n} f(x_i; \theta)$
\textbf{Log-likelihood:} $\ell(\theta) = \sum_{i=1}^{n} \log f(x_i; \theta)$
\textbf{MLE:} $\hat{\theta}_{MLE} = \arg\max_{\theta} \ell(\theta)$
Solve: $\frac{\partial \ell(\theta)}{\partial \theta} = 0$
\section{Linear Regression - Normal Equation}
Given training data $\{(x_i, y_i)\}_{i=1}^n$ with $x_i \in \mathbb{R}^d$, $y_i \in \mathbb{R}$.
\textbf{RSS:} $RSS(\theta) = \sum_{i=1}^{n} (y_i - \theta^T x_i)^2 = \|y - X\theta\|_2^2$
\textbf{Derivative:} $\frac{\partial RSS(\theta)}{\partial \theta} = -2X^T(y - X\theta)$
\textbf{Normal Equation:} $X^TX\theta = X^Ty$
\textbf{Closed-form solution:} $\hat{\theta} = (X^TX)^{-1}X^Ty$
\section{Gradient Descent for Linear Regression}
\textbf{Cost function:} $J(\theta) = \frac{1}{2m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2$
\textbf{Gradient:} $\frac{\partial J}{\partial \theta_j} = \frac{1}{m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)}$
\textbf{Algorithm:}
\begin{enumerate}
\item Initialize $\theta$ randomly
\item While not converged:
\begin{itemize}
\item $\theta_j := \theta_j - \alpha \frac{\partial J}{\partial \theta_j}$ for all $j$
\item Check convergence: $|J(\theta^{(t+1)}) - J(\theta^{(t)})| < \epsilon$
\end{itemize}
\end{enumerate}
\textbf{Vectorized update:} $\theta := \theta - \alpha \frac{1}{m} X^T(X\theta - y)$
\subsection{Example: 3 Data Points}
\textbf{Data:} $(1,1)$, $(2,2)$, $(3,4)$. Model: $h_\theta(x) = \theta_0 + \theta_1 x$
\textbf{Initial:} $\theta_0 = 0$, $\theta_1 = 0$, $\alpha = 0.1$
\textbf{Iteration 1:}
\begin{itemize}
\item $J(\theta) = \frac{1}{6}[(0-1)^2 + (0-2)^2 + (0-4)^2] = 3.5$
\item $\frac{\partial J}{\partial \theta_0} = \frac{1}{3}[(-1) + (-2) + (-4)] = -2.33$
\item $\frac{\partial J}{\partial \theta_1} = \frac{1}{3}[(-1)(1) + (-2)(2) + (-4)(3)] = -5.67$
\item $\theta_0 := 0 - 0.1(-2.33) = 0.233$
\item $\theta_1 := 0 - 0.1(-5.67) = 0.567$
\end{itemize}
\textbf{Iteration 2:} $J(\theta) = 1.26$ (continues until convergence...)
\section{ML Framework \& Loss Functions}
\subsection{General ML Pipeline}
\textbf{Model} $\rightarrow$ \textbf{Algorithm} $\rightarrow$ \textbf{Estimated Parameters}
\textbf{Predictions} $\rightarrow$ \textbf{Decisions} $\rightarrow$ \textbf{Outcomes}
\textbf{Example:} Linear model $\rightarrow$ Gradient descent $\rightarrow$ $\hat{\theta}$
House price prediction $\rightarrow$ Buy/sell decision $\rightarrow$ Profit/loss
\subsection{Loss Functions}
\textbf{Regression:}
\begin{itemize}
\item \textbf{MSE:} $L(y, \hat{y}) = (y - \hat{y})^2$
\item \textbf{MAE:} $L(y, \hat{y}) = |y - \hat{y}|$
\item \textbf{Huber:} $L(y, \hat{y}) = \begin{cases} \frac{1}{2}(y-\hat{y})^2 & \text{if } |y-\hat{y}| \leq \delta \\ \delta|y-\hat{y}| - \frac{1}{2}\delta^2 & \text{otherwise} \end{cases}$
\end{itemize}
\textbf{Classification:}
\begin{itemize}
\item \textbf{0-1 Loss:} $L(y, \hat{y}) = \mathbf{1}[y \neq \hat{y}]$
\item \textbf{Logistic:} $L(y, \hat{y}) = -y\log(\hat{y}) - (1-y)\log(1-\hat{y})$
\item \textbf{Hinge:} $L(y, \hat{y}) = \max(0, 1 - y\hat{y})$ (SVM)
\end{itemize}
\section{Training Loss vs Model Complexity}
\subsection{Bias-Variance Tradeoff}
\textbf{Low Complexity:} High bias, low variance $\rightarrow$ \textbf{Underfitting}
\textbf{High Complexity:} Low bias, high variance $\rightarrow$ \textbf{Overfitting}
\subsection{Typical Curves}
\begin{itemize}
\item \textbf{Training Error:} Decreases as complexity increases
\item \textbf{Validation Error:} U-shaped curve
\item \textbf{Optimal Complexity:} Minimum validation error
\end{itemize}
\begin{figure}[h]
\centering
\includegraphics[width=0.8\textwidth]{model-complexity-diagram.png}
\caption{Model Complexity vs Training/Validation Error}
\end{figure}
\textbf{Total Error} = Bias$^2$ + Variance + Irreducible Error
\textbf{Regularization:} Controls complexity via penalty terms
\begin{itemize}
\item \textbf{L1 (Lasso):} $\lambda \sum_j |\theta_j|$ (sparse solutions)
\item \textbf{L2 (Ridge):} $\lambda \sum_j \theta_j^2$ (smooth solutions)
\end{itemize}
\section{Generalization Error}
\subsection{Definition}
\textbf{Generalization Error:} Expected error on unseen data from same distribution
\textbf{Continuous Case:} For regression with squared loss
\[
\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[(h(x) - y)^2] = \int_{x,y} (h(x) - y)^2 p(x,y) \, dx \, dy
\]
If $p(x,y) = p(y|x)p(x)$, then:
\[
= \int_x \left[ \int_y (h(x) - y)^2 p(y|x) \, dy \right] p(x) \, dx
\]
\textbf{Discrete Case:} For classification with 0-1 loss
\[
\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[\mathbf{1}[h(x) \neq y]] = \sum_{x,y} \mathbf{1}[h(x) \neq y] \cdot p(x,y)
\]
Expanding the sum:
\[
= \sum_x \sum_{y: y \neq h(x)} p(x,y) = \sum_x p(x) \sum_{y: y \neq h(x)} p(y|x)
\]
Where $D$ is the data distribution, $h$ is the hypothesis
\textbf{Empirical Risk:} Approximation using training data
\textbf{Continuous Case:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} (h(x_i) - y_i)^2
\]
\textbf{Discrete Case:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} \mathbf{1}[h(x_i) \neq y_i]
\]
\textbf{General Form:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} L(h(x_i), y_i) \approx \mathbb{E}_{(x,y) \sim D}[L(h(x), y)]
\]
\textbf{Generalization Gap:} $R(h) - \hat{R}(h)$ where $R(h)$ is true risk
\subsection{Sources of Error}
The expected prediction error can be decomposed into three fundamental components:
\textbf{Expected Prediction Error:}
\[
\mathbb{E}[\text{Error}(x)] = \text{Noise} + \text{Bias}^2 + \text{Variance}
\]
\textbf{Detailed Breakdown:}
\begin{itemize}
\item \textbf{Noise:} $\sigma^2 = \mathbb{E}[(y - f(x))^2]$ - Irreducible error from data
\item \textbf{Bias:} $\text{Bias}[\hat{f}(x)] = \mathbb{E}[\hat{f}(x)] - f(x)$ - Model's systematic error
\item \textbf{Variance:} $\text{Var}[\hat{f}(x)] = \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2]$ - Model's sensitivity to training data
\end{itemize}
\textbf{MSE Decomposition:}
\[
\text{MSE}(x) = \mathbb{E}[(\hat{f}(x) - y)^2] = \text{Bias}^2[\hat{f}(x)] + \text{Var}[\hat{f}(x)] + \sigma^2
\]
Expanding the MSE:
\[
= (\mathbb{E}[\hat{f}(x)] - f(x))^2 + \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2] + \mathbb{E}[(y - f(x))^2]
\]
\textbf{Experimental Setup for Bias-Variance Analysis:}
We create $N$ different training sets by sampling from the data distribution. Each training set produces estimated model parameters, giving us $N$ different models $\hat{f}_1, \hat{f}_2, \ldots, \hat{f}_N$.
Use the average predictions of all the $N$ estimated models, denoted by $\bar{f}_w(x)$:
\[
\bar{f}_w(x) = \frac{1}{N}\sum_{i=1}^{N} \hat{f}_i(x)
\]
The average fit is akin to the expected prediction $\mathbb{E}[\hat{f}(x)]$ over all possible training sets.
\textbf{Bias Definition:}
\[
\text{Bias}(x) = f_{\text{true}}(x) - \bar{f}_w(x)
\]
\textbf{Key Question:} Is our approach flexible enough to capture $f_{\text{true}}(x)$?
Bias is high when the hypothesis class is unable to capture $f_{\text{true}}(x)$. This happens when the model class is too simple or restrictive to represent the true underlying function.
\subsection{Approximating Generalization Error}
Since true distribution $D$ is unknown, we approximate generalization error using:
\textbf{Validation Set:} Hold-out data to estimate $R(h) \approx \hat{R}_{val}(h)$
\textbf{Cross-Validation:} Average over multiple train/validation splits to reduce variance.
The key insight: empirical risk on unseen validation data provides unbiased estimate of generalization error.
\subsection{Test Error Definition}
\textbf{Test Error:} Performance on final held-out test set, used only once for final evaluation.
\textbf{Continuous Case (Regression):}
\[
\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} (h(x_i^{test}) - y_i^{test})^2
\]
Expanding the sum:
\[
= \frac{1}{n_{test}}\left[ (h(x_1^{test}) - y_1^{test})^2 + (h(x_2^{test}) - y_2^{test})^2 + \ldots + (h(x_{n_{test}}^{test}) - y_{n_{test}}^{test})^2 \right]
\]
\textbf{Discrete Case (Classification):}
\[
\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} \mathbf{1}[h(x_i^{test}) \neq y_i^{test}]
\]
Expanding the indicator sum:
\[
= \frac{1}{n_{test}}\left[ \mathbf{1}[h(x_1^{test}) \neq y_1^{test}] + \mathbf{1}[h(x_2^{test}) \neq y_2^{test}] + \ldots + \mathbf{1}[h(x_{n_{test}}^{test}) \neq y_{n_{test}}^{test}] \right]
\]
\textbf{Key Point:} Test set should only be used once to avoid overfitting to test data.
\subsection{Key Factors}
\begin{itemize}
\item \textbf{Model Complexity:} More parameters $\rightarrow$ higher capacity to overfit
\item \textbf{Training Set Size:} More data $\rightarrow$ better generalization
\item \textbf{Regularization:} Penalty terms reduce overfitting
\item \textbf{Early Stopping:} Stop training before overfitting occurs
\end{itemize}
\end{document}
|