1 files changed, 261 insertions, 0 deletions
diff --git a/2025-06-26/2025-06-26-notes.tex b/2025-06-26/2025-06-26-notes.tex
new file mode 100644
index 0000000..bbf4ed3
--- /dev/null
+++ b/2025-06-26/2025-06-26-notes.tex
@@ -0,0 +1,261 @@
+\documentclass{article}
+\usepackage[utf8]{inputenc}
+\usepackage[margin=1.5cm, top=2cm, bottom=2cm]{geometry}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{amssymb}
+\usepackage{graphicx}
+\usepackage{setspace}
+\setstretch{0.9}
+
+\title{CS229 Notes - June 26, 2025}
+\author{Yuren Hao}
+\date{\today}
+
+\begin{document}
+
+\maketitle
+
+\section{Maximum Likelihood Estimation (MLE)}
+
+Given i.i.d. observations $x_1, x_2, \ldots, x_n$ from $f(x; \theta)$.
+
+\textbf{Likelihood:} $L(\theta) = \prod_{i=1}^{n} f(x_i; \theta)$
+
+\textbf{Log-likelihood:} $\ell(\theta) = \sum_{i=1}^{n} \log f(x_i; \theta)$
+
+\textbf{MLE:} $\hat{\theta}_{MLE} = \arg\max_{\theta} \ell(\theta)$
+
+Solve: $\frac{\partial \ell(\theta)}{\partial \theta} = 0$
+
+\section{Linear Regression - Normal Equation}
+
+Given training data $\{(x_i, y_i)\}_{i=1}^n$ with $x_i \in \mathbb{R}^d$, $y_i \in \mathbb{R}$.
+
+\textbf{RSS:} $RSS(\theta) = \sum_{i=1}^{n} (y_i - \theta^T x_i)^2 = \|y - X\theta\|_2^2$
+
+\textbf{Derivative:} $\frac{\partial RSS(\theta)}{\partial \theta} = -2X^T(y - X\theta)$
+
+\textbf{Normal Equation:} $X^TX\theta = X^Ty$
+
+\textbf{Closed-form solution:} $\hat{\theta} = (X^TX)^{-1}X^Ty$
+
+\section{Gradient Descent for Linear Regression}
+
+\textbf{Cost function:} $J(\theta) = \frac{1}{2m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2$
+
+\textbf{Gradient:} $\frac{\partial J}{\partial \theta_j} = \frac{1}{m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)}$
+
+\textbf{Algorithm:}
+\begin{enumerate}
+    \item Initialize $\theta$ randomly
+    \item While not converged:
+    \begin{itemize}
+        \item $\theta_j := \theta_j - \alpha \frac{\partial J}{\partial \theta_j}$ for all $j$
+        \item Check convergence: $|J(\theta^{(t+1)}) - J(\theta^{(t)})| < \epsilon$
+    \end{itemize}
+\end{enumerate}
+
+\textbf{Vectorized update:} $\theta := \theta - \alpha \frac{1}{m} X^T(X\theta - y)$
+
+\subsection{Example: 3 Data Points}
+\textbf{Data:} $(1,1)$, $(2,2)$, $(3,4)$. Model: $h_\theta(x) = \theta_0 + \theta_1 x$
+
+\textbf{Initial:} $\theta_0 = 0$, $\theta_1 = 0$, $\alpha = 0.1$
+
+\textbf{Iteration 1:}
+\begin{itemize}
+    \item $J(\theta) = \frac{1}{6}[(0-1)^2 + (0-2)^2 + (0-4)^2] = 3.5$
+    \item $\frac{\partial J}{\partial \theta_0} = \frac{1}{3}[(-1) + (-2) + (-4)] = -2.33$
+    \item $\frac{\partial J}{\partial \theta_1} = \frac{1}{3}[(-1)(1) + (-2)(2) + (-4)(3)] = -5.67$
+    \item $\theta_0 := 0 - 0.1(-2.33) = 0.233$
+    \item $\theta_1 := 0 - 0.1(-5.67) = 0.567$
+\end{itemize}
+
+\textbf{Iteration 2:} $J(\theta) = 1.26$ (continues until convergence...)
+
+\section{ML Framework \& Loss Functions}
+
+\subsection{General ML Pipeline}
+\textbf{Model} $\rightarrow$ \textbf{Algorithm} $\rightarrow$ \textbf{Estimated Parameters}
+
+\textbf{Predictions} $\rightarrow$ \textbf{Decisions} $\rightarrow$ \textbf{Outcomes}
+
+\textbf{Example:} Linear model $\rightarrow$ Gradient descent $\rightarrow$ $\hat{\theta}$
+
+House price prediction $\rightarrow$ Buy/sell decision $\rightarrow$ Profit/loss
+
+\subsection{Loss Functions}
+\textbf{Regression:}
+\begin{itemize}
+    \item \textbf{MSE:} $L(y, \hat{y}) = (y - \hat{y})^2$
+    \item \textbf{MAE:} $L(y, \hat{y}) = |y - \hat{y}|$
+    \item \textbf{Huber:} $L(y, \hat{y}) = \begin{cases} \frac{1}{2}(y-\hat{y})^2 & \text{if } |y-\hat{y}| \leq \delta \\ \delta|y-\hat{y}| - \frac{1}{2}\delta^2 & \text{otherwise} \end{cases}$
+\end{itemize}
+
+\textbf{Classification:}
+\begin{itemize}
+    \item \textbf{0-1 Loss:} $L(y, \hat{y}) = \mathbf{1}[y \neq \hat{y}]$
+    \item \textbf{Logistic:} $L(y, \hat{y}) = -y\log(\hat{y}) - (1-y)\log(1-\hat{y})$
+    \item \textbf{Hinge:} $L(y, \hat{y}) = \max(0, 1 - y\hat{y})$ (SVM)
+\end{itemize}
+
+\section{Training Loss vs Model Complexity}
+
+\subsection{Bias-Variance Tradeoff}
+\textbf{Low Complexity:} High bias, low variance $\rightarrow$ \textbf{Underfitting}
+
+\textbf{High Complexity:} Low bias, high variance $\rightarrow$ \textbf{Overfitting}
+
+\subsection{Typical Curves}
+\begin{itemize}
+    \item \textbf{Training Error:} Decreases as complexity increases
+    \item \textbf{Validation Error:} U-shaped curve
+    \item \textbf{Optimal Complexity:} Minimum validation error
+\end{itemize}
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.8\textwidth]{model-complexity-diagram.png}
+\caption{Model Complexity vs Training/Validation Error}
+\end{figure}
+
+\textbf{Total Error} = Bias$^2$ + Variance + Irreducible Error
+
+\textbf{Regularization:} Controls complexity via penalty terms
+\begin{itemize}
+    \item \textbf{L1 (Lasso):} $\lambda \sum_j |\theta_j|$ (sparse solutions)
+    \item \textbf{L2 (Ridge):} $\lambda \sum_j \theta_j^2$ (smooth solutions)
+\end{itemize}
+
+\section{Generalization Error}
+
+\subsection{Definition}
+\textbf{Generalization Error:} Expected error on unseen data from same distribution
+
+\textbf{Continuous Case:} For regression with squared loss
+\[
+\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[(h(x) - y)^2] = \int_{x,y} (h(x) - y)^2 p(x,y) \, dx \, dy
+\]
+
+If $p(x,y) = p(y|x)p(x)$, then:
+\[
+= \int_x \left[ \int_y (h(x) - y)^2 p(y|x) \, dy \right] p(x) \, dx
+\]
+
+\textbf{Discrete Case:} For classification with 0-1 loss
+\[
+\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[\mathbf{1}[h(x) \neq y]] = \sum_{x,y} \mathbf{1}[h(x) \neq y] \cdot p(x,y)
+\]
+
+Expanding the sum:
+\[
+= \sum_x \sum_{y: y \neq h(x)} p(x,y) = \sum_x p(x) \sum_{y: y \neq h(x)} p(y|x)
+\]
+
+Where $D$ is the data distribution, $h$ is the hypothesis
+
+\textbf{Empirical Risk:} Approximation using training data
+
+\textbf{Continuous Case:}
+\[
+\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} (h(x_i) - y_i)^2
+\]
+
+\textbf{Discrete Case:}
+\[
+\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} \mathbf{1}[h(x_i) \neq y_i]
+\]
+
+\textbf{General Form:}
+\[
+\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} L(h(x_i), y_i) \approx \mathbb{E}_{(x,y) \sim D}[L(h(x), y)]
+\]
+
+\textbf{Generalization Gap:} $R(h) - \hat{R}(h)$ where $R(h)$ is true risk
+
+\subsection{Sources of Error}
+The expected prediction error can be decomposed into three fundamental components:
+
+\textbf{Expected Prediction Error:}
+\[
+\mathbb{E}[\text{Error}(x)] = \text{Noise} + \text{Bias}^2 + \text{Variance}
+\]
+
+\textbf{Detailed Breakdown:}
+\begin{itemize}
+    \item \textbf{Noise:} $\sigma^2 = \mathbb{E}[(y - f(x))^2]$ - Irreducible error from data
+    \item \textbf{Bias:} $\text{Bias}[\hat{f}(x)] = \mathbb{E}[\hat{f}(x)] - f(x)$ - Model's systematic error
+    \item \textbf{Variance:} $\text{Var}[\hat{f}(x)] = \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2]$ - Model's sensitivity to training data
+\end{itemize}
+
+\textbf{MSE Decomposition:}
+\[
+\text{MSE}(x) = \mathbb{E}[(\hat{f}(x) - y)^2] = \text{Bias}^2[\hat{f}(x)] + \text{Var}[\hat{f}(x)] + \sigma^2
+\]
+
+Expanding the MSE:
+\[
+= (\mathbb{E}[\hat{f}(x)] - f(x))^2 + \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2] + \mathbb{E}[(y - f(x))^2]
+\]
+
+\textbf{Experimental Setup for Bias-Variance Analysis:}
+
+We create $N$ different training sets by sampling from the data distribution. Each training set produces estimated model parameters, giving us $N$ different models $\hat{f}_1, \hat{f}_2, \ldots, \hat{f}_N$.
+
+Use the average predictions of all the $N$ estimated models, denoted by $\bar{f}_w(x)$:
+\[
+\bar{f}_w(x) = \frac{1}{N}\sum_{i=1}^{N} \hat{f}_i(x)
+\]
+
+The average fit is akin to the expected prediction $\mathbb{E}[\hat{f}(x)]$ over all possible training sets.
+
+\textbf{Bias Definition:}
+\[
+\text{Bias}(x) = f_{\text{true}}(x) - \bar{f}_w(x)
+\]
+
+\textbf{Key Question:} Is our approach flexible enough to capture $f_{\text{true}}(x)$?
+
+Bias is high when the hypothesis class is unable to capture $f_{\text{true}}(x)$. This happens when the model class is too simple or restrictive to represent the true underlying function.
+
+\subsection{Approximating Generalization Error}
+Since true distribution $D$ is unknown, we approximate generalization error using:
+\textbf{Validation Set:} Hold-out data to estimate $R(h) \approx \hat{R}_{val}(h)$
+\textbf{Cross-Validation:} Average over multiple train/validation splits to reduce variance.
+The key insight: empirical risk on unseen validation data provides unbiased estimate of generalization error.
+
+\subsection{Test Error Definition}
+\textbf{Test Error:} Performance on final held-out test set, used only once for final evaluation.
+
+\textbf{Continuous Case (Regression):}
+\[
+\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} (h(x_i^{test}) - y_i^{test})^2
+\]
+
+Expanding the sum:
+\[
+= \frac{1}{n_{test}}\left[ (h(x_1^{test}) - y_1^{test})^2 + (h(x_2^{test}) - y_2^{test})^2 + \ldots + (h(x_{n_{test}}^{test}) - y_{n_{test}}^{test})^2 \right]
+\]
+
+\textbf{Discrete Case (Classification):}
+\[
+\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} \mathbf{1}[h(x_i^{test}) \neq y_i^{test}]
+\]
+
+Expanding the indicator sum:
+\[
+= \frac{1}{n_{test}}\left[ \mathbf{1}[h(x_1^{test}) \neq y_1^{test}] + \mathbf{1}[h(x_2^{test}) \neq y_2^{test}] + \ldots + \mathbf{1}[h(x_{n_{test}}^{test}) \neq y_{n_{test}}^{test}] \right]
+\]
+
+\textbf{Key Point:} Test set should only be used once to avoid overfitting to test data.
+
+\subsection{Key Factors}
+\begin{itemize}
+    \item \textbf{Model Complexity:} More parameters $\rightarrow$ higher capacity to overfit
+    \item \textbf{Training Set Size:} More data $\rightarrow$ better generalization
+    \item \textbf{Regularization:} Penalty terms reduce overfitting
+    \item \textbf{Early Stopping:} Stop training before overfitting occurs
+\end{itemize}
+
+\end{document} 
+\ No newline at end of file