\documentclass{article}
\usepackage[utf8]{inputenc}
\usepackage[margin=1.5cm, top=2cm, bottom=2cm]{geometry}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{setspace}
\setstretch{0.9}

\title{CS229 Notes - June 26, 2025}
\author{Yuren Hao}
\date{\today}

\begin{document}

\maketitle

\section{Maximum Likelihood Estimation (MLE)}

Given i.i.d. observations $x_1, x_2, \ldots, x_n$ from $f(x; \theta)$.

\textbf{Likelihood:} $L(\theta) = \prod_{i=1}^{n} f(x_i; \theta)$

\textbf{Log-likelihood:} $\ell(\theta) = \sum_{i=1}^{n} \log f(x_i; \theta)$

\textbf{MLE:} $\hat{\theta}_{MLE} = \arg\max_{\theta} \ell(\theta)$

Solve: $\frac{\partial \ell(\theta)}{\partial \theta} = 0$

\section{Linear Regression - Normal Equation}

Given training data $\{(x_i, y_i)\}_{i=1}^n$ with $x_i \in \mathbb{R}^d$, $y_i \in \mathbb{R}$.

\textbf{RSS:} $RSS(\theta) = \sum_{i=1}^{n} (y_i - \theta^T x_i)^2 = \|y - X\theta\|_2^2$

\textbf{Derivative:} $\frac{\partial RSS(\theta)}{\partial \theta} = -2X^T(y - X\theta)$

\textbf{Normal Equation:} $X^TX\theta = X^Ty$

\textbf{Closed-form solution:} $\hat{\theta} = (X^TX)^{-1}X^Ty$

\section{Gradient Descent for Linear Regression}

\textbf{Cost function:} $J(\theta) = \frac{1}{2m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2$

\textbf{Gradient:} $\frac{\partial J}{\partial \theta_j} = \frac{1}{m}\sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)}$

\textbf{Algorithm:}
\begin{enumerate}
    \item Initialize $\theta$ randomly
    \item While not converged:
    \begin{itemize}
        \item $\theta_j := \theta_j - \alpha \frac{\partial J}{\partial \theta_j}$ for all $j$
        \item Check convergence: $|J(\theta^{(t+1)}) - J(\theta^{(t)})| < \epsilon$
    \end{itemize}
\end{enumerate}

\textbf{Vectorized update:} $\theta := \theta - \alpha \frac{1}{m} X^T(X\theta - y)$

\subsection{Example: 3 Data Points}
\textbf{Data:} $(1,1)$, $(2,2)$, $(3,4)$. Model: $h_\theta(x) = \theta_0 + \theta_1 x$

\textbf{Initial:} $\theta_0 = 0$, $\theta_1 = 0$, $\alpha = 0.1$

\textbf{Iteration 1:}
\begin{itemize}
    \item $J(\theta) = \frac{1}{6}[(0-1)^2 + (0-2)^2 + (0-4)^2] = 3.5$
    \item $\frac{\partial J}{\partial \theta_0} = \frac{1}{3}[(-1) + (-2) + (-4)] = -2.33$
    \item $\frac{\partial J}{\partial \theta_1} = \frac{1}{3}[(-1)(1) + (-2)(2) + (-4)(3)] = -5.67$
    \item $\theta_0 := 0 - 0.1(-2.33) = 0.233$
    \item $\theta_1 := 0 - 0.1(-5.67) = 0.567$
\end{itemize}

\textbf{Iteration 2:} $J(\theta) = 1.26$ (continues until convergence...)

\section{ML Framework \& Loss Functions}

\subsection{General ML Pipeline}
\textbf{Model} $\rightarrow$ \textbf{Algorithm} $\rightarrow$ \textbf{Estimated Parameters}

\textbf{Predictions} $\rightarrow$ \textbf{Decisions} $\rightarrow$ \textbf{Outcomes}

\textbf{Example:} Linear model $\rightarrow$ Gradient descent $\rightarrow$ $\hat{\theta}$

House price prediction $\rightarrow$ Buy/sell decision $\rightarrow$ Profit/loss

\subsection{Loss Functions}
\textbf{Regression:}
\begin{itemize}
    \item \textbf{MSE:} $L(y, \hat{y}) = (y - \hat{y})^2$
    \item \textbf{MAE:} $L(y, \hat{y}) = |y - \hat{y}|$
    \item \textbf{Huber:} $L(y, \hat{y}) = \begin{cases} \frac{1}{2}(y-\hat{y})^2 & \text{if } |y-\hat{y}| \leq \delta \\ \delta|y-\hat{y}| - \frac{1}{2}\delta^2 & \text{otherwise} \end{cases}$
\end{itemize}

\textbf{Classification:}
\begin{itemize}
    \item \textbf{0-1 Loss:} $L(y, \hat{y}) = \mathbf{1}[y \neq \hat{y}]$
    \item \textbf{Logistic:} $L(y, \hat{y}) = -y\log(\hat{y}) - (1-y)\log(1-\hat{y})$
    \item \textbf{Hinge:} $L(y, \hat{y}) = \max(0, 1 - y\hat{y})$ (SVM)
\end{itemize}

\section{Training Loss vs Model Complexity}

\subsection{Bias-Variance Tradeoff}
\textbf{Low Complexity:} High bias, low variance $\rightarrow$ \textbf{Underfitting}

\textbf{High Complexity:} Low bias, high variance $\rightarrow$ \textbf{Overfitting}

\subsection{Typical Curves}
\begin{itemize}
    \item \textbf{Training Error:} Decreases as complexity increases
    \item \textbf{Validation Error:} U-shaped curve
    \item \textbf{Optimal Complexity:} Minimum validation error
\end{itemize}

\begin{figure}[h]
\centering
\includegraphics[width=0.8\textwidth]{model-complexity-diagram.png}
\caption{Model Complexity vs Training/Validation Error}
\end{figure}

\textbf{Total Error} = Bias$^2$ + Variance + Irreducible Error

\textbf{Regularization:} Controls complexity via penalty terms
\begin{itemize}
    \item \textbf{L1 (Lasso):} $\lambda \sum_j |\theta_j|$ (sparse solutions)
    \item \textbf{L2 (Ridge):} $\lambda \sum_j \theta_j^2$ (smooth solutions)
\end{itemize}

\section{Generalization Error}

\subsection{Definition}
\textbf{Generalization Error:} Expected error on unseen data from same distribution

\textbf{Continuous Case:} For regression with squared loss
\[
\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[(h(x) - y)^2] = \int_{x,y} (h(x) - y)^2 p(x,y) \, dx \, dy
\]

If $p(x,y) = p(y|x)p(x)$, then:
\[
= \int_x \left[ \int_y (h(x) - y)^2 p(y|x) \, dy \right] p(x) \, dx
\]

\textbf{Discrete Case:} For classification with 0-1 loss
\[
\text{Gen Error} = \mathbb{E}_{(x,y) \sim D}[\mathbf{1}[h(x) \neq y]] = \sum_{x,y} \mathbf{1}[h(x) \neq y] \cdot p(x,y)
\]

Expanding the sum:
\[
= \sum_x \sum_{y: y \neq h(x)} p(x,y) = \sum_x p(x) \sum_{y: y \neq h(x)} p(y|x)
\]

Where $D$ is the data distribution, $h$ is the hypothesis

\textbf{Empirical Risk:} Approximation using training data

\textbf{Continuous Case:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} (h(x_i) - y_i)^2
\]

\textbf{Discrete Case:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} \mathbf{1}[h(x_i) \neq y_i]
\]

\textbf{General Form:}
\[
\hat{R}(h) = \frac{1}{m}\sum_{i=1}^{m} L(h(x_i), y_i) \approx \mathbb{E}_{(x,y) \sim D}[L(h(x), y)]
\]

\textbf{Generalization Gap:} $R(h) - \hat{R}(h)$ where $R(h)$ is true risk

\subsection{Sources of Error}
The expected prediction error can be decomposed into three fundamental components:

\textbf{Expected Prediction Error:}
\[
\mathbb{E}[\text{Error}(x)] = \text{Noise} + \text{Bias}^2 + \text{Variance}
\]

\textbf{Detailed Breakdown:}
\begin{itemize}
    \item \textbf{Noise:} $\sigma^2 = \mathbb{E}[(y - f(x))^2]$ - Irreducible error from data
    \item \textbf{Bias:} $\text{Bias}[\hat{f}(x)] = \mathbb{E}[\hat{f}(x)] - f(x)$ - Model's systematic error
    \item \textbf{Variance:} $\text{Var}[\hat{f}(x)] = \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2]$ - Model's sensitivity to training data
\end{itemize}

\textbf{MSE Decomposition:}
\[
\text{MSE}(x) = \mathbb{E}[(\hat{f}(x) - y)^2] = \text{Bias}^2[\hat{f}(x)] + \text{Var}[\hat{f}(x)] + \sigma^2
\]

Expanding the MSE:
\[
= (\mathbb{E}[\hat{f}(x)] - f(x))^2 + \mathbb{E}[(\hat{f}(x) - \mathbb{E}[\hat{f}(x)])^2] + \mathbb{E}[(y - f(x))^2]
\]

\textbf{Experimental Setup for Bias-Variance Analysis:}

We create $N$ different training sets by sampling from the data distribution. Each training set produces estimated model parameters, giving us $N$ different models $\hat{f}_1, \hat{f}_2, \ldots, \hat{f}_N$.

Use the average predictions of all the $N$ estimated models, denoted by $\bar{f}_w(x)$:
\[
\bar{f}_w(x) = \frac{1}{N}\sum_{i=1}^{N} \hat{f}_i(x)
\]

The average fit is akin to the expected prediction $\mathbb{E}[\hat{f}(x)]$ over all possible training sets.

\textbf{Bias Definition:}
\[
\text{Bias}(x) = f_{\text{true}}(x) - \bar{f}_w(x)
\]

\textbf{Key Question:} Is our approach flexible enough to capture $f_{\text{true}}(x)$?

Bias is high when the hypothesis class is unable to capture $f_{\text{true}}(x)$. This happens when the model class is too simple or restrictive to represent the true underlying function.

\subsection{Approximating Generalization Error}
Since true distribution $D$ is unknown, we approximate generalization error using:
\textbf{Validation Set:} Hold-out data to estimate $R(h) \approx \hat{R}_{val}(h)$
\textbf{Cross-Validation:} Average over multiple train/validation splits to reduce variance.
The key insight: empirical risk on unseen validation data provides unbiased estimate of generalization error.

\subsection{Test Error Definition}
\textbf{Test Error:} Performance on final held-out test set, used only once for final evaluation.

\textbf{Continuous Case (Regression):}
\[
\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} (h(x_i^{test}) - y_i^{test})^2
\]

Expanding the sum:
\[
= \frac{1}{n_{test}}\left[ (h(x_1^{test}) - y_1^{test})^2 + (h(x_2^{test}) - y_2^{test})^2 + \ldots + (h(x_{n_{test}}^{test}) - y_{n_{test}}^{test})^2 \right]
\]

\textbf{Discrete Case (Classification):}
\[
\text{Test Error} = \frac{1}{n_{test}}\sum_{i=1}^{n_{test}} \mathbf{1}[h(x_i^{test}) \neq y_i^{test}]
\]

Expanding the indicator sum:
\[
= \frac{1}{n_{test}}\left[ \mathbf{1}[h(x_1^{test}) \neq y_1^{test}] + \mathbf{1}[h(x_2^{test}) \neq y_2^{test}] + \ldots + \mathbf{1}[h(x_{n_{test}}^{test}) \neq y_{n_{test}}^{test}] \right]
\]

\textbf{Key Point:} Test set should only be used once to avoid overfitting to test data.

\subsection{Key Factors}
\begin{itemize}
    \item \textbf{Model Complexity:} More parameters $\rightarrow$ higher capacity to overfit
    \item \textbf{Training Set Size:} More data $\rightarrow$ better generalization
    \item \textbf{Regularization:} Penalty terms reduce overfitting
    \item \textbf{Early Stopping:} Stop training before overfitting occurs
\end{itemize}

\end{document}