\documentclass[10pt]{article}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{graphicx}
\usepackage{ulem}
\usepackage{framed}
\usepackage{microtype}
\usepackage{booktabs}
\usepackage{subfigure}
\usepackage{hyperref}
\usepackage{tabularx}
\usepackage[capitalise,noabbrev]{cleveref}
\usepackage[usenames,dvipsnames]{xcolor}
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage[letterpaper, margin=1in]{geometry}
\sloppy
\renewcommand{\familydefault}{ppl}
\newcommand{\bx}{{\boldsymbol x}}
\newcommand{\bh}{{\boldsymbol h}}
\newcommand{\bw}{{\boldsymbol w}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bmu}{{\boldsymbol \mu}}
\newcommand{\balpha}{{\boldsymbol \alpha}}
\newcommand{\bbeta}{{\boldsymbol \beta}}
\newcommand{\btheta}{{\boldsymbol \theta}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\AND}{AND}
\DeclareMathOperator*{\OR}{OR}
\DeclareMathOperator*{\XOR}{XOR}
\DeclareMathOperator*{\sign}{sign}
\newcommand{\solution}[1]{{\color{PineGreen} \textbf{Solution:} #1}}
\newcommand{\todo}[1]{\textbf{\textcolor{MidnightBlue}{to do: #1}} }
\begin{document}
\section*{Machine Learning Fall 2015 Homework 3}
Make sure to explain you reasoning or show your derivations. Except for answers that are especially straightforward, you will lose points for unjustified answers, even if they are correct.
\section*{General Instructions}
Submit your homework electronically on Canvas. We recommend using LaTeX, especially for the written problems. But you are welcome to use anything as long as it is neat and readable.
Include a README file that describes all other included files. Indicate which files you modified. You are welcome to create additional functions, files, or scripts, and you are also welcome to modify the included interfaces for existing functions if you prefer a different organization.
Since we will work on some of the homework in class, clearly indicate which parts of your submitted homework are work done in class and which are your own work.
Relatedly, cite all outside sources of information and ideas. \textbf{List any students you discussed the homework with.}
\section*{Written Problems}
\begin{enumerate}
\item Machine learning methods can be viewed as function estimators. Consider the logical functions AND, OR, and XOR. Using a signed representation for Boolean variables, where input and output variables are in $\{+1, -1\}$, these functions are defined as
\begin{align}
\AND(x_1, x_2) &= \begin{cases}
+1 & \textrm{if}~ x_1 = +1 \wedge x_2 = +1\\
-1 & \textrm{otherwise}
\end{cases}\\
\OR(x_1, x_2) &= \begin{cases}
+1 & \textrm{if}~ x_1 = +1\\
+1 & \textrm{if}~ x_2 = +1\\
-1 & \textrm{otherwise}
\end{cases}\\
\XOR(x_1, x_2) &= \begin{cases}
+1 & \textrm{if}~ x_1 = +1 \wedge x_2 = -1\\
+1 & \textrm{if}~ x_2 = +1 \wedge x_1 = -1\\
-1 & \textrm{otherwise}
\end{cases}
\end{align}
\begin{enumerate}
\item (4 points) Which of these three logical functions can be expressed as a linear classifier of the form
\begin{align}
f(\bx; \bw) = \sign(w_1 x_1 + w_2 x_2 + b),
\end{align}
and show weights $w_1$, $w_2$, and bias values $b$ that mimic these logical functions. Which of these functions cannot be expressed as a linear classifier?
\item (5 points) For any logical functions that cannot be expressed as a linear classifier, show how it can be expressed as a two-layered perceptron of the form
\begin{align}
f(\bx; \bw) &= \sign(\bw^{\mathrm{out}\top} \bh + b^{\mathrm{out}})\\
\bh &= [h_1, h_2]^\top\\
h_1 &= \sign(\bw^{(1)\top} \bx + b^{(1)})\\
h_2 &= \sign(\bw^{(2)\top} \bx + b^{(2)})
\end{align}
\end{enumerate}
\item Derive the dual support vector machine (SVM) from first principles. To simplify the work, we will do the derivation without a bias and assuming the data is separable. The classification objective is
\begin{align}
f(\bx; \bw) = \sign(\bw^\top \bx)
\label{eq:svmPredictor}
\end{align}
For each data point $\bx_i$, a constraint that ensures that the point is correctly classified with a margin is
\begin{align}
y_i \bw^\top \bx_i - 1 \ge 0.
\label{eq:marginConstraint}
\end{align}
A vector $\bw$ that satisfies all these constraints for $i \in \{1, \ldots, n\}$ perfectly classifies all the training data.
\begin{enumerate}
\item (5 points) For a vector $\bw$ and for two data examples $\bx_+$ and $\bx_-$ whose labels are respectively $y_+ = +1$ and $y_- = -1$, what is the smallest possible distance between the points $\sqrt{(\bx_+ - \bx_-)^\top (\bx_+ - \bx_-)}$ if the two points satisfy the margin constraint, \cref{eq:marginConstraint}? Hint: the closest possible points will be aligned along the normal vector of the decision boundary, so $\bx_+ = \bx_- + \gamma \bw$ for some $\gamma$.
\item (3 points) The answer you derive in part (a) should reveal why the primal max-margin SVM objective is
\begin{align}
\argmin_{\bw} &~~ \frac{1}{2} \bw^\top \bw \\
\mathrm{s.t.} &~~ y_i \bw^\top \bx_i - 1 \ge 0, ~\forall i \in \{1, \ldots, n\}\nonumber
\end{align}
Convert this constrained optimization into a $\min \max$ of a Lagrangian function $L(\bw, \balpha)$ by using Lagrange multipliers to penalize violations of the inequality constraints. For consistency, use $\balpha = [\alpha_1, \ldots, \alpha_n]^\top$ as the symbol for your Lagrange multipliers. Write the saddle-point optimization over $\bw$ and $\balpha$, and indicate which variables are being minimized over and which are being maximized over. Your optimizations should be over $\bw \in {\mathbb R}^d$ and $\balpha \in {\mathbb R}^n_+$, where $\mathbb{R}_+$ is the set of nonnegative real numbers. Hint: be careful about the sign of the Lagrange multipliers and the constraint functions. Plug in some values to make sure the ``adversary'' can penalize constraint violations with nonnegative $\alpha$s.
\item (5 points) Reversing the original outer minimization and the inner maximization, we obtain the \textit{dual} objective function. We also obtain an inner minimization that has a closed form solution. For a fixed vector of Lagrange multipliers $\balpha$, what is the value of $\bw$ that minimizes the Lagrangian function?
\item (6 points) Plug in your solution to $\bw$ and simplify the new objective function. You should obtain a quadratic program over the $\balpha$ variables where all instances of the data vectors $\bx$ occur in inner products of the form $\bx_i^\top \bx_j$. Hint: an easier way to manage these equations is to substitute your solution for one of the $\bw$ vectors in the $\bw^\top \bw$ term first, then cancel out as many terms as you can before plugging your solution into any remaining $\bw$ terms. For example, if you derived $\bw = \textrm{blah}$, start by simplifying the expression $\bw^\top (\textrm{blah})$ and canceling with any similar terms from your Lagrangian.
\item (2 points) Finally, replace all inner products of the form $\bx_i^\top \bx_j$ with a kernel function $K(x_i, x_j)$. Write the kernelized dual SVM optimization.
\end{enumerate}
\end{enumerate}
\section*{Programming Assignment}
For this programming assignment, we have provided a lot of starter code. Your tasks will be to complete the code in a few specific places, which will require you to read and understand most of the provided code, but will only require you to write a small amount of code yourself.
In \texttt{syntheticData.mat}, there are 10 synthetic, two-dimensional data sets. All but the first data set are not linearly classifiable. We provided the main experiment script \texttt{runSyntheticExperiments.m} for you to use. For each model, the script uses cross-validation on the training data to choose learning parameters, trains on the full training set using those parameters, and evaluates the accuracy on the test set.
\begin{enumerate}
\item (8 points) Complete the backpropagation steps in \texttt{mlpObjective.m} for the multi-layered perceptron. Specifically, you will need to compute the gradient for the output-layer weights, then the error and gradients for the middle-layer weights, and finally the error and gradient for the input-layer. Once you complete this step, the script \texttt{checkMLPDerivative.m} should run and report a derivative is \textit{close} to the numerical approximation. (It won't pass the hard-coded 1e-6 threshold, so it will crash. But if the error is around 1e-6 in magnitude, it is correct.) Once the gradient computation is correct, the experiment using multi-layered perceptron on the synthetic data sets should run.
\item (8 points) Complete the kernel SVM \texttt{kernelSvmTrain.m} and \texttt{kernelSvmPredict.m}. In each function, there is a block of code for you to fill in.
In \texttt{kernelSvmTrain}, set up the quadratic program to compute the dual SVM objective. We have provided the call to the quadprog function, the processing of the output alpha values, and the storage of the support vectors into the model struct.
In \texttt{kernelSvmPredict}, compute the kernelized prediction score directly from the Gram matrix, the alpha dual variables, the support vector labels, and the bias.
Once you complete this step, the linear kernel dual SVM should work.
\item (5 points) Complete the polynomial kernel function \texttt{polynomialKernel.m}. To see how a kernel function should work, look at \texttt{linearKernel.m} as a template. Calling \texttt{polynomialKernel(X, Y, 1)} should produce the exact same output as \texttt{linearKernel(X, Y)}. Once you complete this step and the previous step, the polynomial kernel experiment should run.
\item (5 points) Complete the Gaussian radial basis function kernel function \texttt{rbfKernel.m}. Make sure to use the trick for computing all pairwise distances between two sets of points that we used in the last homework.
\item (4 points) Once all four methods are working, run the experiment script in publish mode, which will iterate through each data set and plot the learned decision boundaries for each data set and each model type. Write a short report (one paragraph is sufficient, but feel free to write more) about the results. Which methods worked best on these synthetic data sets? Is there a pattern to when certain methods work well or not?
\end{enumerate}
\begin{table}[htbp]
\caption{Included files and brief descriptions. Unless indicated here in bold, the source files are just placeholders for code you should complete. You should not need to implement or modify bolded files.}
\label{tab:files}
\begin{center}
\renewcommand{\arraystretch}{1.5}
\begin{tabularx}{\textwidth}{lX}
\toprule
\textbf{File Path} & \textbf{Description}\\
\midrule
\textbf{syntheticData.mat} & Synthetic data\\
\textbf{src/checkMLPDerivative.m} & Script to check the gradient computation of multi-layer perceptron\\
\textbf{src/crossValidate.m} & Wrapper to run cross validation \\
\textbf{src/kernelSvmPredict.m} & Function \textbf{you will complete} that predicts labels using kernel SVM\\
\textbf{src/kernelSvmTrain.m} & Function \textbf{you will complete} that trains a kernel SVM\\
\textbf{src/linearKernel.m} & Function that computes the linear kernel between two sets of data\\
\textbf{src/logistic.m} & Function that computes the logistic of each dimension of its input\\
\textbf{src/mlpFlatObjective.m} & Wrapper function to help gradient checking of multi-layered perceptron\\
\textbf{src/mlpObjective.m} & Function \textbf{you will complete} that computes the regularized loss and gradient for a multi-layered perceptron\\
\textbf{src/mlpPredict.m} & Function that performs forward-propagation to predict labels using a multi-layered perceptron\\
\textbf{src/mlpTrain.m} & Function that runs vanilla gradient descent to train the weights of a multi-layered perceptron\\
\textbf{src/nll.m} & Function that computes the negative log likelihood of Bernoulli probabilities given the true labels and its gradient\\
\textbf{src/plotData.m} & Function that plots 2D, binary-class data\\
\textbf{src/plotSurface.m} & Function that plots the decision boundary for 2D inputs\\
\textbf{src/polynomialKernel.m} & Function \textbf{you will complete} that computes the polynomial kernel matrix between two sets of data points\\
\textbf{src/rbfKernel.m} & Function \textbf{you will complete} that computes the Gaussian radial-basis function kernel between two sets of data points\\
\textbf{src/runSyntheticExperiments.m} & Main experiment script that runs all four models on all ten data sets\\
\bottomrule
\end{tabularx}
\end{center}
\end{table}
\end{document}