\documentclass[11pt]{article}

\evensidemargin 0cm
\oddsidemargin 0cm

\topmargin -.94cm
\textwidth 16.0cm
\textheight 24.6cm

\usepackage[dvips]{epsfig}
%\usepackage{nips}
\usepackage{wrapfig}
\usepackage{subfigure}
\usepackage{graphics}
\usepackage{url}
\usepackage{natbib}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{rotate}
\usepackage{latexsym}

\usepackage{bbold}
\newcommand{\real}{\mathbb R}
\newcommand{\iden}{\mathbb 1}

\input xy
\xyoption{all}
\xyoption{dvips}
\xyoption{color}
%\input{../xyps-col.tex}

\newcommand{\scalefac}{1}
\newcommand{\comment}[1]{}

%\newcommand{\proof}[1]{\vspace*{5mm} \noindent {{\small \bf Proof.} \small #1}%
%\vspace*{3mm}}
%\newcommand{\example}[2]{\vspace*{3mm} \noindent {\bf Example #1.} #2%
%\vspace*{1mm}}

\newcommand{\onder}[2]{#1_{\mbox{\rm \scriptsize #2}}}
\newcommand{\boven}[2]{#1^{\mbox{\rm \scriptsize #2}}}
\newcommand{\Xmap}{\boven{X}{map}}
\newcommand{\mycite}[1]{(\cite{#1})}
\newcommand{\myciteyear}[1]{(\citeyear{#1})}
\newcommand{\qold}{\boven{q}{old}}
\newcommand{\Pold}{\boven{P}{old}}
\newcommand{\qnew}{\boven{q}{new}}
\newcommand{\Pnew}{\boven{P}{new}}
\newcommand{\mnew}{\boven{\mu}{new}}
\newcommand{\bgamma}{\bar{\gamma}}
\newcommand{\blambda}{\bar{\lambda}}
\newcommand{\hlambda}{\hat{\lambda}}
\newcommand{\hbeta}{\hat{\beta}}
\newcommand{\hy}{\hat{y}}
\newcommand{\he}{\hat{e}}
\newcommand{\hsigma}{\hat{\sigma}}
\newcommand{\halpha}{\hat{\alpha}}
\newcommand{\blnew}{\boven{\bar{\lambda}}{new}}
\newcommand{\tlambda}{\tilde{\lambda}}
\newcommand{\bmu}{\bar{\mu}}
\newcommand{\tmu}{\tilde{\mu}}
\newcommand{\tm}{\tilde{m}}
\newcommand{\ttheta}{\tilde{\theta}}
\newcommand{\hm}{\hat{m}}
\newcommand{\htheta}{\hat{\theta}}
\newcommand{\KL}{\mbox{KL}}
\newcommand{\ev}[1]{\left<#1\right>}
\newcommand{\collapse}{\mathop{\rm Collapse}\limits}
\newcommand{\argmax}{\mathop{\rm argmax}\limits}
\newcommand{\argmin}{\mathop{\rm argmin}\limits}
\newcommand{\cmin}{\mathop{\rm min'}\limits}
\newcommand{\cmax}{\mathop{\rm max'}\limits}
\newcommand{\ninp}{\onder{n}{inp}}
\newcommand{\Tr}{\mbox{Tr~}}
\newcommand{\vc}[1]{{\bf #1}}
\newcommand{\vb}[1]{\mbox{\boldmath $#1$}}
\newcommand{\vs}[1]{\mbox{\boldmath \scriptsize $#1$}}
\newcommand{\sss}[1]{\underline{#1}}
\newcommand{\stheta}{\sss{\theta}}
\newcommand{\vx}{\vc{x}}
\newcommand{\sx}{\sss{x}}
\newcommand{\se}{\sss{e}}
\newcommand{\sy}{\sss{y}}
\newcommand{\sz}{\sss{z}}
\newcommand{\spi}{\sss{\pi}}
\newcommand{\svx}{\sss{\vx}}
\newcommand{\vX}{\vc{X}}
\newcommand{\sX}{\sss{X}}
\newcommand{\bX}{\bar{X}}
\newcommand{\by}{\bar{y}}
\newcommand{\bx}{\bar{x}}
\newcommand{\vy}{\vc{y}}
\newcommand{\vt}{\vc{t}}
\newcommand{\vY}{\vc{Y}}
\newcommand{\vz}{\vc{z}}
\newcommand{\vf}{\vc{f}}
\newcommand{\vg}{\vc{g}}
\newcommand{\vm}{\vc{m}}
\newcommand{\vepsilon}{\vc{\epsilon}}
\newcommand{\vmmin}{\vm^{-}}
\newcommand{\vmnul}{\vm^{0}}
\newcommand{\vmplus}{\vm^{+}}
\newcommand{\vM}{\vc{M}}
\newcommand{\hvm}{\hat{\vc{m}}}
\newcommand{\vtheta}{\mbox{\boldmath $\theta$}}
\newcommand{\vT}{\mbox{\boldmath $\Theta$}}
\newcommand{\vxi}{\mbox{\boldmath $\xi$}}
\newcommand{\eye}{{\mathbb 1}}
\newcommand{\zero}{{\mathbb 0}}
\newcommand{\tP}{\tilde{P}}
\newcommand{\hP}{\hat{P}}
\newcommand{\hp}{p}
%\newcommand{\hp}{\hat{p}}
\newcommand{\pold}{\boven{\hp}{old}}
\newcommand{\hV}{\hat{V}}
\newcommand{\ee}{\mbox{e}}
\newcommand{\evnul}[1]{\ev{#1}_{q_t}}
\newcommand{\evbac}[1]{\ev{#1}_{\hp_t}}
\newcommand{\evfor}[1]{\ev{#1}_{\hp_{t+1}}}
\newcommand{\valpha}{\vb{\alpha}}
\newcommand{\vmu}{\vb{\mu}}
\newcommand{\hmu}{\hat{\mu}}
\newcommand{\vbeta}{\vb{\beta}}
\newcommand{\hvbeta}{\hat{\vbeta}}
\newcommand{\hvtheta}{\hat{\vtheta}}
\newcommand{\vgamma}{\vb{\gamma}}
\newcommand{\vGamma}{\vb{\Gamma}}
\newcommand{\vdelta}{\vb{\delta}}
\newcommand{\vDelta}{\vb{\Delta}}
\newcommand{\vsalpha}{\vs{\alpha}}
\newcommand{\vsbeta}{\vs{\beta}}
\newcommand{\vsgamma}{\vs{\gamma}}
\newcommand{\vsdelta}{\vs{\delta}}
\newcommand{\talpha}{\tilde{\valpha}}
\newcommand{\tbeta}{\tilde{\vbeta}}
\newcommand{\tgamma}{\tilde{\vgamma}}
\newcommand{\tdelta}{\tilde{\vdelta}}
\newcommand{\dalpha}{\partial \alpha}
\newcommand{\dbeta}{\partial \beta}
\newcommand{\dgamma}{\partial \gamma}
\newcommand{\ddelta}{\partial \delta}

\newcommand{\evv}[1]{\ev{\ev{#1}}}
\newcommand{\evvnul}[1]{\evv{#1}_{q_t}}
\newcommand{\evvbac}[1]{\evv{#1}_{\hp_t}}
\newcommand{\evvfor}[1]{\evv{#1}_{\hp_{t+1}}}

\newcommand{\gnew}{\boven{\gamma}{new}}
\newcommand{\lnew}{\boven{\lambda}{new}}
\newcommand{\gold}{\boven{\gamma}{old}}
\newcommand{\lold}{\boven{\lambda}{old}}
\newcommand{\dnew}{\boven{\vdelta}{new}}
\newcommand{\dold}{\boven{\vdelta}{old}}
\newcommand{\anew}{\boven{\valpha}{new}}
\newcommand{\aold}{\boven{\valpha}{old}}
\newcommand{\bnew}{\boven{\vbeta}{new}}
\newcommand{\bold}{\boven{\vbeta}{old}}

\newcommand{\Var}{\mbox{Var}}
\newcommand{\RSS}{\mbox{RSS}}
\newcommand{\lik}{\mbox{lik}}
\newcommand{\TrainError}{\mbox{TE}}
\newcommand{\PredError}{\mbox{PE}}
\newcommand{\ValError}{\mbox{VE}}

\newenvironment{proof}{\begin{quotation} {\it Proof.\ }}
			{$\Box$ \end{quotation}}
\newtheorem{proposition}{Proposition}


\begin{document}

\section*{Exercise 6: Classification Problems}

In this exercise you will be asked to develop several tools relevant
for classification problems. The data set that will guide us concerns
the classification of radar returns from the ionosphere.
It can be downloaded from
{\verb#http://www.mbfys.kun.nl/~tom/files/ionosphere.dat#}.

This radar data was collected by a system in Goose Bay, Labrador.  This
system consists of a phased array of 16 high-frequency antennas with a
total transmitted power on the order of 6.4 kilowatts.
The targets were free electrons in the ionosphere.
``Good'' radar returns are those showing evidence of some type of structure
in the ionosphere.  ``Bad'' returns are those that do not; their signals pass
through the ionosphere.

Received signals were processed using an autocorrelation function whose
arguments are the time of a pulse and the pulse number.  There were 17
pulse numbers for the Goose Bay system.  Instances in this database are
described by 2 attributes per pulse number, corresponding to the complex
values returned by the function resulting from the complex electromagnetic
signal.

The database consists of 351 instances (rows in the file), each made up
of 34 inputs (the first 34 columns) and 1 output (the last column)
specifying good (1) or bad (0) returns. We will use the first 200
instances for fitting different models. The remaining 151 instances
will be used for independent testing of the performance of these models.

\subsection*{Principal Component Analysis}

It is difficult to visualize the data in 34 dimensions. A
popular method for dimension reduction is ``principal component analysis''.
The basic idea is to make a linear projection $\vx'$ of the original
$\vx$, i.e.,
\[
\vx' = Q \vx \: ,
\]
such that we keep as much ``information'' as we can. For visualization
purposes, $\vx'$ is usually taken to be two-dimensional, that is, we will
make a projection from 34 to 2 dimensions and the projection matrix $Q$
is 2 by 34. Now it can be argued that the best projection matrix consists
of (any linear combination of) the two eigenvectors of the 34 by 34
input covariance matrix $C$ with components
\[
C_{kl} = {1 \over n-1} \sum_{i=1}^n (x_{ki} - \bar{x}_k)
(x_{li} - \bar{x}_l) \: ,
\]
that correspond to the largest eigenvalues. (Note that if the different
inputs correspond to different entities, it would have been better to
first rescale all inputs to have mean zero and, for example, standard
deviation 1.)

\begin{description}
\item[a.] Build a program called {\tt pca.m} that takes as its first
argument an $\onder{n}{inp} \times n$ matrix of input values {\tt x} and as
its second argument the number {\tt k} of principal components (2 for the
above example), and returns the projection of {\tt x} (called $\vx'$ above)
onto the first {\tt k} principal components.
\end{description}

\begin{description}
\item[b.] Plot the projections onto the first two principal directions against
each other. Use different symbols for the ``good'' and ``bad'' radar
returns. You can use all 351 instances. The result should look like
Figure~1.
\end{description}

\begin{figure}[h]
\begin{center}
\epsfig{file=pcaplot.eps,width=0.5\textwidth}
\end{center}
\label{Principal component plot.}
\end{figure}

\subsection*{Logistic Regression}

We will first apply logistic regression based on the two-dimensional
projections of the original inputs. This enables us to visualize the
results. Below we will apply logistic regression on the unprojected
original inputs. Here and in the following, build your models on the
first 200 instances and use the remaining 151 instances for testing.

You can use the function {\tt solve\_log.m}, that calls the function
{\tt error\_log}, for applying logistic regression. Both can be downloaded
from the course website:\\
{\verb#http://www.mbfys.kun.nl/~tom/files/solve_log.m#} and\\
{\verb#http://www.mbfys.kun.nl/~tom/files/error_log.m#}.


\begin{description}
\item[c.] Use the function {\tt solve\_log.m}
(with {\tt lambda} set to zero) to solve the logistic regression problem
on the first 200 instances. What is the percentage of incorrectly
classified training/test instances?
\end{description}

\begin{description}
\item[d.] Compute the decision boundary and plot it in the figure made
under~(b).
\end{description}

\subsection*{Nearest Neighbor Classification}

Logistic regression is just one tool for solving classification problems.
Another popular and relatively straightforward one is nearest neighbor
classification. The idea is as follows. Given a data set of $n$ instances
and a new input vector $\vx$, find the input $\vx_i$ closest to the
$\vx$ and classify $\vx$ as $y_i$. The straightforward generalization is
$k$-nearest neighbor classification: find the $k$ input vectors closest
to $\vx$ and use majority voting. For example, if $k=3$, the new vector
is classified as ``good'' if two or more of the three closest inputs have
class ``good'' and classified as ``bad'' otherwise.

The function to be used is called {\tt knnclass.m}, which can be downloaded
from\\ {\verb#http://www.mbfys.kun.nl/~tom/files/knnclass.m#}.
It takes as inputs {\tt xtest} (an $\onder{n}{inp} \times
\onder{n}{test}$ matrix of test inputs),
{\tt xtrain} (an $\onder{n}{inp} \times \onder{n}{train}$ matrix of
training inputs), {\tt ytrain} (an $1 \times \onder{n}{train}$ vector
of corresponding outputs), and {\tt k} (the number of
nearest neighbors) and returns {\tt ypred} (an $1 \times \onder{n}{test}$
vector of predicted outputs).

\begin{description}
\item[e.] Compute and plot the classification error (percentage misclassified
instances) for $k$-nearest neighbor classification for different $k$, e.g.,
ranging from 1 to 29 in steps of 2 (why?). Present the performance on
both the training and the test set in one plot. What would you say is the
the optimal number of nearest neighbors in this case? 
\end{description}

\begin{description}
\item[f.] Visualize the best solution obtained with the nearest neighbor
algorithm. For example, take a grid of points and classify each of them
using the nearest neighbor classifier and give different colors to the
points that are classified as ``good'' and ``bad''. You should get something
like Figure~2 (here only the grid points classified as ``good'' are colored).
Do the same for the 1-nearest neighbor classifier. Can you explain the
difference?
\end{description}

\begin{figure}[h]
\begin{center}
\epsfig{file=knnplot.eps,width=0.5\textwidth}
\end{center}
\label{Classification according to a $k$-nearest neighbor classifier.}
\end{figure}

\subsection*{Classification Based on the Raw Inputs}

Now we go back to the original data set and apply both logistic regression
and the nearest neighbor classifier based on all raw inputs, i.e., based
on the 34 by 200 dimensional matrix of training inputs.

\begin{description}
\item[g.] Apply logistic regression based on all raw inputs and evaluate
the training and test errors. Try different values of the regularization
parameter {\tt lambda} to see
if you can improve the test performance and eventually, plot the training
and test performance over a range of ``interesting'' values.
\end{description}

\begin{description}
\item[h.] As e., but now based on all raw inputs. How do you interpret the
results?
\end{description}

\begin{description}
\item[i.] If you {\em had} to choose a single model out of the ones that
you have tried until now to classify new data that the model has never
seen before (that is, not the test data that you used to validate your
models on, but another ``random'' test set), which model would you prefer?
Do you think you have enough information to reliably make such a choice?
If not, what other experiments would you suggest, given the available data?
\end{description}

\subsection*{ROC Curves}

\begin{description}
\item[j.] Build an ROC curve for the best logistic regression model that
you have found. Do this both for the training and for the test instances.
What specificity can be obtained if we require a sensitivity of at least
0.9? Explain in your own words what this statement means.
\end{description}


\newpage
\section*{Hints for Exercise 6}

\begin{description}
\item[a.] You can make use of the {\sc Matlab} routines {\tt cov} for
computing the covariance matrix (make sure that you get a matrix of size
$\onder{n}{inp} \times \onder{n}{ninp}$ out of it), {\tt eig} for computing
both the eigenvectors and eigenvalues of the covariance matrix, {\tt diag}
for turning the matrix with eigenvalues into a vector of eigenvalues, and
{\tt sort} for sorting them.
\end{description}

\begin{description}
\item[b.] {\tt hold on} holds the current plot so that subsequent graphing
commands are added instead of overwriting the previous ones.
\end{description}

\begin{description}
\item[c.] Also for later purposes it may help to build a function
{\tt simu\_log.m} with a similar semantics as {\tt simulin.m} that you
used in Exercise 5. That is, {\tt simu\_log} takes as inputs the matrix
of inputs {\tt x}, the vector of weights {\tt w}, and the threshold
{\tt t}, and returns the vector of class labels {\tt ypred}.
You can then compare {\tt ypred} with the labels {\tt y} according to the
data set, e.g., using something like {\tt length(find(ypred == y))}
(the number of correctly classified instances) or just
{\tt sumsqr(ypred-y)} (the number of misclassified instances).
\end{description}

\begin{description}
\item[d.] To emphasize the decision boundary graphically, you can use
{\tt h = plot(\ldots)} and then {\tt set(h,'LineWidth',2)}. {\tt h}
is called a handle to a graphics object. With {\tt get(h)} you can view
its properties, that can be changed with {\tt set(h,'Property','Value')}
where {\tt 'Property'} stands for the particular property you want to
change. Just {\tt set(h,'Property')} gives you the possible values.
\end{description}

\begin{description}
\item[e.] Just write a for-loop running over different numbers of
nearest neighbors.
\end{description}

\begin{description}
\item[f.] For building a grid that covers the ``interested area'' (for
example, given by the axes that result from the plot you made under~(b))
you can use {\tt meshgrid}. It gives you a {\em matrix} of $x$ and $y$
coordinates. Something like {\tt xx = x(:)';} and {\tt yy = yy(:)';}
concatenates the matrix into a (long) vector. Then put them together
using {\tt xxx = [xx;yy];} to make an $2 \times \onder{n}{grid}$ matrix of
inputs. This is the proper format for {\tt knnclass} that then yields
an $1 \times \onder{n}{grid}$ vector of predicted classes to be visualized.
\end{description}

\begin{description}
\item[g.] As you will notice, you will have to choose rather low values
of {\tt lambda}. Furthermore, you probably want to change {\tt lambda} over
different orders of magnitudes. You can use {\tt semilogx} instead of
{\tt plot} for a logarithmic scaling on the x-axis.
\end{description}

\begin{description}
\item[j.] ROC curves are discussed in the syllabus, section~6.3.3. Based
on the information given there you should be able to work this out.
\end{description}


\end{document}
