\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\spacing{1.06}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\begin{document}
\lecture{17 --- Sublinear Time}{Nov 5,
2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Michael J Curry}}
\section{Introduction}
At the beginning of the lecture, we briefly went over some upper and
lower bounds on distortion for embeddings of various distances into
$l_1$.
Then, we discussed the setting for sublinear time algorithms, one
particular problem (distribution testing), and specific algorithms for
distinguishing between uniform and sufficiently non-uniform
distributions.
\section{Sublinear Time Algorithms}
We've been concerned with situations where the size of the input is
much too large to deal with using conventional algorithms. We've
considered streaming algorithms, where each part of the input is seen
once and the goal is to approximate an answer given space
constraints. Now we'll turn to truly sublinear algorithms --- only
looking at a subset of the input. This might be necessary due to
resource constraints (like on a router where even simple operations
take unacceptably long). The data itself might also come as a sample,
as in natural experiments or in the setting for machine learning where
the training data is assumed to be drawn from some unknown
distribution.
Broadly, there are two types of sublinear algorithms: the ``classic''
type, which examine a subset of the data and return an approximate
output; and ``property testing'', in which the goal is to verify
whether some object has a certain property. We'll consider the problem
of distribution testing.
\section{Testing for Uniformity}
It's hard to precisely tell whether a distribution is uniform, but we
can accept some approximation and try to distinguish between uniform
and ``sufficiently non-uniform'' distributions.
\subsection{Total Variation}
Suppose we treat (discrete) distributions over $[n]$
as vectors of probabilities. Then we consider a distribution $D$
``sufficiently non-uniform'' if $\|D - U_n\|_1 \ge \epsilon$. The
$l_1$ distance here is a proxy for the Total Variation
distance. Suppose we define our test as a subset $T \subset [n]$; if
$x \in T$ when sampling from one distribution but not the other then we can
distinguish the distributions. Then the total variation distance is
defined as
\begin{equation}
TV(A, B) = \max\limits_{T\subset [n]} \left|
Pr_{A}\left[ x \in T \right] -
Pr_{B}\left[ x \in T \right] \right|
\end{equation}
\begin{claim}
$TV(A, B)$ = $\frac{1}{2}\|A - B\|_1$
\end{claim}
\subsection{First Attempt at an Algorithm}
We could estimate the distribution $D$ empirically, and then compute
the distance $\|\hat{D} - U\|_1$. This is not a very good
algorithm. We need at least $n/2$ samples; otherwise at least half the
coordinates are guaranteed to be zero, resulting in an estimate that
is far from uniform. The $\chi^2$ test in classical statistics also
requires $\Omega(n)$ samples.
\subsection{Second Attempt}
\begin{algorithm}
\caption{UNIFORM}
\begin{algorithmic}
\REQUIRE $n, m, x_1\dots x_n$
\STATE $C \leftarrow 0$
\FOR{$i = 0$ to $m$}
\FOR{$j = 0$ to $m$}
\IF{$x_i = x_j$}
\STATE $C \leftarrow C + 1$
\ENDIF \ENDFOR \ENDFOR
\IF{$C < \frac{am^2}{n}$}
\RETURN uniform
\ELSE
\RETURN nonuniform
\ENDIF
\end{algorithmic}
\end{algorithm}
We can actually estimate uniformity using only $O_\epsilon(\sqrt{n})$
samples. The idea is to sample and count the number of collisions: a
nonuniform distribution will have more collisions. The amount of
sampling required is connected to the famous ``birthday paradox'' ---
in a uniform distribution, we would expect collisions to start
appearing at around $\sqrt{n}$ samples.
\subsubsection{Analysis}
First, think about the $l_2$ distance between distributions.
\begin{claim}
If $D = U_n$
then $\|D - U_n\|_2 = 0$.
But if $\|D - U_n\|_1 \ge \epsilon$,
then $\|D - U_n\|_2 > \frac{\epsilon^2}{n}$.
\end{claim}
\begin{proof}
The first part is obvious. For the second part, note that
\begin{equation*}
\|x\|_1 = \sum\limits_{i=1}^{n}|x_i| \le \left( \sum\limits_{i=1}^{n}1 \right)^{1/2}\left( \sum\limits_{i=1}^{n}x_i^2 \right)^{1/2}
\end{equation*}
So $\forall x: \|x\|_2 \ge \frac{\|x\|_1}{\sqrt{n}}$.
\end{proof}
\begin{claim}
$\|D - U_n\|^2_2 = \|D\|_2^2 - \frac{1}{n}$
\end{claim}
\begin{proof}
(The terms of $D$ are denoted $P_i$, because they represent probabilities.)
\begin{align*}
\|D - U_n\|_2^2 &= \|D\|_2^2 + \|U_n\|_2^2 - 2 D \cdot U_n \\
&= \|D\|_2^2 + \sum\limits_{k=1}^{n}\left( \frac{1}{n} \right)^2 -
2\sum\limits_{k=1}^{n}\frac{P_k}{n} \\
&= \|D\|_2^2 + \frac{1}{n} - \frac{2}{n}\sum\limits_{k=1}^{n}P_k \\
&= \|D\|_2^2 - \frac{1}{n}
\end{align*}
\end{proof}
The upshot is that $\|D_2\|_2^2 = \frac{1}{n}$ when uniform and $\|D_2\|_2^2
> \frac{1}{n} + \frac{\epsilon^2}{n}$ when non-uniform, so we just
need to be able to distinguish these cases.
\begin{lemma}
$\frac{1}{M} \times C$ allows us to distiguish between the two cases
above, as long as $m = \Omega(\frac{\sqrt{n}}{\epsilon^4})$. $M$ is
the normalization constant $M=\binom{m}{2}$.
\end{lemma}
\begin{proof}
As before, we first show that the estimator is unbiased and then bound
its variance. Define $\sigma_{ij}$ to be the indicator variable of the
event $x_i=x_j$. Also $Z=\tfrac{1}{M}\sum_{i=1}^m\sum_{j=i+1}^m \sigma_{ij}$.
\begin{align*}
M\cdot E [Z] &= E
\left[\sum\limits_{i=1}^{n}\sum\limits_{j=i+1}^{n}\sigma_{ij}\right] \\
&= \sum\limits_{i=1}^{n}\sum\limits_{j=i+1}^{n}\left[ \sigma_{ij} \right] \\
&= \sum\limits_{i=1}^{n}\sum\limits_{j=i+1}^{n}\sum\limits_{k=1}^{n}P_kP_k \\
&= \sum\limits_{i=1}^{n}\sum\limits_{j=i+1}^{n}\|D\|_2^2 \\
&= \binom{m}{2} \|D\|_2^2
\end{align*}
The variance is a little trickier to bound than in the
past. Essentially, we'll break up a sum into 3 terms and bound each of
them.
\begin{align*}
E\left[ Z^2 \right] &= \frac{1}{M^2} E\left[ \sum\limits_{i_1 < j_1}\sum\limits_{i_2 < j_2} \sigma_{i_1j_1}\sigma_{i_2j_2} \right]\\
&= \frac{1}{M^2} E\left[ \sum\limits_{i_1=i_2 <
j_1=j_2}\sigma_{i_1j_1}^2 + \sum\limits_{|\{i_1, i_2, j_1, j_2\}| =
3}\sigma_{i_1j_1}\sigma_{i_2j_2} +
\sum\limits_{\{i_1\}\cap\{i_2\}\cap\{j_1\}\cap\{j_2\} = \emptyset}\sigma_{i_1j_1}\sigma_{i_2j_2} \right] \\
\end{align*}
Bringing the expectation operator inside, we can bound the first term:
\begin{equation*}
E\left[ \sum\limits_{i