\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\spacing{1.06}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\def\eps{\ensuremath\epsilon}
\begin{document}
\lecture{2 -- Median trick, Distinct Count, Impossibility Results}{Sep 10, 2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Shubham Bansal, Chen-Yu Yang}}
\section{Introduction}
Today's lecture has three main topics that we'll go through, i.e. Median Trick (from previous lecture), Distinct element count and Impossibility Results.
\section{Median Trick}
So far, we have an algorithm $A$ which estimates in correct range of $\eps$ with probability $\ge 0.9$. Our new algorithm $A^{\ast}$ will output in range of $\eps$ with probability $1-\delta$.
Algorithm:
\begin{itemize}
\item Repeat $A$ for $m=O(log (1/\delta))$ times
\item Take median of all the $m$ answers.
\end{itemize}
To prove the correctness, we'll use Chernoff/Hoeffding bounds.
\begin{definition}
[Chernoff/Hoeffding Bound]
Let $X_{1}$, $X_{2}$, $\ldots$, $X_{m}$ be independent random variables $\in \{0,1\}$,
$\mu = E[\Sigma_{i} X_{i}], \eps \in [0,1]$.
Then $Pr[|\Sigma_{i} X_{i}-\mu| > \eps\mu] \leq 2e^{-\eps^{2}\mu/3}$
\end{definition}
Define $X_{i} = 1$ iff the $i^{th}$ answer of $A$ is correct (i.e. estimated value of $A$ lies in correct range).
\begin{claim}
$E[X_{i}] = 0.9$, and $E[\mu] = 0.9m$
\end{claim}
\begin{proof}
Since A is correct with probability 0.9, $E[X_{i}] = 0.9$. And $E[\mu] = 0.9m$ due to linearity of expectation.
\end{proof}
\begin{claim}
New algorithm $A^{\ast}$ is correct when $\Sigma_{i} X_{i} > 0.5m$
\end{claim}
\begin{proof}
Since we are considering median value to be our answer, if more than half the trials of A are correct, algorithm $A^{\ast}$ is also correct.
\end{proof}
\begin{claim}
To prove, $Pr[\Sigma_{i} X_{i} \ge 0.5m] \ge 1-\delta$ or $Pr[\Sigma_{i} X_{i} < 0.5m] < \delta$
\end{claim}
\begin{proof}
\begin{equation}
\begin{split}
Pr[\Sigma_{i} X_{i} < 0.5m] & = Pr[\Sigma_{i} X_{i} - 0.9m < -0.4m]\\
& \le Pr[|\Sigma_{i} X_{i} - \mu| > 0.4m]\\
& = Pr[|\Sigma X_{i} - \mu| > 0.4/0.9 \mu]
\end{split}
\end{equation}
Using Chernoff bound,
\begin{equation}
\begin{split}
& \leq e^{-c*0.9m}\\
& < \delta
\end{split}
\end{equation}
Above equation holds for $m = O(log(1/\delta))$
\end{proof}
\section{Distinct Elements}
Given, a stream of size $m$ containing numbers from $[n]$, we have to approximate the number of elements with non-zero frequency. To calculate the exact value the space required:
\begin{itemize}
\item $O(n)$ bits. (maintain a vector of length n).
\item $O(m \log (n))$ bits. (save m numbers, each taking $log(n)$ bits).
\end{itemize}
Since, this complexity is not feasible as $m$,$n$ can be very large, we'll look at algorithm for approximating the distinct count value.
\subsubsection{Hash Function}
\begin{itemize}
\item $h : [n] \rightarrow [0,1]$
\item $h(i)$ is uniformly distributed in $[0,1]$.
\end{itemize}
\subsection{Algorithm [Flajolet-Martin 1985]}
We maintain a variable $z$.
\begin{enumerate}
\item Initialize $z = 1$.
\item Whenever $i$ is encountered: $z = \min{(z,h(i))}$
\item When done, output $1/z -1$.
\end{enumerate}
Now, we'll prove the algorithm works in a similar fashion followed in previous lecture.
Let $d$ be number of distinct elements.
\begin{claim}
$E[z] = d+1$
\end{claim}
\begin{proof}
$z$ is the minimum of $d$ random numbers in $[0,1]$. Pick another random number $a \in [0,1]$. The probability $a (1 + \eps)d] \leq 0.05$
\item $Pr[\hat{d} < (1 - \eps)d] \leq 0.05$
\item Overall probability that $\hat{d}$ outside range is at most 0.1
\end{itemize}
\end{claim}
\begin{proof}
To compute $Pr[\hat{d} > (1+\eps)d]$:
\begin{itemize}
\item Define $X_{i} = 1$ iff $h(i) < \dfrac{k}{(1+\eps)d}$
\item Then $\hat{d} > (1+\eps)d$ iff $\Sigma_{i} X_{i} > k$
\item if $\Sigma_{i} X_{i} > k$\\
$\iff \exists$ at least $k$ numbers for which $h(i) < \dfrac{k}{(1+\eps)d}$\\
\begin{equation}
\iff z_{k} < \dfrac{k}{(1+\eps)d}
\iff \dfrac{k}{z_{k}} > (1+\eps)d
\iff \hat{d} > (1+\eps)d
\end{equation}
\item
$E[X_{i}] = \dfrac{k}{(1+\eps)d}$\\
$E[\Sigma_{i} X_{i}] = d E[X_{i}] = \dfrac{k}{1 + \eps}$\\
$\text{var}[\Sigma_{i} X_{i}] = d \text{var}[X_{i}] \leq dE[X_{1}^{2}] \leq \dfrac{k}{1+\eps} \leq k$\\
(Since $X_{1} \in \{0,1\}$, $E[X_{1}^{2}] = E[X_{i}]$)
\item By Chebyshev:
$Pr[|\Sigma X_{i} - \dfrac{k}{1+\eps}| > \sqrt{20k}] \leq 0.05 \implies Pr[\Sigma X_{i} > \dfrac{k}{1+\eps} + \sqrt{20k}] \leq 0.05 $\\
\begin{itemize}
\item
(For $\eps < 1/2$ and $k=c/\eps^{2}$)\\
$\dfrac{k}{1+\eps} + \sqrt{20k} \leq k(1-\eps+\eps^{2}) + \sqrt{20k}$ (Taylor Series Expansion)\\
$ \leq k - k\eps/2 + 5\sqrt{c}/\eps$
$ = k - c / 2\eps + 5\sqrt{c}/\eps$\\
$ < k $ where $c > 100$
\item
Since $k > \dfrac{k}{1+\eps} + \sqrt{20k} $ in our case and $\Sigma X_{i}$ is monotonically increasing, $Pr[\Sigma X_{i} > k] \leq Pr[\Sigma X_{i} > \dfrac{k}{1+\eps} + \sqrt{20k}] \leq 0.05$
\end{itemize}
\end{itemize}
\end{proof}
\subsection{Hash functions in stream}
The hash function we used has two practical issues: (1) the return value should be a real number. (2) how do we store it?
Discretization can solve the first issue. Instead of all the real numbers in $[0, 1]$, we use hash function with range $\{0, \frac{1}{M}, \frac{2}{M}, \frac{3}{M}, \ldots, 1\}$. For large $M \gg n^{3}$, the probability that $d \le n$ random numbers collide is at most $\frac{1}{n}$.
For the second issue, we use pairwise independent function instead of independent function.
\begin{definition}
$h: [n] \rightarrow \{1, 2, \ldots M\}$ is pairwise independent if for all $i \ne j$ and $a, b \in [M]$, $\text{Pr}[h(i)=a \land h(j)=b]=\frac{1}{M^2}$
\end{definition}
It works because in previous calculation, we only care about pairs. We defined $X_i=1$ iff $h(i)$ is small than a threshold, then we computed $\text{var}[\Sigma X_i] = E[(\Sigma X_i)^2] - E[(\Sigma X_i)^2] = E[X_1X_1 + X_1X_2 + \ldots]- E[(\Sigma X_i)^2]$. Notice that $E[X_iX_j]$ is the same for fully random $h$ and pairwise independent $h$.
\begin{example}
[Construct a pairwise independent hash]
Assume $M$ is a prime number (if not, we can always pick a larger $M$ that is a prime number). We pick $p, q \in \{0, 1, 2, \ldots M-1 \}$ and the hash function $h(i) = pi+q \mod M$. In this construction we only need $O(\log M) = O(\log n)$ space (to store $p, q, M$).
\end{example}
\begin{proof}
$h(i)=a, h(j)=b$ is equivalent to $pi+q \equiv a, pj+q \equiv b$. So $p(i-j) \equiv a-b$ and $p \equiv (a-b)(i-j)^{-1}, q \equiv a - pi$. Since $M$ is a prime number, the unique inverse implies that there is only one pair $(p, q)$ satisfies it. And the probability that pair is chosen is exactly $\frac{1}{M^2}$.
\end{proof}
\section{Impossibility Results}
We have used both approximation and randomization to solve the distinct counting problem with space much less than $\min{(m, n)}$. Now we are wondering: can we omit either approximation or randomization to achieve the same space efficiency? The answer is no.
\subsection{Deterministic Exact Won't Work}
First, we will show that there is no deterministic (no randomization) and exact (no approximation) way to solve it.
Suppose there do exists a deterministic and exact algorithm $A$ and an estimator function $R$ that use space $s \ll n, m$. That is, for a given integer stream, we first run the algorithm $A$ on the stream. As the stream goes $A$ will return middle memory steps, and we obtain the final memory state $\sigma$ after the stream ends. Then we apply $R$ on $\sigma$ to obtain our estimator $\hat{d}$. Since both $A$ and $R$ are deterministic and exact, $\hat{d}$ must equals to the distinct count for the stream.
We now build a binary representation $x$ of the stream with the following rules: (1) $x \in \{0, 1\}^{n}$, (2) $i$ in stream iff $x_i = 1$. For example, if 1, 3, 5, 6, 7 are in the stream and 2, 4 are not, $x$ will start with 1, 0, 1, 0, 1, 1, 1. Notice that each stream has a corresponding representation and streams containing different numbers have different representations.
\begin{claim}
We can recover the $x$ of the stream given the memory state $\sigma$
\end{claim}
\begin{proof}
Denote $d=R(\sigma)$ be the original estimator. Now we treat $\sigma$ as a middle snapshot of the memory and add integer $i$ as the next element of the stream. Now $A$ will return another memory state $\sigma'$, and $d'=R(\sigma)'$ will be our new estimator. If $d'=d$, $i$ must have appeared in the stream before since $A$ and $R$ are deterministic and exact. Similarly, if $d'>d$, $i$ must have not appeared in the stream before. Using this method with $i=1, 2, 3\ldots$ and we can recover the $x$.
\end{proof}
Since we can recover $x$ from $\sigma$, we can treat $\sigma$ as an encoding of a string $x$ of length $n$. But $\sigma$ has only $s \ll n$ bits! Furthermore, we can treat $A$, the function that produces $\sigma$, as a function with domain $\{0, 1\}^{n}$ and $\{0, 1\}^{s}$. We can see that $A$ must be injective because if $A(x)=A(x')=\sigma$, the recoverability implies $x=x'$.
Hence $s \ge n$. Which implies that there is no deterministic and exact algorithm $A$ and an estimator function $R$ that use space $s \ll n, m$.
\subsection{Deterministic Approx. Won't Either}
We can use the similar strategy to prove that deterministic approx. won't work. We pick $T \subset \{0, 1\}^{n}$ that satisfies the following conditions: (1) for all distinct $x, y \in T$, the number of digits $i$ that $y_i=1$ and $x_i=0$ should $\ge \frac{n}{6}$. (2) $|T| \ge 2^{\Omega(n)}$. Now we use algorithm $A$ to encode an input $x$ into $\sigma=A(x)$ and our estimator would be $\hat{d}=R(\sigma)$.
Now we want to recover $x$ based on $\sigma$, as what we have done in the last section. For a given $\sigma$ and any $y \in T$, we append $y$ to the stream and apply $A$ on it, and $A$ will return a memory state $\sigma'$. Using $\sigma'$ we have new estimator $\hat{d'}=R(\sigma')$.
\begin{claim}
If $\hat{d'} > 1.01 \hat{d}$, then $x \ne y$, else $x=y$.
\end{claim}
\begin{proof}
The idea is that when $x=y$, $\hat{d}$ would be really close to $\hat{d'}$ (up to $(1+\epsilon)^{2}$ because both of them are $\epsilon$-approximated) and when $x \ne y$, the construction of $T$ guarantee that $\hat{d} \ge \hat{d} + \frac{n}{6}$. So we can pick an $\epsilon$ that works for our claim.
\end{proof}
We can use this method to check every element $y \in T$ to see if $y=x$, and eventually we can recover $x$ from it. Similar to last section, we can show that $A$ is an injective function and it implies that $2^{s} \ge |T|$ or $s = \Omega(n)$.
\section{Concluding Remarks}
\begin{itemize}
\item We can use median trick and Chernoff bound to improve the probability of an existing algorithm.
\item For distinct elements problem, we can also store the hashes $h(i)$ approximately. One example is to store the number of leading zeros, and it only cost $O(\log \log n)$ bits per hash value, and that is the idea behind another algorithm called HyperLogLog.
\item For the impossibility results, we can also prove that randomized exact algorithm won't work.
\end{itemize}
\end{document}