\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\spacing{1.06}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\newcommand{\E}{\textbf{E}}
\newcommand{\var}{\text{var}}
\begin{document}
\lecture{24 --- MapReduce Algorithms}{Dec. 3, 2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Kui Tang}}
\section{Introduction}
We review the computational model of MapReduce algorithms. Our model
is more general than the original Google MapReduce framework: we do
not constrain the functional forms of computation (e.g decomposition
into local map and associative reduce functions) but only in terms
of space and communication constraints.
We have $M$ machines with $S$ space per machine, and $MS\approx O(\mbox{input size})$.
This means we cannot replicate data too much. We have $n$ inputs
and $O(n)$ outputs, but $S\ll n$, i.e. neither input nor output
can fit on one machine.
Local computation is cheap but communication (shuffling) is very expensive,
so we bound the number of rounds. In particular, we want rounds $R=O(1)$
for $S\leq n^{\delta}$, for instance $S>\sqrt{n}$ where $S>M$.
We want $O(S)$ in-communication (how much input each node requires)
per round, and ideally linear. (You cannot have more communication
than you have space.)
This model is the culmination of the bulk-synchronous parallel, the
original MapReduce, and massively parallel computing (MPC) frameworks.
MR can simulate PRAM algorithms in $R=O(\mbox{parallel time})$. But
this often takes a logarithmic slowdown, because to collect $n$ items
of data in a tree of constant degree $s$, we need a depth of $O(\log_{s}n)$,
which is also how many rounds we must take. In particular, we need
$\widetilde{\Omega}(\log n)$ on CRCW PRAMs.
\subsection{Sorting}
Also called the Terasort problem \textemdash{} sorting a terabyte
of numbers.
Suppose $S=O(n^{2/3})$ and $M=O(n^{1/3})$. The entire input of $n$
numbers can be sorted in just three rounds of communication. How?
\begin{itemize}
\item Each machine picks some distinguished elements with probability $n^{1/2}/n$.
This means $\Theta(n^{1/2})$ elements will be selected.
\item \textbf{All machines send the distinguished elements to machine \#1.}
(This fits, since $n^{1/2}