\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{framed}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\DeclareMathOperator*{\var}{\mathbf{Var}}
\newcommand{\eps}{\epsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
%complexity theory variables.
\newcommand{\Pt}{\ensuremath{\mathbf{P}}}
\newcommand{\NP}{\ensuremath{\mathbf{NP}}}
\newcommand{\PP}{\ensuremath{\mathbf{PP}}}
\newcommand{\BPP}{\ensuremath{\mathbf{BPP}}}
\newcommand{\ZPP}{\ensuremath{\mathbf{Z_{PP}}}}
\newcommand{\RP}{\ensuremath{\mathbf{R_{P}}}}
\newcommand{\coRP}{\ensuremath{\mathbf{coR_{P}}}}
\newcommand{\Ppoly}{\ensuremath{\mathbf{P_{poly}}}}
\newcommand{\Alg}{\ensuremath{\mathcal{A}}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 388R: Randomized Algorithms } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
%for wrapping inline math mode.
\let\ab\allowbreak
\begin{document}
\lecture{2 --- 31 August 2014}{Fall 2015}{Prof.\ Eric Price}{Sid Kapur, Neil Vyas}
\section{Overview}
In this lecture we will first introduce some probability machinery: linearity of expectation, the central limit theorem, and the discrete-setting Chernoff bound.
We will then apply these tools to some simple problems: repeated Bernoulli trials and the coupon collecting problem.
Finally, we will discuss how randomized algorithms fit in to the landscape of complexity theory.
\section{Food for Thought}
\begin{enumerate}
\item Suppose you flip an unbiased coin 1000 times. How surprised would you be if we observed
\begin{itemize}
\item 500 heads?
\item 510 heads?
\item 600 heads?
\item 1000 heads?
\end{itemize}
\item Suppose you have a biased coin that lands heads with an unknown probability $p$. How many flips will it take to learn $p$ to an error of $\pm \eps$ with probability $1 - \delta$?
\end{enumerate}
\section{Probability Background}
\begin{theorem}
\textbf{Linearity of expectation.}
If $X_1,\dots,X_n$ are random variables, then
$$ \E\left[\sum_{i=1}^n X_i\right] = \sum_{i=1}^n \E[X_i] $$
\end{theorem}
Example: Suppose $X_1,\dots,X_n$ are Bernoulli trials with $p=0.5$. Let $X = \sum_{i=1}^n E_i$. Then
$$ \E[X] = \sum_{i=1}^n \E[X_i] = \frac{n}{2} $$
\begin{definition}
The variance of a random variable $X$, denoted $\var[X]$, is defined as
$$ \var[X] = \E\left[(X - \E[X])^2\right] $$
\end{definition}
\begin{observation}
Note that if $Y_1$ and $Y_2$ are independent random variables with $\E[Y_1] = \E[Y_2] = 0$, then
\begin{align*}
\var[Y_1 + Y_2] &= \E\left[(Y_1 + Y_2)^2\right] \\
&= \E\left[Y_1^2 + Y_2^2 + 2Y_1Y_2\right] \\
&= \E\left[Y_1^2\right] + \E\left[Y_2^2\right] + 2\E[Y_1]\E[Y_2] &\small\text{(since $Y_1$ and $Y_2$ are independent)}\\
&= \E\left[Y_1^2\right] + \E\left[Y_2^2\right] \\
&= \var[Y_1] + \var[Y_2]
\end{align*}
\end{observation}
\begin{claim}\label{claima}
We claim (without proof) that if $Y_1$ and $Y_2$ are independent, then $\var[Y_1 + Y_2] = \var[Y_1] + \var[Y_2]$ (i.e. $\E[Y_1]$ and $\E[Y_2]$ need not be zero).
\end{claim}
Example: Suppose $X_1,\dots,X_n$ are independent Bernoulli trials with $p=0.5$. Let $X = \sum_{i=1}^n E_i$. Then
\begin{align*}
\var[X] &= \sum_{i=1}^n \var[X_i] &\small\text{(by Claim \ref{claima})}\\
&= n\left[ \frac12 \left(\frac12\right)^2 + \frac12 \left(\frac12\right)^2 \right] \\
&= \frac{n}{4}
\end{align*}
If $n=1000$, then $\var[X] = 250$ and the standard deviation of $X = \sqrt{\var[X]} \approx 16$.
\begin{theorem}
\textbf{The central limit theorem}
Suppose $X_1,\dots,X_n$ are independent identically distributed random variables with expected value $\mu$ and variance $\sigma^2$.
Then as $n \to \infty$, the sample average $S_n = \frac{X_1 + \dots + X_n}{n}$ converges to the Gaussian
$$ N\left(\mu, \frac{\sigma^2}{n}\right) $$
Some statements of the central limit theorem give bounds on convergence, but unfortunately these bounds are not tight enough to be useful in this example.
\end{theorem}
By the central limit theorem, as the number of coin tosses goes to infinity, our binomial distribution of coin tosses converges to a Gaussian $N(\mu, \sigma^2)$ where $\mu = \frac{n}{2}$ and $\sigma^2 = \frac{n}{4}$, as we calculated above.
\section{Bounding the Sum of Bernoulli Trials}
Let's return to our coin-tossing problem.
Can we find an upper bound on the probability that the number of coin tosses exceeds a certain number?
\begin{claim}\label{gaussianbound}
If $Z$ is drawn from a Gaussian with mean $\mu$ and variance $\sigma^2$, then
\begin{align*}
\Pr[Z \ge \mu + t] &\le e^{-\frac{t^2}{2\sigma^2}} \quad \forall t \ge 0 \\
\Pr[Z \le \mu - t] &\le e^{-\frac{t^2}{2\sigma^2}} \quad \forall t \ge 0
\end{align*}
(We omit the proof.)
\end{claim}
If our distribution were Gaussian (or if we were willing to invoke the central limit theorem), we could use the bound from Claim \ref{gaussianbound}:
\begin{align*}
&&\Pr[X \ge 500 + \sigma t] &\le e^{-\frac{t^2}{2}} \\
&\Rightarrow &\Pr[X \ge 500 + 16t] &\le e^{-\frac{t^2}{2}} \\
&\Rightarrow &\Pr[X \ge 600] &\le e^{-18} \\
\end{align*}
But unfortunately, we have a binomial distribution, not a Gaussian, and we don't want to use the central limit theorem. What should we do?
\begin{definition}
\textbf{Chernoff bound (for the discrete setting)}
Let $X = \sum_{i=1}^n X_i$, where the $X_i$ take on values in $[0,1]$ and are independent. (The $X_i$ can take on any value between 0 and 1. Also, the $X_i$ need not be identical.)
Then
\begin{align*}
\Pr[X \ge \E[X] + t] &\le e^{-\frac{2t^2}{n}} \\
\Pr[X \le \E[X] - t] &\le e^{-\frac{2t^2}{n}}
\end{align*}
\end{definition}
If we use the Chernoff bound, we get:
\begin{align*}
\Pr[X \ge 500 + \sigma t]
&\le e^{-\frac{2t^2\sigma^2}{n}} \\
&= e^{-\frac{2t^2(n/4)}{n}} &\small\text{(since $\sigma^2 = \frac{n}{4}$)}\\
&= e^{-\frac{t^2}{2}}
\end{align*}
% TODO Did he say that the Chernoff bound is basically tight,
% or that the Gaussian bound is basically tight?
Note that this is the same bound that we got from the Gaussian! So the Chernoff bound is basically tight in this case.
\subsection{Evaluating the Chernoff Bound}
There are a few limitations of the Chernoff bound in this situation.
The main limitation is that it does not depend on the variance of the $X_i$. Suppose each $X_i$ has a very low probability, say,
$$\Pr[X_i] = \frac{1}{10\sqrt{n}}$$
Then
$$ \E[X] = \frac{1}{10\sqrt{n}} \cdot n = \frac{\sqrt{n}}{10} $$
Then the Chernoff bound would give us $1 - \delta$ confidence interval of
\begin{align*}
&& \Pr[X \ge \E[X] + t] \le e^{-\frac{2t^2}{n}} = \frac{\delta}{2} \\
&\Rightarrow &t = \sqrt{\frac{n\log(2/\delta)}{2}}
\end{align*}
So the Chernoff bound gives us $X \in \E[X] \pm \sqrt{\frac{n\log(2/\delta)}{2}}$ with probability $1 - \delta$.
If we pick $\delta = 0.10$, then the bound is $X \in \E[X] \pm \sqrt{10n}$. Note that the width of our confidence interval is about 60 times the expected value!
We know that the bound can be made much tighter here because the variance of the $X_i$ is small. The Chernoff bound doesn't take variance into account. Later in this class, we will encounter Bernstein-type inequalities, which do account for the variance.
Also, note that the Chernoff bound can be less than 0 or greater than $n$, which does not make sense in this problem.
\subsection{Second Food for Thought}
\textbf{Suppose you have a biased coin that lands heads with an unknown probability $p$. How many flips will it take to learn $p$ to an error of $\pm \eps$ with probability $1 - \delta$?}
We need to find $n$ such that
$$ \Pr[X \ge (p + \eps)n] \le \frac{\delta}{2} $$
By the Chernoff bound,
\begin{align*}
\Pr[X \ge (p + \eps)n] &\le e^{-\frac{2(\eps n)^2}{n}} \\
&= e^{-2\eps^2 n}
\end{align*}
So we must pick $n$ such that
$$ e^{-2\eps^2 n} \le \frac \delta 2 $$
i.e.
$$ n \ge \frac{1}{2\eps^2}\log\left(\frac{2}{\delta} \right) $$
\section{The Coupon Collecting Problem}
\textbf{Suppose there exist $n$ types of coupons, with an infinite supply of each. If you collect coupons uniformly at random, what is the expected number of coupons you must collect to hold at least one of each type?}
$T_0$ must be 1, since the next coupon we receive is the first coupon, so it must be a new type.
$T_{n-1}$ must be $n$, since for each coupon we receive, there is a $\frac{1}{n}$ probability that it is the coupon that we haven't received yet.
In general, $T_{n-k} = \frac n k$ because there are $k$ coupon types remaining, so at each step the probability that we receive a new coupon is $\frac{k}{n}$.
So
$$ T = \sum{T_i} = \sum_{i=1}^n \frac{n}{k} = nH_k \leq n(\log n + 1) $$
where $H_n = \sum_{i=1}^n \frac{1}{i}$ is the $n$th harmonic number.
\section{Background on Complexity Theory}
Let's begin our whirlwind tour of complexity theory. We'll explore, to varying degrees, \Pt,\ \NP,\ab\ \ZPP,\ \RP,\ \coRP, and \BPP.
We use as supplemental material section 1.5 from \textit{Randomized Algorithms} by Motwani and Raghavan \cite{MR}.
Note that formally, complexity classes are sets of languages, but we'll be looser and also speak in terms of algorithms inhabiting these classes.
\subsection{\Pt: Polynomial-time deterministic algorithms}
These can be described by Turing machines and \textit{languages}. A \textit{language} is a set $\ell \subseteq \{0,1\}^n$. Given an input string $x \in \{0,1\}^n$, a Turing machine either accepts the string, rejects the string, or loops. The language of a Turing machine is the set of strings that the Turing machine accepts.
It is possible to phrase many non-decision problems as decision problems. For example, suppose the original problem is to find an integer $n$ that has some property. The corresponding decision problem would be: Given an input $x$, is $x$ less than or equal to $n$? In this setting, we can use binary search to find $x$.
\begin{definition}
The class \Pt\ consists of all languages $\ell$ that have a polynomial-time algorithm \Alg\ such that for any input $x$,
\begin{align*}
&\text{If $x \in \ell$, $\Alg$ accepts} \\
&\text{If $x \not\in \ell$, $\Alg$ rejects}
\end{align*}
\end{definition}
\subsection{\NP: \Pt\ with a good ``advice'' string}
``Advice'' given to a Turing machine is an additional input string allowed to depend on the length of the input, $n$, but not the input itself.
\begin{definition}
A language $l$ is in $\NP$ if there exists a polynomial-time algorithm $\Alg$ such that for any input $x$,
\begin{align*}
&\text{If $x \in \ell$, there exists an advice string $y$ such that $\Alg$ accepts the input $(x,y)$} \\
&\text{If $x \not\in \ell$, then for all strings $y$, $\Alg$ rejects the input $(x,y)$}
\end{align*}
\end{definition}
For our purposes, the advice string may be arbitrarily long, but the algorithm can only inspect a polynomial-in-the-length-of-the-input amount of it.
Note the resemblance between advice strings and random bits. We will come back to this later.
\subsection{\RP\ and \coRP: One-sided error }
\begin{definition}
An algorithm $\Alg$ is in $\RP$ if it has polynomial worst-case runtime and, for all inputs $x$,
\begin{align*}
&\text{If $x \in \ell$, then $\Pr[\Alg \text{ accepts } x] \ge \frac12 $ } \\
&\text{If $x \not\in \ell$, then $\Pr[\Alg \text{ accepts } x] = 0 $ }
\end{align*}
\end{definition}
\begin{definition}
An algorithm $\Alg$ is in $\coRP$ if it has polynomial worst-case runtime and, for all inputs $x$,
\begin{align*}
&\text{If $x \in \ell$, then $\Pr[\Alg \text{ accepts } x] = 1 $ } \\
&\text{If $x \not\in \ell$, then $\Pr[\Alg \text{ accepts } x] < \frac12 $ }
\end{align*}
\end{definition}
Note that an \RP\ algorithm corresponds to a Monte Carlo algorithm that can err only when $x \in \ell$, and, similarly, an algorithm in \coRP\ corresponds to a Monte Carlo algorithm that can err only when $x \notin \ell$ \cite{MR}.
\subsection{\ZPP\: Zero-error with expected polynomial time}
This complexity class corresponds to Las Vegas-type randomized algorithms. Thus, $\ZPP = \RP \cap \coRP$.
\subsection{\PP\ and \BPP: Two-sided error }
\begin{definition}
An algorithm $\Alg$ is in $PP$ (probabilistic polynomial time) if it has polynomial worst-case runtime and, for all inputs $x$,
\begin{align*}
&\text{If $x \in \ell$, $\Pr[\Alg \text{ accepts } x] \ge \frac12$} \\
&\text{If $x \not\in \ell$, $\Pr[\Alg \text{ accepts } x] \le \frac12$}
\end{align*}
\end{definition}
Unfortunately, $\PP$ is not a very useful class. For example, $\NP\subseteq\PP$. To show this we consider SAT, the unrestricted satisfiability problem: Given $f(x_1, \dots, x_n)$, is it satisfiable?
We propose the following algorithm:
% H option ensures that the algorithm is placed "Here",
% i.e. does not float
\begin{algorithm}[H]
\begin{algorithmic}
\Function{RandomizedSAT}{$f: \{\mathrm{True},\mathrm{False}\}^n \to \{\mathrm{True},\mathrm{False}\}$}$: \{\mathrm{True},\mathrm{False}\}$
\ForAll{$i \in \{1 \dots n\}$}
\State $x_i \gets $ Uniform(True, False)
\EndFor
\If {$f(x_1,\dots,x_n) = \mathrm{True}$}
\State \Return True
\Else
\State $b \gets $ Uniform(True, False)
\State \Return $b$
\EndIf
\EndFunction
\end{algorithmic}
\end{algorithm}
Suppose that the input is satisfiable; then $\Pr[\text{RandomizedSAT accepts}] \geq \frac{1}{2} + \frac{1}{2^{n+1}}$.
Similarly, if the input is not satisfiable, $\Pr[\text{RandomizedSAT rejects}] = \frac12$. Thus SAT is in \PP.
Note that since we can reduce every problem in \NP\ to SAT, every $\NP$ is in \PP. Thus, \PP\ isn't a very useful class to reason about.
We introduce $\BPP$ in the hopes of remedying this situation.
\begin{definition}
An algorithm $\Alg$ is in $\BPP$ (bounded probabilistic time) if, for all inputs $x$,
\begin{align*}
&\text{If $x \in \ell$, $\Pr[\Alg \text{ accepts } x] \ge \frac23$} \\
&\text{If $x \not\in \ell$, $\Pr[\Alg \text{ rejects } x] \le \frac23$}
\end{align*}
\end{definition}
In the next section, we will show that we can amplify any $\BPP$ algorithm to arbitrary accuracy.
\subsection{Amplification of $\BPP$ algorithms}
Now, let's get back to that last remark; let's amplify the probability of success of a \BPP\ algorithm.
Suppose that we want a $\BPP$ algorithm with $\Pr[\text{accept $x$ if $x\in\ell$}] \ge 1-\delta$ and $\Pr[\text{reject $x$ if $x\not\in\ell$}] \ge 1 - \delta$. We claim that for a \BPP\ algorithm, this can be accomplished with only $O(\log{\frac{1}{\delta}})$ time overhead.
\begin{claim}\label{amplification}
\textbf{Amplification.} Given a $\BPP$ algorithm for some problem that accepts correct answers with probability at least $\frac23$, and rejects incorrect answers with probability at least $\frac23$, there exists a $\BPP$ algorithm that accepts correct answers with probability at least $1 - \delta$ and rejects incorrect answers with probability at least $1 - \delta$, for any $0 \le \delta < 1$.
\end{claim}
\begin{leftbar}
\textbf{Proof:} Consider the algorithm $\Alg'$ that runs $k$ independent iterations of $\Alg$, and accepts if the majority of the runs accepted, and rejects if the majority of the runs rejected.
and then employ the Chernoff bound to bound the probability that the majority vote across all iterations is incorrect.
Suppose $x\in\ell$. Then $\mathbb{E}[\text{num. runs that accept } x] \geq \frac{2}{3}k$.
So
\begin{align*}
&&\Pr\left[\text{num accepts }\leq \frac{2}{3}k - t\right] &\leq e^{-{\frac{2t^2}{k}}} &\text{(by Chernoff Bound)} \\
&\Rightarrow&
\Pr\left[\text{num accepts }\leq \frac12k\right] &\leq e^{-{\frac{k}{18}}} &\text{(letting $t = k/6$)} \\
&\Rightarrow &
k &\ge -18\log \delta &\text{(setting $e^{-\frac{k}{18}} \le \delta$)}
\end{align*}
So, the probability that the majority of the $k$ runs falsely reject (and thus $\Alg'$ rejects) when $x \in \ell$ is bounded by $\delta$ if we set $k \ge -18 \log \delta$.
The proof that $\Alg_1$ rejects incorrect answers with probability at least $1 - \delta$ is similar.
So, $\Alg'$ is in $\BPP$ with the $1 - \delta$ bounds that we desired, if we set $k$ appropriately.
\end{leftbar}
\subsection{\Ppoly}
\Ppoly\ is the set of all problems that are decidable by deterministic polynomial-time algorithms that can take in an advice string that depends on the size of the input, but not the input itself.
We'll show one relevant theoretical use of \Ppoly\ in a later section.
\subsection{Containments}
It is known that $\Pt\subseteq\ZPP$, but it is not known whether $\ZPP\subseteq\Pt$.
It should also be apparent that $\ZPP = \RP\cap\coRP$, as stated above.
We claim that $\Pt\subseteq\RP\subseteq\NP$, and that $\RP\subseteq\BPP$; here we will only prove that $\RP\subseteq\NP$.
\begin{claim}
$\RP \subseteq \NP$.
\end{claim}
\begin{leftbar}
\textbf{Proof:}
Suppose $\Alg$ is a $\RP$ algorithm for the language $\ell$.
To prove that $\ell \in \NP$, it is sufficient to show that $\Alg$ has polynomial runtime and that:
\begin{enumerate}
\item For every $x \in \ell$, there exists an advice string $r$ such that $\Alg(x,r)$ accepts
\item For every $x \not\in \ell$, for all advice strings $r$, $\Alg(x,r)$ rejects.
\end{enumerate}
(1) is true since, by the definition of $\RP$, there is a random string $r$ such that $\Alg(x,r)$ accepts.
(2) is true since, by the defintion of $\RP$, there is no random string $r$ such that $\Alg(x,r)$ accepts.
So $\ell \in \NP$.
\end{leftbar}
It is not known whether the following statements hold:
\begin{itemize}
\item $\BPP\subset\NP$ ?
\item $\RP=\ZPP=\coRP$ ?
\end{itemize}
\section{Adleman's Theorem}
\begin{theorem}
\textbf{Adleman's Theorem:} $\BPP\subseteq\Ppoly$.
\end{theorem}
\begin{leftbar}
\textbf{Proof:}
Let $\Alg\in\BPP$.
By Claim \ref{amplification}, there exists an algorithm $\Alg' \in \BPP$ with $\delta = \frac{1}{2^{n+1}}$.
Let $\mathrm{Bad}(x)$ be the set of advice strings $r$ such that $\Alg'(x,r)$ is incorrect.
So, over the advice string $R$,
\begin{align*}
P[\exists x \text{ s.t. $R \in \mathrm{Bad}(x)$}]
&= P\left[\bigcup_{x \in \{0,1\}^n} R \in \mathrm{Bad}(x) \right] \\
&\le \sum_{x \in \{0,1\}^n} P(R \in \mathrm{Bad}(x)) &\text{(union bound)}\\
&\le 2^n \cdot \frac{1}{2^{n+1}} \\
&= \frac12 < 1
\end{align*}
Thus, there is a positive probability that $\Alg'$ succeeds on every $x$. Thus there exists a random advice string (call it $r_n$) such that for all inputs $x$ of length $n$, $\Alg'(x, r_n)$ gives the correct answer.
So, the algorithm $\Alg'(x, r_n)$ is a polynomial-time deterministic algorithm that, given the appropriate advice string $r_n$, gives the correct answer for all inputs $x$ of length $n$. So $\Alg' \in \Ppoly$.
\end{leftbar}
So, in a sense, randomness doesn't help \textit{that} much. Every problem in $\BPP$ can be solved in polynomial time without randomness (assuming we have an advice string for the $\Ppoly$ algorithm). However, randomized algorithms often give us a smaller polynomial runtime.
\bibliographystyle{alpha}
\begin{thebibliography}{42}
% \bibitem[AMS99]{AMS99}
% Noga~Alon, Yossi~Matias, Mario~Szegedy.
% \newblock The Space Complexity of Approximating the Frequency Moments.
% \newblock {\em J. Comput. Syst. Sci.}, 58(1):137--147, 1999.
\bibitem[MR]{MR}
Rajeev~Motwani, Prabhakar~Raghavan
\newblock Randomized Algorithms.
\newblock {\em Cambridge University Press}, 0-521-47465-5, 1995. %pub - isbn - year
%@book{Motwani:1995:RA:211390,
% author = {Motwani, Rajeev and Raghavan, Prabhakar},
% title = {Randomized Algorithms},
% year = {1995},
% isbn = {0-521-47465-5, 9780521474658},
% publisher = {Cambridge University Press},
% address = {New York, NY, USA},
%}
%notes on BPP, amplification, Ppoly, adlemans theorem:
%http://people.csail.mit.edu/madhu/ST07/scribe/lect11.pdf
\end{thebibliography}
\end{document}