\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage[linesnumbered,ruled]{algorithm2e}
\usepackage{graphicx}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\epsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 388R: Randomized Algorithms } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\renewcommand{\.}{,\ldots,}
\begin{document}
\sloppy
\lecture{17 --- Nov 4, 2015}{Fall 2015}{Prof.\ Eric Price}{Enxu Yan, Fu Li}
\section{Sampling}
\textbf{Example:} estimating $\pi$:
\begin{itemize}
\item Choose $x,y\in[-1,1]$ at random. Check if $x^2+y^2\leq 1$.
\item The fraction of samples satisfying $x^2+y^2\leq1$ is an estimate of the true $prob=\frac{\pi}{4}$.
\end{itemize}
\textbf{Question:} how many samples do we need? In general, suppose we are getting samples form an unknown set with $p$ fraction of elements having some property. How many samples is needed to estimate $p$ with estimator satisfying $\tilde{p}=(1\pm\epsilon)p$ with probability $1-\delta$, that is, an $(\epsilon,\delta)$ approximation.
Let's say we draw
\begin{itemize}
\item $n$ samples and let $Z$ be the number of samples with the property.
\item We have $E[Z]=pn$.
\item By Chenoff bound,
\begin{align*}
P[Z\geq pn+t] &\leq e^{-2t^2/n} &\\
&\leq e^{-2\epsilon^2p^2n} &(with \;\;t=\epsilon p n)\\
& \Rightarrow \textit{ needs } n \geq \frac{1}{\epsilon^2p^2}\log(\frac{1}{\delta}) \textit{ to obtain } (\epsilon,\delta) \textit{ approx.} &
\end{align*}
\item When $p$ is small, Chenoff gets a loose bound. Let's try Bernstein-style bound. By Theorem 6 of Lecture 6's note, since $Z=\sum_{i} Z_i$ with $Z_i\in[0,1]$ and variance $p(1-p)\leq p$, $Z_i$ is subgamma with $(\sigma^2=2p,B=1/2)$ and $Z$ is subgamma with $(\sigma^2=2np,B=1/2)$. Therefore, we have
\begin{align*}
P[Z\geq pn+t] \leq \max\left\{ e^{-\frac{t^2}{4pn}} ,\; e^{-t/4}\right\}.
\end{align*}
Let $t\geq \sqrt{pn\log\frac{1}{\delta}}+\log\frac{1}{\delta}$, we have $1-\delta$ probability that
$$
Z \leq pn\left(1+\sqrt{\frac{\log\frac{1}{\delta}}{pn}}+\frac{\log\frac{1}{\delta}}{pn}\right)
\;\Rightarrow\;
Z\leq pn\left(1+\mathcal{O}(\epsilon)\right) \;if\; pn \geq \frac{1}{\epsilon^2}\log\frac{1}{\delta},
$$
which means we only need
$$
n\geq\frac{1}{p\epsilon^2}\log\frac{1}{\delta}
$$
to get $(\epsilon,\delta)$ approximation, a tighter result than that from Chenoff.
\end{itemize}
\textbf{Question:} What if we don't know $p$?
\begin{itemize}
\item After $n\geq\frac{1}{p\epsilon^2}\log\frac{1}{\delta}$ steps, we have $Z=pn(1\pm \epsilon)$ true with prob. $1-\delta$.
\item Now suppose we run the experiment until getting $\mu\approx \frac{1}{\epsilon^2}\log\frac{1}{\delta}$ number of hits, and we output
$$
\tilde{p}=\frac{\mu}{\tilde{n}},
$$
where random variable $\tilde{n}$ is the number of samples we draw. We have $\tilde{n}=\frac{\mu}{\tilde{p}}\in\frac{\mu}{p}[\frac{1}{1+\epsilon},\frac{1}{1-\epsilon}]$ with high probability based on previous analysis.
\item On the other hand, from previous results (from Bernstein), for any $n'$, we have
$$
Z=pn'\left(1 \pm O(\sqrt{\frac{\log\frac{1}{\delta}}{n'p}}) \right)
$$
with probability $1-\delta$. Now if $n'\in \frac{\mu}{p}[\frac{1}{1+\epsilon},\frac{1}{1-\epsilon}]$, we have
$$
Z=n'p\left(1\pm \epsilon \frac{1}{\sqrt{1-\epsilon}}\right)=n'p(1\pm \mathcal{O}(\epsilon))
$$
as desired.
\end{itemize}
\begin{figure}
\centering
\includegraphics{RA_fig1.png}
\caption{Dynamics of $Z$ as $n$ keeps increasing.}
\label{fig:idea}
\end{figure}
\section{Median finding}
Given the item set $\{x_1\. x_n\}$, we want to find the median value as soon as possible. First of all, we can come up with the following methods:
\begin{enumerate}
\item Quick sorting : $O(n\log n)$.
That is, choose random $x_i$ and then sort $\{x_j|x_j\leq x_i\}$ and $\{x_k|x_k>x_i\}$.
\item Randomized select: $O(n)$
Modify the Quicksort to find the $i^{th}$ biggest item. That is, select $x_i$ randomly and recursive on one side. This will run for $\log n$ rounds and the expectation of time $E[time]=n+3/4n+(3/4)^2+\ldots=O(n)$. Note that, for getting a concentration result, we need a bit more work.
\item Deterministic select: $O(n)$
If we first partition the items into groups of $5$ and then apply the same divide and conquer trick, we can get a deterministic algorithm with running time $O(n)$.
\end{enumerate}
In the following, we will show
\begin{theorem}
There is a random algorithm that can find the median in $3/2n+o(n)$ time with high probability.
\end{theorem}
\begin{proof}
We design the claimed algorithm in the following three steps:
\begin{enumerate}
\item
Sample $s$ items from the $n$ items where $s=\sqrt n$.
Let $S$ denote the set of these $s$ sampled items. Then, we sort $S$ and could find the $median(S)=S_m$. However, we cannot guarantee the $S_m$ is the median of the original set with high probability.
\item Find $L,H\in S$ such that $\text{median}\in [L,H]$ with probability $1-\delta$ and the number of elements in $[L,H]$ is $o(n)$.
Instead of directly using the median of $S_m$, we consider about the $\sharp \text{elements in $S$ that rank $\leq (1/2-\alpha)n$ in the original set}$. Note that, by the discussion in the first section, we know
$$Pr[Z\geq (1/2-\alpha)s+t]\leq exp(-2t^2/s)$$
Let $t=\alpha s$, then
$$Pr[Z\geq 1/2s]\leq exp(-2\alpha^2s).$$
Namely, the point in sample with rank $\beta s$ has rank within $(\beta\pm \sqrt{\frac{\log 1/\delta}{s}})n$ with probability $(1-\delta)$. Thus, let $L= (1/2-\sqrt{\frac{\log 1/\delta}{s}})s$ rank sample in $S$ and $H=(1/2+\sqrt{\frac{\log 1/\delta}{s}})s$ rank sample in $S$, which satisfies our requirement for the Step 2.
\item With the $L,H$, we can find the median by scanning the original set, which is stated as the Algorithm \ref{alg:1}.
\begin{algorithm}
\caption{Scan the item set with $L,H$}\label{alg:1}
\KwIn{$L, H$ and $n$ items $x_1\.x_n$}
\KwOut{The median of $\{x_1\.x_n\}$}
Initially, let $count_L=count_H=0$ and $mid=[]$\;
\For {each element $x_i$}
{
\If{$x_i\leq L$}
{ $count_L++$\;
\ElseIf{$x_i\geq H$}
{ $count_H++$\;
\ElseIf{$x_i\in [L,H]$}
{
Insert $x_i$ into $mid$\;
}
}
}
}
Sort $mid$\;
\Return the $(n/2-count_L)^{th}$ smallest item in the $mid$.
\end{algorithm}
\end{enumerate}
As last, we consider about the running time. We first spend $s\log s=\sqrt{n}\log \sqrt{n}=o(n)$ on sorting the items in $S$. Then notice that with high probability $1-\delta$, $|\{ x_i|x_iH\}|\leq n/2$ and $|mid|\leq 4\sqrt{s\log 1/\delta }$. For simplicity, we can let $\delta=2^{-n^{2/3}}$, then $|mid|=o(n)$ with high probability. So know the running time of the Algorithm \ref{alg:1} is at most $3n/2+|mid|\log |mid|=3n/2+o(n)$. Thus, in total, the running time of the whole random algorithm will be $3n/2+o(n)$ with high probability $1-2^{-n^{2/3}}$.
\end{proof}
\section{Streaming Algorithm}
\begin{itemize}
\item See $v_1$, $v_2$, ...., $v_m$.
\item Let $x_u=$ number of times $i$ that $v_i=u$.
\item $n=$ number of different $u$.
\item Both $m$, $n$ are very large, and the goal is to get some statistics from $v_1$,...,$v_m$ with $o(m)$, $o(n)$ space.
\end{itemize}
\textbf{Example:} find heavy hitters
For some $\alpha$ with $1/\alpha \ll m,n$, find $S$ s.t.
$$
\{u|x_u \geq \alpha m\} \subseteq S \subseteq \{u|x_u\geq \frac{\alpha}{2}m\}
$$
with output space $|S| \leq \frac{1}{\alpha}$.
Suppose we know $m$, we can
\begin{itemize}
\item[1] Sample randomly, keep counters w/in samples.
\item[2] Hash and discard when counters small. There is a well-known deterministic algorithm for this, called \emph{Misra-Gries}, which is a generalization of the linear-time \emph{Majority Algorithm} to find all items of frequency larger than $m/k$.
\end{itemize}
\textbf{\emph{Misra-Gries} Algorithm} works as follows:
\begin{itemize}
\item At each stage, we maintain a map of at most $k-1$ pairs of (item,counter) as the $k-1$ candidates of frequent items. (Note we cannot have more than $k-1$ frequent items of frequency $>m/k$.).
\item In the beginning, the map is initialized as empty.
\item For each new incoming $v_i$,
\begin{itemize}
\item[i] if item $v_i$ is in the map, increment its counter.
\item[ii] Otherwise, if map size $m/k$ cannot be decremented to $0$.
\item We can remove false positive if having a second pass on $v_1$,...,$v_m$.
\end{itemize}
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\end{thebibliography}
\end{document}