\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm,url,dsfont}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\epsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 388R: Randomized Algorithms } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\urldef{\Reservoir}\url{https://en.wikipedia.org/wiki/Reservoir_sampling#Reservoir_with_Random_Sort}
\begin{document}
\lecture{19 --- Nov 11, 2015}{Fall 2015}{Prof.\ Eric Price}{Xiangru Huang,
Qi Lei}
\section{Overview}
In previous lectures, we introduced Count-Min Sketch for finding $l_1$ heavy hitters. %\footnote{Is there anything else?}.
In this lecture, we will introduce Count Sketch and $l_0$ sampling.
\section{Review Count-Min sketch}
First let's review Count-Min sketch.
We have a stream of insertions and deletions $(u, \Delta x_u)$ within "strict" turnstile model(i.e. $x_u \geq 0$) and want to recover $\hat{x}_u$.
Count-Min sketch works by maintaining $r = \log \frac{|U|}{\delta}$ bloom filters with size $O(k)$. And get
$$
|\hat{x}_u - x_u| \leq \frac{\|x_{-k}\|_1}{k}
$$
where $x_{-k}$ is the vector after deleting the top $k$ elements from $x$.
\section{Count sketch}
Before introducing Count sketch, let's consider a medium case algorithm.
\subsection{Count-Median sketch}
Recall that in Count-Min sketch, we have hash function $h_i$ and counters in bloom filters looks like
$$
y_{i,v} = \sum_{u, h_i(u) = v} x_u
$$
we get one estimate from each bloom filter
$$
\tilde{x}_u^{(i)} = y_{i, h_i(u)}
$$
and each estimate is good, since with probability $\frac{1}{2}$
\begin{equation} \label{eq1}
|\tilde{x}_u^{(i)} - x_u| \leq \frac{\|x_{-k}\|_1}{k}
\end{equation}
So then we take the min of all estimates
$$
\tilde{x}_u = \min_i \tilde{x}_u^{(i)}
$$
we have with high probability
$$
|\tilde{x}_u - x_u| \leq \frac{\|x_{-k}\|_1}{k}
$$
Now for turnstile model(not necessary strict), we can not use $\min$ any more, and a nature idea would be to change $\min$ to median.
However, if we change \textbf{min} to \textbf{median}, we will not get the same bound, since the expectation of number of estimates not satisfy Eqn. $\eqref{eq1}$ is $\frac{r}{2}$, which effectively makes the median not good. But since the size of each bloom filter is $O(k) = C k$, we can choose parameter $C$ larger so that the probability for each estimate to satisfy Eqn. \eqref{eq1} is $\frac{3}{4}$.
Then this modified algorithm (Count-Median) will work similarly as Count-Min.
\subsection{Count sketch}
The idea of Count sketch is to symmetrize the noise(in Count-Min we have all positive noise).
As in Count sketch, we define
\begin{itemize}
\item pair-wise independent hash functions $h_i : U \rightarrow [c]$, and random signs $s_i(u) \in \{\pm 1 \}$
\item counters in bloom filter
$$
y_{i,v} = \sum_{u, h_i(u) = v} s_i(u) x_u
$$
\item estimate from one bloom filter $\tilde{x}_u^{(i)} = s_i(u) y_{i, h_i(u)}$
\item estimate of $x_u$ is $\tilde{x}_u = \mbox{ median}_i ~ \tilde{x}_u^{(i)}$
\end{itemize}
Now we give the bounds of Count sketch.
For each $u$, first we condition on $u$ not colliding with any top $|H|$ elements by using $h_i$, which is true with probability $1 - \frac{|H|}{c}$. Then
\begin{align*}
(\tilde{x}_u^{(i)} - x_u )^2 & = \sum_{u' \not \in H \cup \{u\}} x_{u'}^2 \mathds{1}_{h_i(u) = h_i(u')} + \sum_{u_1, u_2 \not \in H \cup \{u\}} s_i(u_1) s_i(u_2) \mathds{1}_{h_i(u_1) = h_i(u) = h_i(u_2)}x_{u_1} x_{u_2}
\end{align*}
The expectation of the second term is zero since
$$
\E[s_i(u_1)s_i(u_2)] = 0
$$
Hence we have
\begin{align*}
\E (\tilde{x}_u^{(i)} - x_u )^2 & = \E [\sum_{u' \not \in H \cup \{u\}} x_{u'}^2] \frac{1}{c} \\
& \leq \frac{\| x_{-k}\|^2}{c}
\end{align*}
Therefore $(\tilde{x}_u^{(i)}-x_u)^2\leq \frac{8\| x_{-k}\|^2}{c}$ with probability $\geq \frac{7}{8}$.
\subsection{Comparison with Count-Min sketch}
In generaly, $l_2$ norm is better than $l_1$ norm.
For example, consider power law distribution with parameter $\alpha$, the $i$-th largest element has frequency proportional to $i^{-\alpha}$. With this kind of distribution, Count-Min is good for $\alpha > 1$, Count sketch is good for $\alpha > \frac{1}{2}$.
\section{$l_0$ sampling}
Given a stream of items $v_1, v_2, \ldots$, we would like to sample one element from $U$, where $U$ contains all unique elements appeared in the stream.
To be specific, we want to design an online algorithm using sublinear space. Therefore storing $U$ is not realistic.
Here we consider three version of this problem.
\subsection{First version}
In the first version, each elements appears once. We can use Reservoir Sampling\footnote{\Reservoir}, which basically works as follows
\begin{itemize}
\item Set the first element as candidate.
\item For $i \geq 2$, switch $i$-th element with the candidate with probability $\frac{1}{i}$
\item Output candidate after the stream.
\end{itemize}
\subsection{Second version}
In the second version, we have duplicates of the same elements in the stream.
For this version, we can pick a uniformly random hash function $h$ such that
$$
h: U \rightarrow [0, 1]
$$
Then we store the element $v$ that minimize $h(v)$, and output $v$.
If $h$ is fully independent, we get perfect performance
\begin{itemize}
\item $O(1)$ words
\item $O(1)$ time for each element in the stream and storing and evaluating $h$
\item $x$ is uniformly random from $U$
\end{itemize}
But fully independence is not practical. So we have an alternative reasoning
\begin{itemize}
\item We can use min-wise independence from \cite{BCF00}, which basically means
$$
\forall S, x \in S, ~\Pr[h(x) = \min_{x' \in S} h(x')] = \frac{1 \pm \eps}{|S|}
$$
\item Theorem 1.1 in \cite{I01} shows that $O(\log \frac{1}{\eps})$-wise independence implies approximate min-wise independence for $|S| < \eps |U|$
\end{itemize}
\subsection{Third version}
In this version, we have deletions in the stream. To solve this problem, we first introduce a few building blocks.
The high level idea is to subsample the stream so that each substream contains one unique element with good probability. And recover the element back using certain simple techniques.
\subsubsection{subsampling}
Similar to the way we recover shortest path in lecture 15, we first get a 2-approximation of $\|x\|_0$ by choosing $\log |U|$ different $r$ to be $1, 2, 4, \ldots, \log |U|$. For each $r$, we choose $\log |U|$ different hash function $h_i$ where $h_i : U \rightarrow [r]$ and keep the samples with $h_i(v) = 0$. Each hash function corresponds to one subsample set.
As we did in lecture 15, if $r$ is a 2-approximation to the true $\|x\|_0$, i.e., size of distinct elements of the stream, with probability at least $\frac{1}{2e}$, you have exactly one unique element in one subsample set. Since we choose $O(\log |U|)$ subsample sets, with high probability $1-|U|^{-c}$, we will get one unique element in some subsample set.
\subsubsection{Check if only one element remains}
Although each time with probability $\geq \frac{1}{2e}$ only one element remains from the sketching process, we need to check whether it is actually the case so that we can recover and output the element. And unlike in lecture 15, this process is much tricky.
Now let $x$ be a subsample vector(corresponding to the number of occurrence of each element in this subsample set).
We want to be able to check if $\|x\|_0 = 1$ w.h.p to verify if we succeed in subsampling.
This can be further divided into two tasks.
\begin{itemize}
\item We first check if $\|x\|_0 \geq 1$. To do so, we randomly pick $v \in \{\pm 1\}^{|U|}$ and check if $v^T x = 0$. For some $x_i \neq 0$
$$
v^T x = 0 \Leftrightarrow -v_{-i}^T x_{-i} = v_ix_i \mbox{ ~~~($v_{-i}$ is vector v without coordinate i) }
$$
since $v_i$ is randomly picked from $\{\pm1 \}$, the probability of a false positive(i.e. $v^T x = 0$ for some $x \neq 0$) is at most $\frac{1}{2}$. Therefore by repeating $\log |U|$ times, we succeed with high probability.
\item Suppose we know $\|x\|_0 \geq 1$, we want to know if there are more than one element in this subsample set. To do this, we randomly split $x$ into $x_1$ and $x_2$ and use the method described in the first part above. If both $v_1^T x_1 \neq 0$ and $v_2^T x_2 \neq 0$ happens, we are certain that $\|x\|_0 \geq 2$. And we make mistake with probability at most $\frac{7}{8}$. \footnote{To be specific, with probability $\frac{1}{2}$, two unique element are not both in $x_1$ or $x_2$. Then with probability $\frac{1}{4}$ there is no false positive on both $x_1$ and $x_2$. This means we succeed with probability $\frac{1}{8}$.} So again by repeating this $\log |U|$ times, we succeed in this task with high probability.
\end{itemize}
\subsubsection{recover the index of the unique element}
If for some subsample vector $x$, $\|x\|_0 = 1$, we can use a simple technique to recover the non-zero coordinate(i.e. the unique element) $i^*$
\begin{itemize}
\item Pick $v' = (1, \ldots, 1)$ and $v = (1,2,3,\ldots, |U|)$
\item Maintain $v^Tx$ and $(v')^T x$ during the stream.
\item We have
$$
v^Tx = x_{i^*} i^*, ~~ (v')^T x = x_{i^*}
$$
So we output $i^* = \frac{(v^Tx)}{(v')^T x}$
\end{itemize}
\bibliographystyle{alpha}
\begin{thebibliography}{42}
% \bibitem[AMS99]{AMS99}
% Noga~Alon, Yossi~Matias, Mario~Szegedy.
% \newblock The Space Complexity of Approximating the Frequency Moments.
% \newblock {\em J. Comput. Syst. Sci.}, 58(1):137--147, 1999.
\bibitem[I01]{I01}
Piotr Indyk
\newblock A small approximately min-wise independent family of hash functions.
\newblock {\em Journal of Algorithms}, 38.1 (2001): 84-90.
\bibitem[BCF00]{BCF00}
Broder, Andrei Z., et al.
\newblock Min-wise independent permutations.
\newblock {\em Journal of Computer and System Sciences}, 60.3 (2000): 630-659.
\end{thebibliography}
\end{document}