├── .gitignore ├── README.md ├── discrete-distribution.cc └── doc ├── algorithm.tex └── biblio.bib /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | 3 | # LaTeX and BibTex temporary files 4 | doc/*.aux 5 | doc/*.bbl 6 | doc/*.blg 7 | doc/*.log 8 | doc/*.nav 9 | doc/*.out 10 | doc/*.pdf 11 | doc/*.toc 12 | doc/*.spl 13 | 14 | # Vim temporary files 15 | *.swp 16 | *.swo 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # discrete-distribution 2 | 3 | Fast algorithm for sampling from discrete distributions. 4 | 5 | Generating a sample takes O(1) time. This is in contrast with the naive 6 | algorithm that takes O(log N) time to generate a sample, where N is the size of 7 | support of the distributions. The naive algorithm is commonly used in many 8 | implementations of C++ standard library (clang, GCC). 9 | 10 | The details of the algorithm are described in the `doc/` directory. 11 | 12 | The ultimate goal of the project is to make implementation conform to C++ 13 | ISO standard and have it accepted to major open source implementations (clang, 14 | GCC). 15 | -------------------------------------------------------------------------------- /discrete-distribution.cc: -------------------------------------------------------------------------------- 1 | // C++ implementation of a fast algorithm for generating samples from a 2 | // discrete distribution. 3 | // 4 | // David Pal, December 2015 5 | // 6 | // To compile the program run: 7 | // 8 | // g++ -Wall -Wextra -Werror -std=c++11 discrete-distribution.cc 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using std::cout; 21 | using std::endl; 22 | 23 | namespace { 24 | // Stack that does not own the underlying storage. 25 | template 26 | class stack_view { 27 | public: 28 | stack_view(const BidirectionalIterator base) 29 | : base_(base), top_(base) { }; 30 | 31 | void push(const T& element) { 32 | *top_ = element; 33 | ++top_; 34 | } 35 | 36 | T pop() { 37 | --top_; 38 | return *top_; 39 | } 40 | 41 | bool empty() { 42 | return top_ == base_; 43 | } 44 | 45 | private: 46 | const BidirectionalIterator base_; 47 | BidirectionalIterator top_; 48 | }; 49 | } 50 | 51 | template 52 | class fast_discrete_distribution { 53 | public: 54 | typedef IntType result_type; 55 | 56 | fast_discrete_distribution(const std::vector& weights) 57 | : uniform_distribution_(0.0, 1.0) { 58 | normalize_weights(weights); 59 | create_buckets(); 60 | } 61 | 62 | result_type operator()(std::default_random_engine& generator) { 63 | const double number = uniform_distribution_(generator); 64 | size_t index = floor(buckets_.size() * number); 65 | 66 | // Fix index. TODO: This probably not necessary? 67 | if (index >= buckets_.size()) index = buckets_.size() - 1; 68 | 69 | const Bucket& bucket = buckets_[index]; 70 | if (number < std::get<2>(bucket)) 71 | return std::get<0>(bucket); 72 | else 73 | return std::get<1>(bucket); 74 | } 75 | 76 | result_type min() const { 77 | return static_cast(0); 78 | } 79 | 80 | result_type max() const { 81 | return probabilities_.empty() 82 | ? static_cast(0) 83 | : static_cast(probabilities_.size() - 1); 84 | } 85 | 86 | std::vector probabilities() const { 87 | return probabilities_; 88 | } 89 | 90 | void reset() { 91 | // Empty 92 | } 93 | 94 | void PrintBuckets() { 95 | cout << "buckets.size() = " << buckets_.size() << endl; 96 | for (auto bucket : buckets_) { 97 | cout << std::get<0>(bucket) << " " 98 | << std::get<1>(bucket) << " " 99 | << std::get<2>(bucket) << " " 100 | << endl; 101 | } 102 | } 103 | 104 | private: 105 | // TODO: Figure out how to replace size_t in Segment with result_type. 106 | // GCC 4.8.4 refuses to compile it. 107 | typedef std::pair Segment; 108 | typedef std::tuple Bucket; 109 | 110 | void normalize_weights(const std::vector& weights) { 111 | const double sum = std::accumulate(weights.begin(), weights.end(), 0.0); 112 | probabilities_.reserve(weights.size()); 113 | for (auto weight : weights) { 114 | probabilities_.push_back(weight / sum); 115 | } 116 | } 117 | 118 | void create_buckets() { 119 | const size_t N = probabilities_.size(); 120 | if (N <= 0) { 121 | buckets_.emplace_back(0, 0, 0.0); 122 | return; 123 | } 124 | 125 | // Two stacks in one vector. First stack grows from the begining of the 126 | // vector. The second stack grows from the end of the vector. 127 | std::vector segments(N); 128 | stack_view::iterator> 129 | small(segments.begin()); 130 | stack_view::reverse_iterator> 131 | large(segments.rbegin()); 132 | 133 | // Split probabilities into small and large 134 | result_type i = 0; 135 | for (auto probability : probabilities_) { 136 | if (probability < (1.0 / N)) { 137 | small.push(Segment(probability, i)); 138 | } else { 139 | large.push(Segment(probability, i)); 140 | } 141 | ++i; 142 | } 143 | 144 | buckets_.reserve(N); 145 | 146 | i = 0; 147 | while (!small.empty() && !large.empty()) { 148 | const Segment s = small.pop(); 149 | const Segment l = large.pop(); 150 | 151 | // Create a mixed bucket 152 | buckets_.emplace_back(s.second, l.second, 153 | s.first + static_cast(i) / N); 154 | 155 | // Calculate the length of the left-over segment 156 | const double left_over = s.first + l.first - static_cast(1) / N; 157 | 158 | // Re-insert the left-over segment 159 | if (left_over < (1.0 / N)) 160 | small.push(Segment(left_over, l.second)); 161 | else 162 | large.push(Segment(left_over, l.second)); 163 | 164 | ++i; 165 | } 166 | 167 | // Create pure buckets 168 | while (!large.empty()) { 169 | const Segment l = large.pop(); 170 | // The last argument is irrelevant as long it's not a NaN. 171 | buckets_.emplace_back(l.second, l.second, 0.0); 172 | } 173 | 174 | // This loop can be executed only due to numerical inaccuracies. 175 | // TODO: Find an example when it actually happens. 176 | while (!small.empty()) { 177 | const Segment s = small.pop(); 178 | cout << "Here" << endl; 179 | // The last argument is irrelevant as long it's not a NaN. 180 | buckets_.emplace_back(s.second, s.second, 0.0); 181 | } 182 | } 183 | 184 | // Uniform distribution over interval [0,1]. 185 | std::uniform_real_distribution uniform_distribution_; 186 | 187 | // List of probabilities 188 | std::vector probabilities_; 189 | std::vector buckets_; 190 | }; 191 | 192 | void Test(const std::vector& weights, const size_t num_samples) { 193 | std::default_random_engine generator; 194 | fast_discrete_distribution distribution(weights); 195 | distribution.PrintBuckets(); 196 | 197 | std::vector counts(weights.size(), 0); 198 | for (size_t i = 0; i < num_samples; ++i) { 199 | const int number = distribution(generator); 200 | assert(number >= 0); 201 | assert(number < static_cast(weights.size())); 202 | ++counts[number]; 203 | } 204 | 205 | std::cout << "counts:" << std::endl; 206 | for (size_t i = 0; i < weights.size(); ++i) 207 | cout << i << " (" << weights[i] << ") : " 208 | << std::string(counts[i], '*') << endl; 209 | 210 | cout << endl; 211 | } 212 | 213 | void TestEmpty(const size_t num_samples) { 214 | std::default_random_engine generator; 215 | fast_discrete_distribution distribution({}); 216 | distribution.PrintBuckets(); 217 | 218 | for (size_t i = 0; i < num_samples; ++i) { 219 | const int number = distribution(generator); 220 | assert(number == 0); 221 | } 222 | } 223 | 224 | int main() { 225 | TestEmpty(100); 226 | Test({0}, 100); 227 | Test({1}, 100); 228 | Test({1, 1}, 200); 229 | Test({1, 1, 1}, 300); 230 | Test({1, 1, 2}, 300); 231 | Test({1, 0, 2}, 300); 232 | Test({20, 10, 30}, 300); 233 | Test({0, 1e-20, 0}, 100); 234 | Test({1 - 1e-10, 1 - 1e-10, 1 - 1e-10}, 100); 235 | Test({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, 236 | 100000000); 237 | 238 | std::discrete_distribution distribution({10.0, 20.0, 30.0}); 239 | cout << distribution << endl; 240 | return 0; 241 | } 242 | -------------------------------------------------------------------------------- /doc/algorithm.tex: -------------------------------------------------------------------------------- 1 | % LaTeX document describing the sampling algorithm. 2 | % 3 | % You will a recent LaTeX distribution installed to generate a PDF file. 4 | % Run the following commands: 5 | % 6 | % pdflatex algorithm.tex 7 | % bibtex algorithm.aux 8 | % pdflatex algorithm.tex 9 | % pdflatex algorithm.tex 10 | % 11 | 12 | \documentclass{article} 13 | 14 | \usepackage{amsmath} 15 | 16 | \usepackage{hyperref} 17 | \hypersetup{colorlinks=true} 18 | 19 | \title{Algorithm for Sampling from Discrete Distributions} 20 | \author{D\'avid P\'al} 21 | 22 | \begin{document} 23 | 24 | \maketitle 25 | 26 | \begin{abstract} 27 | We describe an algorithm for generating random samples from any discrete 28 | (categorical) distribution, given as an input, in $O(1)$ time per sample. If 29 | the distribution has $N$ categories, the algorithm needs $O(N)$ memory and 30 | $O(N)$ pre-processing time. 31 | \end{abstract} 32 | 33 | \section{Introduction} 34 | \label{section:introduction} 35 | 36 | Generating random numbers from various probability distributions is an 37 | important task in many scientific and industrial applications. In this short 38 | note, we describe an algorithm for generating random numbers from a discrete 39 | distribution, also called \emph{categorical 40 | distribution}~\cite{wikipedia:categorical_distribution}. 41 | 42 | A discrete distribution is specified by $N$ non-negative real numbers 43 | $p_1, p_2, \dots, p_N$ that satisfy 44 | $$ 45 | p_1 + p_2 + \dots + p_N = 1 \; . 46 | $$ 47 | A sample from the distribution is a number from the set $\{1,2,\dots,N\}$. 48 | Number $i \in \{1,2,\dots,N\}$ is generated with probability $p_i$. The number 49 | $N$ is called the number of categories. 50 | 51 | In practical implementations, the algorithm is given as an input 52 | a list of arbitrary non-negative numbers $w_1, w_2, \dots, w_N$ 53 | with positive sum and the goal is to generate samples from the discrete 54 | distribution specified by $p_1, p_2, \dots, p_N$ where 55 | $$ 56 | p_i = \frac{w_i}{\sum_{i=1}^N w_i} \; . 57 | $$ 58 | Transforming $w_1, w_2, \dots, w_N$ to $p_1, p_2, \dots, p_N$ is easily 59 | achieved in $O(N)$ time as part of the pre-processing phase. In the rest of 60 | the paper, we assume this has been done. 61 | 62 | The problem of generating random numbers from a distribution is to design an 63 | algorithm that receives numbers $p_1, p_2, \dots, p_N$ as an input, 64 | pre-processes them, and is able to generate any number of independent samples 65 | from the distribution. We make the standard assumption that the algorithm has 66 | access to a random number generator that generates independent random real 67 | numbers uniformly from the unit interval $[0,1]$ in $O(1)$ time per sample. 68 | 69 | Naive, but fairly common, algorithm requires $O(N)$ pre-processing time, $O(N)$ 70 | memory, and can generate sample in $O(\log N)$ time.\footnote{All time and 71 | memory complexities are in the worst-case sense.} As of year 2015, the naive 72 | algorithm is used in popular implementations of the C++ standard 73 | library~\cite{gcc-libstdc++,clang-libc++}. We recap the algorithm in 74 | Section~\ref{section:naive-algorithm}. 75 | 76 | In Section~\ref{section:fast-algorithm}, we describe a faster algorithm that 77 | uses $O(N)$ memory, has $O(N)$ pre-processing time, but requires only $O(1)$ 78 | time to generate a sample. The space and time complexities are clearly 79 | optimal. 80 | 81 | \section{Naive Algorithm} 82 | \label{section:naive-algorithm} 83 | 84 | In the pre-processing phase, the naive algorithm computes prefix sums $s_0, 85 | s_1, s_2, \dots, s_N$ where $s_0 = 0$ and for $i=1,2,\dots,N$ 86 | $$ 87 | s_i = p_1 + p_2 + \dots + p_i \; . 88 | $$ 89 | The prefix sums can be computed in $O(N)$ time and use $O(N)$ memory. 90 | 91 | To generate a sample the algorithm makes a single call to the random number 92 | generator. Let $X$ be the number it obtains. Using 93 | binary search, the algorithm finds an index $i \in \{1,2,\dots,N\}$ such that 94 | $$ 95 | s_{i-1} \le X \le s_i \; . 96 | $$ 97 | It is not hard to see that index $i$ is chosen with probability $s_i - s_{i-1} 98 | = p_i$. 99 | 100 | \section{Fast Algorithm} 101 | \label{section:fast-algorithm} 102 | 103 | The difference between the naive and the fast algorithm starts with the 104 | pre-processing phase. Given $p_1, p_2, \dots, p_N$, think of each $p_i$ as a 105 | line segment of length $p_i$ and color $i$. The algorithm cuts the line 106 | segments into $2N$ smaller line segments. During cutting each point of each 107 | line segment keeps its original color. Let $q_1, q_2, \dots, q_{2N}$ be the 108 | lengths of the resulting line segments. Their length will satisfy 109 | \begin{equation} 110 | \label{equation:two-segments} 111 | q_{2i - 1} + q_{2i} = \frac{1}{N} 112 | \end{equation} 113 | for $i=1,2,\dots,N$. In other words, if we connect segments $q_{2i}$ and 114 | $q_{2i-1}$ we get segment of length exactly $1/N$. 115 | 116 | It is a non-trivial fact that any $p_1, p_2, \dots, p_N$ can be cut into $q_1, 117 | q_2, \dots, q_{2N}$ satisfying equation \eqref{equation:two-segments}. 118 | Furthermore, the cutting can be done in $O(N)$ time and $O(N)$ memory. We show 119 | how to do the cutting in Section~\ref{subsection:cutting} below. 120 | 121 | Once the cutting is done, the algorithm can generate a sample in $O(1)$ time: 122 | First, it makes a single call to the random number generator generator. Let $X$ 123 | be the number it obtains. The algorithm computes an index $i \in \{1,2,\dots,N\}$ 124 | such that 125 | $$ 126 | \frac{i-1}{N} \le X \le \frac{i}{N} \; . 127 | $$ 128 | This can be done in constant time using formula $i = \lceil N \cdot X \rceil$ 129 | if $X$ is positive, and if $X = 0$, we set $i=1$. Note that index $i$ is 130 | uniformly distributed over $\{1,2,\dots,N\}$. Number $X$ lies in the interval 131 | $[\frac{i-1}{N}, \frac{i}{N}]$ of length $1/N$. We divide this interval into 132 | two disjoint sub-intervals: 133 | $$ 134 | \left[\frac{i-1}{N}, \frac{i}{N} \right] 135 | = 136 | \left[\frac{i-1}{N}, \frac{i-1}{N} + q_{2i-1} \right) 137 | \cup 138 | \left[\frac{i-1}{N} + q_{2i-1}, \frac{i}{N} \right] \; . 139 | $$ 140 | The length of the first subinterval is $q_{2i-1}$ and the length of second 141 | subinterval is $1/N - q_{2i - 1} = q_{2i}$. If $X$ lies in the first 142 | subinterval, algorithm outputs the color of line segment $q_{2i-1}$. Otherwise, 143 | $X$ lies in the second subinterval and the algorithm outputs the color of 144 | line segment $q_{2i}$. 145 | 146 | \subsection{How to Cut the Line Segments?} 147 | \label{subsection:cutting} 148 | 149 | We split the line segments $p_1, p_2, \dots, p_N$ into two types: short and 150 | long. Short ones are those that have length smaller than $1/N$. Long ones have 151 | length $1/N$ or bigger. Notice that there must be at least one long line 152 | segment; since if all $N$ line segments were short, their total length 153 | would be less than $1$. 154 | 155 | 156 | The algorithm then consists of $N$ rounds. In each round $i=1,2,\dots,N$, it 157 | constructs $q_{2i-1}$ and $q_{2i}$ satisfying \eqref{equation:two-segments}. 158 | The construction of the pair $q_{2i-1}, q_{2i}$ will be done in $O(1)$ time. 159 | 160 | In any round $i$, the algorithm takes an arbitrary short line segment and an 161 | arbitrary long line segment. Let $s$ be the length of the short line segment 162 | and let $\ell$ be the length of long line segment. We remove $s$ and $\ell$ 163 | from their respective piles. The line segment $q_{2i-1}$ will be the short 164 | segment, i.e., $q_{2i-1} = s$ and the color of $q_{2i-1}$ will be the color of 165 | $s$. The line segment $q_{2i}$ will be cut from the long line segment $\ell$, 166 | i.e., $q_{2i} = \frac{1}{N} - s$ and $q_{2i}$ will have the color of $\ell$. 167 | Note that since $\ell \ge \frac{1}{N}$ there will be left-over line segment 168 | (possibly of size zero) from $\ell$. The length of left-over line segment is 169 | $\ell' = \ell - q_{2i}$. The left-over length $\ell'$ could be long (i.e. $1/N$ 170 | or longer) or short (less than $1/N$). Based on its length, we insert the 171 | left-over line segment into the corresponding list. 172 | 173 | First, notice that by construction $q_{2i-1}$ and $q_{2i}$ satisfy 174 | \eqref{equation:two-segments}. Second, notice that in each round we decrease 175 | the number of line segments in the two piles by one: We remove $2$ line 176 | segments and we insert one left-over line segment back. Thus, the algorithm in 177 | exactly $N$ rounds removes all line segments from the two piles. Finally, 178 | notice that at the beginning of round $i$ there are $N-i+1$ line segments in 179 | the two piles and their length is $1 - \frac{i-1}{N}$, since in each of the 180 | previous $i-1$ rounds we have decreased the total length by $1/N$. Therefore, 181 | at the beginning of the round $i$, at least one line segment in the two piles 182 | will have length at least 183 | $$ 184 | \frac{1 - \frac{i-1}{N}}{N - i + 1} 185 | = \frac{\frac{N - i + 1}{N}}{N - i + 1} 186 | = \frac{1}{N} \; . 187 | $$ 188 | In other words, there is at least one long line segment at the beginning of 189 | every round. 190 | 191 | It could happen that at the beginning of round $i$ there is no short line 192 | segment. That means all (long) line segments have length $1/N$. This follows 193 | from that there are $N-i+1$ long line segments and their total length is 194 | $\frac{N-i+1}{N}$. In that case, we can simply create $N-i+1$ pairs of line 195 | segments $(q_{2i-1}, q_{2i}), (q_{2i+1}, q_{2i+2}), \dots, (q_{2N-1}, q_{2N})$. 196 | The first line segment in each pair is constructed from a long line segment and 197 | the second line segment in each pair has zero length of arbitrary color. 198 | 199 | The two piles can be implemented using two stacks stored in the same array of 200 | size $N$. The stacks have their bottoms at the opposite ends of the array and 201 | grow inward. 202 | 203 | \bibliographystyle{plain} 204 | \bibliography{biblio} 205 | 206 | \end{document} 207 | -------------------------------------------------------------------------------- /doc/biblio.bib: -------------------------------------------------------------------------------- 1 | @Misc{clang-libc++, 2 | Note = {URL: \url{http://libcxx.llvm.org/}}, 3 | Organization = {University of Illinois at Urbana-Champaign}, 4 | Title = {"libc++" {C++} Standard Library}, 5 | Year = {2015}, 6 | } 7 | 8 | @Misc{gcc-libstdc++, 9 | Note = {URL: \url{https://gcc.gnu.org/libstdc++/}}, 10 | Organization = {Free Software Foundation}, 11 | Title = {The {GNU} Standard {C++} Library v3}, 12 | Year = {2015}, 13 | } 14 | 15 | @Misc{wikipedia:categorical_distribution, 16 | Author = {Wikipedia}, 17 | Month = {December}, 18 | Note = {URL: \url{https://en.wikipedia.org/wiki/Categorical_distribution}; accessed December 9, 2015}, 19 | Title = {Categorical distribution}, 20 | Year = {2015}, 21 | } 22 | 23 | --------------------------------------------------------------------------------