├── 01-skiplist.ipynb ├── 02-coinflips.ipynb ├── 03-hyper-log-log-counter.ipynb ├── 04-bloom-filters.ipynb └── README.rst /01-skiplist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "01-skiplist" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "See http://ivory.idyll.org/blog/2013-pycon-awesome-big-data-algorithms-talk.html" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# SkipList\n", 22 | "\n", 23 | "An implementation and exploration of skiplists.\n", 24 | "\n", 25 | "Code taken/refactored from John Shipman's excellent SkipList implementation:\n", 26 | "http://infohost.nmt.edu/tcc/help/lang/python/examples/pyskip/pyskip.pdf" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "collapsed": false, 32 | "input": [ 33 | "import random\n", 34 | "def random_level(max_level):\n", 35 | " num = random.randint(1, 2**max_level - 1)\n", 36 | " lognum = math.log(num, 2)\n", 37 | " level = int(floor(lognum))\n", 38 | " return max_level - level\n", 39 | "\n", 40 | "print random_level(8)\n", 41 | "print random_level(8)\n", 42 | "print random_level(8)\n", 43 | "print random_level(8)\n", 44 | "print random_level(8)\n", 45 | "print random_level(8)" 46 | ], 47 | "language": "python", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "output_type": "stream", 52 | "stream": "stdout", 53 | "text": [ 54 | "3\n", 55 | "1\n", 56 | "1\n", 57 | "4\n", 58 | "3\n", 59 | "3\n" 60 | ] 61 | } 62 | ], 63 | "prompt_number": 36 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [ 69 | "class Node(object):\n", 70 | " def __init__(self, value, level=0):\n", 71 | " self.value = value\n", 72 | " self.next = [None] * level\n", 73 | " \n", 74 | " def __str__(self):\n", 75 | " return \"Node(%s,%s)\" % (self.value, len(self.next))\n", 76 | " __repr__ = __str__\n", 77 | " \n", 78 | "class SkipList(object):\n", 79 | " def __init__(self, max_level=8):\n", 80 | " self.max_level = max_level\n", 81 | " n = Node(None, max_level)\n", 82 | " self.head = n\n", 83 | " self.verbose = False\n", 84 | " \n", 85 | " def update_list(self, value):\n", 86 | " update = [None] * (self.max_level)\n", 87 | " n = self.head\n", 88 | " \n", 89 | " self._n_traverse = 0\n", 90 | " \n", 91 | " level = self.max_level - 1\n", 92 | " while level >= 0:\n", 93 | " if self.verbose and \\\n", 94 | " n.next[level] != None and n.next[level].value >= value:\n", 95 | " print 'DROP down from level', level + 1\n", 96 | " while n.next[level] != None and n.next[level].value < value:\n", 97 | " self._n_traverse += 1\n", 98 | " if self.verbose:\n", 99 | " print 'AT level', level, 'value', n.next[level].value\n", 100 | " n = n.next[level]\n", 101 | " update[level] = n\n", 102 | " level -= 1\n", 103 | "\n", 104 | " return update\n", 105 | " \n", 106 | " def find(self, value, update=None):\n", 107 | " if update is None:\n", 108 | " update = self.update_list(value)\n", 109 | " \n", 110 | " if len(update) > 0:\n", 111 | " candidate = update[0].next[0]\n", 112 | " if candidate != None and candidate.value == value:\n", 113 | " return candidate\n", 114 | " return None\n", 115 | " \n", 116 | " def insert_node(self, value, level=None):\n", 117 | " if level is None:\n", 118 | " level = random_level(self.max_level)\n", 119 | " \n", 120 | " node = Node(value, level)\n", 121 | " \n", 122 | " update = self.update_list(value)\n", 123 | " if self.find(value, update) == None:\n", 124 | " for i in range(level):\n", 125 | " node.next[i] = update[i].next[i]\n", 126 | " update[i].next[i] = node\n", 127 | " \n", 128 | "def print_level(sl, level):\n", 129 | " print 'level %d:' % level,\n", 130 | " node = sl.head.next[level]\n", 131 | " while node:\n", 132 | " print node.value, '=>',\n", 133 | " node = node.next[level]\n", 134 | " print 'END'\n", 135 | " " 136 | ], 137 | "language": "python", 138 | "metadata": {}, 139 | "outputs": [], 140 | "prompt_number": 37 141 | }, 142 | { 143 | "cell_type": "code", 144 | "collapsed": false, 145 | "input": [ 146 | "# create and load a skiplist with max level of 4\n", 147 | "x = SkipList(4)\n", 148 | "for i in range(0, 20, 2):\n", 149 | " x.insert_node(i)" 150 | ], 151 | "language": "python", 152 | "metadata": {}, 153 | "outputs": [], 154 | "prompt_number": 46 155 | }, 156 | { 157 | "cell_type": "code", 158 | "collapsed": false, 159 | "input": [ 160 | "# print out the data structure\n", 161 | "print_level(x, 0)\n", 162 | "print_level(x, 1)\n", 163 | "print_level(x, 2)" 164 | ], 165 | "language": "python", 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "output_type": "stream", 170 | "stream": "stdout", 171 | "text": [ 172 | "level 0: 0 => 2 => 4 => 6 => 8 => 10 => 12 => 14 => 16 => 18 => END\n", 173 | "level 1: 2 => 6 => 10 => 12 => 18 => END\n", 174 | "level 2: 10 => 12 => END\n" 175 | ] 176 | } 177 | ], 178 | "prompt_number": 47 179 | }, 180 | { 181 | "cell_type": "code", 182 | "collapsed": false, 183 | "input": [ 184 | "# verbalize the insertion process for '11'\n", 185 | "x.verbose = True\n", 186 | "print 'INSERT', 11\n", 187 | "x.insert_node(11)\n", 188 | "print 'DONE;', x._n_traverse, 'traversals'" 189 | ], 190 | "language": "python", 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "output_type": "stream", 195 | "stream": "stdout", 196 | "text": [ 197 | "INSERT 11\n", 198 | "AT level 2 value 10\n", 199 | "DROP down from level 2\n", 200 | "DROP down from level 1\n", 201 | "DONE; 1 traversals\n" 202 | ] 203 | } 204 | ], 205 | "prompt_number": 48 206 | }, 207 | { 208 | "cell_type": "code", 209 | "collapsed": false, 210 | "input": [ 211 | "# print out the updated data structure\n", 212 | "print_level(x, 0)\n", 213 | "print_level(x, 1)\n", 214 | "print_level(x, 2)" 215 | ], 216 | "language": "python", 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "output_type": "stream", 221 | "stream": "stdout", 222 | "text": [ 223 | "level 0: 0 => 2 => 4 => 6 => 8 => 10 => 11 => 12 => 14 => 16 => 18 => END\n", 224 | "level 1: 2 => 6 => 10 => 12 => 18 => END\n", 225 | "level 2: 10 => 12 => END\n" 226 | ] 227 | } 228 | ], 229 | "prompt_number": 49 230 | }, 231 | { 232 | "cell_type": "code", 233 | "collapsed": false, 234 | "input": [ 235 | "# do a random simulation to evaluate how many traversals need to be done to reach\n", 236 | "# the last element in the list\n", 237 | "def skiplist_traverse_mc(max_level, max_count, n=100):\n", 238 | " z = []\n", 239 | " for _ in range(n):\n", 240 | " x = SkipList(max_level)\n", 241 | " for i in reversed(range(max_count)):\n", 242 | " x.insert_node(i)\n", 243 | " \n", 244 | " x.find(254)\n", 245 | " z.append(x._n_traverse)\n", 246 | "\n", 247 | " return z\n" 248 | ], 249 | "language": "python", 250 | "metadata": {}, 251 | "outputs": [], 252 | "prompt_number": 53 253 | }, 254 | { 255 | "cell_type": "code", 256 | "collapsed": false, 257 | "input": [ 258 | "avgs = []\n", 259 | "for i in range(1, 10):\n", 260 | " z = skiplist_traverse_mc(i, 200)\n", 261 | " avgs.append((i, average(z)))\n", 262 | "\n", 263 | "avgs = numpy.array(avgs)" 264 | ], 265 | "language": "python", 266 | "metadata": {}, 267 | "outputs": [], 268 | "prompt_number": 51 269 | }, 270 | { 271 | "cell_type": "code", 272 | "collapsed": false, 273 | "input": [ 274 | "# graph average to traverse to last of 200 elements\n", 275 | "plot(avgs[:,0], avgs[:,1])\n", 276 | "axis(ymin=0, xmin=0)\n", 277 | "xlabel('skiplist max level')\n", 278 | "ylabel('time to traverse to last of 200 elements')" 279 | ], 280 | "language": "python", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "output_type": "pyout", 285 | "prompt_number": 52, 286 | "text": [ 287 | "" 288 | ] 289 | }, 290 | { 291 | "output_type": "display_data", 292 | "png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEMCAYAAADJQLEhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xlc1PW+x/HXoIGmuOSCtiByVHB3MMAVkczUm0umD9Ms\nTW8lmktZN+taaZ1j2Y7e41Jpthy1POXJNvVQEq6gSdohlwxcc0FNBRUU+N0/5jBHFBiQYX4zzPv5\neMxD5zc/hjen43z47hbDMAxERMRr+ZgdQEREzKVCICLi5VQIRES8nAqBiIiXUyEQEfFyKgQiIl7O\n6YXg0KFD9OzZk9atWxMdHc3SpUsByMzMZODAgQQGBjJo0CCysrLsXzNnzhyaN29Oq1at2LBhg7Mj\niYhICSzOXkdw7Ngxjh07RocOHTh58iQRERHs2LGD+fPnc+jQIV5//XWmTp1KUFAQTz75JCdOnCAq\nKoq1a9eSnp7O448/zvbt250ZSURESuD0FkGjRo3o0KEDAPXr16d169Zs3bqV5ORkxo4di5+fH2PG\njCEpKQmApKQk+vTpQ2BgID169MAwDDIzM50dS0REilGhYwT79u0jNTWViIgItm7dSmhoKAChoaEk\nJycDtkLQsmVL+9eEhITYXxMRkYpXtaLeODMzk2HDhvHWW29Rs2ZNytIDZbFYSnVNREQcc/T5WyEt\ngsuXL3PvvffywAMPMHDgQADCw8PZtWsXALt27SI8PByAyMhIfvnlF/vX7t692/7a1QzDcJvHhQsG\nN9zwAufOmZ/lyscLL7xgegZPyOSuuZRJmZz9KA2nFwLDMBg7dixt2rRhypQp9uuRkZEsXryYixcv\nsnjxYjp16gRAREQEa9as4eDBgyQkJODj44O/v7+zYzld9epwyy2gSU4i4umcXgg2btzIxx9/zPff\nf4/VasVqtbJ69WpiY2M5ePAgISEhHDlyhHHjxgEQEBBAbGwsMTExjB8/nri4OGdHqjBBQfD992an\nEBEpH6ePEXTr1o38/PwiX/viiy+KvD558mQmT57s7CgVbvDgaD74wOwUhUVHR5sd4RrumAncM5cy\nlY4yOZfT1xFUFIvFUur+Lle5dAnq14cDB6BuXbPTiIhcqzSfndpiohx8faFzZ/jhB7OTiIhcPxWC\ncoqJgXXrzE4hInL9VAjKKSZGA8Yi4tk0RlBOubnQoAHs3g0BAWanEREpTGMELlC1KnTvDgkJZicR\nEbk+KgROoHECEfFkKgROoHECEfFkDgvBvn37yM7OBuCnn35i6dKl5ObmVngwT9KmDfzxBxw+bHYS\nEZGyc1gI7r33XqpWrcqJEycYOnQoiYmJjBkzxhXZPIaPD0RHq3tIRDyTw0JgsVioWrUq77//Po8+\n+igLFiyw7yIq/6HuIRHxVA4LQePGjVm0aBEff/wxI0eOBODixYsVHszTFBQCN5zhKiJSIoeF4J13\n3uHQoUO88sorNGrUiPT0dB544AFXZPMoLVrA5cuQlmZ2EhGRsnG4++jnn3/OjBkz7M+bNm2Kn59f\nRWbySBbLf6aR/ulPZqcRESk9hy2CJUuWXHPtA3fbe9lNaJxARDxRsS2CZcuWsXTpUtLT0+nfv7/9\nekZGBq1bt3ZJOE/Tsyc8+6xtnEBHLIuIpyi2EHTp0oXGjRuTkZHBk08+ad+rokmTJjRt2tRlAT1J\n06a2Iyx374aWLc1OIyJSOtp0zsnGjoWwMJgwwewkIiJO2nQuPj6emJgY6tSpg7+/P/7+/tSqVctp\nISsbjROIiKdx2CK4/fbbiYuLo3Pnzvj4mLc1kae0CH7/Hdq2hYwM24pjEREzOaVF4OvrS8eOHU0t\nAp7k5ptt5xPs3Gl2EhGR0nG4jqB79+4MGjSIoUOHUqdOHcBWYQYPHlzh4TxVQfdQhw5mJxERccxh\n19Do0aNtN141H/L999+vsFBF8ZSuIYAVK+CDD+Crr8xOIiLerjSfnZo1VAEyMqB5czh50naCmYiI\nWZwyRpCenk5sbCxWqxWAnTt38uc//9k5CSupBg2gSRP48Uezk4iIOOawEMyYMaPQyuK2bduybNmy\nCg1VGWgaqYh4CoeFYO/evfTr18/+PD8/H19f3woNVRn07KlCICKewWEh6NatGz/+u48jJyeHuXPn\nctddd1V4ME8XFQVbtkBOjtlJRERK5rAQTJkyhXnz5nHs2DGCg4NJTU1l0qRJrsjm0erUse03lJRk\ndhIRkZKVetZQbm6uqd1CnjRrqMDTT9s2obviOAcREZdyyvTRzMxM1q5dy+bNm8n5dz+HxWJhzpw5\nzktaCp5YCNasgVmz4IcfzE4iIt6qNJ+dDme5P/LII1SvXp3OnTvj6+uLYRjXLC6TonXrZptCeuEC\n3Hij2WlERIrmsBCkpqayUxvnXJcaNcBqhY0b4c47zU4jIlI0h4PFsbGx/OUvfyEtLY3Tp0/bH1I6\nPXvazjEWEXFXDlsEN954I1OnTmXevHn2gWKLxUJaWlqFh6sMYmJg2jSzU4iIFM/hYHFwcDDr1q2j\nSZMmrspUJE8cLAbIzrZtOXH4MNSubXYaEfE2TtlrqFmzZlSvXt1pobxNtWoQEQHr15udRESkaA67\nhurWrUv79u3p1atXofMIXD191JPFxNjGCe6+2+wkIiLXclgI+vbtS9++fYH/NDE0fbRsYmJg/Hiz\nU4iIFK3UK4vT0tIIDg6u6DzF8tQxAoDLl6FePUhPt/0pIuIqThkjSEhIIDIykpiYGABSUlIYMGCA\ncxJ6iRtusC0u0wpjEXFHDgvBa6+9xqpVq6hbty4AVqtVU0evg84nEBF35bAQZGVlERAQYH+emZlJ\nrVq1KjRUZaRCICLuymEhGDhwIHPmzCE3N5fExETGjRvHsGHDXJGtUmnfHo4dg6NHzU4iIlJYqbaY\nqFWrFkFBQcyePZt+/foxbtw4V2SrVKpUgR49ICHB7CQiIoWVetaQ2Tx51lCBuXNh5054912zk4iI\ntyjXeQRXHlhf1BuvWrWqfOnKqDIUgn/9CwYOhN9+MzuJiHiLcp1HMHXq1BLfuDhjxozh66+/pmHD\nhvz8888AzJgxg/fee48GDRoAMGvWLPsitTlz5jB37lxuuOEG3nnnHbp161ZiYE/WujVkZcGBA2Dy\n1k0iInZOX1C2fv16atasyYMPPmgvBDNnzsTf358nnnii0L0nTpwgKiqKtWvXkp6ezuOPP8727duL\nDloJWgQA990HffrA6NFmJxERb2DKgrLu3bvb1xxcqaggSUlJ9OnTh8DAQHr06IFhGGRmZjqK5NE0\njVRE3I3DvYYKFpT16dMHuP4FZXPnzmXFihXcc889jB8/Hn9/f5KTk2nZsqX9npCQEJKTk7njjjuK\nfI8ZV5wCHx0dTXR0dJlzmK1nT3jxRTAM0JZNIuJsCQkJJJRxeqLDQuCMBWWxsbE8//zznDt3jqee\neoqFCxfy5JNPFtlKKGn84cpC4KmaNbMVgH37oHlzs9OISGVz9S/JM2fOdPg1LllQ1rBhQywWC7Vr\n12bChAmsXLkSgMjISH755Rf7fbt37yY8PLxM7+1pLBZ1D4mIe3FYCMaPH1/uBWVH/72cNjc3l6VL\nl9KvXz8AIiIiWLNmDQcPHiQhIQEfHx/8/f2v48fwLD17qhCIiPtw+oKy4cOH88MPP3Dy5EkCAgKY\nOXMmCQkJ/PTTT/j6+hIVFcX06dO56aabAIiLi2Pu3Ln4+vqycOFCunfvXnTQSjJrCGzTR8PD4fhx\njROISMUq14Iyd1OZCgHYxgr+8Q9o08bsJCJSmTll+qhUDI0TiIi7UCEwicYJRMRdlNg1tG3bNpKS\nktiyZQsAnTp1IjIykttvv91lAQtUtq6hY8egVSvIyLDtTCoiUhHK1TX0v//7v0yZMoWcnBzuv/9+\nRowYQXZ2NlOmTOGZZ55xelhv06gRNG4MP/1kdhIR8XbFtgiCg4PZunUr9a46bf3kyZOEh4eTnp7u\nkoAFKluLAOCxx2ybzz31lNlJRKSyKleLoE6dOsTHx19zPT4+vsi9hKTsYmJg3TqzU4iItyu2RbBr\n1y6mTp3Knj17aNiwIQDHjx8nNDSU119/nVatWrk2aCVsEZw6BU2b2v684Qaz04hIZeS0dQT79u0D\noFmzZs5Jdh0qYyEACAuD//s/6NLF7CQiUhk5ZR1BVlYWBw4c4ODBg5V+i2gzaBqpiJit2BbBhg0b\nmDRpEoZhEBISAtg2hfPx8SEuLq7YrSAqLGglbRF8/TW8+SZ8953ZSUSkMipX11CrVq2YP38+PXr0\nKHQ9ISGB8ePHF9o11BUqayE4dw5uucW2nqBaNbPTiEhlU66uocuXL9O0adNrrgcHB3Pp0qXypxMA\natWynWW8ebPZSUTEWxV7MM3EiRPp3bs3ffr0sZ8i9ssvv7BmzRomTpzosoDeoGAaac+eZicREW9U\n4qyhEydOkJSURHJyMoZhEBkZSURERKETy1ylsnYNAcTHw4wZsGGD2UlEpLLRNtQe4sIFaNjQtv9Q\nzZpmpxGRyqRcYwQXL15k0aJFPPjgg6xYsaLQa+PHj3dOQgHgxhuhY0fYuNHsJCLijYotBM888wxb\ntmxhwIABvPfeewwZMoTs7GwANmtk0+l0PoGImKXYQpCYmMjChQsZMmQIa9asoV27dtxxxx2cOnXK\nlfm8hgqBiJil2FlDFy9eLPT8+eefJzAwkKioKLKysio8mLeJiIDdu+HMGahTx+w0IuJNim0R3H33\n3Xx31XLX0aNH88Ybb+Dr61vhwbyNnx907gyJiWYnERFvo1lDbuSVV2wzh95+2+wkIlJZ6PB6D6Nx\nAhExg1oEbiQ3F+rXh19/hQYNzE4jIpVBuVoEBWsH0tLSnJtKilW1KnTvDgkJZicREW9SbCF45ZVX\nALj33ntdFkbUPSQirlfs9NHQ0FCio6NJT0+nf//+hV6zWCysWrWqwsN5o549YcECs1OIiDcpcYxg\n586dDB48mEWLFhXqY7JYLNecU1DRvGGMACA/37bv0I4dtnMKRETKwymbzmVkZNCgQQNyc3MBqFq1\n2EZEhfKWQgAwZAgMGgQjR5qdREQ8nVOmj547d45hw4YRHBxMcHAw9913nwaQK5jOMRYRV3JYCGbN\nmsWAAQNIS0sjLS2NgQMH8pe//MUV2bxWwUE1IiKu4LBrqEOHDmzfvh0fH1vNyMvLo2PHjvz0008u\nCVjAm7qGDANuvhk2bYIiTgsVESk1p3QN9e/fnylTprB9+3Z+/PFHpk6des0sInEui0XTSEXEdRy2\nCM6dO8eSJUv4+uuvAdtmdKNGjaJWrVouCVjAm1oEAO+9Z+se+tvfzE4iIp5MR1V6sLQ06NYNjhyx\ntRBERK6HNp3zYE2bgq8v7NljdhIRqexUCNyUxaJppCLiGg4LwYYNG665tlGnrLuEppGKiCs4HCOw\nWq2kpKQ4vFbRvG2MAODwYejQAU6cAB+13UTkOpTms7PY/SI2b97Mpk2byMjI4M0337S/UUZGBvXq\n1XNuUinSrbdCvXrw88/Qvr3ZaUSksir298xLly6RmZlJXl4emZmZZGVlkZWVRWhoKB9++KErM3o1\njROISEVz2DV04MABmjRpYn+enZ1NtWrVKjzY1byxawjg00/h449Bu36LyPVwyvTRZ555hnPnzpGX\nl0dkZCQtWrRg8eLFTgspJYuOhsRE2zGWIiIVwWEhSE1NpVatWqxcuZKOHTuyd+9eFi1a5Ipsgu1s\ngttug+3bzU4iIpWVw0Jw4403cuHCBT766CNGjhxJtWrVyMzMdEU2+TdNIxWRiuSwEEycOJGwsDD8\n/f3p0qUL+/fvp3bt2q7IJv+mDehEpCKVea8hwzDIy8tz+Ull3jpYDHDmDAQGwsmTtm0nRERKy2l7\nDe3Zs4c5c+Ywc+ZMXnrpJWbNmlXsvWPGjCEgIIC2bdvar2VmZjJw4EACAwMZNGgQWVlZ9tfmzJlD\n8+bNadWqVZGrmAXq1IEWLSA52ewkIlIZleqEsqeeeopXX32VM2fOsGTJEk6cOFHs/Q899BCrV68u\ndG3+/PkEBgby66+/cuutt7JgwQIATpw4wbx58/juu++YP38+kyZNKuePU3mpe0hEKorDQrBy5UpW\nrlxJ7dq1eeutt1i/fn2Jp5N1796dunXrFrqWnJzM2LFj8fPzY8yYMSQlJQGQlJREnz59CAwMpEeP\nHhiGoYHoYqgQiEhFcVgILBYLVapUITQ0lH/961/Url2b06dPl+mbbN26ldDQUABCQ0NJ/ncfR1JS\nEi1btrTfFxISYn9NCuvWDbZtg4sXzU4iIpWNwxHfu+++mz/++INx48YxZMgQMjMzmTZtWpm+SVkG\neS0lnMIyY8YM+9+jo6OJjo4uUw5PVrOmbb+hTZvgjjvMTiMi7iohIYGEhIQyfY3DQvD8888DcOed\nd7Jr1y5ycnLKvMVEeHg4u3btwmq1smvXLsLDwwGIjIwkPj7eft/u3bvtrxXlykLgjQq6h1QIRKQ4\nV/+SPHPmTIdfU2wh+Oyzz+y/nRuGcc1v6oMHDy51sMjISBYvXsyrr77K4sWL6dSpEwARERE89dRT\nHDx4kLS0NHx8fPD39y/1+3qbmBh49lmzU4hIZVNsIfjyyy9L7KYprhAMHz6cH374gVOnTnHbbbfx\n4osvEhsby8iRIwkJCSEsLIzZs2cDEBAQQGxsLDExMfj6+rJw4cJy/jiVW+fO8K9/QWYmqF6KiLPo\n8HoPExMDTz4J/fqZnUREPIEOr6+ENI1URJxNhcDD6KAaEXE2dQ15mEuXoH592L8fbrrJ7DQi4u7K\ndWZxgdzcXP75z3+y6t9HZA0cOJBevXq5fNM5sfH1ha5d4Ycf4J57zE4jIpWBw66huLg4Fi5cSExM\nDD179uSdd94hLi7OFdmkGBonEBFnctg1FB4eTmJiItWrVwfg4sWLREVFsXXrVpcELKCuof/Ytg1G\nj7ZNJRURKYlTZg0FBQWxc+dO+/Off/6ZoKCgcoeT62e1wpEjcPy42UlEpDJw2NE/bdo0HnnkES5f\nvgyAn5+ffRtpMUeVKtCjh+34yvvuMzuNiHg6h11D2dnZVKtWjd9//x3DMLjlllvs11xJXUOFxcVB\naiq8847ZSUTEnTmla6hLly4A3Hzzzdxyyy2Frol5dKC9iDhLsV1DR48e5ffff+fChQts377dvvHc\niRMn8PPzc2VGKULr1nD2LBw8aDvPWETkehVbCNauXcuSJUs4cuQIU6dOtV9v0qQJL730kkvCSfF8\nfGyrjNetg1GjzE4jIp7M4RjB3//+d4YMGeKqPMXSGMG1FiyALVtgyRKzk4iIuyrNZ6e2mPBge/dC\nr15w4ACUsGO4iHgx7T5ayTVvDvn58NtvZicREU+mQuDBLBbtRioi5eewEOTk5PDJJ58wYcIEAH79\n9Ve++uqrCg8mpaNppCJSXg7HCKZNm4ZhGHz11VekpqZy/vx5unTpwo4dO1yVEdAYQXEOHICICDh2\nTOMEInItp4wRrFu3jtmzZ+Pr6wtAjRo19IHsRpo0gRo14JdfzE4iIp7KYSEICQnh7Nmz9udbtmzB\narVWaCgpG3UPiUh5ONx0buLEidxzzz0cPnyYnj17cvz4cT766CNXZJNSiomBv/8dHnvM7CQi4olK\nvY7gxx9/JD8/n/Dw8IrOVCSNERTv6FHblhMZGbadSUVECjhljGDDhg1kZWXRsWNHjh8/zqxZszh9\n+rTTQkr5NW4MAQHg4vF7EakkHBaC2NhYatSoQXp6Os888ww+Pj48/PDDrsgmZaBxAhG5Xg4LQdWq\nVbFYLLz//vuMHz+eadOmsX//fhdEk7LQOcYicr0cDhYHBQXx3HPPsWLFCpKSksjLy+PSpUuuyCZl\n0KMHjBkDly/DDTeYnUZEPInDFsHf/vY3goODWbZsGbVr1+bIkSM89dRTrsgmZVC/PjRtCj/+aHYS\nEfE0Jc4ays3N5a677uK7775zZaYiadaQY088YSsIzz5rdhIRcRflnjVUMD6gMQHPcNdd8NFHcO6c\n2UlExJM4HCOoW7cuYWFhxMTE0LhxY8BWYebMmVPh4aRseveG6GgYOhS++kpjBSJSOg4XlC254vir\ngiaGxWJhlIvPR1TXUOnk5sKgQdCoEbz7rjaiE/F2Tj2hLC0tjeDgYKcEux4qBKWXlWWbRXTPPTB9\nutlpRMRMTllZnJCQQGRkJDExMQCkpKQwYMAA5ySUClGzpq1r6L334OOPzU4jIu7OYSF47bXXWLVq\nFXXr1gXAarWSlpZW4cGkfBo3hq+/hqlTteJYRErmsBBkZWUREBBgf56ZmUmtWrUqNJQ4R+vWsHw5\n3HcfpKaanUZE3JXDQjBw4EDmzJlDbm4uiYmJjBs3jmHDhrkimzhBz57w+uvwX/9l26VURORqDgeL\ns7OzWb58OZ999hn5+fmMGDGCIUOG4Ofn56qMgAaLy+vPf4aVK+GHH2xjCCLiHZwya2j79u2EhYU5\nNdj1UCEoH8OAhx+2nW38j39AVYcrSESkMnBKIYiOjubYsWMMHTqUYcOG0aZNG6eGLC0VgvK7fBnu\nvhuCg2HePK0xEPEGTps+um7dOurXr8+jjz5K27Zteemll5wWUlznhhtgxQrYtAlee83sNCLiLkq9\noAzg559/Zvbs2XzyySdcvny5InNdQy0C5zl8GLp0sRUDjfuLVG5OaRH88ssvzJgxgzZt2vDYY4/R\npUsXjhw54rSQ4nq33mpbcDZxIqxfb3YaETGbwxZB586dGTZsGEOHDuWWW25xVa5rqEXgfGvXwgMP\nQGIihISYnUZEKoJT9xoymwpBxVi8GP7yF9i8GRo2NDuNiDibUwrB/v37WbhwIWvWrOGPP/6wv7Gr\nt5lQIag4zz8Pa9bYtqK48Uaz04iIMzlljOCFF17AarWSm5vLypUr6devH4888ojTQor5Zs60dQ2N\nGAF5eWanERFXc9gisFqtpKSk0L59e7Zt2wbA7bffzo4dO1wSsIBaBBXr0iXo0wfatoW4OLPTiIiz\nOKVFUL16dfLy8ujRowezZs1i2bJl1LzOPQqCgoJo164dVquViIgIwLaJ3cCBAwkMDGTQoEFkZWVd\n13tL+fj6wuefQ3w8vP222WlExJUcFoK4uDguXLjA9OnTMQyD9evXM3/+/Ov6ZhaLhYSEBFJSUkhO\nTgZg/vz5BAYG8uuvv3LrrbeyYMGC63pvKb86deCbb2yb1H3+udlpRMRVSiwEeXl5fPrpp/j7+9Ow\nYUNmzJjBu+++S7t27a77G17dRElOTmbs2LH4+fkxZswYkpKSrvu9pfyaNIFVq+DRR20ziUSk8iux\nEFSpUoXExEQyMzOd8s0sFgsxMTEMGjSIVatWAbB161ZCQ0MBCA0NtbcUxDxhYfDBBzB4MOzbZ3Ya\nEaloDveg7Nq1K/3792fIkCE0btwYsH2gDx48uMzfbOPGjTRu3Jhdu3bRv39/IiIiyjQAPGPGDPvf\no6OjiY6OLnMGKZ1+/eCFF2x/btoE9eubnUhESiMhIYGEhIQyfY3DWUOjR4+23XjVVpXvv/9+mb7R\n1Z544glatmzJ6tWrmT59OlarlR9//JGXX36Zv//979cG1awhU0ybZtuGIj4eqlc3O42IlJVTFpRt\n2LCBbt26ObzmyIULF8jLy8Pf35+MjAyio6NZvXo1y5Yt49ChQ7z66qs8+eSTNG3alCeffPK6fhhx\nvvx8uP9+2/qC5cvBx+H0AhFxJ04pBGFhYWzfvt3hNUfS09O55557AKhXrx73338/Y8aMITMzk5Ej\nR5KSkkJYWBgff/xxkdNTVQjMk50Nd94JnTpp+2oRT1Oaz85ixwg2b97Mpk2bOHHiBG+++ab9jTIy\nMrjpppvKHKZp06b89NNP11z39/fniy++KPP7ietUq2Y71axLFwgKggkTzE4kIs5UbCG4dOkSmZmZ\n5OXlFZo1FBoayqRJk1wSTtxHvXrw7bfQrRsEBkL//mYnEhFnKdWmc0FBQS6KUzx1DbmH5GT4r/+y\nFYXbbzc7jYg4om2opUJ88QXExtqmlbrB7wgiUoJyjRGIFGfgQDhwAPr2tRWDunXNTiQi5aEWgVy3\nJ56A7dttZxn4+ZmdRkSK4pTdR48fP87TTz9Nq1ataNWqFdOmTePEiRNOCyme6/XXbYPIY8bY1huI\niGdyWAheeeUV6tSpY1+2XKdOHV5++WVXZBM35+MDH38MaWnw3HNmpxGR6+Wwa6h9+/aFDqHJz8/H\narXqYBqxy8iAzp3h6afh4YfNTiMiV3JK11B0dDSvvfYap06d4uTJk7z11lva7E0KadDAdo7Bc8/Z\nppWKiGdxWAiefvppjh49Srdu3ejevTu///4706ZNc0U28SAtWtgOs3nwQShiAbmIuDGHXUMbN26k\na9euDq9VNHUNeYYVK+Dxx23TSgMDzU4jIk5ZUFZweL2jaxVNhcBzvP667WCbDRugdm2z04h4N6ds\nOpeRkXHNpnP16tVzblKpVKZOhf374d57bWMHvr5mJxKRkhQ7RnD1pnNZWVlkZWURGhrKhx9+6MqM\n4mEsFoiLgxtvhEceATXkRNybNp2TCnP+PERH284ymDkTbrjB7EQi3scp00fdoQiIZ6pRA778ErZs\nsW1O9+KLcPSo2alE5Go6eFAqVKNG8P33tv2Ijh6FVq1g+HDbQLIaeCLuQZvOiUudOWObUfTXv9rG\nEB57DEaMsP1dRJzPKV1D6enpxMbGYrVaAdi5cyd//vOfnZNQvE6dOjB5MuzeDa++CqtW2dYbTJ0K\nv/1mdjoR7+SwEMyYMYP+V5xL2LZtW5YtW1ahoaTy8/GB3r1thWDrVqhaFTp1sp1+9s032s1UxJUc\nFoK9e/fSr18/+/P8/Hx8NTFcnKhpU5g9Gw4ehKFDbXsWtWgBb74Jf/xhdjqRys9hIejWrRs//vgj\nADk5OcydO5e77rqrwoOJ96leHUaPhm3bbNtbb98OwcG2HU21f5FIxXFYCKZMmcK8efM4duwYwcHB\npKamMmnSJFdkEy9lsdi6iT7+2DaWEBQE/ftDt26wfDlcumR2QpHKpdSzhnJzc03tFtKsIe+Wmwtf\nfGGbbbTuiC/2AAAPUklEQVR7t23F8iOPwM03m51MxL05ZdO5zMxM1q5dy+bNm8nJybG/8Zw5c5yX\ntBRUCKRAairMmwfLltlWLU+YAN2721oSIlKYUwrB8OHDqV69Op07d8bX1xfDMLBYLIwaNcqpYR1R\nIZCrnT0LH35oayX4+trWJNx/v21Fs4jYOKUQtGvXjp07dzo12PVQIZDiGAZ89x383//ZViw/8ACM\nHw/Nm5udTMR8TikE8+fP5/Tp0wwfPpw6derYr990003OSVlKKgRSGgcOwIIFsGgRdOxo6zbq2xeq\nVDE7mYg5nFIIPvjgA2JjY6lbt659oNhisZCWlua8pKWgQiBlkZ0Nn3xiayWcOmVrIYwZAy7+/UXE\ndE4pBMHBwaxbt44mTZo4NVxZqRDI9UpOthWEL7+EwYNtrYSwMLNTibhGuU4oK9CsWTOqV6/utFAi\nrhYRYRtUPnHC1mU0aJBt2mm3bhAaanu0bAk6eE+8lcMWwbBhw0hMTKRXr172MQJNHxVPlpsL8fGw\nY4dtTcLu3bBrl+3gnILCcGWBaNJEYwziuZzSNbRkyZIi31jTR6UyMQw4fvw/heHKApGRAc2aXVsg\nWrTQVFVxf04pBO5ChUDMcv487N17bYH49Vdo2PDaAhEaCgEBWuAm7qFchWDo0KGsWLGCtm3bFvnG\nrl5boEIg7iYvzzZd9eoCsXs3XL5cdIEIDtbZzeJa5SoEv//+OzfffDMHDhy45k0sFovLZxGpEIgn\nOXkS9uy5tkAcPmzbdvvqIhESArVrm51aKiOndA09/fTTzJ492+G1iqZCIJVBdjbs23dtgdizB2rV\nsp3W1qhR4Ufjxv/5e0AAVKtm9k8hnsQphcBqtZKSklLoWlhYGNu3by9/wjJQIZDKLD8fjhyxtRiO\nHbv2cfSo7c/jx23nO19dLIp61K+v2U5SznUE8+fPZ968efz222+FxgnOnTvHsGHDnJdSRPDxgdtu\nsz1KYhi2U9uKKhapqYWf//GHrRgU17q48uHvr8Ftb1Zsi+Ds2bP88ccfTJs2jdmzZ9srSkBAgCkL\nzNQiECmby5dtU1+LKhpXP3Jzi29Z3HSTbZpsjRpQs+Z//l7w0Mm17k3TR0WkVM6fL75L6o8/bK9f\n+cjK+s/fLZZri0NRBaO46yVdq+pw74OKYRi2WWFleeTn21p2Vapc+2dprlVUi0yFQEQq3KVL1xaH\nogrG9VyrWrXoguHnV/YP6pI+wK++ZhiFP7BL8/Dxsb1Xwftd+b4lXSv4fhZL2YtHaa5t2aJCICIe\nyjAgJ6fogpGTU7YP6bJ8mFf0b+jF/axXFhFHxaMsRaZrVxUCERGvVprPTh8XZRERETelQiAi4uVU\nCEREvJxbFILExERatmxJ8+bNmTt3rtlxSi0hIcHsCNdQptJzx1zKVDrK5FxuUQgmT57MwoULiY+P\n569//SsnT540O1KpuON/eGUqPXfMpUylo0zOZXohOHv2LABRUVE0adKE3r17k5SUZHIqERHvYXoh\n2Lp1K6GhofbnrVq1YsuWLSYmEhHxLqavI4iPj2fRokUsW7YMgAULFnDkyBFeeumlQvdZtCOWiMh1\nue7dR10lPDycp556yv48NTWVPn36XHOfFpOJiFQM07uGav/7WKbExET279/PP//5TyIjI01OJSLi\nPUxvEQC8/fbbPProo1y+fJlJkyZRv359syOJiHgN01sEAD169GDXrl3s27ePSZMmFXrN3dYYjBkz\nhoCAgEKH9Zjt0KFD9OzZk9atWxMdHc3SpUvNjgRAdnY2kZGRdOjQgU6dOvHWW2+ZHQmAvLw8rFYr\n/fv3NzuKXVBQEO3atcNqtRIREWF2HADOnz/PqFGjaNGihVtM4tizZw9Wq9X+qF27NnPmzDE1E8C7\n775Lly5d6NixI1OmTDE7jt3SpUvp0aMHrVu35r333iv5ZsPNdejQwfjhhx+M/fv3GyEhIUZGRoap\neRITE43t27cbbdq0MTXHlY4ePWqkpKQYhmEYGRkZRtOmTY1z586ZnMrm/PnzhmEYRnZ2ttG6dWvj\n119/NTmRYbzxxhvGiBEjjP79+5sdxS4oKMg4deqU2TEKmTp1qjF9+nTj4sWLxuXLl40zZ86YHcku\nLy/PaNSokXHw4EFTc5w6dcoICgoysrKyjLy8PKNv377G6tWrTc1kGIZx5swZo0WLFsbp06eNzMxM\nIzw8vMT/fm7RIiiOO64x6N69O3Xr1jU1w9UaNWpEhw4dAKhfvz6tW7dm27ZtJqeyufHGGwHIysoi\nNzcXPz8/U/McPnyYb775hv/+7/92uwkI7pYnPj6eZ599lmrVqlG1alX7eJ47iI+P509/+hO3OTrb\ns4JVr14dwzA4e/YsFy9e5MKFC27x+bBp0ybCwsKoW7cuNWvWpGfPnmzevLnY+926EGiNQdnt27eP\n1NRUt+leyM/Pp3379gQEBPDYY4+Z/g/38ccf57XXXsPHx73+r2+xWIiJiWHQoEGsWrXK7DgcPnyY\n7OxsYmNjiYyMZPbs2WRnZ5sdy2758uWMGDHC7BhUr16d+fPnExQURKNGjejatatb/NuLiooiOTmZ\n9PR0jh49yjfffMOmTZuKvd+9/jVIuWRmZjJs2DDeeustatSoYXYcAHx8fNixYwf79u1j3rx5pKSk\nmJblq6++omHDhlitVrf77Xvjxo3s2LGDl19+mSeeeIJjx46Zmic7O5u9e/dy7733kpCQQGpqKp9+\n+qmpmQpcunSJL7/8kqFDh5odhYyMDGJjY/nll1/Yv38/mzdv5uuvvzY7FjVq1ODtt99mwoQJDBky\nhLZt21KtWrVi73frQhAeHs7u3bvtz1NTU+nUqZOJidzX5cuXuffee3nggQcYOHCg2XGuERQURL9+\n/Uzt2tu0aROrVq2iadOmDB8+nO+//54HH3zQtDxXaty4MQAtW7ZkwIABfPnll6bmadasGSEhIfTv\n35/q1aszfPhwvv32W1MzFfj222/p2LEjDRo0MDsKycnJdOrUiWbNmlGvXj2GDh1KYmKi2bEA6N+/\nP9988w0bN24kPz+/yPVZBdy6EGiNQekYhsHYsWNp06aNW81aOHnyJGfOnAHg1KlTrF271tQiNWvW\nLA4dOkR6ejrLly8nJiaGDz/80LQ8BS5cuEBmZiZg+w1zzZo1Jf6jdZXmzZuTlJREfn4+X3/9Nb16\n9TI7EgDLli1j+PDhZscAbGOG27Zt4/Tp0+Tk5PDtt9/Su3dvs2MBcOLECcA2nvLzzz8TFhZW/M2u\nGcO+fgkJCUZoaKjxpz/9yYiLizM7jnHfffcZjRs3Nnx9fY1bb73VWLx4sdmRjPXr1xsWi8Vo3769\n0aFDB6NDhw7Gt99+a3YsY+fOnYbVajXatWtn9O7d2/jggw/MjmSXkJDgNrOG0tLSjPbt2xvt27c3\nYmJijEWLFpkdyTAMw9izZ48RGRlptG/f3pg6daqRlZVldiQjKyvLqFevntvMijMMw3j//feNqKgo\n4/bbbzemT59u5OXlmR3JMAzD6N69uxESEmLcfvvtRlJSUon3mr7XkIiImMutu4ZERKTiqRCIiHg5\nFQIRES+nQiAi4uVUCMSjBAUFcfr06Wuud+3atUxf6+j+WbNmXV9AJ9i/f7/TNzWsiPeUykOFQDxK\ncSfVbdy4sUxf6+j+l19+uWzBRDyYCoG4JcMweOihhwgLC6Nt27asWLGi0OsXL16kb9++LFq0CICa\nNWsCkJCQYN+zp02bNsTFxRX5/gX3nz9/nnvuuQer1Urbtm3ZsGED06ZN4+LFi1itVh544IEiv/a5\n554jJCSEIUOGsHv3bnr27ElYWJh9Ven+/fuJiooiLCyMIUOGsGPHDgBWrlxpX5h19OhRQkJC7At/\nivvf4d133+XOO++kV69efP755wAMHz6cb775xn7f6NGj+fzzz4u9X6REFb+sQaTsvv/+e2PkyJH2\n52fPnjUMw7Zd8/79+41evXoZH330kf31mjVrGoZhGOvWrTN8fHyMbdu2GWfPnjU6d+5sbNu2zf61\nBVs9F9y/ePFiY/r06YZhGEZ+fr6RmZlZ6PWiWCwWY8mSJUZ+fr5xxx13GF27djXOnTtnJCQkGHff\nfbdhGIZx4cIFIzs72zAMw9iyZYsxfPhw+9ePHDnSmDt3rnH33Xcby5cvv+b909PT7ducr1u3znji\niSeM/Px8Iysry7BarUZOTo6xcuVKY9SoUYZhGEZOTo5x2223GdnZ2cXef+V7ilzNLU4oE7lay5Yt\nSU5OZurUqYwePdrev20YBgMHDuTpp58udpuB1q1b07FjRwAGDx7M6tWr7c+v1qFDB2bPno3FYuGh\nhx6iadOmDrNVrVqV++67D4vFQmRkJFWqVMHf35/OnTsX2ur3+eef57vvviMvL49Dhw7Zr8+dO5fW\nrVvTpUsXhg0bVuL3+uyzz1i7di3ff/89AOfOnWPLli307duXyZMnc+nSJb799lt69OiBn59fsfcH\nBgY6/LnEe6lrSNxSo0aN2LFjB+3bt+fhhx9m3rx5gK2fv1u3bmXaAK24cQUAq9VKUlISjRs3ZsCA\nAXz11VcO38/Pz89+roKvr699TyxfX19ycnIA+OSTTzh58iQbNmwgPj7evucS2E6Uq1KlCsePH3e4\nC2p+fj7PPvssKSkppKSk8NtvvxEVFYWfnx/R0dGsWbOGTz/91F5QirtfpCQqBOKWjh49CsCDDz7I\n5MmT+emnn+yvvfjii9StW5cJEyYU+bWpqamkpKRw7tw5/vGPf5S4gdvBgwepWbMmsbGx3H///ezc\nuROABg0acOHChevOf+TIEZo0aYKfnx/vvvsu+fn5AOTm5jJ27FiWL19OaGgob775ZonvM2LECD78\n8EMyMjIA2Lt3rz3XsGHDWLx4MevXr7f/jCXdL1IcFQJxSz///DORkZGEhYXxt7/9jf/5n/8p9Hpc\nXBwXL15k2rRpQOHf+qOjo5k5cyZdunRh6NChRe66WHD/unXr6NChAx07dmTr1q2MGzcOgIkTJ9K9\ne/ciB4uvbmFc+bzg76NGjWLDhg20bduWS5cu2QenZ82aRVRUFF26dOHNN9/kvffeY8+ePcV+j65d\nuzJixAiGDh1K27ZtiY2NJTc3F4DevXuTmJjInXfeSdWqVYu9Py8vr8jcIgW06ZxUKgkJCbzxxhum\n7+cv4knUIpBKxWKx6DdfkTJSi0BExMupRSAi4uVUCEREvJwKgYiIl1MhEBHxcioEIiJeToVARMTL\n/T/CZjq1Ab+myQAAAABJRU5ErkJggg==\n", 293 | "text": [ 294 | "" 295 | ] 296 | } 297 | ], 298 | "prompt_number": 52 299 | }, 300 | { 301 | "cell_type": "code", 302 | "collapsed": false, 303 | "input": [], 304 | "language": "python", 305 | "metadata": {}, 306 | "outputs": [] 307 | } 308 | ], 309 | "metadata": {} 310 | } 311 | ] 312 | } -------------------------------------------------------------------------------- /02-coinflips.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "02-coinflips" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "See http://ivory.idyll.org/blog/2013-pycon-awesome-big-data-algorithms-talk.html" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "collapsed": false, 20 | "input": [ 21 | "# Flip a lot of coins;\n", 22 | "# Use the distribution of longest run of zeros to infer how many\n", 23 | "# coinflips were done, ex post facto.\n", 24 | "import random\n", 25 | "def generate_coinflips(num):\n", 26 | " return [ random.choice([0,1]) for i in range(num) ]\n", 27 | "\n", 28 | "def longest_run_zero(x):\n", 29 | " i = 0\n", 30 | " count = 0\n", 31 | " max_count = 0\n", 32 | " while i < len(x):\n", 33 | " if x[i] == 0:\n", 34 | " count += 1\n", 35 | " else:\n", 36 | " if count > max_count:\n", 37 | " max_count = count\n", 38 | " count = 0\n", 39 | " i += 1\n", 40 | " \n", 41 | " return max_count" 42 | ], 43 | "language": "python", 44 | "metadata": {}, 45 | "outputs": [], 46 | "prompt_number": 7 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [ 52 | "def longest_run_mc(runsize, num):\n", 53 | " z = []\n", 54 | " for i in range(num):\n", 55 | " x = generate_coinflips(runsize)\n", 56 | " count = longest_run_zero(x)\n", 57 | " z.append(count)\n", 58 | " \n", 59 | " return z" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [], 64 | "prompt_number": 8 65 | }, 66 | { 67 | "cell_type": "code", 68 | "collapsed": false, 69 | "input": [ 70 | "# do 100 runs of 1000 coinflips, and plot the distribution\n", 71 | "d = longest_run_mc(100, 1000)\n", 72 | "hist(d, bins=max(d), range=(0, max(d)))\n", 73 | "\n", 74 | "# we expect a peak right around...\n", 75 | "print math.log(100, 2)" 76 | ], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "output_type": "stream", 82 | "stream": "stdout", 83 | "text": [ 84 | "6.64385618977\n" 85 | ] 86 | }, 87 | { 88 | "output_type": "display_data", 89 | "png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD9CAYAAAC2l2x5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFY1JREFUeJzt3X9MVff9x/HXJRZq63WxNQUTvNVMxgX8wdVxL6kRr2Rz\nhMXBYhpnZtsUlmzXLM6f/yymxf1h49rNodmQNLttFuOWpUkzu05RtpxW3HYvTksXilMXCLjoqHUr\nh4qN2vP9w29vdOCFi/dy4OPzkZDAPfd8zvtezn1xeN/POdfjOI4jAIBxstwuAACQGQQ8ABiKgAcA\nQxHwAGAoAh4ADEXAA4Chkgb89evXFQqFVFpaqvLycu3du1eSZNu2ampq5PP5VFtbq8HBwcQ6+/bt\nU0FBgYqLi9XW1pbZ6gEA9+QZbR78tWvX9Mgjj+jTTz/VsmXL9Oabb+rNN99UX1+fXnnlFW3btk3z\n5s3T9u3b1d/fr4qKCh07dkzd3d3asmWLTp8+PVGPBQBwh1FbNI888ogkaXBwUDdv3lROTo7i8bjq\n6+uVk5Ojuro6xWIxSVIsFlNVVZV8Pp9Wrlwpx3Fk23ZmHwEAYESjBvxnn32mJUuWKDc3V9///vfl\n8/nU3t4uv98vSfL7/YrH45JuB3xRUVFi3cLCwsQyAMDEmjbaHbKystTR0aGenh5VV1dr+fLlSuXq\nBh6PZ0y3AQBGl0r+jnkWzbx581RdXa1YLKaysjJ1dXVJkrq6ulRWViZJCoVC+uCDDxLrnD17NrFs\npCKn6teLL77oeg0PYu3U7/4X9bv7laqkAX/lyhX997//lSR99NFHOnbsmGpqahQKhRSNRjU0NKRo\nNKry8nJJUjAYVEtLi3p7e2VZlrKysuT1elMuCgBw/5K2aC5duqTnnntOt27dUl5enrZv3645c+Yo\nEolow4YNKiws1NKlS7Vnzx5JUm5uriKRiCorK5Wdna3m5uYJeRAAgOFGnSaZkY16POP6d2OysCxL\n4XDY7TLGZSrXLlG/26jfXalmJwEPAFNEqtnJpQoAwFAEPAAYioAHAEMR8ABgKAIeaTVz5mPyeDwZ\n+5o58zG3HyIwZTCLBml1+zIUmfzdsu/gwcUsGgCAJAIeAIxFwAOAoQh4ADAUAQ8AhiLgAcBQBDwA\nGIqABwBDEfAAYCgCHgAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8ABiKgAcAQxHwAGAoAh4ADEXAA4Ch\nCHgAMBQBDwCGShrwfX19WrVqlUpKShQOh3Xo0CFJUkNDg/Lz8xUIBBQIBHTkyJHEOvv27VNBQYGK\ni4vV1taW2eoBAPfkcRzHudfCy5cv6/LlyyotLdWVK1cUDAbV0dGhn/70p/J6vdq6detd9+/v71dF\nRYWOHTum7u5ubdmyRadPnx6+UY9HSTaLKczj8UjK5O+WfQcPrlSzc1qyhXl5ecrLy5MkzZ49WyUl\nJWpvb5ekETcSi8VUVVUln88nn88nx3Fk27a8Xm8qjwEAkAZj7sFfuHBBnZ2dCoVCkqT9+/ervLxc\ne/bskW3bkqR4PK6ioqLEOoWFhYrH42kuGQAwFkmP4D9n27bWrVunvXv36tFHH1UkEtELL7yggYEB\n7dixQ83Nzdq+ffuIR/W3/2UfrqGhIfF9OBxWOBwe1wMAAFNZliXLssa9ftIevCTduHFDX//611Vd\nXa3NmzcPW97R0aGNGzfq5MmTeuutt9Ta2qrGxkZJUmlpqU6cODGsRUMP3lz04IHMSTU7k7ZoHMdR\nfX29Fi5ceFe4X7p0SZJ08+ZNHTp0SNXV1ZKkYDColpYW9fb2yrIsZWVl0X8HAJckbdGcPHlSBw8e\n1OLFixUIBCRJu3fv1q9//Wu99957ys7OVkVFhSKRiCQpNzdXkUhElZWVys7OVnNzc+YfAQBgRKO2\naDKyUVo0xqJFA2ROWls0AICpi4AHAEMR8ABgKAIeAAxFwAOAoQh4ADAUAQ8AhiLgAcBQBDwAGIqA\nBwBDEfAAYCgCHgAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8ABiKgAcAQxHwAGAoAh4ADEXAA4ChCHgA\nMNQ0twsAUjNNHo8nY6N7vbM0MHA1Y+MDE8njOI4z4Rv1eOTCZjEBbodvJn+3mR+ffROTVarZSYsG\nAAxFwAOAoQh4ADAUAQ8Ahkoa8H19fVq1apVKSkoUDod16NAhSZJt26qpqZHP51Ntba0GBwcT6+zb\nt08FBQUqLi5WW1tbZqsHANxT0lk0ly9f1uXLl1VaWqorV64oGAyqo6NDTU1N6uvr0yuvvKJt27Zp\n3rx52r59u/r7+1VRUaFjx46pu7tbW7Zs0enTp4dvlFk0xmIWDZA5aZ1Fk5eXp9LSUknS7NmzVVJS\novb2dsXjcdXX1ysnJ0d1dXWKxWKSpFgspqqqKvl8Pq1cuVKO48i27ft4OACA8RpzD/7ChQvq7OxU\nMBhUe3u7/H6/JMnv9ysej0u6HfBFRUWJdQoLCxPLAAATa0xnstq2rXXr1mnv3r2aMWNGSv8i3Ous\nw4aGhsT34XBY4XB4zGMCwIPAsixZljXu9UcN+Bs3bmjt2rV65plnVFNTI0kqKytTV1eXAoGAurq6\nVFZWJkkKhUJqbW1NrHv27NnEsv91Z8ADAIb734PfXbt2pbR+0haN4ziqr6/XwoULtXnz5sTtoVBI\n0WhUQ0NDikajKi8vlyQFg0G1tLSot7dXlmUpKytLXq83pYIAAOmRdBZNW1ubKioqtHjx4kSr5aWX\nXtLy5cu1YcMGnTlzRkuXLtXBgwc1Y8YMSVJjY6P279+v7OxsNTc3a8WKFcM3yiwaYzGLBsicVLOT\ni40hrQh4IHO42BgAQBIBDwDGIuABwFAEPAAYioAHAEMR8ABgKAIeAAxFwAOAoQh4ADAUAQ8AhiLg\nAcBQBDwAGIqABwBDEfAAYCgCHgAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8ABhqmtsFYGLNnPmYbPs/\nbpcBYAJ4HBc+Qj7VTwZH+ng8HkmZfO6n/vjsm5isUs1OWjQAYCgCHgAMRcADgKEIeAAwFAEPAIYi\n4AHAUEkDvq6uTrm5uVq0aFHitoaGBuXn5ysQCCgQCOjIkSOJZfv27VNBQYGKi4vV1taWuaoBAKNK\nOg/+xIkTmjFjhp599ln9/e9/lyTt2rVLXq9XW7duveu+/f39qqio0LFjx9Td3a0tW7bo9OnTI2+U\nefCuYR786OOzb2KySjU7k57JumLFCvX09Ay7faQNxGIxVVVVyefzyefzyXEc2bYtr9c75mIAAOkz\nrh78/v37VV5erj179si2bUlSPB5XUVFR4j6FhYWKx+PpqRIAkLKUr0UTiUT0wgsvaGBgQDt27FBz\nc7O2b98+4lH97XbAyBoaGhLfh8NhhcPhVEsBAKNZliXLssa9/qjXounp6dGaNWsSPfg7dXR0aOPG\njTp58qTeeusttba2qrGxUZJUWlqqEydOjNiioQfvHnrwo4/PvonJKuPXorl06ZIk6ebNmzp06JCq\nq6slScFgUC0tLert7ZVlWcrKyqL/DgAuStqiWb9+vd555x1duXJFc+fO1a5du2RZlt577z1lZ2er\noqJCkUhEkpSbm6tIJKLKykplZ2erubl5Qh4AAGBkXC74AUOLZvTx2TcxWXG5YACAJAIeAIxFwAOA\noQh4ADAUAQ8AhiLgAcBQBDwAGIqABwBDEfAAYCgCHgAMRcADgKFSvh48YLZpST/H4H55vbM0MHA1\nY+MDd+JiYw8YLjbm/vjs+xgvLjYGAJBEwAOAsQh4ADAUAQ8AhiLgAcBQBDwAGIqABwBDEfAAYCgC\nHgAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8ABiKgAcAQyUN+Lq6OuXm5mrRokWJ22zbVk1NjXw+n2pr\nazU4OJhYtm/fPhUUFKi4uFhtbW2ZqxoAMKqkAf/888/r6NGjd93W1NQkn8+n8+fPKz8/XwcOHJAk\n9ff36xe/+IX++Mc/qqmpSZs2bcpc1QCAUSUN+BUrVmjWrFl33RaPx1VfX6+cnBzV1dUpFotJkmKx\nmKqqquTz+bRy5Uo5jiPbtjNXOQAgqZR78O3t7fL7/ZIkv9+veDwu6XbAFxUVJe5XWFiYWAYAmHgp\nf+h2Kp8HmOzDixsaGhLfh8NhhcPhVEsBAKNZliXLssa9fsoBX1ZWpq6uLgUCAXV1damsrEySFAqF\n1Nramrjf2bNnE8tGcmfAAwCG+9+D3127dqW0fsotmlAopGg0qqGhIUWjUZWXl0uSgsGgWlpa1Nvb\nK8uylJWVJa/Xm+rwAIA0SRrw69ev11NPPaVz585p7ty5eu211xSJRNTb26vCwkL961//0ve+9z1J\nUm5uriKRiCorK7Vx40Y1NjZOyAMAAIzM46TSVE/XRj2elHr5SJ/b74tk8rln/NHGZ9/HeKWanZzJ\nCgCGIuABwFAEPAAYioAHAEOlPA8ewP2YlvQEwPvl9c7SwMDVjI2PqYVZNA8YZtGYPz6vLXMxiwYA\nIImABwBjEfAAYCgCHgAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8ABiKgAcAQxHwAGAoAh4ADEXAA4Ch\nCHgAMBQBDwCGIuABwFAEPAAYioAHAEMR8ABgKAIeAAxFwAOAoQh4ADAUAQ8Ahhp3wM+bN0+LFy9W\nIBBQMBiUJNm2rZqaGvl8PtXW1mpwcDBthQIAUjPugPd4PLIsS2fOnFE8HpckNTU1yefz6fz588rP\nz9eBAwfSVigAIDX31aJxHOeun+PxuOrr65WTk6O6ujrFYrH7Kg4AMH73dQRfWVmp2tpaHT58WJLU\n3t4uv98vSfL7/YkjewDAxJs23hVPnjypOXPmqKurS2vWrFEwGBx2RJ9MQ0ND4vtwOKxwODzeUgDA\nSJZlybKsca/vcVJJ5XvYunWrioqKdPToUe3cuVOBQEB/+9vf9NJLL+mNN94YvlGPJ6U/Bkgfj8cj\nKZPPPeO7PT6vLXOlmp3jatFcu3ZNtm1Lkj788EO1tLSoqqpKoVBI0WhUQ0NDikajKi8vH8/wAIA0\nGNcRfHd3t775zW9Kkh5//HF9+9vfVl1dnWzb1oYNG3TmzBktXbpUBw8e1IwZM4ZvlCN413AEb/74\nvLbMlWp2pqVFkyoC3j0EvPnj89oy14S0aAAAkx8BDwCGIuABwFAEPAAYatwnOgGYjKb9/xvpmeH1\nztLAwNWMjY/0IuABo9xUJmfp2Hbm/ngg/WjRAIChCHgAMBQtmklo5szHZNv/cbsMAFMcZ7JOQpk9\n23Tqn6nJ+O6Oz2vXPZzJCgCQRMADgLEIeAAwFG+yAkgBJ1JNJQQ8gBRwItVUQosGAAxFwAOAoQh4\nADAUAQ8AhiLgAcBQBDwAGIqABwBDEfAAYCgCHgAMRcADgKG4VAGASYRr3aQTAQ9gEuFaN+lEwAN4\ngGT2PwTpIUk3Mjh+atLeg3/33XdVVFSkgoIC7d+/P93DTwqWZbldAoBx+fw/hEx93cjw+KlJe8D/\n4Ac/UHNzs1pbW/Xzn/9cV65cSfcmXEfAA5gK0hrwH3/8sSSpoqJCTz75pFavXq1YLJbOTQAAxiit\nAd/e3i6/35/4ubi4WH/961/TuQkAwBi59iZrZt/oyLxdu3ZleAuZfH4y/dwzPuMz/mSQ1oAvKyvT\njh07Ej93dnaqqqpq2P0cJ3PToAAAt6W1RfOFL3xB0u2ZND09PTp+/LhCoVA6NwEAGKO0t2h+9rOf\n6bvf/a5u3LihTZs2afbs2eneBABgDNI+TXLlypXq6urShQsXtGnTpruWTeU58n19fVq1apVKSkoU\nDod16NAht0tK2a1btxQIBLRmzRq3S0nZJ598oueee05f+tKXpuSb96+++qqeeuopLVu2TJs3b3a7\nnFHV1dUpNzdXixYtStxm27Zqamrk8/lUW1urwcFBFytMbqT6d+zYoaKiIi1dulSbN2/W0NCQixUm\nN1L9n/vJT36irKwsXb06+iUXJvRiY1N5jvxDDz2kvXv3qrOzU2+88YZ27twp27bdLisljY2NKi4u\nnpJvcL/44ovy+Xx6//339f7776uoqMjtksbs6tWr2r17t44fP6729nadO3dOLS0tbpeV1PPPP6+j\nR4/edVtTU5N8Pp/Onz+v/Px8HThwwKXqRjdS/atXr1ZnZ6dOnTqlTz75ZFIfpI1Uv3T7QPP48eN6\n8sknxzTOhAX8VJ8jn5eXp9LSUknS7NmzVVJSolOnTrlc1dhdvHhRf/jDH/Sd73xnSr7J3draqh/+\n8Id6+OGHNW3atMT7PVPB9OnT5TiOPv74Yw0NDenatWuaNWuW22UltWLFimE1xuNx1dfXKycnR3V1\ndZP69TtS/V/96leVlZWlrKwsfe1rX9M777zjUnWjG6l+Sdq6dat+/OMfj3mcCQt4k+bIX7hwQZ2d\nnQoGg26XMmZbtmzRyy+/rKysqXeF6IsXL+r69euKRCIKhULas2ePrl+/7nZZYzZ9+nQ1NTVp3rx5\nysvL0/Lly6fUvvO5O1/Dfr9f8Xjc5YrG79VXX51yrcrf/e53ys/P1+LFi8e8ztR7tbvMtm2tW7dO\ne/fu1aOPPup2OWPy+9//Xk888YQCgcCUPHq/fv26zp07p7Vr18qyLHV2duq3v/2t22WN2YcffqhI\nJKIPPvhAPT09+stf/qK3337b7bJSNhX3nZH86Ec/ktfr1dNPP+12KWN27do17d69+67zb8by+5iw\ngC8rK9PZs2cTP3d2dqq8vHyiNp8WN27c0Nq1a/XMM8+opqbG7XLG7M9//rMOHz6s+fPna/369frT\nn/6kZ5991u2yxmzBggUqLCzUmjVrNH36dK1fv15Hjhxxu6wxi8fjKi8v14IFC/T444/r6aef1rvv\nvut2WSkrKytTV1eXJKmrq0tlZWUuV5S6119/XS0tLTp48KDbpaTkn//8p3p6erRkyRLNnz9fFy9e\n1LJly9Tf3590vQkL+Kk+R95xHNXX12vhwoVTYhbEnXbv3q2+vj51d3frN7/5jSorK/WrX/3K7bJS\nUlBQoFgsps8++0xvv/22vvKVr7hd0pitWLFCp06d0tWrV/Xpp5/qyJEjWr16tdtlpSwUCikajWpo\naEjRaHTKHaAdPXpUL7/8sg4fPqyHH37Y7XJSsmjRIv373/9Wd3e3uru7lZ+fr9OnT+uJJ55IvqIz\ngSzLcvx+v/PFL37RaWxsnMhN37cTJ044Ho/HWbJkiVNaWuqUlpY6R44ccbuslFmW5axZs8btMlL2\nj3/8wwmFQs6SJUucbdu2OYODg26XlJLXXnvNqaiocL785S87O3fudG7duuV2SUl961vfcubMmeNk\nZ2c7+fn5TjQadQYGBpxvfOMbzty5c52amhrHtm23y7ynz+t/6KGHnPz8fOeXv/yls2DBAsfn8yVe\nv5FIxO0y72mk5/9O8+fPdz766KNRx/E4jiGNNQDAXXiTFQAMRcADgKEIeAAwFAEPAIYi4AHAUAQ8\nABjq/wDPZlPBG+ezbQAAAABJRU5ErkJggg==\n", 90 | "text": [ 91 | "" 92 | ] 93 | } 94 | ], 95 | "prompt_number": 9 96 | }, 97 | { 98 | "cell_type": "code", 99 | "collapsed": false, 100 | "input": [ 101 | "# if we do 500 coinflips, the peak shifts right, to... \n", 102 | "d = longest_run_mc(500, 1000)\n", 103 | "hist(d, bins=max(d), range=(0, max(d)))\n", 104 | "print math.log(500, 2)" 105 | ], 106 | "language": "python", 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "output_type": "stream", 111 | "stream": "stdout", 112 | "text": [ 113 | "8.96578428466\n" 114 | ] 115 | }, 116 | { 117 | "output_type": "display_data", 118 | "png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD9CAYAAAC2l2x5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFYxJREFUeJzt3X9MVff9x/HXJRRr6tXYOqEJ3mkqA8QfXFu4rJ14NWtL\nmzFYTIN2tcvApL1t52+zZDEd7g8b5zqnprOkGy6LYT+yZJlbhzC3XBW3cem0pkWYdYGAiw5/bOUy\ncVP8fP9g3m8tiFw83HP78flISPBezjlv4Prk8LnHq8cYYwQAsE6K2wMAAMYHgQcASxF4ALAUgQcA\nSxF4ALAUgQcAS40Y+O7ubi1ZskR5eXkKBoOqq6uTJFVXVyszM1N+v19+v1/19fWxbXbt2qWsrCzN\nmTNHTU1N4zs9AOCWPCNdB3/u3DmdO3dO+fn5unDhggoLC3XixAl997vfldfr1fr162/6+J6eHhUX\nF6uxsVEdHR1at26djh07Nu6fBABgqNSR7szIyFBGRoYkadq0acrLy1NLS4skabifC83NzSopKZHP\n55PP55MxRtFoVF6vdxxGBwCMZNRr8KdPn1Zra6sCgYAkaffu3SoqKtK2bdsUjUYlSZFIRLm5ubFt\nsrOzFYlEHB4ZADAaI57B3xCNRlVRUaEdO3bovvvuUygU0quvvqre3l5t2rRJNTU12rhx47Bn9R6P\nZ1S3AQBuL55Xl7ntGfzVq1e1bNkyrVy5UmVlZZKk6dOny+PxaMqUKXr55Zf1y1/+UpIUCAR08uTJ\n2Lbt7e0qKCi45ZDJ9PbNb37T9RmYya65mImZnH6L14iBN8aoqqpKc+fO1dq1a2O3nz17VpJ07do1\n1dXV6emnn5YkFRYWqqGhQV1dXQqHw0pJSWH9HQBcMuISzdGjR7Vv3z7Nnz9ffr9fkrR161b95Cc/\n0bvvvqu0tDQVFxcrFApJktLT0xUKhbR06VKlpaWppqZm/D8DfGJNnny/otF/OrKvtLR7VV1d7ci+\nAFuMGPjPfe5zun79+pDbn3rqqVtus2bNGq1Zs+bOJ0uwYDDo9ghD2D7TYNydebXq//43+Z7Xsf37\n5xRmGj8jXgc/bgf1eMa0ngS7DD7Z7tTjgMcU7BdvO0d1FQ2Q/FIduTrL652q3t5LDswDuI8zeLjG\n6TN4Z/bFYxPJK9528mJjAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8A\nliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLw\nAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8AliLwAGApAg8Alhox8N3d3VqyZIny8vIU\nDAZVV1cnSYpGoyorK5PP51N5ebn6+vpi2+zatUtZWVmaM2eOmpqaxnd6AMAteYwx5lZ3njt3TufO\nnVN+fr4uXLigwsJCnThxQnv27FF3d7e+853vaMOGDZo5c6Y2btyonp4eFRcXq7GxUR0dHVq3bp2O\nHTs29KAej0Y4LO4SHo9HklOPA6f2xWMTySvedo54Bp+RkaH8/HxJ0rRp05SXl6eWlhZFIhFVVVVp\nwoQJqqysVHNzsySpublZJSUl8vl8Wrx4sYwxikajd/DpAADGatRr8KdPn1Zra6sKCwvV0tKinJwc\nSVJOTo4ikYikwcDn5ubGtsnOzo7dBwBIrNTRfFA0GlVFRYV27NihSZMmxfUrwuCv4UNVV1fH3g8G\ngwoGg6PeJwDcDcLhsMLh8Ji3v23gr169qmXLlmnlypUqKyuTJBUUFKitrU1+v19tbW0qKCiQJAUC\nAR08eDC2bXt7e+y+j/to4AEAQ3385HfLli1xbT/iEo0xRlVVVZo7d67Wrl0buz0QCKi2tlb9/f2q\nra1VUVGRJKmwsFANDQ3q6upSOBxWSkqKvF5vXAMBAJwx4lU0TU1NKi4u1vz582NLLa+99poee+wx\nPffcczp+/LgWLlyoffv2adKkSZKknTt3avfu3UpLS1NNTY0WLVo09KBcRQNxFQ0Qr3jbOWLgxwuB\nh0TggXg5epkkAOCTi8ADgKUIPABYisADgKUIPABYalT/khW4e6Te8l9fx8vrnare3kuO7AsYCy6T\nhGuS9TJJJ2ficQ4ncZkkAEASgQcAaxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcA\nSxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4\nALAUgQcASxF4ALAUgQcASxF4ALAUgQcASxF4ALDUiIGvrKxUenq65s2bF7uturpamZmZ8vv98vv9\nqq+vj923a9cuZWVlac6cOWpqahq/qQEAt+Uxxphb3XnkyBFNmjRJzz//vN577z1J0pYtW+T1erV+\n/fqbPranp0fFxcVqbGxUR0eH1q1bp2PHjg1/UI9HIxwWdwmPxyPJqceBU/tydiYe53BSvO1MHenO\nRYsWqbOzc8jtwx2gublZJSUl8vl88vl8MsYoGo3K6/WOehgAgHPGtAa/e/duFRUVadu2bYpGo5Kk\nSCSi3Nzc2MdkZ2crEok4MyUAIG4jnsEPJxQK6dVXX1Vvb682bdqkmpoabdy4cdiz+sFfwYdXXV0d\nez8YDCoYDMY7CgBYLRwOKxwOj3n7EdfgJamzs1OlpaWxNfiPOnHihF566SUdPXpUv/71r3Xw4EHt\n3LlTkpSfn68jR44Mu0TDGjwk1uCBeMXbzriXaM6ePStJunbtmurq6vT0009LkgoLC9XQ0KCuri6F\nw2GlpKSw/g4ALhpxiWbFihU6dOiQLly4oBkzZmjLli0Kh8N69913lZaWpuLiYoVCIUlSenq6QqGQ\nli5dqrS0NNXU1CTkEwAADO+2SzTjclCWaCCWaIB4jfsSDQDgk4HAA4ClCDwAWIrAA4ClCDwAWIrA\nA4ClCDwAWIrAA4ClCDwAWIrAA4ClCDwAWIrAA4ClCDwAWIrAA4Cl4v4v+3B3mzz5fkWj/3R7DACj\nwOvBIy7J+RruTu6L14NH8uL14AEAkgg8AFiLwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOA\npQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFhqxMBXVlYq\nPT1d8+bNi90WjUZVVlYmn8+n8vJy9fX1xe7btWuXsrKyNGfOHDU1NY3f1ACA2xox8F/96ld14MCB\nm27bs2ePfD6fPvjgA2VmZurNN9+UJPX09Oj73/++fv/732vPnj1avXr1+E0NALitEQO/aNEiTZ06\n9abbIpGIqqqqNGHCBFVWVqq5uVmS1NzcrJKSEvl8Pi1evFjGGEWj0fGbHAAworjX4FtaWpSTkyNJ\nysnJUSQSkTQY+Nzc3NjHZWdnx+4DACRearwbGGNG/bEej+eW91VXV8feDwaDCgaD8Y4CAFYLh8MK\nh8Nj3j7uwBcUFKitrU1+v19tbW0qKCiQJAUCAR08eDD2ce3t7bH7hvPRwAMAhvr4ye+WLVvi2j7u\nJZpAIKDa2lr19/ertrZWRUVFkqTCwkI1NDSoq6tL4XBYKSkp8nq98e4eAOCQEQO/YsUKPfroozp1\n6pRmzJihvXv3KhQKqaurS9nZ2fr73/+uF198UZKUnp6uUCikpUuX6qWXXtLOnTsT8gkAAIbnMfEs\nqjt1UI8nrrV8JI/B51Wc+t4l476cnYnHOZwUbzvjXoMHMFqpI15oMFpe71T19l5yYB7cbTiDR1w4\ng3djX/x9waB428lr0QCApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiK\nwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOA\npQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApQg8AFiKwAOApcYc+JkzZ2r+/Pny\n+/0qLCyUJEWjUZWVlcnn86m8vFx9fX2ODQoAiM+YA+/xeBQOh3X8+HFFIhFJ0p49e+Tz+fTBBx8o\nMzNTb775pmODAgDic0dLNMaYm/4ciURUVVWlCRMmqLKyUs3NzXc0HABg7O7oDH7p0qUqLy/X/v37\nJUktLS3KycmRJOXk5MTO7AEAiZc61g2PHj2qBx98UG1tbSotLVVhYeGQM/qRVFdXx94PBoMKBoNj\nHQUArBQOhxUOh8e8vcfEU+VbWL9+vXJzc3XgwAFt3rxZfr9ff/nLX/Taa6/pF7/4xdCDejxx/TBA\n8vB4PJKc+t4l476Scyb+vkCKv51jWqK5fPmyotGoJOn8+fNqaGhQSUmJAoGAamtr1d/fr9raWhUV\nFY1l9wAAB4zpDL6jo0Nf+tKXJEkPPPCAvvzlL6uyslLRaFTPPfecjh8/roULF2rfvn2aNGnS0INy\nBv+JxRm8G/vi7wsGxdtOR5Zo4kXgP7kIvBv74u8LBiVkiQYAkPwIPABYasyXSQJIlNT/LY3dOa93\nqnp7LzmyLyQ/Ag8kvWty6nmBaNSZHxT4ZGCJBgAsReABwFIEHgAsReABwFIEHgAsReABwFIEHgAs\nReABwFIEHgAsReABwFIEHgAsReABwFIEHgAsReABwFIEHgAsReABwFIEHgAsReABwFIEHgAsReAB\nwFIEHgAsReABwFKpbg+AxJg8+X5Fo/90ewwACeQxxpiEH9TjkQuHvat5PB5JTnzNndpPsu7L9pnu\nkXTNkT15vVPV23vJkX1hdOJtJ2fwwF3lmpz6YRGN3vO/E4c7ww+K8UPgAYyRMz8sotE7/yGB4fEk\nKwBYisADgKUIPABYisADgKUIPABrTJ58vzwezx2/TZ58v9ufiiO4igaAy1Idudzy/3Flzw2On8Ef\nPnxYubm5ysrK0u7du53e/bgJh8NujzBEMs4EOO/G5ZZOvOGjHA/8mjVrVFNTo4MHD+qNN97QhQsX\nnD7EuEjGmCbjTMDdIcWRpR63l3scDfyHH34oSSouLtanP/1pPfHEE2pubnbyEACQANfl1G8Vbr4G\nlKOBb2lpUU5OTuzPc+bM0Z///GcnDwEAGCXXnmR19kkVZ2zZssXtEYZwdianvuZOfu+ScV/MlPh9\n2T2TW71zNPAFBQXatGlT7M+tra0qKSkZ8nG8kiQAjD9Hl2imTJkiafBKms7OTv3ud79TIBBw8hAA\ngFFyfInme9/7nl544QVdvXpVq1ev1rRp05w+BABgFBy/THLx4sVqa2vT6dOntXr16pvuS7Zr5Lu7\nu7VkyRLl5eUpGAyqrq7O7ZFiBgYG5Pf7VVpa6vYokqR///vf+spXvqLPfOYzSfXk+VtvvaVHH31U\nDz/8sNauXevKDJWVlUpPT9e8efNit0WjUZWVlcnn86m8vFx9fX2uz7Rp0ybl5uZq4cKFWrt2rfr7\n+12f6YbXX39dKSkpunQpsa8Lf6uZ9u7dq9zcXOXl5enrX/96Qme61VwnT57UF77wBeXn56u0tFRt\nbW2335FJoPz8fHPo0CHT2dlpsrOzzfnz5xN5+CHOnj1rjh8/bowx5vz582bWrFmmt7fX1ZlueP31\n182zzz5rSktL3R7FGGPMhg0bzObNm01/f7+5evWq+de//uX2SObixYtm5syZpq+vzwwMDJinnnrK\nHDhwIOFzHD582Bw7dszMnTs3dtu2bdvMK6+8Yq5cuWJefvlls337dtdnamxsNAMDA2ZgYMCsWrXK\n/OAHP3B9JmOM6erqMk8++aSZOXOmuXjxouszvffee6aoqMicOnXKGGNMT09PQme61VwVFRXmZz/7\nmTHGmLq6OrN8+fLb7idhr0WTjNfIZ2RkKD8/X5I0bdo05eXl6Z133nF1Jkk6c+aMfvvb32rVqlVJ\n84T0wYMH9Y1vfEP33nuvUlNTY8+3uGnixIkyxujDDz9Uf3+/Ll++rKlTpyZ8jkWLFg05biQSUVVV\nlSZMmKDKysqEP9aHm+nxxx9XSkqKUlJS9OSTT+rQoUOuzyRJ69ev17e//e2EznLDcDPV19erqqpK\nWVlZkqRPfepTSTHXlClTdPHiRV2/fl0XL14c1WM9YYFP9mvkT58+rdbWVhUWFro9itatW6ft27cr\nJSU5XgvuzJkzunLlikKhkAKBgLZt26YrV664PZYmTpyoPXv2aObMmcrIyNBjjz2WFN8/6ebHe05O\njiKRiMsT3eytt95KiuW/X/3qV8rMzNT8+fPdHiWmsbFR77//vh555BGtWrVKJ0+edHskSdL27du1\nc+dOTZ06VW+88Ya2bdt2222SoyAui0ajqqio0I4dO3Tfffe5OstvfvMbTZ8+XX6/P2nO3q9cuaJT\np05p2bJlCofDam1t1c9//nO3x9L58+cVCoV08uRJdXZ26k9/+pPefvttt8eSlNyXAn/rW9+S1+vV\nM8884+ocly9f1tatW2/6tx7J8HW7cuWKLl26pCNHjqisrEyvvPKK2yNJGlyX/9rXvqaLFy/qxRdf\nVFVV1W23SVjgCwoK1N7eHvtza2urioqKEnX4W7p69aqWLVumlStXqqyszO1x9Mc//lH79+/XrFmz\ntGLFCv3hD3/Q888/7+pMs2fPVnZ2tkpLSzVx4kStWLFC9fX1rs4kDS6DFBUVafbs2XrggQf0zDPP\n6PDhw26PJWnw8X7jSbC2tjYVFBS4PNGgH/3oR2poaNC+ffvcHkV/+9vf1NnZqQULFmjWrFk6c+aM\nHn74YfX09Lg6V1FRkSoqKjRx4kSVlpaqvb09KX5jbWpqUmVlpVJTU1VVVTWqx3rCAp+M18gbY1RV\nVaW5c+e6dgXGx23dulXd3d3q6OjQT3/6Uy1dulQ//vGP3R5LWVlZam5u1vXr1/X222/r85//vNsj\nadGiRXrnnXd06dIl/ec//1F9fb2eeOIJt8eSJAUCAdXW1qq/v1+1tbVJcTJz4MABbd++Xfv379e9\n997r9jiaN2+e/vGPf6ijo0MdHR3KzMzUsWPHNH36dFfn+uxnP6v6+noZY9Tc3KyHHnooKb5eS5Ys\n0f79+yUNLm09/vjjt9/I+ed/by0cDpucnBzz0EMPmZ07dyby0MM6cuSI8Xg8ZsGCBSY/P9/k5+eb\n+vp6t8eKCYfDSXMVzV//+lcTCATMggULzIYNG0xfX5/bIxljjNm7d68pLi42jzzyiNm8ebMZGBhI\n+AzLly83Dz74oElLSzOZmZmmtrbW9Pb2mi9+8YtmxowZpqyszESjUVdmuueee0xmZqb54Q9/aGbP\nnm18Pl/ssR4KhVyZ6aNfp4+aNWtWwq+iGW6ma9eumRdeeMHk5OSY8vJyE4lEEjrTR+e68f2rra01\n77//vlm+fLmZP3++efbZZ01bW9tt9+MxJgkWvQAAjuNJVgCwFIEHAEsReACwFIEHAEsReACwFIEH\nAEv9H7N67AcpkRvoAAAAAElFTkSuQmCC\n", 119 | "text": [ 120 | "" 121 | ] 122 | } 123 | ], 124 | "prompt_number": 10 125 | } 126 | ], 127 | "metadata": {} 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /03-hyper-log-log-counter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "03-hyper-log-log-counter" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "See http://ivory.idyll.org/blog/2013-pycon-awesome-big-data-algorithms-talk.html" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# HyperLogLog counter\n", 22 | "\n", 23 | "The HyperLogLog counter uses bit patterns in hash functions to estimate \n", 24 | "\n", 25 | "Read:\n", 26 | " \n", 27 | "http://blog.aggregateknowledge.com/2012/10/25/sketch-of-the-day-hyperloglog-cornerstone-of-a-big-data-infrastructure/\n", 28 | "\n", 29 | "The code was mostly taken from\n", 30 | "\n", 31 | "https://github.com/svpcom/hyperloglog\n", 32 | "\n", 33 | "which is Vasily Evseenko's rewrite of Nelson Gon\u00e7alves's Log-Log-Sketch repo (https://github.com/goncalvesnelson/). I'm much indebted to them!" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "collapsed": false, 39 | "input": [ 40 | "# derived constants etc. These are just utility functions that you can ignore.\n", 41 | "\n", 42 | "def _get_alpha(b):\n", 43 | " if not (4 <= b <= 16):\n", 44 | " raise ValueError(\"b=%d should be in range [4 : 16]\" % b)\n", 45 | "\n", 46 | " if b == 4:\n", 47 | " return 0.673\n", 48 | "\n", 49 | " if b == 5:\n", 50 | " return 0.697\n", 51 | "\n", 52 | " if b == 6:\n", 53 | " return 0.709\n", 54 | "\n", 55 | " return 0.7213 / (1.0 + 1.079 / (1 << b))\n", 56 | "\n", 57 | "def estimate_cardinality(alpha, bits, bins):\n", 58 | " # harmonic mean\n", 59 | " E = alpha * float(len(bins) ** 2) / sum(math.pow(2.0, -x) for x in bins)\n", 60 | " \n", 61 | " if E <= 2.5 * bits: # Small range correction \n", 62 | " V = bins.count(0) #count number or registers equal to 0\n", 63 | " return bits * math.log(bins/ float(V)) if V > 0 else E\n", 64 | " elif E <= float(1L << 160) / 30.0:\n", 65 | " return E\n", 66 | " else:\n", 67 | " return -(1L << 160) * math.log(1.0 - E / (1L << 160))\n" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [], 72 | "prompt_number": 148 73 | }, 74 | { 75 | "cell_type": "code", 76 | "collapsed": false, 77 | "input": [ 78 | "# choose the precision by choosing how many estimators to track.\n", 79 | "bits = 8\n", 80 | "alpha = _get_alpha(bits)\n", 81 | "num_bins = 1 << bits\n", 82 | "bit_bins = [ 1L << i for i in range(160 - bits + 1) ]\n", 83 | "\n", 84 | "print 'num bins:', num_bins" 85 | ], 86 | "language": "python", 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "output_type": "stream", 91 | "stream": "stdout", 92 | "text": [ 93 | "num bins: 256\n" 94 | ] 95 | } 96 | ], 97 | "prompt_number": 149 98 | }, 99 | { 100 | "cell_type": "code", 101 | "collapsed": false, 102 | "input": [ 103 | "# 'rho' function to calculate the bit pattern to watch (string of 0s)\n", 104 | "import bisect\n", 105 | "\n", 106 | "# here, 'rho' is the number of 0s to the left of the first 'accuracy' bits.\n", 107 | "def rho(w):\n", 108 | " r = len(bit_bins) - bisect.bisect_right(bit_bins, w)\n", 109 | " return r" 110 | ], 111 | "language": "python", 112 | "metadata": {}, 113 | "outputs": [], 114 | "prompt_number": 150 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "print 1, len(bit_bins) - rho(1)\n", 121 | "print 2, len(bit_bins) - rho(2)\n", 122 | "print 3, len(bit_bins) - rho(3)\n", 123 | "print 4, len(bit_bins) - rho(4)\n", 124 | "print 8, len(bit_bins) - rho(8)\n", 125 | "print 2**152, len(bit_bins) - rho(2**152)" 126 | ], 127 | "language": "python", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "stream": "stdout", 133 | "text": [ 134 | "1 1\n", 135 | "2 2\n", 136 | "3 2\n", 137 | "4 3\n", 138 | "8 4\n", 139 | "5708990770823839524233143877797980545530986496 153\n" 140 | ] 141 | } 142 | ], 143 | "prompt_number": 151 144 | }, 145 | { 146 | "cell_type": "code", 147 | "collapsed": false, 148 | "input": [ 149 | "print 'initializing', num_bins, 'estimators'\n", 150 | "estimators = [0]*num_bins" 151 | ], 152 | "language": "python", 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "output_type": "stream", 157 | "stream": "stdout", 158 | "text": [ 159 | "initializing 256 estimators\n" 160 | ] 161 | } 162 | ], 163 | "prompt_number": 152 164 | }, 165 | { 166 | "cell_type": "code", 167 | "collapsed": false, 168 | "input": [ 169 | "from hashlib import sha1\n", 170 | "\n", 171 | "# to add a number into the counter:\n", 172 | "def add(num):\n", 173 | " # take the hash of 'num'\n", 174 | " num = str(num)\n", 175 | " hash = long(sha1(num).hexdigest(), 16)\n", 176 | " \n", 177 | " # here, 'bin' is determined by the first 'bits' bits of hash\n", 178 | " bin = hash & ((1 << bits) - 1)\n", 179 | " \n", 180 | " # now count the number of 0s in the remaining bits\n", 181 | " remaining_bits = hash >> bits\n", 182 | " count = rho(remaining_bits)\n", 183 | " \n", 184 | " # take max of currently stored estimation & this one\n", 185 | " estimators[bin] = max(estimators[bin], count)\n", 186 | " \n", 187 | "import random\n", 188 | "for i in range(100000):\n", 189 | " num = random.randint(0, int(1e9))\n", 190 | " add(num)" 191 | ], 192 | "language": "python", 193 | "metadata": {}, 194 | "outputs": [], 195 | "prompt_number": 153 196 | }, 197 | { 198 | "cell_type": "code", 199 | "collapsed": false, 200 | "input": [ 201 | "print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)" 202 | ], 203 | "language": "python", 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "output_type": "stream", 208 | "stream": "stdout", 209 | "text": [ 210 | "estimate cardinality as 98837.2382097\n" 211 | ] 212 | } 213 | ], 214 | "prompt_number": 154 215 | }, 216 | { 217 | "cell_type": "code", 218 | "collapsed": false, 219 | "input": [ 220 | "import random\n", 221 | "for i in range(100000):\n", 222 | " num = random.randint(0, int(1e9))\n", 223 | " add(num)" 224 | ], 225 | "language": "python", 226 | "metadata": {}, 227 | "outputs": [], 228 | "prompt_number": 155 229 | }, 230 | { 231 | "cell_type": "code", 232 | "collapsed": false, 233 | "input": [ 234 | "print 'estimate cardinality as', estimate_cardinality(alpha, bits, estimators)" 235 | ], 236 | "language": "python", 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "output_type": "stream", 241 | "stream": "stdout", 242 | "text": [ 243 | "estimate cardinality as 187751.540573\n" 244 | ] 245 | } 246 | ], 247 | "prompt_number": 156 248 | }, 249 | { 250 | "cell_type": "code", 251 | "collapsed": false, 252 | "input": [], 253 | "language": "python", 254 | "metadata": {}, 255 | "outputs": [] 256 | } 257 | ], 258 | "metadata": {} 259 | } 260 | ] 261 | } -------------------------------------------------------------------------------- /04-bloom-filters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "04-bloom-filters" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "See http://ivory.idyll.org/blog/2013-pycon-awesome-big-data-algorithms-talk.html" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## IPython Blocks\n", 22 | "\n", 23 | "Below, I will use IPython Blocks (https://github.com/jiffyclub/ipythonblocks) to demonstrate Bloom filters.\n", 24 | "\n", 25 | "IPython Blocks is a nifty little visualization tool built by Matt Davis @jiffyclub for use in teaching Python and programming basics. Here's a quick little demo --" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "collapsed": false, 31 | "input": [ 32 | "from ipythonblocks import BlockGrid" 33 | ], 34 | "language": "python", 35 | "metadata": {}, 36 | "outputs": [], 37 | "prompt_number": 1 38 | }, 39 | { 40 | "cell_type": "code", 41 | "collapsed": false, 42 | "input": [ 43 | "grid = BlockGrid(10,10, fill=(0,0,128))\n", 44 | "\n", 45 | "x = grid.shape[0]\n", 46 | "y = grid.shape[1]\n", 47 | "\n", 48 | "for block in grid:\n", 49 | " r = block.row * 255 / float(x)\n", 50 | " g = block.col * 255 / float(y)\n", 51 | " block.red = r\n", 52 | " block.green = g\n", 53 | " \n", 54 | "grid" 55 | ], 56 | "language": "python", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "html": [ 61 | "
" 62 | ], 63 | "output_type": "pyout", 64 | "prompt_number": 2, 65 | "text": [ 66 | "" 67 | ] 68 | } 69 | ], 70 | "prompt_number": 2 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Bloom filters\n", 77 | "\n", 78 | "I'll implement Bloom filters using multiple individual hash tables. (The canonical way to implement them is to use multiple hash functions with one big hash table, but I find that a bit harder to understand.)\n", 79 | "\n", 80 | "First, let's build a simple hash table object that doesn't track collisions. Note, to get 'num' from a string you'd just use a hash function." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "collapsed": false, 86 | "input": [ 87 | "import math\n", 88 | "\n", 89 | "class Hash(object):\n", 90 | " def __init__(self, size):\n", 91 | " self.size = size\n", 92 | " self.bits = [0]*size\n", 93 | " \n", 94 | " def add(self, num):\n", 95 | " num = num % self.size\n", 96 | " self.bits[num] = 1\n", 97 | " \n", 98 | " def get(self, num):\n", 99 | " num = num % self.size\n", 100 | " return self.bits[num]" 101 | ], 102 | "language": "python", 103 | "metadata": {}, 104 | "outputs": [], 105 | "prompt_number": 3 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Next, build a full Bloom filter that creates multiple hash tables; here, 'add' inserts the element into each hash table, and 'get' checks to see if the element is in all the hash tables.\n", 112 | "\n", 113 | "I've also included three utility methods: fp(), empirical_fp(), and show(). The first calculates the predicted false positive rate based on hash table occupancy; the second calculates the actual false positive rate for a range of numbers; and the third shows the hash table occupancy using IPython Blocks. " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "class BloomFilter(object):\n", 121 | " def __init__(self, *sizes):\n", 122 | " self.hashes = [ Hash(size) for size in sizes ]\n", 123 | " \n", 124 | " def add(self, num):\n", 125 | " for h in self.hashes:\n", 126 | " h.add(num)\n", 127 | " \n", 128 | " def get(self, num):\n", 129 | " for h in self.hashes:\n", 130 | " if not h.get(num):\n", 131 | " return 0\n", 132 | " return 1\n", 133 | " \n", 134 | " def fp(self):\n", 135 | " total = 0.\n", 136 | " for h in self.hashes:\n", 137 | " occupancy = sum(h.bits)\n", 138 | " f = occupancy / float(h.size)\n", 139 | " total += math.log(f, 2)\n", 140 | " \n", 141 | " return 2**total\n", 142 | " \n", 143 | " def empirical_fp(self, actual, max):\n", 144 | " found_true = 0\n", 145 | " found_false = 0\n", 146 | " for i in range(max):\n", 147 | " if self.get(i):\n", 148 | " if i in actual:\n", 149 | " found_true += 1\n", 150 | " else:\n", 151 | " found_false += 1\n", 152 | " \n", 153 | " return found_false / float(max)\n", 154 | " \n", 155 | " \n", 156 | " def show(self):\n", 157 | " rows = len(self.hashes)\n", 158 | " cols = max([ h.size for h in self.hashes ])\n", 159 | " grid = BlockGrid(cols, rows, fill=(0,0,0))\n", 160 | " for i, h in enumerate(self.hashes):\n", 161 | " for pos in range(h.size, cols):\n", 162 | " grid[i, pos] = (255, 255, 255)\n", 163 | " for j, bit in enumerate(h.bits):\n", 164 | " if bit:\n", 165 | " grid[i, j] = (255, 0, 0)\n", 166 | " return grid.show()" 167 | ], 168 | "language": "python", 169 | "metadata": {}, 170 | "outputs": [], 171 | "prompt_number": 4 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Let's create a Bloom filter with three hash tables, size 5, 7, and 11, and then show the occupied cells in the three hash tables after adding '253' and '8132' (no special significance to the numbers)." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "collapsed": false, 183 | "input": [ 184 | "x = BloomFilter(5, 7, 11)\n", 185 | "x.show()\n", 186 | "x.add(253)\n", 187 | "x.show()\n", 188 | "print x.get(253)\n", 189 | "\n", 190 | "###\n", 191 | "\n", 192 | "x.add(8132)\n", 193 | "x.show()\n", 194 | "print x.get(8132)" 195 | ], 196 | "language": "python", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "html": [ 201 | "
" 202 | ], 203 | "output_type": "display_data", 204 | "text": [ 205 | "" 206 | ] 207 | }, 208 | { 209 | "html": [ 210 | "
" 211 | ], 212 | "output_type": "display_data", 213 | "text": [ 214 | "" 215 | ] 216 | }, 217 | { 218 | "output_type": "stream", 219 | "stream": "stdout", 220 | "text": [ 221 | "1\n" 222 | ] 223 | }, 224 | { 225 | "html": [ 226 | "
" 227 | ], 228 | "output_type": "display_data", 229 | "text": [ 230 | "" 231 | ] 232 | }, 233 | { 234 | "output_type": "stream", 235 | "stream": "stdout", 236 | "text": [ 237 | "1\n" 238 | ] 239 | } 240 | ], 241 | "prompt_number": 5 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "Next, let's check out what happens when you start filling up the hash tables with lots and lots of entries." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "collapsed": false, 253 | "input": [ 254 | "import random, time\n", 255 | "\n", 256 | "x = BloomFilter(5, 7, 11)\n", 257 | "actual = set()\n", 258 | "for _ in range(10):\n", 259 | " num = random.randint(0, 255)\n", 260 | " actual.add(num)\n", 261 | " x.add(num)\n", 262 | " x.show()\n", 263 | " theory, empirical = x.fp(), x.empirical_fp(actual, 255)\n", 264 | " print 'inserting', num\n", 265 | " print 'predicted FP:', theory, 'diff from actual:', abs(theory-empirical)\n", 266 | " time.sleep(1)" 267 | ], 268 | "language": "python", 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "html": [ 273 | "
" 274 | ], 275 | "output_type": "display_data", 276 | "text": [ 277 | "" 278 | ] 279 | }, 280 | { 281 | "output_type": "stream", 282 | "stream": "stdout", 283 | "text": [ 284 | "inserting 2\n", 285 | "predicted FP: 0.0025974025974 diff from actual: 0.0025974025974\n" 286 | ] 287 | }, 288 | { 289 | "html": [ 290 | "
" 291 | ], 292 | "output_type": "display_data", 293 | "text": [ 294 | "" 295 | ] 296 | }, 297 | { 298 | "output_type": "stream", 299 | "stream": "stdout", 300 | "text": [ 301 | "inserting 49\n", 302 | "predicted FP: 0.0207792207792 diff from actual: 0.00509294626942\n" 303 | ] 304 | }, 305 | { 306 | "html": [ 307 | "
" 308 | ], 309 | "output_type": "display_data", 310 | "text": [ 311 | "" 312 | ] 313 | }, 314 | { 315 | "output_type": "stream", 316 | "stream": "stdout", 317 | "text": [ 318 | "inserting 141\n", 319 | "predicted FP: 0.0701298701299 diff from actual: 0.0034632034632\n" 320 | ] 321 | }, 322 | { 323 | "html": [ 324 | "
" 325 | ], 326 | "output_type": "display_data", 327 | "text": [ 328 | "" 329 | ] 330 | }, 331 | { 332 | "output_type": "stream", 333 | "stream": "stdout", 334 | "text": [ 335 | "inserting 151\n", 336 | "predicted FP: 0.124675324675 diff from actual: 0.0187929717341\n" 337 | ] 338 | }, 339 | { 340 | "html": [ 341 | "
" 342 | ], 343 | "output_type": "display_data", 344 | "text": [ 345 | "" 346 | ] 347 | }, 348 | { 349 | "output_type": "stream", 350 | "stream": "stdout", 351 | "text": [ 352 | "inserting 26\n", 353 | "predicted FP: 0.194805194805 diff from actual: 0.0183346065699\n" 354 | ] 355 | }, 356 | { 357 | "html": [ 358 | "
" 359 | ], 360 | "output_type": "display_data", 361 | "text": [ 362 | "" 363 | ] 364 | }, 365 | { 366 | "output_type": "stream", 367 | "stream": "stdout", 368 | "text": [ 369 | "inserting 54\n", 370 | "predicted FP: 0.233766233766 diff from actual: 0.0220015278839\n" 371 | ] 372 | }, 373 | { 374 | "html": [ 375 | "
" 376 | ], 377 | "output_type": "display_data", 378 | "text": [ 379 | "" 380 | ] 381 | }, 382 | { 383 | "output_type": "stream", 384 | "stream": "stdout", 385 | "text": [ 386 | "inserting 0\n", 387 | "predicted FP: 0.363636363636 diff from actual: 0.0185383244207\n" 388 | ] 389 | }, 390 | { 391 | "html": [ 392 | "
" 393 | ], 394 | "output_type": "display_data", 395 | "text": [ 396 | "" 397 | ] 398 | }, 399 | { 400 | "output_type": "stream", 401 | "stream": "stdout", 402 | "text": [ 403 | "inserting 96\n", 404 | "predicted FP: 0.363636363636 diff from actual: 0.0224598930481\n" 405 | ] 406 | }, 407 | { 408 | "html": [ 409 | "
" 410 | ], 411 | "output_type": "display_data", 412 | "text": [ 413 | "" 414 | ] 415 | }, 416 | { 417 | "output_type": "stream", 418 | "stream": "stdout", 419 | "text": [ 420 | "inserting 248\n", 421 | "predicted FP: 0.623376623377 diff from actual: 0.0390628978864\n" 422 | ] 423 | }, 424 | { 425 | "html": [ 426 | "
" 427 | ], 428 | "output_type": "display_data", 429 | "text": [ 430 | "" 431 | ] 432 | }, 433 | { 434 | "output_type": "stream", 435 | "stream": "stdout", 436 | "text": [ 437 | "inserting 54\n", 438 | "predicted FP: 0.623376623377 diff from actual: 0.0390628978864\n" 439 | ] 440 | } 441 | ], 442 | "prompt_number": 6 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "## One-sided error\n", 449 | "\n", 450 | "Bloom filters have what's called one-sided error: they may report elements that were never inserted as being members of the set -- false positives -- but they will never miss reporting elements that WERE inserted as being members -- false negatives. This is a straightforward consequence of the \"no collision tracking\" aspect of the hash tables: collisions lead to false reporting, but you never miss something you've already inserted.\n", 451 | "\n", 452 | "If you know the hash table sizes, it's easy to predict the collisions. One simple way is to test something that's 5*7*11 times added to something you've already inserted, i.e. to force a collision based on the modulus of the hash table sizes." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "collapsed": false, 458 | "input": [ 459 | "# here, we add '3' and test for '3 + 5*7*11', or 388.\n", 460 | "x = BloomFilter(5, 7, 11)\n", 461 | "x.add(3)\n", 462 | "x.show()\n", 463 | "\n", 464 | "print x.get(3), x.get(3 + (5*7*11))\n", 465 | "print \"oh noes! 388 is falsely said to be present in the data set!\"" 466 | ], 467 | "language": "python", 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "html": [ 472 | "
" 473 | ], 474 | "output_type": "display_data", 475 | "text": [ 476 | "" 477 | ] 478 | }, 479 | { 480 | "output_type": "stream", 481 | "stream": "stdout", 482 | "text": [ 483 | "1 1\n", 484 | "oh noes! 388 is falsely said to be present in the data set!\n" 485 | ] 486 | } 487 | ], 488 | "prompt_number": 7 489 | }, 490 | { 491 | "cell_type": "code", 492 | "collapsed": false, 493 | "input": [], 494 | "language": "python", 495 | "metadata": {}, 496 | "outputs": [], 497 | "prompt_number": 7 498 | } 499 | ], 500 | "metadata": {} 501 | } 502 | ] 503 | } -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | This repo contains notebooks from my 2013 PyCon talk, "Awesome Big Data 2 | Algorithms". 3 | 4 | Blog post here: http://ivory.idyll.org/blog/2013-pycon-awesome-big-data-algorithms-talk.html 5 | Slides here: http://www.slideshare.net/c.titus.brown/2013-py-con-awesome-big-data-algorithms 6 | 7 | Titus Brown 8 | titus@idyll.org 9 | --------------------------------------------------------------------------------