├── .gitignore ├── CONTRIBUTING.md ├── LICENSE-examples ├── LICENSE.md ├── PATENTS ├── README.md ├── examples ├── dense_matrix.ipynb ├── enron.ipynb ├── pysparnn_utils.py └── sparse_search_comparison.ipynb ├── pysparnn ├── __init__.py ├── cluster_index.py └── matrix_distance.py ├── requirements.txt ├── run_tests.sh ├── setup.py └── tests ├── __init__.py └── test_pysparnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PySparNN 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## Coding Style 30 | Please use pylint with the default settings. 31 | 32 | ## License 33 | By contributing to PySparNN, you agree that your contributions will be licensed 34 | under its BSD license. 35 | -------------------------------------------------------------------------------- /LICENSE-examples: -------------------------------------------------------------------------------- 1 | Copyright (c) 20__-present, Facebook, Inc. All rights reserved. 2 | 3 | The examples provided by Facebook are for non-commercial testing and evaluation 4 | purposes only. Facebook reserves all rights not expressly granted. 5 | 6 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 7 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 8 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 9 | FACEBOOK BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 10 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 11 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For PySparNN software 4 | 5 | Copyright (c) 2016-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the PySparNN software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySparNN 2 | Approximate Nearest Neighbor Search for Sparse Data in Python! This library is well suited to finding nearest neighbors in sparse, high dimensional spaces (like text documents). 3 | 4 | Out of the box, PySparNN supports Cosine Distance (i.e. 1 - cosine_similarity). 5 | 6 | PySparNN benefits: 7 | * Designed to be efficient on sparse data (memory & cpu). 8 | * Implemented leveraging existing python libraries (scipy & numpy). 9 | * Easily extended with other metrics: Manhattan, Euclidian, Jaccard, etc. 10 | * Supports incremental insertion of elements. 11 | 12 | If your data is NOT SPARSE - please consider [faiss](https://github.com/facebookresearch/faiss) or [annoy](https://github.com/spotify/annoy). They use similar methods and I am a big fan of both. You should expect better performance on dense vectors from both of those projects. 13 | 14 | The most comparable library to PySparNN is scikit-learn's LSHForest module. As of this writing, PySparNN is ~4x faster on the 20newsgroups dataset (as a sparse vector). A more robust benchmarking on sparse data is desired. [Here is the comparison.](https://github.com/facebookresearch/pysparnn/blob/master/examples/sparse_search_comparison.ipynb) [Here is another comparison](https://github.com/facebookresearch/pysparnn/blob/master/examples/enron.ipynb) on the larger Enron email dataset. 15 | 16 | 17 | ## Example Usage 18 | ### Simple Example 19 | ```python 20 | import pysparnn.cluster_index as ci 21 | 22 | import numpy as np 23 | from scipy.sparse import csr_matrix 24 | 25 | features = np.random.binomial(1, 0.01, size=(1000, 20000)) 26 | features = csr_matrix(features) 27 | 28 | # build the search index! 29 | data_to_return = range(1000) 30 | cp = ci.MultiClusterIndex(features, data_to_return) 31 | 32 | cp.search(features[:5], k=1, return_distance=False) 33 | >> [[0], [1], [2], [3], [4]] 34 | ``` 35 | ### Text Example 36 | ```python 37 | import pysparnn.cluster_index as ci 38 | 39 | from sklearn.feature_extraction.text import TfidfVectorizer 40 | 41 | data = [ 42 | 'hello world', 43 | 'oh hello there', 44 | 'Play it', 45 | 'Play it again Sam', 46 | ] 47 | 48 | tv = TfidfVectorizer() 49 | tv.fit(data) 50 | 51 | features_vec = tv.transform(data) 52 | 53 | # build the search index! 54 | cp = ci.MultiClusterIndex(features_vec, data) 55 | 56 | # search the index with a sparse matrix 57 | search_data = [ 58 | 'oh there', 59 | 'Play it again Frank' 60 | ] 61 | 62 | search_features_vec = tv.transform(search_data) 63 | 64 | cp.search(search_features_vec, k=1, k_clusters=2, return_distance=False) 65 | >> [['oh hello there'], ['Play it again Sam']] 66 | 67 | ``` 68 | 69 | ## Requirements 70 | PySparNN requires numpy and scipy. Tested with numpy 1.11.2 and scipy 0.18.1. 71 | 72 | ## Installation 73 | ```bash 74 | # clone pysparnn 75 | cd pysparnn 76 | pip install -r requirements.txt 77 | python setup.py install 78 | ``` 79 | 80 | ## How PySparNN works 81 | Searching for a document in an collection of D documents is naively O(D) (assuming documents are constant sized). 82 | 83 | However! we can create a tree structure where the first level is O(sqrt(D)) and each of the leaves are also O(sqrt(D)) - on average. 84 | 85 | We randomly pick sqrt(D) candidate items to be in the top level. Then -- each document in the full list of D documents is assigned to the closest candidate in the top level. 86 | 87 | This breaks up one O(D) search into two O(sqrt(D)) searches which is much much faster when D is big! 88 | 89 | This generalizes to h levels. The runtime becomes: 90 | O(h * h_root(D)) 91 | 92 | ## Further Information 93 | http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html 94 | 95 | See the CONTRIBUTING file for how to help out. 96 | 97 | ## License 98 | PySparNN is BSD-licensed. We also provide an additional patent grant. 99 | -------------------------------------------------------------------------------- /examples/dense_matrix.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Copyright 2016-present, Facebook, Inc.\n", 12 | "# All rights reserved.\n", 13 | "\n", 14 | "# This source code is licensed under the license found in the\n", 15 | "# LICENSE-examples file in the root directory of this source tree." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Study Sparse vs Dense Matrix Implementations\n", 23 | "Pysparnn defaults to sparse matricies but you may also use a dense matrix to improve performance\n", 24 | "\n", 25 | "This is typically when the number of dimensions is small" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import numpy as np\n", 37 | "import time" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# make sure you run 'python setup.py install' first!\n", 49 | "import pysparnn.cluster_index as ci\n", 50 | "import pysparnn.matrix_distance" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "source": [ 59 | "# Get data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "# feature vectors are ~10% full and there are only 100 dimensions\n", 71 | "features = np.random.binomial(1, 0.1, size=(100000, 100))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "test_features = features[:5000]\n", 83 | "train_features = features[5000:]\n", 84 | "\n", 85 | "data_to_return = range(train_features.shape[0])" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Build models to compare" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stderr", 104 | "output_type": "stream", 105 | "text": [ 106 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:189: RuntimeWarning: divide by zero encountered in true_divide\n", 107 | " magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "cp = ci.MultiClusterIndex(train_features, data_to_return)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stderr", 124 | "output_type": "stream", 125 | "text": [ 126 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:334: RuntimeWarning: divide by zero encountered in true_divide\n", 127 | " magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)\n", 128 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:336: RuntimeWarning: invalid value encountered in multiply\n", 129 | " return 1 - (dprod * magnitude)\n", 130 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:108: RuntimeWarning: invalid value encountered in less_equal\n", 131 | " dist_filter = (dist_matrix <= max_distance)\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "dense_cp = ci.MultiClusterIndex(train_features, data_to_return, \n", 137 | " distance_type=pysparnn.matrix_distance.DenseCosineDistance)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## Answer Key" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 8, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "import pysparnn_utils" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "from sklearn.neighbors import NearestNeighbors \n", 167 | "knn = NearestNeighbors()\n", 168 | " \n", 169 | "knn.fit(train_features)\n", 170 | "\n", 171 | "# get top 3 nearest neighbors for each document\n", 172 | "answers = knn.kneighbors(test_features, 3, return_distance=False)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Compare Performance\n", 180 | "Don't worry so much about the recall performance. There are many items in this space (congested). These methods should return close matches even if they arent the closest absolute matches." 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 13, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "Percent of time sparse returns a top 3 result: 0.2498\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "t0 = time.time()\n", 200 | "\n", 201 | "results = cp.search(test_features, return_distance=False)\n", 202 | "\n", 203 | "print('Percent of time sparse returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 204 | "\n", 205 | "cp_time = time.time() - t0" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 14, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Percent of time dense returns a top 3 result: 0.2458\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "t0 = time.time()\n", 225 | "\n", 226 | "results = dense_cp.search(test_features, return_distance=False)\n", 227 | "\n", 228 | "print('Percent of time dense returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 229 | "\n", 230 | "dense_cp_time = time.time() - t0" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 15, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "4.979948311566905" 244 | ] 245 | }, 246 | "execution_count": 15, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "# sparse is x times slower than dense\n", 253 | "cp_time / dense_cp_time" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "**Analysis:** Equivalent performance (the indexes use random seeds for construction) and the dense version is ~4x faster in this case." 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "anaconda-cloud": {}, 266 | "kernelspec": { 267 | "display_name": "Python 2", 268 | "language": "python", 269 | "name": "python2" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 2 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython2", 281 | "version": "2.7.12" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 0 286 | } 287 | -------------------------------------------------------------------------------- /examples/enron.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Copyright 2016-present, Facebook, Inc.\n", 12 | "# All rights reserved.\n", 13 | "\n", 14 | "# This source code is licensed under the license found in the\n", 15 | "# LICENSE-examples file in the root directory of this source tree." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "# Evaluate pysparnn on Enron data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "import numpy as np\n", 34 | "import time\n", 35 | "import inspect\n", 36 | "\n", 37 | "from scipy.sparse import csr_matrix\n", 38 | "from sklearn.datasets import fetch_20newsgroups\n", 39 | "from sklearn.neighbors import LSHForest\n", 40 | "from sklearn.feature_extraction import DictVectorizer" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "# make sure you run 'python setup.py install' first!\n", 52 | "import pysparnn.cluster_index as ci" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Get data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "raw", 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "source": [ 68 | "# fetch data\n", 69 | "\n", 70 | "\n", 71 | "!wget https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz\n", 72 | "\n", 73 | "\n", 74 | "_ = !tar -xzvf enron_mail_20150507.tgz" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "# load enron data \n", 86 | "import os\n", 87 | "import sys\n", 88 | "\n", 89 | "docs = []\n", 90 | "max_docs = 100000\n", 91 | "for folder, subs, files in os.walk('maildir'):\n", 92 | " for filename in files:\n", 93 | " with open(os.path.join(folder, filename), 'r') as src:\n", 94 | " try:\n", 95 | " txt = ' '.join(src.readlines())\n", 96 | " if len(txt) > 0:\n", 97 | " docs.append(txt)\n", 98 | " except:\n", 99 | " pass\n", 100 | " if len(docs) > max_docs:\n", 101 | " break \n", 102 | " if len(docs) > max_docs:\n", 103 | " break" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "Num docs: 100001\n", 118 | "Avg doc length: 413.757442426\n", 119 | "Num unique words: 942676\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "print('Num docs: {}'.format(len(docs)))\n", 125 | "print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in docs])))\n", 126 | "words = set()\n", 127 | "for doc in docs:\n", 128 | " words.update(doc.split())\n", 129 | "print('Num unique words: {}'.format(len(words)))\n", 130 | "del words" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Turn documents into vectors" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.neighbors import LSHForest, NearestNeighbors \n", 149 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 150 | "\n", 151 | "tv = TfidfVectorizer(decode_error='ignore')\n", 152 | "\n", 153 | "features = csr_matrix(tv.fit_transform(docs))\n", 154 | "\n", 155 | "doc_index = np.array(range(len(docs)))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 7, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "test_features = features[:2000]\n", 167 | "train_features = features[2000:]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Create an answer key" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',\n", 188 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)" 189 | ] 190 | }, 191 | "execution_count": 8, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "knn = NearestNeighbors(algorithm='brute', metric='cosine')\n", 198 | " \n", 199 | "knn.fit(train_features)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 9, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "67.97951602935791" 213 | ] 214 | }, 215 | "execution_count": 9, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "t0 = time.time()\n", 222 | "# get 1 NN for each document\n", 223 | "answers = knn.kneighbors(test_features, n_neighbors=1, return_distance=False)\n", 224 | "time.time() - t0" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 10, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [ 234 | { 235 | "name": "stderr", 236 | "output_type": "stream", 237 | "text": [ 238 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py:211: UserWarning: cannot use tree with sparse input: using brute force\n", 239 | " warnings.warn(\"cannot use tree with sparse input: \"\n" 240 | ] 241 | }, 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "52.95571780204773" 246 | ] 247 | }, 248 | "execution_count": 10, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "bknn = NearestNeighbors(algorithm='ball_tree')\n", 255 | " \n", 256 | "bknn.fit(train_features)\n", 257 | "\n", 258 | "t0 = time.time()\n", 259 | "# get 1 NN for each document\n", 260 | "_ = bknn.kneighbors(test_features, n_neighbors=1, return_distance=False)\n", 261 | "time.time() - t0" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "## Build models to compare" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 11, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "114.92523193359375" 282 | ] 283 | }, 284 | "execution_count": 11, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "t0 = time.time()\n", 291 | "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)\n", 292 | "time.time() - t0" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 12, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "18.300570964813232" 306 | ] 307 | }, 308 | "execution_count": 12, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "t0 = time.time()\n", 315 | "lshf = LSHForest(n_neighbors=1)\n", 316 | " \n", 317 | "lshf.fit(train_features)\n", 318 | "time.time() - t0" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "## Compare results" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 13, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "import pysparnn_utils\n", 337 | "import time " 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 14, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "Recall: 0.965\n" 352 | ] 353 | }, 354 | { 355 | "data": { 356 | "text/plain": [ 357 | "23.034273862838745" 358 | ] 359 | }, 360 | "execution_count": 14, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "t0 = time.time()\n", 367 | "\n", 368 | "results = snn.search(test_features, return_distance=False)\n", 369 | "\n", 370 | "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 371 | "\n", 372 | "snn_time = time.time() - t0\n", 373 | "snn_time" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 15, 379 | "metadata": { 380 | "collapsed": false 381 | }, 382 | "outputs": [ 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "Recall: 0.9245\n" 388 | ] 389 | }, 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "11.743115901947021" 394 | ] 395 | }, 396 | "execution_count": 15, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "# only search one index instead of 2\n", 403 | "t0 = time.time()\n", 404 | "\n", 405 | "results = snn.search(test_features, return_distance=False, num_indexes=1)\n", 406 | "\n", 407 | "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 408 | "\n", 409 | "time.time() - t0" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 16, 415 | "metadata": { 416 | "collapsed": false 417 | }, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "Recall: 0.7185\n" 424 | ] 425 | }, 426 | { 427 | "data": { 428 | "text/plain": [ 429 | "77.58664608001709" 430 | ] 431 | }, 432 | "execution_count": 16, 433 | "metadata": {}, 434 | "output_type": "execute_result" 435 | } 436 | ], 437 | "source": [ 438 | "t0 = time.time()\n", 439 | "\n", 440 | "results = lshf.kneighbors(test_features, return_distance=False)\n", 441 | "\n", 442 | "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 443 | "lsh_time = time.time() - t0\n", 444 | "lsh_time" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 17, 450 | "metadata": { 451 | "collapsed": false 452 | }, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "3.3683130860568533" 458 | ] 459 | }, 460 | "execution_count": 17, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "# LSH is x times slower than snn\n", 467 | "lsh_time / snn_time" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "# Track Pysparnn vs Bruteforce as a function of index size" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 18, 480 | "metadata": { 481 | "collapsed": false 482 | }, 483 | "outputs": [ 484 | { 485 | "name": "stderr", 486 | "output_type": "stream", 487 | "text": [ 488 | "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:117: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 100001 but corresponding boolean dimension is 1000\n", 489 | " records = self.records_data[index]\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "snn_results = []\n", 495 | "\n", 496 | "for n in np.linspace(1000, 80000, 5):\n", 497 | " feats = train_features[:n]\n", 498 | " \n", 499 | " ########## brute force ############\n", 500 | " bf = NearestNeighbors(algorithm='brute', metric='cosine')\n", 501 | "\n", 502 | " bf.fit(feats)\n", 503 | "\n", 504 | " # get 1 NN for each document\n", 505 | " t0 = time.time()\n", 506 | " \n", 507 | " answers = bf.kneighbors(test_features, n_neighbors=1, return_distance=False)\n", 508 | " \n", 509 | " bf_time = time.time() - t0\n", 510 | " \n", 511 | " \n", 512 | " ########## snn ############\n", 513 | " snn = ci.MultiClusterIndex(feats, doc_index, num_indexes=2)\n", 514 | " # only search one index instead of 2\n", 515 | " t0 = time.time()\n", 516 | "\n", 517 | " results = snn.search(test_features, return_distance=False, num_indexes=1)\n", 518 | " \n", 519 | " snn_time = time.time() - t0\n", 520 | " snn_recall = pysparnn_utils.recall(answers, results).mean()\n", 521 | " \n", 522 | " # results\n", 523 | " snn_results.append({\n", 524 | " 'n': n,\n", 525 | " 'snn_recall': snn_recall , \n", 526 | " 'snn_time': snn_time,\n", 527 | " 'bf_recall': 1.0,\n", 528 | " 'bf_time': bf_time\n", 529 | " })" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 19, 535 | "metadata": { 536 | "collapsed": false 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "import pandas as pd" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 20, 546 | "metadata": { 547 | "collapsed": true 548 | }, 549 | "outputs": [], 550 | "source": [ 551 | "%matplotlib inline" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 21, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [ 561 | { 562 | "data": { 563 | "text/plain": [ 564 | "" 565 | ] 566 | }, 567 | "execution_count": 21, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | }, 571 | { 572 | "data": { 573 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEZCAYAAABxbJkKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH2xJREFUeJzt3XuYFPWd7/H3d7gMEEGGOwPMMAF5IEhUBMIaN5mIi3gl\nz+EkAlGUuFmy0ehjyFkQzgqcPSdGE5+w2Xh54GHV1RgMulnBowu6MB5NRO4XuYs63EEychHDMMx8\nzx9VM/QMc+mZ6Z7umfq8nqefrq7+VdW3e6A+Xb9fVbe5OyIiEk0ZqS5ARERSRyEgIhJhCgERkQhT\nCIiIRJhCQEQkwhQCIiIRphCQtGFm3zSz/TGPPzaz62po+5SZzW666urPzOaY2fO1PF/j64tj3c+Y\nWZGZra76vonUh0JAqmVmn5jZF2Z2yswOhTudDk2w6bguXHH3v3f3/5PsYhIg4RfimNm1wBgg291H\n12c7ZnaXmb2T6Jqk+VIISE0cuNndOwFXAlcBD6W2pMQys1aprqGB+gOfuPvZBixrJCGYpPlSCEht\nDMDdjwHLCcIgeMKsrZn90swKzeywmT1pZpkxz483s41mdtLM9pjZ2HD+3Wa2PTzC+NDM/q5BhQVH\nJv8rnO5qZsvM7DMz+7OZvV3LcmVm9iMz2w3sDuddY2ZrwuXfN7O/imlfqcumahePmU0Jj5o+NbP/\nWU0XT6aZPRe+3q1mNryamnqa2Rkzy4qZN9zMjlUNKjP7PrAQ+KtwnXOqWd+M8L09ZWYfmNm3w/mD\ngafCZU+bWVHN77BEhUJA6mRmfYEbgT0xsx8FBgJfDe/7AA+H7UcBzwHT3f1S4BvAJ+FyR4GbwiOM\nqcCvzOxKGmc6sB/oCvQAZtXRfjwwEvhKuON9DZgfLv8r4P/G7pCr4QBm9hXgCWAS0Bu4FMiu0vZW\n4MXwuWVh+8orcz8KrAK+GzP7DuB37l5ape2/Aj8E3nP3Tu4+r5r6PgS+Hr7H84AXzKynu++MWbaj\nu3ep5TVKRCgEpDb/YWangH0EO++5Mc/9AHjQ3U+6+xng5wQ7Q4DvA4vcfSWAux92993h9Bvu/kk4\n/Q6wAvjrRtZZQrATznP3Unf/Yx3tfxbWXQzcDOx29xfdvczdFwM7CXbedZkALHX399z9PGEIVvGu\nuy/34Eu6nicIzer8G3AngJllELyXNQ4q18bdXwmDBXdfQhDeoxqyLmn5FAJSm/Hhp8lvAoOBbgBm\n1h3oAKwPz1ApAt4g+CQN0A/YW90KzexGM3sv7Lb5jOAIo1sj6/xFuL0VYTfIjDraH4iZzgYKqzxf\nSHBkU5dsgiMQANz9L8Cfq7Q5EjP9BdAu3MlX9SowxMxygbHACXdfF0cNFwm7qDaG3VufAUNp/Hss\nLZRCQGpTPibwDkH3zuPh/OMEO7Sh7t4lvHUOu34g2DEOuGhlZm2Bl4HHgO7unkUQHtaYIt39c3f/\nqbsPAG4DfmJm36ptkZjpQwQDrbFygIPh9BmCwCvXK2b6MNC3/IGZtedCENZLeFTye4KjgTto4FGA\nmeUAC4AfuXtW+B5v48J7rEFhqUQhIPGaD/yNmQ0LuzYWAvPDowLMrE/54C+wCJhqZt+yQLaZDQLa\nhrfj7l5mZjcSfOptFDO72czKQ+c0cB4oi3Px14HLzGyimbUys9uBIQTjBACbgIlm1trMRgD/PWbZ\nl4FbzWy0mbWhcndZjeXW8tzzwN0EXVENCgHgSwSv/biZZZjZVODymOePAn3DekUUAlKjSp8Y3f04\nwdFAeb/3TIIByNVmdoKgb39Q2HYtwaDvfOAkUADkuvvnwP3AkrALaSJBN0hcNdTiMuAtMzsN/BF4\nwt1rOkOo6usqAm4BfkpwhPNTglNjy8+c+UeCge8iYA7w25hltwM/Bl4iOKI4BRwDiuN8TVVr+RPB\nDnyDuzfo4i9330FwxLaaoCtqKPBuTJOVBEcGR8zsWEO2IS2L1fWjMma2iOA/yVF3r3ZQy8x+TdC3\newa42903JbpQkXRnZl8CTgAD3b3qOEO86/gv4LfhWUAiSRfPkcAzwA01PRke0g9w98uAacDTCapN\nJO2Z2S1m1j4MgMeBLY0IgJEEF+W9lMgaRWpTZwi4+7vAZ7U0GU9wehvu/j5wqZn1TEx5ImlvPEFX\n0AGCwfCJDVmJmT1L0KX2QHjKrUiTaJ2AdfQh5jQ5grMq+hAMQIm0aO7+A4JrJhq7nrsbX41I/Wlg\nWEQkwhJxJHCQ4OKgcn25cI51JWamc5RFRBrA3Rt1PU1N4g0Bo+bzm5cC9wIvmdlogisda+4Kmluf\n8prIKqC2S4vSSXOpVXUmXnOptbnUCc2n1rlJXLe713oj+PKrQwTnPu8jOP97GvB3MW1+Q3DO+GZg\neC3r8rIy9/373f/wB/dZs9zHjnXv0sUdGn7Lzna/7Tb3f/on9zfecD92zOtlzpw59VsghZpLraoz\n8ZpLrc2lTvfmU2uwq659X93QW51HAu4+OY4298UbOmbQt29w+/a3y5eHTz6Bdetg7drgfv16OHUq\nvnUeOgRLlwa3crm5MGIEjBwZ3F99NXTuHG+VIiLRkIgxgUYzg7y84Pad7wTzysrgww8vhMK6dbBh\nA3zxRXzrLCwMbq+8cmHewIEXQmHECLjqKujYEfLz8xP+mpKludSqOhOvudTaXOqE5lVrstR5xXBC\nN2bmjdleaSns3Fk5GDZtguLaLtKvtR4YPLhyMFx5JbRv3+ASRUQSzsySNjDcrEKgOiUl8MEHF0Jh\n3TrYsgXOn2/Y+lq1gssvvxAKI0bAV78KbdsmtGwRkbgpBOrp7NkgCGKDYdu2oIupIdq2DYIgNhiG\nDoXWadGZJiItnUIgAc6cCbqOYoNh165gULoh2rULxhTKQ2HkSBg0KDiSEBFJJIVAkpw6FQw2l4fC\n2rXw0UcNX98ll8Dw4ZWDYcCAYOxBRKShFAJNqKgoOD019ohh376Gr69z5+D01NjTVXNyFAwiEj+F\nQIodPVo5GNauhSNH6l6uJt26VT5aGDECsrMTV69Udv48/OUvwenFf/lL5Vv5vOLiYIynXTvIzKz7\nvk0bBbk0HYVAmnEPLlCLPVpYuxb+XPUnxuuhd+/KoTBiBHTvnria00lZ2cU74ZoexzuvtjYNPVOs\nNmbxhUUy7mOnFUbRoBBoBtyDi9Nig2HdOjh5suHrzMmpHApXXw1ZWYmruZx7cEZVsnbCVR+fO5f4\n1xBlqQiiqvdt2yqMkkkh0EyVlcHevZWPFjZsCM5UaqiBAy+EQu/eidlRnz2buNcs0ZWZWTkcWrcO\njlSqu2/oc41dviHrbtUq9QGnEGhBSkuDU1Njg2HTJu2Ik8ksuAq8Q4fgvvwW+7hdu6Db6OzZYHwg\n9r66eaWlqX5V0pTKwyFV4TV9ukKgRSspge3bK3+B3pYtwfyWKnZnXN0OOpHzktFVcf58EAhVw6Gp\n75Mx3iHpSCEQOcXFsHVr5WDYti15n0AzM5tmh1z+qTvVh9ctRWlp04RRXW1a8geW9KAQEIL++82b\ng0DYuDHo00/ETrp9e8jQD41KI5SVVQ6F4uLgKKWkpPJ9dfPS/bn06PpTCIiIpERZWRAEqQqkkhKY\nP18hICISWck8O0idACIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIi\nIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISITF\nFQJmNs7MdprZbjObUc3zncxsqZltMrOtZnZ3wisVEZGEq/OH5s0sA9gNjAEOAWuBie6+M6bNQ0An\nd3/IzLoBu4Ce7n6+yrr0Q/MiIvWU6h+aHwXscfdCdy8BFgPjq7RxoGM43RH4c9UAEBGR9BNPCPQB\n9sc8PhDOi/Ub4CtmdgjYDDyQmPJERCSZWidoPTcAG939OjMbALxpZl9198+rNpw7d27FdH5+Pvn5\n+QkqQUSkZSgoKKCgoKBJthXPmMBoYK67jwsfzwTc3R+NafMa8Ii7/zF8/F/ADHdfV2VdGhMQEamn\nVI8JrAUGmlmumbUFJgJLq7QpBK4HMLOewCDgo0QWKiIiiVdnd5C7l5rZfcAKgtBY5O47zGxa8LQv\nAP438KyZbQkX+wd3L0pa1SIikhB1dgcldGPqDhIRqbdUdweJiEgLpRAQEYkwhYCISIQpBEREIkwh\nICISYQoBEZEIUwiIiESYQkBEJMIUAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhE\nmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQ\nEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiIiESYQkBEJMIUAiIiEaYQEBGJMIWAiEiExRUCZjbO\nzHaa2W4zm1FDm3wz22hmH5jZqsSWKSIiyWDuXnsDswxgNzAGOASsBSa6+86YNpcCfwLGuvtBM+vm\n7serWZfXtT0REanMzHB3S8a64zkSGAXscfdCdy8BFgPjq7SZDLzi7gcBqgsAERFJP/GEQB9gf8zj\nA+G8WIOALma2yszWmtmdiSpQRESSp3UC1zMcuA74EvCemb3n7h8maP0iIpIE8YTAQSAn5nHfcF6s\nA8Bxdz8LnDWz/wdcAVwUAnPnzq2Yzs/PJz8/v34Vi4i0cAUFBRQUFDTJtuIZGG4F7CIYGD4MrAEm\nufuOmDaDgX8BxgGZwPvA7e6+vcq6NDAsIlJPyRwYrvNIwN1Lzew+YAXBGMIid99hZtOCp32Bu+80\ns+XAFqAUWFA1AEREJP3UeSSQ0I3pSEBEpN5SfYqoiIi0UAoBEZEIUwiIiESYQkBEJMIUAiIiEaYQ\nEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQi\nTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiI\niESYQkBEJMIUAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTC4goBMxtn\nZjvNbLeZzail3UgzKzGz/5a4EkVEJFnqDAEzywB+A9wADAUmmdngGtr9HFie6CJFRCQ54jkSGAXs\ncfdCdy8BFgPjq2n3Y+Bl4FgC6xMRkSSKJwT6APtjHh8I51Uws2zg2+7+FGCJK09ERJIpUQPD84HY\nsQIFgYhIM9A6jjYHgZyYx33DebFGAIvNzIBuwI1mVuLuS6uubO7cuRXT+fn55Ofn17NkEZGWraCg\ngIKCgibZlrl77Q3MWgG7gDHAYWANMMndd9TQ/hlgmbv/ezXPeV3bExGRyswMd09KD0udRwLuXmpm\n9wErCLqPFrn7DjObFjztC6oukoQ6RUQkCeo8EkjoxnQkICJSb8k8EtAVwyIiEaYQEBGJMIWAiEiE\nKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgER\nkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiIiESYQkBEJMIU\nAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhI\nhCkEREQiLK4QMLNxZrbTzHab2Yxqnp9sZpvD27tmNizxpYqISKKZu9fewCwD2A2MAQ4Ba4GJ7r4z\nps1oYIe7nzSzccBcdx9dzbq8uu3179+fwsLCRr0QSZ7c3Fw++eSTVJchEllmhrtbMtbdOo42o4A9\n7l4YFrMYGA9UhIC7r45pvxroU58iCgsLqSuMJHXMkvJvT0TSQDzdQX2A/TGPD1D7Tv5vgTcaU5SI\niDSNeI4E4mZm3wKmAtfW1Gbu3LkV0/n5+eTn5yeyBBGRZq+goICCgoIm2VY8YwKjCfr4x4WPZwLu\n7o9WafdV4BVgnLvvrWFd1Y4JhP1dDXsFknT6+4ikVjLHBOLpDloLDDSzXDNrC0wEllYpMIcgAO6s\nKQBERCT91Nkd5O6lZnYfsIIgNBa5+w4zmxY87QuAfwS6AE9aMIpY4u6jklm4iIg0Xp3dQQndmLqD\n0kZeXh6LFi3iuuuuY968eXz44Yc8//zz1bbV30cktVLdHSQRoNNARaIpoWcHJUOi903N6QNtaWkp\nrVq1SnUZItKC6UggDo8++ih9+/alU6dODBkyhFWrVjFv3jxuv/127rrrLjp16sSwYcPYsGFDxTJ5\neXk8/vjjXHHFFWRlZTFp0iTOnTtX63befvtt+vXrx2OPPUbv3r35/ve/D8Brr73GVVddRVZWFtde\ney1bt26tWObAgQNMmDCBHj160L17d+6//34APvroI8aMGUO3bt3o0aMHd9xxB6dOnUrCuyMizZlC\noA67d+/miSeeYP369Zw6dYrly5fTv39/AJYtW8bkyZM5efIkt956K/fee2+lZZcsWcKKFSv4+OOP\n2bx5M88++2yd2zty5AgnTpxg3759LFiwgI0bN3LPPfewcOFCioqKmDZtGrfddhslJSWUlZVxyy23\nkJeXx759+zh48CATJ04EwN2ZNWsWR44cYceOHRw4cKDSNRoiIqAQqFOrVq04d+4cH3zwAefPnycn\nJ4e8vDwArr32Wm644QbMjDvvvJMtW7ZUWvaBBx6gZ8+edO7cmVtvvZVNmzbFtb158+bRpk0bMjMz\nWbhwIT/84Q8ZMWJExXYyMzNZvXo1a9as4fDhwzz22GO0a9eOtm3bcs011wAwYMAAxowZQ+vWrena\ntSsPPvggb7/9duLfIBFp1tI+BNwTe6uvAQMGMH/+fObOnUuPHj2YPHkyhw8fBqBXr14V7Tp06MDZ\ns2cpKyurmNezZ89Kz3/++ed1bq979+60adOm4nFhYSGPP/44Xbp0oUuXLmRlZXHgwAEOHTrE/v37\nyc3NJSPj4j/jsWPHmDRpEn379qVz587ccccdHD9+vP5vgIi0aGkfAulg4sSJvPPOO+zbtw+AGTMu\n+jbthKl6lk6/fv2YPXs2RUVFFBUV8dlnn/H5559z++23069fP/bt21cpeMrNmjWLjIwMtm3bxokT\nJ3jhhRd0mqeIXEQhUIfdu3ezatUqzp07R9u2bWnfvn2NZ+wkYyf7gx/8gKeffpo1a9YAcObMGV5/\n/XXOnDnDqFGj6N27NzNnzuSLL76guLiYP/3pTwCcPn2aSy65hI4dO3Lw4EF+8YtfJLw2EWn+FAJ1\nKC4uZubMmXTv3p3s7Gw+/fRTHnnkkWrbxn6KT9R591dffTULFy7kvvvuo0uXLgwaNIjnnnsOgIyM\nDJYtW8aePXvIycmhX79+/P73vwdgzpw5rF+/vmI8YsKECTXWKiLRpSuGpU76+4iklq4YFhGRpFAI\nNLFHHnmEjh070qlTp0q3m2++OdWliUgEqTtI6qS/j0hqqTtIRESSQiEgIhJhCgERkQhTCIiIRJhC\nQEQkwhQCccjLy2PlypXVPvfUU0/Rq1cvOnXqxGeffdbElV1s6tSpPPzww8CF3ycQEalJ2v+yWDo7\nf/4806dPZ82aNVx++eWpLqda+noIEalN2oeAzUvsTsznJO589yNHjlBcXMyQIUPqtZx+NlJE0oW6\ng+K0Zs0ahg4dSteuXbnnnnvYunUrgwcPBiArK4vrr7++1uUzMjJ48sknGTRoEIMGDQJg586djB07\nlq5duzJkyBCWLFlS0f7s2bNMnz6d/v37k5WVxTe+8Q2Ki4sB+O53v0vv3r3JysoiPz+f7du3J+lV\ni0hLpxCI04svvsibb77J3r172bVrF0uWLKnY+Z48eZK33nqrznW8+uqrrFmzhu3bt/PFF18wduzY\nih97Wbx4MT/60Y/YuXMnANOnT2fjxo2sXr2aoqIiHnvssYofj7npppvYu3cvx44dY/jw4Xzve99L\n3gsXkRZNIRCnH//4x2RnZ9O5c2dmz57N7373u4qvUoj3KxVmzZpF586dyczM5LXXXiMvL48pU6Zg\nZlxxxRVMmDCBJUuW4O4888wz/PrXv6ZXr16YGaNHj674xbG7776bDh060KZNGx5++GE2b97M6dOn\nk/baRaTlSvsxgUT24TdG3759K6Zzc3M5dOgQUL8fkoldR2FhIatXr6ZLly4V6yktLWXKlCkcP36c\ns2fP8uUvf/midZSVlTFr1ixefvlljh8/jplhZhw/fpyOHTs29OWJSESlfQiki/3791dMFxYWkp2d\nDdTv7JvYtv369SM/P5/ly5df1M7dad++PXv37mXYsGGVnnvxxRdZtmwZK1euJCcnh5MnT5KVlaUv\neBORBlF3UJyeeOIJDh48SFFRET/72c+YOHEi0PCflLzlllvYvXs3L7zwAufPn6ekpIR169axa9cu\nzIypU6fyk5/8hMOHD1NWVsbq1as5d+4cp0+fJjMzk6ysLM6cOcNDDz2k00BFpMEUAnEwMyZPnszY\nsWMZOHAgl112GbNnz654Lt51xLrkkktYsWIFixcvJjs7m+zsbGbOnFlxBtAvf/lLhg0bxsiRI+na\ntSszZ87E3ZkyZQo5OTn06dOHyy+/nGuuuSaxL1ZEIkW/JyB10t9HJLX0ewIiIpIUCoEEeffddy/6\n2cjyxyIi6UrdQVIn/X1EUkvdQSIikhQKARGRCEuLi8Vyc3N1rnsay83NTXUJIpIkcY0JmNk4YD7B\nkcMid3+0mja/Bm4EzgB3u/umatpUOyYgIiI1S+mYgJllAL8BbgCGApPMbHCVNjcCA9z9MmAa8HQS\nak2agoKCVJcQt+ZSq+pMvOZSa3OpE5pXrckSz5jAKGCPuxe6ewmwGBhfpc144N8A3P194FIz65nQ\nSpOoOf1DaC61qs7Eay61Npc6oXnVmizxhEAfYH/M4wPhvNraHKymjYiIpBmdHSQiEmF1Dgyb2Whg\nrruPCx/PBDx2cNjMngZWuftL4eOdwDfd/WiVdWlUWESkAZI1MBzPKaJrgYFmlgscBiYCk6q0WQrc\nC7wUhsaJqgEAyXsRIiLSMHWGgLuXmtl9wAounCK6w8ymBU/7And/3cxuMrMPCU4RnZrcskVEJBGa\n9LuDREQkvbSogWEzW2RmR81sS8y8LDNbYWa7zGy5mV0a89xDZrbHzHaY2diY+cPNbIuZ7Taz+THz\n25rZ4nCZ98wsp4F19jWzlWa2zcy2mtn96VirmWWa2ftmtjGsc0461hmzrgwz22BmS9O8zk/MbHP4\nvq5J11rN7FIzWxJud5uZfS3d6jSzQeH7uCG8P2lm96dbnTHretDMPgi389tw3amt1d1bzA24FrgS\n2BIz71HgH8LpGcDPw+mvABsJusT6Ax9y4cjofWBkOP06cEM4/ffAk+H07cDiBtbZC7gynL4E2AUM\nTtNaO4T3rYDVBNeNpF2d4fIPAi8AS9P1bx8u/xGQVWVe2tUKPAtMDadbA5emY50x9WYAh4B+6Vgn\nkB3+7duGj18C7kp1rSnbYSfrBuRSOQR2Aj3D6V7AznB6JjAjpt0bwNfCNttj5k8Engqn/xP4Wjjd\nCvg0QTX/B3B9OtcKdADWASPTsU6gL/AmkM+FEEi7OsPlPwa6VpmXVrUCnYC91cxPqzqr1DYWeCdd\n6yQIgUIgi2DHvpQ0+H/forqDatDDwzOV3P0I0COcX9MFbn0ILogrF3txXMUy7l4KnDCzLo0pzsz6\nExy9rCb4h5BWtYZdLBuBI8Cb7r42HesEfgX8D8Bj5qVjnYQ1vmlma83sb9O01jzguJk9E3a1LDCz\nDmlYZ6zbgRfD6bSr090PAY8D+8LtnnT3t1JdaxRCoCqvu0ncGnXKq5ldArwMPODun3NxbSmv1d3L\n3P0qgk/ao8xsaDV1pbROM7sZOOrBlxbWtnzK38/Q1919OHATcK+Z/TVp9p4SfFIdDjwR1nqG4JNp\nutUZLGjWBrgNWBLOSrs6zawzwVfs5BIcFXzJzL5XTW1NWmsUQuCohd9jZGa9gGPh/IMEfYfl+obz\nappfaRkzawV0cveihhRlZq0JAuB5d381nWsFcPdTQAEwLg3r/Dpwm5l9BPwOuM7MngeOpFmdALj7\n4fD+U4KuwFGk33t6ANjv7uvCx68QhEK61VnuRmC9ux8PH6djndcDH7l7Ufgp/Q/ANamutSWGgFE5\n/ZYCd4fTdwGvxsyfGI6m5wEDgTXh4dhJMxtlZgZMqbLMXeH0d4CVjajzXwn69f45XWs1s27lZyqY\nWXvgb4Ad6Vanu89y9xx3/zJB/+hKd78TWJZOdQKYWYfwCBAz+xJBP/ZW0u89PQrsN7NB4awxwLZ0\nqzPGJIIPAOXSsc59wGgzaxduYwywPeW1NmYgJt1uBP2Bh4Di8A2fSjAI8xbBGTgrgM4x7R8iGHHf\nAYyNmX81wX/MPcA/x8zPBH4fzl8N9G9gnV8HSoFNBKP/Gwg+YXdJp1qBYWFtm4AtwOxwflrVWaXm\nb3JhYDjt6iToay//u28FZqZxrVcQfGPAJuDfCc4OSsc6OwCfAh1j5qVdneG65oTb3QI8B7RJda26\nWExEJMJaYneQiIjESSEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBCTSzCzX\nzLaH35L5gZn9p5llproukaaiEBAJvpPlX9z9cuAkMCHF9Yg0GYWACHzs7lvD6fUEv+IkEgkKAZHg\nCwfLlRJ8l75IJCgERBr540AizZlCQCSxv+Qk0qzoq6RFRCJMRwIiIhGmEBARiTCFgIhIhCkEREQi\nTCEgIhJhCgERkQhTCIiIRJhCQEQkwv4/WW1egacM/bEAAAAASUVORK5CYII=\n", 574 | "text/plain": [ 575 | "" 576 | ] 577 | }, 578 | "metadata": {}, 579 | "output_type": "display_data" 580 | } 581 | ], 582 | "source": [ 583 | "pd.DataFrame(snn_results)[['n', 'snn_recall', 'bf_recall']].plot(x='n', ylim=(0, 1), linewidth=4,\n", 584 | " title='Recall is roughly flat')" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 22, 590 | "metadata": { 591 | "collapsed": false 592 | }, 593 | "outputs": [ 594 | { 595 | "data": { 596 | "text/plain": [ 597 | "" 598 | ] 599 | }, 600 | "execution_count": 22, 601 | "metadata": {}, 602 | "output_type": "execute_result" 603 | }, 604 | { 605 | "data": { 606 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEoCAYAAAC0OiEVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXmcTuX7x9/XDGMfy8gWhqRIRElKMkokhMraQrK1SNu3\nVPoaUepX+baSJVtRopBSUsxEKQlFVEqW7OuMnTH374/7DI/xnGee2Z5l5nq/XvOac859nXN/znad\n+7nOfa5bjDEoiqIo+YOIYAtQFEVRAoc6fUVRlHyEOn1FUZR8hDp9RVGUfIQ6fUVRlHyEOn1FUZR8\nRJ52+iLylIiMDbaOvIyIFBaRuSJyQESm58D2monIlpzQlg0NF4nIShFJEpEHg6hjtIg8E6z6g0H6\n8y8ia0TkuiBpSRWRC4JRd25SINgCsoOIHATSPjQoBhwHTjnL+hljRgRLWz7iduA8oLTJuY8+cvTj\nERFJBS40xmzwc5UngIXGmAY5qSOzGGPuC2b9QeT0+TfGXBoKOvISYd3SN8aUMMZEG2OigU1AG49l\nHwRbX1YQkcgg15/ZayIW+DMrDj8n9tVPvZnVFgv8lgU5QT9/GRHq+kIBj2MkQRWSWxhj8sQf8A9w\nfbplQ4D3nOlYIBXoCWwG9gL9gIbAL8A+4M106/cC1jq2XwBVXepO23YfYKvz95hHuQCDgL+A3cCH\nQKl06/bCPrgSXOr4D7AN+Be4x1nnAqdsEdDLw7YHsNhjvhbwlbMf64BOHmUTgVHA58BB4HFgByAe\nNrcCq7xoisf+ujoBJDu6BBgMbHS2MwmI9ndfgWbAFuAp51htALr70Hu9r/0HEp06DzkaOznL2wIr\ngf3AEuBSZ/k3QApw1LG/EIgGpgC7sNfZM+nqWgKMBPYAzznL+zjXTjKwBqjvLK8IzHS29TcwwMc1\nPdFje2nH5VFgJ/Ya6+lhWxh41Tnu+4FvgUJuxxxoDHzn2K4Emnlsq6eH9r+Avh5lMcBcZ729QKJH\nmeu+AVcCPwFJwHbgFZd9bgZs9nZfY+/n6cBkR9tq4PJM1P+9o3sr8CZQwKM8Fbgf+BP422PZBVgf\n4dc9EQ5/QReQYzvi7vSnONNpF/8oIApogb2xP3Eu5ErOzdTUsW/vXAAXYX8RPQ1851J32ranOjff\npc6Fl3axDnQuuIpAQWA0MC3dupOAIkAhL9u/yblRajs2U7FhLF9O/1tnuij2IXc31iFfhnWmtZzy\nic6N0NiZL4R1Uq08tvcJ8LDLvp8+xs58L+e4xTp1f+zlHPja12bASeBl51hdh3XYNX3odd1/Zz4V\nqO4x38A51w2dY3KXc/0UdDmeU4BZzv7EAn8A93jUdRLrMCIcPZ2wDvpyx+YCoIpT13LgGSASqIZ1\nqje6HNv0Tv+kc7wjgdbAYaCkU/42sBCo4NTT2Dl+5xxz7LW+J+0cAzc48zHOfGugmjPd1Kkn7aH1\nAvYeinB0NHGW+9w37PV/h8c12chlnzNy+keAVk59LwBL/az/cqCRY1cV+0vuoXTXyHygFM51ydkN\nK7/viVD/C7qAHNsR/5z+KaCCR/kezm71zky7EIB5ODe2Mx/hXPxVvNSddmPV9Fj2EjDOmV4LNPco\nq4htHUd46Ir1sW/vAi94zNfEf6ffGY/WmLPsHeBZZ3oiMCld+RPA+850GWe/y7toS+/0vwb6e8xf\nlMl9bebYF/ZYNh2nde2i1x+nf4HH/ChgaLpt/M6ZB/7p7Tm6jwMXe9j2xcb80+ramG5bX+KlBY91\nOultBwHvuhyL9E7/MBDhUb6TM47sCM6vFS/X5lnH3Dm/k71ovstFx6y0/QGGOvM1MrNv2F9cQ3Ae\nLBmcf19O/yuPstrAYWf6qkwe24HAx+mukWbpbDydvt/3RKj/hfWL3Cyyy2P6KPbG8Zwv7kzHAq+L\nyKvOvGBjw+djW3HpMdjQSxqbsC3+tG3Ncl4opm3rJFDew95z3fRUwrZiPLftb7wxFmgsIvs86o7E\ntl7TSL8/7wNrRaQI9qHxrTFmJ/5RydHnqbUA/u8rwH5jzLF026jkQ29miQXuFpEBzrxgW8WVvNiW\nxerfnE7P+T70VMGGF7zVe366cxGBDcX4w15jTKrH/BHs9VoW24L39aLa85jHAp1FpJ2HjgLYXwqI\nSGvgv5z5lVsE+NWxfRkb1vtKRAy2YfOSH/vWCxgG/C4iG7APs8/93G9PdnhMHwEKO+91qvqqX0Rq\nYkNwDZ39KQD8nG7bvq7L7NwTIUV+dPr+sgUYbvx/ISzYm/1PZ74qNgYP1mH0MsYsPWclkVhn0vjY\n9nZn22nEprM/jP3JnEYFj+kt2DhuKx/bP6tuY8w2EVkK3AbciW0Z+8s2R5+n1pPYh2vaPvjaV4DS\nIlLEGHPUma+Kjd961Yvv/ffGFuB541/vrj1Y/bHYXwM401t96NkC1HCpd4Mx5mI/6s0Me4BjTp2r\nXWw8NW7B/jrrl95IRKKwv3jvBOYYY1JFZBZOI8MYcwj73udxEbkEWCQiy8hg34wxfwPdnTpuA2aK\nSBmPc5xdMjq2o4EVQBdjzBERGYi9vs+S6bbxbN4TIUVY997JApl5G/8O8LRzYSMiJUXk9gzWeVZE\niohIHexLzQ+d5WOAF0SkqrOt80Tklkzo+gjoKSK1RaQothXmySrgVqfuC4F7Pco+Ay4SkTtFpICI\nFBSRhiKSkeN5D/uT9lJs/NJfPgAeEZFqIlIceB740KOF6s85EGCoo7Up0AZ7DNzwtf9gW4ee/a3H\nAf1FpBGAiBQTkZtFpFj6DTu6PwKeF5HizkP6EezxcWM81ile7my/hohUAZYBB0XkCef7hkgRqSMi\nDX1sK0OMjTlMBEaKSEURiRCRxiJS0DFJf8zfB9qJSEvHtrDTP74S9n1XFLDHcfitgZZpK4pIGxFJ\ne6AdxL70Ts1o30TkDhEp66yXhHWwnr9askravmV0bEsAyY7DrwVkpTtsVu+JkCIvOf2MWo/ebFzn\njTGzgReBD0XkAPbn7U0ZbD8R+/JoAfB/xphvnOWvA3OwP4mTsC+1Gvmr3RjzJfAa9uf3n9geJp78\nD9sa3YG9+d/3WPcQ9qbtim2Fb3P2q1AG+zIL26L9JF2oJSMmYG+Ob7EhjiPAQ56748c2tmNf1m5z\nttXPGLPex/qu++8QD0wRkX0icrsx5mds75q3nHDAn9jYvJvGh5z92ODs1/vGmIlu4o0xM7EPu2ki\nkow9lmWcB0hboD42Vr0L+wCKdttWBnjqfBzbyv8J26vmRc7c3+l/yf2L7ajwNPal/iZn/QjnenkI\nmOEcm67YazeNmsDXzjcy3wFvG2MS/di3m4DfnOPxP2yL+3gm99G13I/6HwfucOofw5kGma960i/L\n6j0RUojzYkLJBk7rbwO290dOtF78qTOzHxxlpY607noLc6sORQkn8sI9kZda+sEmT33I4cRdU8P5\n4laUnCSv3BP6IjfnCPRPplyrT0QWYbvD3ZlbdShKOJGX7gkN7yiKouQjNLyjKIqSjwh5px8qaW7D\nFRGpIiLJIpKn3jmEAyIyRER8de3M7vYXiUgvZ7q7iHzpUZYn0wIr2SfknT5n0tyWNMa8FWwx4YYx\nZouxWUcNnO0olICQpfipiPQQkcV+V2LMNGOMZ5dijdsGAedBPyVjy+ARDk4/ljya5tYb4aY53PSG\nEWlpP7Kzfr4ju9djvrieg538x9cfOZTmNt02hwAzsB9nJGNz2tR1yh4HZqazfwP4nzPdE/vBUbLz\nv1u6et8EDmATrF3vsY2euKeqbYb9hPwJ7EdJk8k4je5E4C3s17bJwFI8skim0x+L/fIxAhjuHM8j\nznpvuKxzNzZF725smuT0Sa9mYD+aOoDNqRKF/XhsKzZ/yf84k7EyAejoTDdxtLR25q8HVjrTNRzb\nA865/cBFWyGn7j3YD7h+BM5zyipiPyTai/3gqne68/6Rs24yNp12TWxSrp3YD5RaeNhHY7+s3eac\ni2Gc6fiwEWjgTN/h7FNtZ74X9uOdtDp9pQJ+0rke0tIvd3CW18Je8yexX73uczkWiziTGK4HZ6fT\n9kwWdi02Fch1Htv3mmrbSx09cbl2vdi6nkPgRqeu/dj7JMFD+xCcFOjpr9ms3D/O8vSps+v60O0t\nrfI12K98066xqz3svV5n2Oyfx52/gzjXdqj9BV1AhgKzmebWy/aGOCelIzbx2GPYD6sisTlbDnIm\n/3sk1iHUd+pLwn4QBTaBWO109T7krNPZufDTcub7SlXbzFn3BWzSr0JknEZ3ItYhX+Hs5/s4qZq9\n7G8sNstihLfj6cX+EucYXI3t0vuyc7yuT3f82jnzhYHnsF8Zxzh/3+FkscRmZXzdmX4KWA+M8ChL\ne6BOA55ypqOAa1z09XVuuELY1mwDoLhT9i3WoRTEppDeBcR56D6CTakdgXXEGxxNkUBvbO6WtHpm\nYfOrFMYmNPsB6OOUTQYecabHOPvUz6NsYLo6z0kF7JTfhpOpEZuO+ZDHfA88MoVmdG+kt+dMLvib\nsA+0K5zl3lJt78JJte2lDtdr14ut13PoXBPJnLnnHsZmUvV0+p6ZWtNfs5m9f3ymzvaiOy2tckln\n/dLY8TW6O9dKV2e+tJ/X2RRv9YTKX9AFZCgwm2luvWxvCPC9x7xgW3NpecE/B+51ptsCazxuln3O\nhVs43TZ7AP+mW/YjTv5wLxo8U9U2wybLKuhR3gyXNLrO9ERgrEdZa2CtS12ZdfrPAlM95otwrtNP\nSLfOX5yda7wljgPFtuZXOdNfYFvC3zvzCZxp3U7G5js6P4Pzdw9eWm5AZezNX9Rj2QvABA/d8z3K\n2mIdUVrrvbhznKKxD/RjeDQasDd+2nXWC5jtTK915tPGR9jIGYc0BJdUwC77tpIzD9OccPqDsM6u\ntsdyn6m2/bgfT1+7Xsq8nkOs0/0+3bIt+On0s3D/+Eyd7WV7Z6VVxvbF/yGdzffYB6U/11lIO/1w\niOl7kpU0t944bWPsmfqXM2l1p3DmA4w7cBJrGWOOAF2wiZq2ix0M3DNpmWfWxTRdlcCmqhWRpSKy\nV0T2Y510WQ/b3caYk+nWd0ujm0b6FLOeZdmhEmcfn6PYn7GepD/GlTj3nKQdz6XYhG/lsK2iKUAV\nEYnB5h9KS737H+xDfZmIrBaRe1z0TcG2yj4UkX9F5EUnDlsJGwY5kk6H57WRPo32Huf8p80L9jhW\nxbbitjv5evZjnVnaOUsEmopIBUfzR8C1TjqOaGPMKo963FIBIyJ3Oz3T9jt11OHs6yK7DAQ+Msas\n81gWi5Nq22PfuuOSmdSPa9cTt3N41jXl4Hd67CzcP7HAY+n2sTLeU2en4ZlWOX16cDhzLflznYU0\n4eb0PdPcphGL7zS33jidptjpyliZM2mQZwP1xGbKbIsdpcpu2JgFxpiW2BvkD2CsxzbTn/SqwDY5\nk6r2/7Cx59LYFq/nizZ/NOcUGdW1HXs8ABCbPzwmg21s5dxzsg1OPzR+xjqgNcaYFOyD4FHgL2PM\nPsdulzGmrzHmfKA/MMpbl0NjzCljzDBjTB1s3LUdtgW2DSiTLlNmVc59GPvDFmzrMcYYU8YYU9oY\nU8oYU8/R8Df2ITEA27o+hHXufbG/QjJEbMbVscD9zvZLYzsspF0X2b0mDDZk1FFEPBPepaXaLuOx\nb9HGmAe8aPTn2j1Tofs53I49F554pgpPnxq7YiY1pD9WaamzPfexuDFmujfdXraxDTvylidp11JG\n11kg7+UsEVZO32Qtza03rhCRDk4L8RHsDf6DU8dx7BB/04Afjc1IiIiUE5FbxKY2PomNv3q2xMuJ\nyACx6Ys7YV+WfU4GqWoDhOcNspOz0wynZyY27W5aat54P7b/ITBYRMo66XOf5exz8i3wILaFDDas\n4zmPiNwuImkPzgPYY3tO8joRiRORS53W8iHsuTjlnKfvgREiUkhE6mFTLGe6n7wxZgf2Ref/RKSE\nWC4Qkes8zBIz2icX0s5FMWf/9ohNb3wPZwbdAXueKsuZ9MiZJS1seQPwkIj0d5a7pdqu5WUbmbp2\nfZzDz4FL0u45sbnsPQfVWQVc53xTUhIblsqSBge/U2e7MA+oKSJdHb1dsKG5uX5cZzuBaqH8XUw4\nOP30T85Mpbl1YQ42VLMfG8LpaIw55VE+GajL2aNLRWBbp1uxvziu4+yc3D9ie4Pswfb0uM0Yc8Bk\nnKrWX7LTgvBc93Wgk/NT+bVzDI1Zi23BTsc6jWTsiypfaXCHY3tB/YrtFbMcm1o4jURs2OTbdPOe\nDvJK4EexqW9nY4et3OilrgrYB1MStmW8iDOplLsB1R3dH2Pj1It86E6P53G6G+ts1mLf5czg7BCI\n2z5lNAqWAXBCLq9iGxs7sKEdz18JC7H7t0NEdqXfiBe9bvVswb68flJEehn3VNtR52wg89eu13No\njNmL/dXxEvb+qIF92Z9Wz9fY6+1XbGroudnQgMk4dfY5q6Rbfx/2V/7jjt7HgTbGmP2Oia/rbAb2\ngbtXRDxHuwsZ/M6947SslmNfWN4iIqWxJyoW+/KqszEmKbeE5hQiMgQ7vufdPmyqYLuXVXAuuoy2\n2QP78ve6jGzDDad1dADbayl9nFNRsoTYBGbvGWMmBFtLfiMzLf2B2FZPGoOAr40dnmwhtutb2OM8\n3B7DjvaUocPPi4hIW7GjUBXDtkZ/VYevKHkDv5y+iFQGbsZ+rJJGe2wYBOd/h5yVFniceH0Stpvh\nkCDLCSbtsT9d/8X+FO8aXDlKHiTkX3jmVfwK74jIDGyMtiTwmBPe2e+8SU+z2WeMKZN7UhVFUZTs\nkmFLX0TaADudvse+3kjrk1tRFCXE8WfkrCbALSJyM/brzBJi08XuEJHyxpidzkcqXnsYiIg+DBRF\nUbKAMSbHu35m2NI3xjxtjKlqjLmAM5+i34XtVtXTMeuBj25Uwfrc2NffkCFDgq4hL+kMJ63hojOc\ntKrOzP2lpqbyweoPiHkpxn4J4+0vl8jOGLkvAh+Jzc2+CZvTQ1EURfHBrsO7uP/z+/l43ceuNuWK\nlWOX9+BJtsnUx1nGmERjzC3O9D5jTAtjzMXGmJbGmAO5olBRFCWPMOO3GdQZVcenw+9Spwu/3Z+l\nIUT8Ijst/bAmLi4u2BL8Ilx0QvhoDRedED5aVadvdh/ezQPzHmDG2hmuNmWLlmV0m9HcfsntuarF\n7y9ys1yBiMntOhRFUUKVT9Z9Qv/P+rP7yG5Xm9svuZ23b36bcsXKnV4mIphceJEbtJZ+tWrV2LRJ\nP/IMNrGxsWzcuDHYMhQlz7H3yF4GfDGAD9Z84GoTUySGUW1G0blO4F6JBq2l7zzFcrVuJWP0PChK\nzjPn9zn0+6wfOw/vdLXpWKsjo9uMpnzx8l7L81xLX1EUJa+x7+g+HvriIaaunupqU7pwad66+S26\nXdqNYGRgVqevKIqSA8z9Yy59P+vLjkM7XG1uufgW3mnzDhVLVHS1yW3U6SuKomSD/Uf38/D8h5ny\nyxRXm1KFS/Fm6ze5o+4dQWnde6JOX1EUJYvMWz+PPnP7sO3gNlebNjXbMLbdWCqV8DVEb+AIh5Gz\n8h0jRoygb9++wZahKIoLSceS6DWnF22mtXF1+CULlWRi+4nM7TY3ZBw+aO+doJOYmMidd97Jli1b\nglK/ngdFyRzz/5pP77m9+Tf5X1ebmy68iXHtxlE5unKW68k3vXdyK9wVqn7NGBP0GJ+iKBmTfDyZ\nx+Y/xviV411togtF879W/+Oe+veE7H2t4R0XXnrpJSpXrkx0dDS1a9dm0aJFDB06lC5dutCjRw+i\no6OpW7cuK1asOL1O9erVefXVV7nssssoXbo03bp148SJE651HDlyhJtvvplt27ZRokQJoqOj2bFj\nB0OHDuWuu+4CYNOmTURERDBp0iSqVq1KTEwMY8aMYfny5Vx22WWUKVOGAQMGnLXdCRMmcMkllxAT\nE0Pr1q3ZvHlz7hwkRcknLPh7AZeOutSnw29ZoyVr7ltDrwa9Qtbhgzp9r/z555+8/fbb/PzzzyQn\nJzN//nyqVasGwNy5c+nevTtJSUm0a9eOBx544Kx1Z8yYwVdffcU///zDL7/8wqRJk1zrKVq0KF98\n8QWVKlXi4MGDJCcnU6FCBYBzLpply5bx119/MX36dB5++GFeeOEFFi5cyJo1a/joo49YvHgxAHPm\nzOHFF19k9uzZ7N69m6ZNm9KtW7ecOziKko84ePwg/T/rT8v3W7Il2XsItnhUcca2HcuXd3xJlZJV\nAqww86jT90JkZCQnTpxgzZo1pKSkULVqVapXrw7AtddeS6tWrRAR7rrrLn799dez1h04cCDly5en\nVKlStGvXjlWrVmVbj4jw3//+l6ioKFq0aEGxYsXo1q0bMTExVKpUiaZNm7Jy5UoAxowZw1NPPcVF\nF11EREQEgwYNYtWqVUF7Z6Ao4crCfxZSd3Rdxvw8xtXmhuo3sOa+NfS5ok9It+49UafvhRo1avDa\na68RHx9PuXLl6N69O9u3bwc43RIH21I/duwYqampp5eVL1/+rPJDhw7liKZy5c4kYipSpMhZ9RQp\nUuR0PZs2bWLgwIGUKVOGMmXKEBMTg4iwdevWHNGhKHmdQycO8cDnD3DDlBvYlOQ9P1ixgsUY3WY0\nC+5aQGyp2AArzB4h5/SNyZ2/zNK1a1cWL158Oh7+5JNP5vCeWnK6dVClShXGjBnDvn372LdvH/v3\n7+fQoUM0btw4R+tRlLxI4sZE6o2ux6jlo1xtmldrzur7VtO/Yf+wad17EnJOPxT4888/WbRoESdO\nnCAqKooiRYoQGRnp1Ta73R3Lly/P3r17SU5OdrXJTB39+/fnhRdeYO3atQAkJSUxc+bMbGlUlLzO\n4ROHeeiLh4ibHMc/B/7xalO0YFHeav0WX9/9NdVLVw+wwpxDnb4Xjh8/zqBBgzjvvPOoVKkSu3fv\nZsSIEV5tPZ/0WXnqX3zxxXTr1o0LLriAMmXKsGPHuXk70m/X13yHDh0YNGgQXbt2pVSpUtSrV48v\nv/wy07oUJb+weNNiLnvnMt5c9qarzXWx1/Fr/195oNEDREh4u039OCufo+dBya8cOXmEZ755htd/\nfB2D93ugSIEivNjiRR5s9GDAnX3QPs4SkULAt0CUYz/TGDNURIYAfeD06L1PG2O0SakoSsjz3ebv\nuGfOPazft97VpkmVJkxsP5GaMTUDqCz3yfDRZYw5DjQ3xjQA6gOtRaSRUzzSGHO586cO34URI0ac\n/vjK869NmzbBlqYo+YqjJ4/y+FeP03RiU1eHX7hAYUa2HEliz8Q85/Ahk+EdESmKbfXfB9wMHDLG\nvJrBOhreCWH0PCj5hR/+/YGes3vyx94/XG2urnw1E9tP5OKyFwdQmXdyK7zjV5BKRCJEZCWwA1hg\njPnJKXpQRFaJyHgRKZnT4hRFUbLLsZRjPLngSZpMaOLq8AtFFuLlG19m8T2LQ8Lh5yaZbelHA7OA\nAcBuYI8xxojIcKCiMeZeL+toSz+E0fOg5GWWbV1Gz9k9WbdnnatNo/MbMan9JGqfVzuAyjImJLJs\nGmOSRSQBuMkYM9KjaBww1229+Pj409NxcXHExcVlSqSiKEpmOJ5ynKGJQ3npu5dINalebaIio3gu\n7jkeu+YxCkQEP+FwQkICCQkJuV5Phi19ESkLnDTGJIlIEWA+8CKwwhizw7F5BLjSGNPdy/ra0g9h\n9DwoeY3l25bTc3ZPftv9m6tNw0oNmdR+EnXK1QmgsswRzJZ+RWCyiERg3wFMN8bME5EpIlIfSAU2\nAv1yWpyiKIq/nDh1gmGJwxixZASnzCmvNgUjChIfF88TTZ4IidZ9MPCny+Zqp0tmfWNMPWPM887y\nu535+saYDsaYnbkvNzBUr16dhQsXei0bPXo0FSpUIDo6mv379/u9zSVLllC7dmjFDBUlr7Bi+woa\njm3I8MXDXR3+5RUv5+e+P/N006fzrcMHTcOQKVJSUnjsscf4+uuvSU5OpnTp0q62ERERbNiw4fT8\ntddey7p17i+TFEXJPCdOnWDIoiFcNf4qVu9a7dWmYERBhjUfxg/3/kDd8nUDrDD0CLnHnQzNnax1\nZkj249Y7duzg+PHjfrXYwzH7nqKEE7/s+IUes3vwy85fXG3qV6jPpPaTuKzCZQFUFtpoS9+FZcuW\nUadOHWJiYrj33ntZvXo1tWrVAqB06dK0aNHCdd1mzZphjKFevXpER0czY8YMEhMTqVLlzKg61atX\n55VXXuGyyy6jRIkS9OnTh127dnHzzTcTHR1Ny5YtSUpKOm3/ww8/0KRJE0qXLk2DBg1ITEzMvZ1X\nlBDm5KmTDEscRsNxDV0dfoGIAgxpNoQfe/+oDj8dIdfSDxWmTZvGggULKFq0KG3btmXGjBmsXbuW\n6tWrk5SU5LMln5iYSEREBKtXrz494lZiYuI563zyySd88803nDx5kvr167Ny5UomTJhArVq1aN26\nNW+88QbPPvssW7dupW3btkydOpVWrVrxzTffcNttt/HHH38QExOTq8dBUUKJ1TtX03NOT1ZsX+Fq\nU7dcXSZ3mEyDig0CqCx80Ja+CwMGDKBSpUqUKlWKZ555hg8++OB010Z/uzhmZDdgwADKli1LxYoV\nadq0KVdddRX16tUjKiqKjh07nh4CcerUqbRp04ZWrVoBcMMNN9CwYUPmzZuXjT1UlPAhJTWF5799\nnivGXuHq8CMlkmeve5blfZerw/eBtvRdqFy58unp2NhYtm3bBmR/0BRP0g956GsIxI8++oi5c+ee\n1pCSksL111+fY1oUJVT5bddv9JzTk+Xblrva1DmvDpM7TOaKSlcEUFl4EnJOPydeuOYEngOJb9q0\niUqVKgHBeUFbpUoV7r77bsaMcR+gWVHyGimpKbzy/SsMSRjCiVMnvNpESARPNnmSIc2GUKhAoQAr\nDE80vOPC22+/zdatW9m3bx8vvPACXbt2Bfxv6VeoUOGsLpvZ4c4772Tu3Ll89dVXpKamcuzYMRIT\nE0//+lCUvMa63etoMqEJT33zlKvDr122NkvvXcoLN7ygDj8TqNP3gojQvXt3WrZsyYUXXkjNmjV5\n5plnTpcnv1OuAAAgAElEQVT5Q3x8PHfffTdlypTxOkZtRkMgelK5cmXmzJnDCy+8wHnnnUdsbCyv\nvPIKqanec4ooSrhyKvUUL3/3Mg3GNGDZ1mVebdJa9yv6raDR+Y282iju6HCJ+Rw9D0qo8MeeP7hn\nzj0s/Xepq83FMRczqcMkGlduHEBlwSEksmwqiqLkNKdST/H6j6/zzMJnOJZyzKuNIDx29WM81/w5\nihQsEmCFeQt1+llkyZIltG7d+qywjDEGESE5OTmIyhQlfFi/dz33zLmH77Z852pTs0xNJrafSJOq\nTQKoLO+i4Z18jp4HJRikmlTe/PFNnvrmKY6mHPVqIwgPN36Y4dcPp2jBogFWGHw0vKMoSp7gr31/\n0WtOLxZvXuxqU6N0DSa2n0jT2KYBVJY/UKevKEpASDWpjPppFE9+/SRHTh5xtXuo0UO8cMMLFIsq\nFkB1+YegOf3Y2FjNRBkCxMbGBluCkg/4Z/8/9Pq0FwkbE1xtqpeqzsT2E2lWrVnghOVDghbTVxQl\n75NqUhmzfAz/WfAfDp887Gr3wJUP8GKLFykeVTyA6kIbjekrihJWbDywkXs/vZeF/3gfhQ4gtmQs\nE9pP4PrqmkcqUKjTVxQlRzHGMPbnsTy+4HEOnTjkatf/iv78343/R4lCJQKoTsnQ6YtIIeBbIMqx\nn2mMGSoipYHpQCx2YPTOxpgk1w0pipLn2Zy0md6f9mbBhgWuNlVLVuXdW96lxQXuAxEpuYdfMX0R\nKWqMOSIikcB3wEPAbcBeY8z/iciTQGljzCAv62pMX1HyOMYYJqycwCPzH+HgiYOudn0u78MrLV8h\nulB0ANWFJ0GN6Rtj0vpXFXLWMUB7IO01+2QgATjH6SuKkrf5N/lf+sztw5d/felqUzm6MuPbjafV\nha0CqEzxhl9OX0QigJ+BGsDbxpifRKS8MWYngDFmh4iUy0WdiqKEIAkbE+g0oxN7juxxtelVvxcj\nW42kZOGSAVSmuOFvSz8VaCAi0cAsEamDbe2fZea2fnx8/OnpuLg44uLiMi1UUZTQwRjD2z+9zcNf\nPswpc8qrTaUSlRjXbhw317w5wOrCk4SEBBISEnK9nkz30xeRZ4EjQG8gzhizU0QqAIuMMbW92GtM\nX1HyEMdTjnP/5/czYdUEV5sel/Xgf63+R+kipQOoLG+RWzH9DAdREZGyIlLSmS4C3AisAz4Fejpm\nPYA5OS1OUZTQYtvBbTSb1MzV4VcoXoFPu37KpA6T1OGHKBm29EWkLvZFbYTzN90Y87yIlAE+AqoA\nm7BdNg94WV9b+oqSB/jh3x+4dfqtbD+03Wv5tVWvZWanmZQvXj7AyvImudXS1zQMiqJkyLsr3uX+\nefe7jld7X8P7eO2m14iKjAqwsryLpmFQFCXgnDx1kkfmP8LbP73ttbxgREHevvlt+lzRJ8DKlKyi\nTl9RFK/sOryLTjM68e2mb72Wly9Wno87f6wjWoUZ6vQVRTmHFdtX0OHDDmxJ3uK1/MpKVzKryyzO\njz4/wMqU7JJh7x1FUfIX01ZPo8mEJq4Ov8dlPfj2nm/V4Ycp2tJXFAWAU6mnGPT1IF5Z+orX8kiJ\nZGSrkQxoNEAHQApj1OkrisK+o/vo9nE3vvr7K6/lMUVimNFpBs2rNw+wMiWnUaevKPmcNbvW0OHD\nDvy9/2+v5ZeVv4zZXWdTrVS1wApTcgWN6StKPmbWulk0Ht/Y1eF3qdOF73p9pw4/D6FOX1HyIakm\nlSGLhnDrR7d6HbtWEF684UU+uO0DikUVC4JCJbfQ8I6i5DOSjydz5yd3MvfPuV7LSxYqyQe3fUDr\nmq0DrEwJBOr0FSUf8efeP2n/YXt+3/O71/LaZWszp+scasbUDLAyJVCo01eUfMK89fPo/nF3ko57\nH8r6lotv4b2O7+lQhnkcjekrSh7HGMOLS16k7bS2rg5/SLMhzOoySx1+PkBb+oqShzl84jC9Pu3F\nR7995LW8eFRxpnSYQsfaHQOsTAkW6vQVJY+y8cBGOnzYgV92/uK1vEbpGszpOoc65eoEWJkSTNTp\nK0oeZOE/C+k8ozN7j+71Wt6yRks+vO1DHd0qH6IxfUXJQxhjeP2H12n5XktXh/+fa/7DvO7z1OHn\nU7Slryh5hGMpx+j/WX8m/zLZa3nhAoV595Z36V63e4CVKaGEOn1FyQP8m/wvt06/lZ+2/eS1vGrJ\nqszqMovLK14eYGVKqJFheEdEKovIQhH5TURWi8gAZ/kQEflXRFY4fzflvlxFUdLz3ebvaDi2oavD\nbxbbjOV9lqvDVwA/BkYXkQpABWPMKhEpDvwMtAe6AAeNMSMzWF8HRleUXGLsz2N5cN6DnEw96bX8\nwSsfZGSrkRSMLBhgZUp2CdrA6MaYHcAOZ/qQiKwD0obM0ZEUFCUInDh1goFfDOSdn9/xWh4VGcXo\nNqPp1aBXgJUpoU6GLf2zjEWqAQnApcBjQE8gCVgOPGaMOedzP23pK0rOsvPQTm6fcTtLNi/xWl6x\neEU+6fIJjSs3DrAyJScJWkvfQ0BxYCYw0GnxjwKeM8YYERkOjATu9bZufHz86em4uDji4uKyo1lR\n8i3Lty2n4/SO/Jv8r9fyxpUb83Hnj6lUolKAlSnZJSEhgYSEhFyvx6+WvogUAD4DvjDGvO6lPBaY\na4yp56VMW/qKkgO898t79Jnbh+Onjnst71W/F6PajKJQgUIBVqbkBrnV0vf346wJwFpPh++84E3j\nVmBNTgpTFMWSkprCo/Mf5e7Zd3t1+AUiCvBW67cYf8t4dfhKhvjTe6cJ8C2wGjDO39NAd6A+kAps\nBPoZY3Z6WV9b+oqSRfYe2UuXmV345p9vvJaXLVqWmZ1m0qxaswArU3Kb3GrpZ+pFbpYqUKevKFni\n152/0uHDDvxz4B+v5Q0qNGB219lULVk1wMqUQBDs8I6iKAFk5tqZXP3u1a4Ov9ul3VjSa4k6fCXT\nqNNXlBAi1aTyzDfP0GlGJ46cPHJOeYRE8PKNLzP11qkULVg0CAqVcEdz7yhKiJB0LIk7PrmDz9d/\n7rW8VOFSfHjbh7S6sFWAlSl5CXX6ihIC/L7nd9p/2J4/9/7ptbzOeXWY3XU2F5a5MMDKlLyGOn1F\nCTKf/fkZd3xyB8nHk72Wd6zVkckdJlOiUIkAK1PyIhrTV5QgYYxh+LfDueWDW1wd/nNxzzGz80x1\n+EqOoS19RQkCh04coufsnny87mOv5SWiSvD+re9zy8W3BFiZktdRp68oAWbD/g10+LADq3et9lpe\ns0xN5nSdQ+3zagdYmZIfUKevKAHk6w1f03lGZ/Yf2++1vPWFrZl22zRKFS4VYGVKfkFj+ooSAIwx\njFw6klbvt3J1+IOaDGJut7nq8JVcRVv6ipLLHD15lL6f9eX9X9/3Wl60YFEmtp9I5zqdA6xMyY+o\n01eUXGRz0mY6Tu/Iiu0rvJbHloxlTtc5XFbhsgArU/Ir6vQVJZdYvGkxt310G7uP7PZa3rxacz7q\n9BFli5YNsDIlP6MxfUXJYYwxjP5pNNdPud7V4Q+8aiDz75yvDl8JONrSV5Qc5HjKcQZ8MYBxK8Z5\nLS8UWYgxbcfQo36PACtTFIs6fUXJIbYf3M7tM27n+y3fey0/v8T5fNLlExqd3yjAyhTlDOr0FSUH\nWLZ1GR2nd2TbwW1ey6+pcg0fd/6YCsUreC1XlEChMX1FySaTVk3iuonXuTr8Ppf3YeHdC9XhKyGB\ntvQVJYucPHWSx796nDeWveG1vEBEAd5s/Sb9G/YPsDJFcSdDpy8ilYEpQHnsIOjjjDFviEhpYDoQ\nix0YvbMxJikXtSpKyLDnyB46z+jMoo2LvJaXK1aOmZ1m0jS2aYCVKYpvMhwYXUQqABWMMatEpDjw\nM9AeuAfYa4z5PxF5EihtjBnkZX0dGF3JU6zasYoOH3ZgU9Imr+VXVLyCWV1mUaVklQArU/ISQRsY\n3Rizwxizypk+BKwDKmMd/2THbDLQIafFKUqoMX3NdK559xpXh39nvTtZfM9idfhKyJKpF7kiUg2o\nD/wAlDfG7AT7YADK5bQ4RQkVTqWeYtDXg+j6cVeOphw9pzxCIhjZciRTOkyhSMEiQVCoKP7h94tc\nJ7QzExhojDkkIuljNq4xnPj4+NPTcXFxxMXFZU6logSRA8cO0P3j7nzx1xdey8sUKcP026fT4oIW\nAVam5CUSEhJISEjI9XoyjOkDiEgB4DPgC2PM686ydUCcMWanE/dfZIw5Z9QHjekr4cza3Wvp8GEH\n1u9b77W8brm6zO46mwtKXxBgZUpeJ2gxfYcJwNo0h+/wKdDTme4BzMlBXYoSdOb8Poerxl/l6vBv\nv+R2vr/3e3X4SljhT++dJsC3wGpsCMcATwPLgI+AKsAmbJfNA17W15a+ElakmlSGJQ4jPjHea7kg\nDGs+jKebPo1IjjfEFAXIvZa+X+GdbFWgTl8JIw4eP8jds+9m9u+zvZZHF4pm6q1TaXtR2wArU/Ib\nueX09YtcRXH4a99ftP+wPWt3r/VafnHMxczuOptaZWsFWJmi5Bzq9BUFmP/XfLp+3JUDx86JUALQ\npmYbpt46lZKFSwZYmaLkLJpwTcnXGGN4+buXuXnaza4O/5mmz/Bpt0/V4St5Am3pK/mWIyeP0PvT\n3nyw5gOv5cUKFmNSh0ncfsntAVamKLmHOn0lX7LpwCY6Tu/Iyh0rvZZXL1WdOV3nULd83QArU5Tc\nRcM7Sr7iVOoppvwyhYbjGro6/Buq38BPfX5Sh6/kSbSlr+QLUk0qH6/9mP8m/Jff9/zuavdo40d5\n6caXKBCht4aSN9ErW8nTGGOYt34egxcNZtWOVa52hSILMa7dOO667K4AqlOUwKNOX8mzLPxnIYMX\nDmbpv0t92lWOrsysLrNoWKlhgJQpSvBQp6/kOZZuWcozC59xHdXKk7YXtWV8u/GUL14+AMoUJfio\n01fyDCu3r2TwosHMWz8vQ9tmsc0Yfv1wrq16bQCUKUrooE5fCXvW7l7LkIQhzFw7M0PbRuc34vnr\nn+eG6jdosjQlX6JOXwlb/t73N/GJ8Uz9dSrGfQwfAOqVr8ew5sNod1E7dfZKvkadvhJ2bEnawvBv\nhzNh1QRSUlN82l4cczFD44bSqU4nIkQ/S1EUdfpK2LDz0E5GLBnBO8vf4fip4z5tq5WqxpBmQ7iz\n3p3a515RPNC7QQl59h3dx8vfvcwby97gyMkjPm0rlajE4KaDuffye4mKjAqQQkUJH9TpKyFL8vFk\nXvvhNV5d+irJx5N92pYtWpanrn2K+xreR5GCRQKkUFHCD3X6Sshx5OQR3l72Ni999xJ7j+71aVuy\nUEn+c81/eOiqhyhRqESAFCpK+KJOXwkZjqccZ9yKcTy/+Hl2HNrh07ZYwWI83PhhHrv6MUoXKR0g\nhYoS/mTo9EXkXaAtsNMYU89ZNgToA+xyzJ42xnyZayqVPE1KagpTfpnC0MShbE7a7NO2UGQhHrjy\nAZ689knKFSsXIIWKknfIcGB0EbkWOARMSef0DxpjRmZYgQ6MrriQalKZvmY6QxKGsH7fep+2BSIK\n0LtBbwZfN5jzo88PkEJFCR5BGxjdGLNERGK9acppMUr+wBjDnD/m8OyiZ1mza41P2wiJ4K56dzGk\n2RCql64eIIWKknfJTkz/QRG5C1gOPGaMScohTUoexRjDV39/xeBFg1m+bXmG9p3rdCa+WTy1z6sd\nAHWKkj/IqtMfBTxnjDEiMhwYCdzrZhwfH396Oi4ujri4uCxWq4Qr3276lsELB7N48+IMbdtd1I7n\nmj9H/Qr1A6BMUUKDhIQEEhIScr2eDGP6AE54Z25aTN/fMqdcY/r5mJ+2/sTgRYP56u+vMrRtcUEL\nhjUfRuPKjQOgTFFCm6DF9NPqxyOGLyIVjDFpfepuBXwHZpV8x+qdq3l20bPM+WNOhrbXVLmG569/\nnrhqcbkvTFHyOf502ZwGxAExIrIZGAI0F5H6QCqwEeiXixqVMOLPvX8yJGEI09dMzzDz5eUVL2d4\n8+HcdOFNmvlSUQKEX+GdbFWg4Z18wcYDG3ku8Tkm/zKZVJPq0/aS8y5hWPNhdKzVUZ29orgQ7PCO\nonhl28FtPP/t84xbMY6TqSd92tYoXYP4uHi6XdqNyIjIAClUFMUTdfpKlthzZA8vLXmJt356i2Mp\nx3zaVo6uzH+v+y896/ekYGTBAClUFMUb6vSVTHHg2AFGLh3J/374H4dOHPJpW75YeZ5u+jR9r+hL\n4QKFA6RQURRfqNNX/OLwicO88eMbvPz9y+w/tt+nbenCpXmyyZM82OhBikUVC5BCRVH8QZ2+4pNj\nKcd4Z/k7jFgygl2Hd/m0LRFVgkevfpRHGj9CycIlA6RQUZTMoE5f8crJUyeZsHICw74dxtaDW33a\nFilQhAGNBvBEkyeIKRoTIIWKomQFdfrKWZxKPcXU1VMZmjiUDfs3+LSNioyi3xX9eOrap6hYomKA\nFCpK3iU1FZYsgUmTcq8OdfoKYNMcf7LuE/676L+s27POp22kRNKzfk+eve5ZYkt5S8CqKEpm2LgR\npkyByZNhg++2VrZRp5/PMcYwb/08Bi8azKodq3zaCkK3ut2IbxZPzZiaAVKoKHmTw4fh449tq37R\nosDVq04/H7Pwn4UMXjiYpf8uzdC2Y62OPNf8OS4td2kAlClK3sSYM+Gbjz6CQ757PecK6vTzIUu3\nLGXwosEs/GdhhrY3XXgTw5oPo2GlhgFQpih5k82bbehm8mT4++/galGnn49YuX0lgxcNZt76eRna\nXhd7HcObD6dpbNMAKFOUvMeRI/DJJ7ZVv3ChbeX7gwjceCN8lXE28iyhTj8fsHb3WoYkDGHm2pkZ\n2jY6vxHDmw+nxQUtNBmaomQSY+C7786Ebw4e9H/diy6Cnj3hrrugcmXr/HMDdfp5mL/3/c3QxKFM\nXT01w8yX9crXY1jzYbS7qJ06e0XJJJs3w3vvWWf/11/+rxcdDV27WmffuHHuOXpP1OnnQbYkbWH4\nt8OZsGoCKakpPm0virmI5+Keo1OdTkRIRIAUKkr4c+QIzJplHf0332QufNOihXX0HTpA0aK5qfJc\n1OnnIXYe2smIJSN4Z/k7HD913KdttVLVGNJsCHfWu5MCEXoZKIo/GANLl1pHP306JCf7v+6FF1pH\nf/fdUKVKbinMGL3b8wD7ju7j5e9e5o1lb3Dk5BGfthWLV2TwdYPpfXlvoiKjAqRQUcKbLVvOhG/W\nr/d/vRIloEsX6+yvuSYw4ZuMUKcfxiQfT+b1H17nlaWvkHzcd5OjbNGyDGoyiPuvvJ8iBYsESKGi\nhC9Hj8Ls2TBxInz9debCN9dfD/fcAx07Bj58kxHq9MOQIyePMOqnUby45EX2Ht3r07ZkoZI8fs3j\nDLxqICUKlQiQQkUJT4yBH36wLfoPP8xc+KZGjTO9b2JDODuJPwOjvwu0BXYaY+o5y0oD04FY7MDo\nnY0xSbmoUwGOpxxn/IrxPL/4ebYf2u7TtljBYgy8aiCPX/M4pYuUDpBCRQlPtm61uW8mTYI///R/\nveLFoXNn6+yvvTY0wjcZkeHA6CJyLXAImOLh9F8C9hpj/k9EngRKG2MGuayvA6Nnk1STynu/vMeQ\nhCFsStrk07ZQZCHuv/J+Bl07iHLFygVIoaKEH0ePwpw51tEvWGAzXPrL9ddbR3/rrVAsl8YJyq2B\n0TN0+k7lscBcD6f/O9DMGLNTRCoACcaYWi7rqtPPBuv3rqfvZ31J2Jjg065ARAF6N+jNM9c9Q+Xo\nyoERpyhhhjGwbJl19B98AEmZiE9Ur36m9021arkk0IPccvpZjemXM8bsBDDG7BARbVLmMCmpKbz6\n/avEJ8b7HHg8QiK4q95d/LfZf7mg9AUBVKgo4cO2bWd63/z+u//rFSt2dvgmIg98ypJTL3J9NuXj\n4+NPT8fFxREXF5dD1eZNVm5fyb2f3svKHSt92nW6pBND44ZS+7zaAVKmKOHDsWNnwjdffZW58E1c\nnHX0t91m4/aBICEhgYSEhFyvJ6vhnXVAnEd4Z5Exxqvn0fCO/xw9eZShiUN55ftXOGVOudq1vagt\nw5oPo36F+gFUpyihjzHw009nwjcHDvi/brVqZ8I31avnksBMEOzwjjh/aXwK9AReAnoAc3JWVv4j\ncWMifeb2Yf0+9y8/qpWqxpi2Y2hZo2UAlSlK6LN9+5nwzTrfA7+dRdGi0KmTdfbXXZc3wjcZ4U/v\nnWlAHBAD7ASGALOBGUAVYBO2y6bXZ6q29H2TdCyJJxY8wdgVY11tBOHhxg8zrPkwikXlUlcBRQkz\njh2DuXOto//yy8yFb5o1OxO+KRGin68EtfdOtipQp+/KnN/ncP+8+9l2cJurzaXlLmV8u/FcVfmq\nACpTlNDEGPj5Z/uV7AcfwP79/q8bGws9etjwTY0auacxpwh2eEfJQXYe2smALwYwY+0MV5uCEQUZ\nfN1gBl07SHPkKPmeHTvg/fdtq/633/xfr2hRuP1226pv1ix/hG8yQp1+ADHGMPmXyTw6/1H2H3Nv\nolxd+WrG3zKeS867JIDqFCW0OH787PDNKfe+DefQtKl19J06hW74Jlio0w8Q/+z/h36f9WPBhgWu\nNsWjijPihhHcf+X9mtteyZcYAytWWEc/bRrs2+f/ulWrngnfXHhhrkkMe9Tp5zKnUk/xxo9vMHjR\nYJ9pj1tf2Jp32r5D1ZJVA6hOUUKDHTtg6lTr7Nes8X+9IkXsy9iePaF5cw3f+IM6/Vxk9c7V9J7b\nm2Vbl7naxBSJ4bWbXuOOunfoMIVKvuLECfjsM+vo583LXPjm2mvPhG+io3NLYd5EnX4ucDzlOM8v\nfp4RS0b4HK6we93uvNbqNc4rdl4A1SlK8DAGVq2yvW+mTYO9vjODn0XlyjZ806MH1KyZexrzOur0\nc5jvt3xP7097s26P+xciVaKrMLrNaNpc1CaAyhQleOzadSZ88+uv/q9XuPDZ4ZvIyNxSmH9Qp59D\nHDx+kKe/eZq3f3ob4yMV0QNXPsCIG0bogCZKnufECfj88zPhmxT3H73ncM011tF37gwlS+aWwvyJ\nOv0c4Iv1X9Dvs35sSd7ialOrbC3GtxtPk6pNAqhMUQLPqlXW0U+dCnv2+L/e+eefCd9cdFGuycv3\nqNPPBrsP7+aR+Y8wdfVUV5sCEQUY1GQQz1z3DIULFA6gOkXJfYyx/en37IGZM62z/+UX/9cvXNiO\nI9uzJ9xwg4ZvAoE6/SxgjGHa6mk8PP9h9hxxb8pcWelKxt8ynnrl62W7zpMn7TBuJ07YG6VIkbP/\nR0WFx1BtSuBJc8yHD3v/O3Qoe8szk/MmjauvPhO+KVUqx3dZ8YE6/UyyOWkz931+H/PWz3O1KVKg\nCMOvH87AqwYSGZG9psuff8KYMTB5su+eDiLeHwZp/32VZed/oUL6sMkJjLEP9Jx2yGl/mekOmVtU\nqmQ/nOrRA2p5HWdPCQSacM1PUk0qo38azaBvBnHoxCFXuxYXtGBM2zHZGsXqxAmYNcs6+0WLsryZ\ngJH+oZJbD5j0dQTjQ5z0jjmnnPKhQ6HhmHOaQoWgQwe45x5o0ULDN5lBs2wGkXW719F7bm++3/K9\nq02pwqUY2XIkPev3zPJHVn//DePGwYQJsHt3VtXmH6KisvfgKFAAjhzJnLPOTA+U/MxVV9nwTZcu\nULp0sNWEJ+r0g8CJUyd4aclLDF88nBOnTrja3X7J7bzZ+k0qFK+Q6TpOnrRJpd55Bxa4p+VRlJCl\nQAE7pGDFitCunXX2tXUEz2yjqZUDzLKty+j9aW9W71rtalOxeEVGtRlFh1odMr39TZtsq/7dd23e\nEX8oVAguvtgOHnHsGBw9eua/tkAVXxQoYAf5TvsrXvzs+YyW+yqL0szfYYU6/XQcPnGYZxc9y+s/\nvk6qce+W0OfyPvzfjf9HqcL+dz1ISbEfqYwZA198YV/e+cPFF0O/fvYlWEyM+7bTPwzSPxhy8n/a\n9An3H0BKJomMzJpD9sdRq2NW0tDwjgdfb/iavnP78s+Bf1xtLixzIePajSOuWpzf2/33X9uiHz/e\nTvtDwYL28/N+/ezgD6HaQyY1NecfMP7aBIM0x5xdh+xtuXa7VTzRmH4usu/oPh776jEmrZrkahMp\nkTx29WPEx8VTpGCRDLd56hTMn29b9Z995n9f5ho1oG9fGxctV86/dfIjqan2V0Z2HignT9qRlTLj\nqNUxK4EiJJ2+iGwEkoBU4KQxppEXm5B1+sYYZq6dyYAvBrDz8E5Xu/oV6vPuLe9yecXLM9zm9u22\n9824cTZu7w8FCthubf36wfXXa05wRVFC90VuKhBnjMnE8MShwbaD27j/8/uZ88ccV5vCBQoT3yye\nR69+lIKRBV3tUlPhm29sD5xPP/X/pWq1atCnD/TqBRUy3/FHURQl02TX6QsQVu3SVJPK+BXj+c+C\n/5B8PNnVrllsM8a1G0fNGPfE3bt22bzgY8fChg3+1R8ZCW3bQv/+cOON+rGKoiiBJbvhnQ3AAeAU\nMNYYM86LTciEd9bvXU+fuX1I3JToahNdKJqXb3yZ3pf39jpOrTGQkGBj9Z98YuPC/lC5sm3V33uv\nzSaoKIrii1AN7zQxxmwXkfOABSKyzhizJL1RfHz86em4uDji4uKyWW3mSElN4dXvXyU+MZ5jKe7d\nPm65+BZG3TyK86PP9cp799oMgmPH2nw4/iACN99sY/WtW9vYvaIoijcSEhJISEjI9XpyrPeOiAwB\nDhpjRqZbHtSW/srtK7n303tZuWOlq025YuV4q/Vb3H7J7WelUDAGliyxrfqZM22mQn+oWNG26Hv3\nhtjY7O6Boij5kZBr6YtIUSDCGHNIRIoBLYGhOaYsmxw9eZShiUN55ftXOGXcM1n1rN+TV1u+Spki\nZU4v278f3nvPOvu1a/2vs1Ur26pv29b2s1cURQk1shNwKA/MEhHjbGeqMearnJGVPRI3JtJnbh/W\n7ySScF8AAAnlSURBVFvvalOtVDXGth3LjTVuBGyr/scfbQ+c6dP9//inXDnb+6ZPH7gg64k1FUVR\nAkKe+jgr6VgSTyx4grErxrrrQXi48cMMaz6MYlHFSE6G99+3rfrMDNh8/fW2B0779vqJu6IoOU/I\nhXdCjTm/z+H+efez7eA2V5tLy13K+HbjuaryVSxfbh39tGk2va4/xMTYvOB9+0JN956ciqIoIUvY\nO/2dh3Yy4IsBzFg7w9UmKjKKwU0H80D9J/n4oyjufwdWrPC/juuus7H6W2+1edgVRVHClbB1+sYY\nJv8ymUfnP8r+Y+4fBF9T5RoevXAcX39wCdXawcGD/m2/VCk7rFu/fpobXFGUvENYxvT/2f8P/T7r\nx4IN7qOOFC9YnPbFR7B+2v0s+9H/j4avucY6+k6d7AhLiqIowSAkE675VUEOOv1Tqad448c3GLxo\nMEdOugfiY0+0Zt+Udzj4b1W/thsdDXfdZZ193bo5IlVRFCVb5Hunv3rnanrP7c2yrctcbQqciCFl\n7uuwujs2LZBvrrzS9sDp0sWmzlUURQkV8m3vneMpxxn+7XBe/O5FUlJ9pK/8tTspX74GR87zub3i\nxeGOO2yrvkGDHBarKIoS4oS00/9+y/f0/rQ36/asczdKqgKfjYb1bXxuq0ED6+i7d4cSJXJYqKIo\nSpgQkk7/4PGDPP3N07z909sYXEJDRuCn++HrEXDCuxcvWhS6dbPOvmFDHfFIURQl5Jz+vPXz6P9Z\nf7Ykb3E32l0LPh0PW5p4La5b1zr6O++EkiVzSaiiKEoYEjJOf/fh3Tw8/2GmrZ7mbnSqACwZBIuf\ngZSzv5IqXBg6d7bO/uqrtVWvKIrijaA7fWMM01ZP4+H5D7PnyB53w61X2tb9znpnLa5Vy/bAuesu\nKFPGZV1FURQFCLLT35y0mXtm3sfCf+e5G50sAt88Dz8+BMaOLRgVBbffblv1TZtqq15RFMVfguL0\nT6akMuC9UYz/5ylORR5yN/y7BXw2BvbbnMU1a9pkZz17QtmygdGqKIqSlwio09+2DV4cv44xO3pz\novz34DYo+NFSMH8krOpJgQJCx042hBMXBxFhNQy7oihKaBGQL3K//NIweuwJPt33Euba4VDghPsK\nv3WCL96g+nkV6NvXpjIuXz5XJSqKooQcYZ2GgfN/hFt6Q/nV7obJlYj4YhTta7Wnf39o0UJb9Yqi\n5F/COw3DvVdDRKprcbHf+zLwkpd44OtSVKoUEEWKoij5kmw5fRG5CXgNiADeNca85NXQxeEXO3Yh\nz9QbxxOD44h0i+8riqIoOUaWAygiEgG8BbQC6gDdRKSWX+uaSO6r+yS7n/uVp7oFx+EnJCQEvtIs\nEC46IXy0hotOCB+tqjN8yE7UvBGw3hizyRhzEvgQaJ/RSvXLN2B5v2WMuvVFihQM3igl4XLyw0Un\nhI/WcNEJ4aNVdYYP2XH65wOeCXL+dZZ5pXCBwrx4w4ss6/Mjl1e8PBvVKoqiKFklIC9ym8U2Y1y7\ncdSMqRmI6hRFURQXstxlU0QaA/HGmJuc+UGASf8yV0Ryt0+ooihKHiWk+umLSCTwB3ADsB1YBnQz\nxvgY8URRFEUJJlkO7xhjTonIg8BXnOmyqQ5fURQlhMn1L3IVRVGU0CGsEx2IyLsislNEfvVYVlpE\nvhKRP0RkvoiU9Ch7SkTWi8g6EWnpsfxyEflVRP4Ukdc8lkeJyIfOOktFpGoWdVYWkYUi8puIrBaR\nh0JYayER+VFEVjpah4SqVmdbESKyQkQ+DVWdIrJRRH5xjumyUNXpbKukiMxw6v5NRK4KNa0icpFz\nLFc4/5NE5KFQ0+ls5xERWePUMdXZbnB1GmPC9g+4FqgP/Oqx7CXgCWf6SeBFZ/oSYCU2pFUN+Isz\nv3R+BK50pucBrZzp+4BRznQX4MMs6qwA1Hemi2PfhdQKRa3O+kWd/5HAD9hvMkJV6yPA+8CnIXz+\nNwCl0y0LOZ3O+pOAe5zpAkDJUNXqbCMC2AZUCTWdQCXn3Ec589OBHsHWGTSHnVN/QCxnO/3fgfLO\ndAXgd2d6EPCkh90XwFWOzVqP5V2B0c70l8BVznQksDuHNM8GWoS6VqAosBy4MhS1ApWBBUAcZ5x+\nKOr8B4hJtywUdUYDf3tZHnJaPbbdElgcijqxTn8TUBrryD8lBO77sA7vuFDOGLMTwBizAyjnLE//\nMdlWZ9n52A/L0vD8yOz0OsaYU8ABEcnWoIwiUg376+QH7IkPOa1OyGQlsANYYIz5KUS1/g/4D2A8\nloWiTgMsEJGfRKR3COusDuwRkYlO6GSsiBQNUa1pdAHSBtYOKZ3GmG3Aq8Bmp84kY8zXwdaZF51+\nekzGJn6TrT6zIlIcmAkMNMYc4lxtIaHVGJNqjGmAbUk3EpE6hJhWEWkD7DTGrMpg/VA4pk2MMZcD\nNwMPiEhTQux4OhQALgfedvQexrY+Q1ErIlIQuAWY4SwKKZ0iUgqbmiYW2+ovJiJ3eNEVUJ150env\nFJHyACJSAdjlLN+KjfulUdlZ5rb8rHXEfpcQbYzZlxVRIlIA6/DfM8bMCWWtaRhjkoEE4KYQ1NoE\nuEVENgAfANeLyHvAjhDTiTFmu/N/Nza014jQO55gW5BbjDHLnfmPsQ+BUNQK0Br42Riz5//bu3vW\nqIIwDMP3W0j8QBCx9otgpVgIIlpYRP0DImih0X/hR+FvCPaCrYWiNiKSWhQlJJoUgoKCjSCkTBHG\nYmbxaCUuZMd97wsCZ4fdsw8b8uzZmZOz7XZvOc8Bn0opP9pR+GPg9KRzTkPpB7+/uz0FrrfteeDJ\nYPxyW+0+BMwCr9vHq/WIOBkRAVz74zHzbfsSsDhGzvvUebmFnrNGxL7R2QQRsQM4D6z1lrWUcruU\nsr+Ucpg6x7lYSrkKPOspZ0TsbJ/wiIhd1DnoFTp7PQHalMPXiDjShuaADz1mba5Q3/BHesv5BTgV\nEdvb/ueA1YnnHGcRZdI/1Lm8b8BGe4FvUBdNXlLPkHkB7Bnc/xZ1RXwNuDAYP0H9Q/wILAzGZ4CH\nbfwVcPAfc54BNoEl6ur8O+rR894Osx5r+ZaAZeBOG+8u62B/Z/m1kNtVTuo8+ej3vgLc7DHnYF/H\ngTct8yPq2TvdZaWeZPAd2D0Y6zHn3facy8ADYNukc/rPWZKUyDRM70iS/pKlL0mJWPqSlIilL0mJ\nWPqSlIilL0mJWPqSlIilL0mJWPpKJyIORMRqu4rk+4h4HhEzk84lbQVLX1nNAvdKKUeBdeDihPNI\nW8LSV1afSykrbfst9ZuKpKln6SurjcH2JvVa8tLUs/SV1VhfiCP9ryx9ZeXlZZWSl1aWpEQ80pek\nRCx9SUrE0pekRCx9SUrE0pekRCx9SUrE0pekRCx9SUrkJ1b9/UB1A1asAAAAAElFTkSuQmCC\n", 607 | "text/plain": [ 608 | "" 609 | ] 610 | }, 611 | "metadata": {}, 612 | "output_type": "display_data" 613 | } 614 | ], 615 | "source": [ 616 | "df = pd.DataFrame(snn_results)[['n', 'snn_time', 'bf_time']]\n", 617 | "df.plot(x='n', linewidth=5,\n", 618 | " title='Time per query for bruteforce increases linearly\\nfor pysparnn it grows somewhat like a square root')" 619 | ] 620 | } 621 | ], 622 | "metadata": { 623 | "anaconda-cloud": {}, 624 | "kernelspec": { 625 | "display_name": "Python 2", 626 | "language": "python", 627 | "name": "python2" 628 | }, 629 | "language_info": { 630 | "codemirror_mode": { 631 | "name": "ipython", 632 | "version": 2 633 | }, 634 | "file_extension": ".py", 635 | "mimetype": "text/x-python", 636 | "name": "python", 637 | "nbconvert_exporter": "python", 638 | "pygments_lexer": "ipython2", 639 | "version": "2.7.12" 640 | } 641 | }, 642 | "nbformat": 4, 643 | "nbformat_minor": 0 644 | } 645 | -------------------------------------------------------------------------------- /examples/pysparnn_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE-examples file in the root directory of this source tree. 6 | import numpy as np 7 | 8 | 9 | # code that will measure query time and recall 10 | def recall(query, full_set): 11 | ret = [] 12 | for r_items, t_items in zip(query, full_set): 13 | result = 0.0 14 | for r in np.unique(r_items): 15 | result += 1 if r in t_items else 0 16 | if len(t_items) > 0: 17 | ret.append(result / len(t_items)) 18 | else: 19 | ret.append(0.0) 20 | return np.array(ret) 21 | -------------------------------------------------------------------------------- /examples/sparse_search_comparison.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Copyright 2016-present, Facebook, Inc.\n", 12 | "# All rights reserved.\n", 13 | "\n", 14 | "# This source code is licensed under the license found in the\n", 15 | "# LICENSE-examples file in the root directory of this source tree." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "# Evaluate pysparnn on 20 Newsgroups data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "import numpy as np\n", 34 | "import time\n", 35 | "from scipy.sparse import csr_matrix\n", 36 | "from sklearn.datasets import fetch_20newsgroups" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# make sure you run 'python setup.py install' first!\n", 48 | "import pysparnn.cluster_index as ci" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "source": [ 57 | "# Get data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "dataset = fetch_20newsgroups(subset='all', shuffle=True)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Num docs: 18846\n", 83 | "Avg doc length: 283.6560012734798\n", 84 | "Num unique words: 386410\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "print('Num docs: {}'.format(len(dataset.data)))\n", 90 | "print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in dataset.data])))\n", 91 | "words = set()\n", 92 | "for doc in dataset.data:\n", 93 | " words.update(doc.split())\n", 94 | "print('Num unique words: {}'.format(len(words)))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Turn documents into vectors" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "from sklearn.neighbors import LSHForest, NearestNeighbors \n", 113 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 114 | "\n", 115 | "tv = TfidfVectorizer(decode_error='ignore')\n", 116 | "\n", 117 | "features = csr_matrix(tv.fit_transform(dataset.data))\n", 118 | "\n", 119 | "doc_index = np.array(range(len(dataset.data)))" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "test_features = features[:200]\n", 131 | "train_features = features[200:]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from sklearn.cluster import KMeans" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 10, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "(18646, 173762)" 156 | ] 157 | }, 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "train_features.shape" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 15, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "from sklearn.cluster import KMeans\n", 176 | "from sklearn import datasets\n", 177 | "\n", 178 | "np.random.seed(5)\n", 179 | "\n", 180 | "centers = [[1, 1], [-1, -1], [1, -1]]\n", 181 | "iris = datasets.load_iris()\n", 182 | "X = iris.data\n", 183 | "y = iris.target" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 22, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "k_means = KMeans(n_clusters=int(np.sqrt(train_features.shape[0])), max_iter=20)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 23, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "import time" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 24, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "1223.7469282150269" 219 | ] 220 | }, 221 | "execution_count": 24, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "t0 = time.time()\n", 228 | "\n", 229 | "k_means.fit(train_features)\n", 230 | "\n", 231 | "time.time() - t0" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 25, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "16.347729921340942" 245 | ] 246 | }, 247 | "execution_count": 25, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "t0 = time.time()\n", 254 | "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)\n", 255 | "time.time() - t0" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "## Create an answer key" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 8, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "knn = NearestNeighbors()\n", 274 | " \n", 275 | "knn.fit(train_features)\n", 276 | "\n", 277 | "# get top 3 nearest neighbors for each document\n", 278 | "answers = knn.kneighbors(test_features, 3, return_distance=False)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "## Build models to compare" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 9, 291 | "metadata": { 292 | "collapsed": false, 293 | "scrolled": false 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 10, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,\n", 311 | " radius=1.0, radius_cutoff_ratio=0.9, random_state=None)" 312 | ] 313 | }, 314 | "execution_count": 10, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "lshf = LSHForest()\n", 321 | " \n", 322 | "lshf.fit(train_features)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Compare results" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 11, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "import pysparnn_utils" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 12, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Percent of time snn returns a top 3 result: 0.66\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "t0 = time.time()\n", 360 | "\n", 361 | "results = snn.search(test_features, return_distance=False, num_indexes=1)\n", 362 | "\n", 363 | "print('Percent of time snn returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 364 | "\n", 365 | "snn_time = time.time() - t0" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 13, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "Percent of time lsh returns a top 3 result: 0.143\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "t0 = time.time()\n", 385 | "\n", 386 | "results = lshf.kneighbors(test_features, return_distance=False)\n", 387 | "\n", 388 | "print('Percent of time lsh returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n", 389 | "\n", 390 | "lsh_time = time.time() - t0" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 14, 396 | "metadata": { 397 | "collapsed": false 398 | }, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "5.112146987324278" 404 | ] 405 | }, 406 | "execution_count": 14, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "# LSH is x times slower than snn\n", 413 | "lsh_time / snn_time" 414 | ] 415 | } 416 | ], 417 | "metadata": { 418 | "anaconda-cloud": {}, 419 | "kernelspec": { 420 | "display_name": "Python 3", 421 | "language": "python", 422 | "name": "python3" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 3 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython3", 434 | "version": "3.6.0" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 0 439 | } 440 | -------------------------------------------------------------------------------- /pysparnn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | -------------------------------------------------------------------------------- /pysparnn/cluster_index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | """Defines a cluster pruing search structure to do K-NN Queries""" 8 | 9 | from __future__ import absolute_import, division, print_function, unicode_literals 10 | 11 | import collections as _collections 12 | import random as _random 13 | 14 | import numpy as _np 15 | 16 | import pysparnn.matrix_distance 17 | 18 | 19 | def _k_best(tuple_list, k): 20 | """For a list of tuples [(distance, value), ...] - Get the k-best tuples by 21 | distance. 22 | Args: 23 | tuple_list: List of tuples. (distance, value) 24 | k: Number of tuples to return. 25 | """ 26 | tuple_lst = sorted(tuple_list, key=lambda x: x[0], 27 | reverse=False)[:k] 28 | 29 | return tuple_lst 30 | 31 | 32 | def _filter_unique(tuple_list): 33 | """For a list of tuples [(distance, value), ...] - filter out duplicate 34 | values. 35 | Args: 36 | tuple_list: List of tuples. (distance, value) 37 | """ 38 | 39 | added = set() 40 | ret = [] 41 | for distance, value in tuple_list: 42 | if not value in added: 43 | ret.append((distance, value)) 44 | added.add(value) 45 | return ret 46 | 47 | 48 | def _filter_distance(results, return_distance): 49 | """For a list of tuples [(distance, value), ...] - optionally filter out 50 | the distance elements. 51 | Args: 52 | tuple_list: List of tuples. (distance, value) 53 | return_distance: boolean to determine if distances should be returned. 54 | """ 55 | if return_distance: 56 | return results 57 | else: 58 | return list([x for y, x in results]) 59 | 60 | 61 | class ClusterIndex(object): 62 | """Search structure which gives speedup at slight loss of recall. 63 | 64 | Uses cluster pruning structure as defined in: 65 | http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html 66 | 67 | tldr - searching for a document in an index of K documents is naievely 68 | O(K). However you can create a tree structure where the first level 69 | is O(sqrt(K)) and each of the leaves are also O(sqrt(K)). 70 | 71 | You randomly pick sqrt(K) items to be in the top level. Then for 72 | the K doccuments you assign it to the closest neighbor in the top 73 | level. 74 | 75 | This breaks up one O(K) search into O(2 * sqrt(K)) searches which 76 | is much much faster when K is big. 77 | 78 | This generalizes to h levels. The runtime becomes: 79 | O(h * h_root(K)) 80 | """ 81 | 82 | def __init__(self, features, records_data, 83 | distance_type=pysparnn.matrix_distance.CosineDistance, 84 | matrix_size=None, 85 | parent=None): 86 | """Create a search index composed of recursively defined 87 | matricies. Does recursive KNN search. See class docstring for a 88 | description of the method. 89 | 90 | Args: 91 | features: A csr_matrix with rows that represent records 92 | (corresponding to the elements in records_data) and columns 93 | that describe a point in space for each row. 94 | records_data: Data to return when a doc is matched. Index of 95 | corresponds to records_features. 96 | distance_type: Class that defines the distance measure to use. 97 | matrix_size: Ideal size for matrix multiplication. This controls 98 | the depth of the tree. Defaults to 2 levels (approx). Highly 99 | reccomended that the default value is used. 100 | """ 101 | 102 | self.is_terminal = False 103 | self.parent = parent 104 | self.distance_type = distance_type 105 | self.desired_matrix_size = matrix_size 106 | features = distance_type.features_to_matrix(features) 107 | num_records = features.shape[0] 108 | 109 | if matrix_size is None: 110 | matrix_size = max(int(_np.sqrt(num_records)), 1000) 111 | else: 112 | matrix_size = int(matrix_size) 113 | 114 | self.matrix_size = matrix_size 115 | 116 | num_levels = _np.log(num_records) / _np.log(self.matrix_size) 117 | 118 | if num_levels <= 1.4: 119 | self.is_terminal = True 120 | self.root = distance_type(features, records_data) 121 | else: 122 | self.is_terminal = False 123 | records_data = _np.array(records_data) 124 | 125 | records_index = list(_np.arange(features.shape[0])) 126 | clusters_size = min(self.matrix_size, num_records) 127 | clusters_selection = _random.sample(records_index, clusters_size) 128 | clusters_selection = features[clusters_selection] 129 | 130 | item_to_clusters = _collections.defaultdict(list) 131 | 132 | root = distance_type(clusters_selection, 133 | list(_np.arange(clusters_selection.shape[0]))) 134 | 135 | root.remove_near_duplicates() 136 | root = distance_type(root.matrix, 137 | list(_np.arange(root.matrix.shape[0]))) 138 | 139 | rng_step = self.matrix_size 140 | for rng in range(0, features.shape[0], rng_step): 141 | max_rng = min(rng + rng_step, features.shape[0]) 142 | records_rng = features[rng:max_rng] 143 | for i, clstrs in enumerate(root.nearest_search(records_rng)): 144 | _random.shuffle(clstrs) 145 | for _, cluster in _k_best(clstrs, k=1): 146 | item_to_clusters[cluster].append(i + rng) 147 | 148 | clusters = [] 149 | cluster_keeps = [] 150 | for k, clust_sel in enumerate(clusters_selection): 151 | clustr = item_to_clusters[k] 152 | if len(clustr) > 0: 153 | index = ClusterIndex(self.distance_type.vstack(features[clustr]), 154 | records_data[clustr], 155 | distance_type=distance_type, 156 | matrix_size=self.matrix_size, 157 | parent=self) 158 | 159 | clusters.append(index) 160 | cluster_keeps.append(clust_sel) 161 | 162 | cluster_keeps = self.distance_type.vstack(cluster_keeps) 163 | clusters = _np.array(clusters) 164 | 165 | self.root = distance_type(cluster_keeps, clusters) 166 | 167 | def insert(self, feature, record): 168 | """Insert a single record into the index. 169 | 170 | Args: 171 | feature: feature vector 172 | record: record to return as the result of a search 173 | """ 174 | feature = self.distance_type.features_to_matrix(feature) 175 | nearest = self 176 | while not nearest.is_terminal: 177 | nearest = nearest.root.nearest_search(feature) 178 | _, nearest = nearest[0][0] 179 | 180 | cluster_index = nearest 181 | parent_index = cluster_index.parent 182 | while parent_index and cluster_index.matrix_size * 2 < \ 183 | len(cluster_index.root.get_records()): 184 | cluster_index = parent_index 185 | parent_index = cluster_index.parent 186 | 187 | cluster_index._reindex(feature, record) 188 | 189 | def _get_child_data(self): 190 | """Get all of the features and corresponding records represented in the 191 | full tree structure. 192 | 193 | Returns: 194 | A tuple of (list(features), list(records)). 195 | """ 196 | 197 | if self.is_terminal: 198 | return [self.root.get_feature_matrix()], [self.root.get_records()] 199 | else: 200 | result_features = [] 201 | result_records = [] 202 | 203 | for c in self.root.get_records(): 204 | features, records = c._get_child_data() 205 | 206 | result_features.extend(features) 207 | result_records.extend(records) 208 | 209 | return result_features, result_records 210 | 211 | def _reindex(self, feature=None, record=None): 212 | """Rebuild the search index. Optionally add a record. This is used 213 | when inserting records to the index. 214 | 215 | Args: 216 | feature: feature vector 217 | record: record to return as the result of a search 218 | """ 219 | 220 | features, records = self._get_child_data() 221 | 222 | flat_rec = [] 223 | for x in records: 224 | flat_rec.extend(x) 225 | 226 | if feature is not None and record is not None: 227 | features.append(feature) 228 | flat_rec.append(record) 229 | 230 | self.__init__(self.distance_type.vstack(features), flat_rec, self.distance_type, 231 | self.desired_matrix_size, self.parent) 232 | 233 | def _search(self, features, k=1, k_clusters=1): 234 | """Find the closest item(s) for each feature_list in. 235 | 236 | Args: 237 | features: A matrix with rows that represent records 238 | (corresponding to the elements in records_data) and columns 239 | that describe a point in space for each row. 240 | k: Return the k closest results. 241 | k_clusters: number of branches (clusters) to search at each level. 242 | This increases recall at the cost of some speed. 243 | 244 | Returns: 245 | For each element in features_list, return the k-nearest items 246 | and their distance score 247 | [[(score1_1, item1_1), ..., (score1_k, item1_k)], 248 | [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] 249 | """ 250 | if self.is_terminal: 251 | nearest = self.root.nearest_search(features) 252 | return [r[:k] for r in nearest] 253 | else: 254 | ret = [] 255 | nearest = self.root.nearest_search(features) 256 | 257 | for search_i, nearest_clusters in enumerate(nearest): 258 | curr_ret = [] 259 | 260 | for cluster_i, distance_cluster in enumerate(nearest_clusters): 261 | distance, cluster = distance_cluster 262 | cluster_items = cluster.search(features[search_i], k=k, 263 | k_clusters=k_clusters) 264 | 265 | for elements in cluster_items: 266 | if len(elements) > 0: 267 | curr_ret.extend(elements) 268 | 269 | # if we have k elements and we have searched at least 270 | # k_clusters then we are done 271 | if len(curr_ret) >= k and cluster_i + 1 >= k_clusters: 272 | break 273 | 274 | ret.append(_k_best(curr_ret, k)) 275 | return ret 276 | 277 | def search(self, features, k=1, k_clusters=1, 278 | return_distance=True): 279 | """Find the closest item(s) for each feature_list in the index. 280 | 281 | Args: 282 | features: A matrix with rows that represent records 283 | (corresponding to the elements in records_data) and columns 284 | that describe a point in space for each row. 285 | k: Return the k closest results. 286 | k_clusters: number of branches (clusters) to search at each level. 287 | This increases recall at the cost of some speed. 288 | 289 | Returns: 290 | For each element in features_list, return the k-nearest items 291 | and (optionally) their distance score 292 | [[(score1_1, item1_1), ..., (score1_k, item1_k)], 293 | [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] 294 | 295 | Note: if return_distance == False then the scores are omitted 296 | [[item1_1, ..., item1_k], 297 | [item2_1, ..., item2_k], ...] 298 | """ 299 | 300 | # search no more than 1k records at once 301 | # helps keap the matrix multiplies small 302 | batch_size = 1000 303 | results = [] 304 | rng_step = batch_size 305 | features = self.distance_type.features_to_matrix(features) 306 | for rng in range(0, features.shape[0], rng_step): 307 | max_rng = min(rng + rng_step, features.shape[0]) 308 | records_rng = features[rng:max_rng] 309 | 310 | results.extend(self._search(features=records_rng, 311 | k=k, 312 | k_clusters=k_clusters)) 313 | 314 | return [_filter_distance(res, return_distance) for res in results] 315 | 316 | def _print_structure(self, tabs=''): 317 | """Pretty print the tree index structure's matrix sizes""" 318 | print(tabs + str(self.root.matrix.shape[0])) 319 | if not self.is_terminal: 320 | for index in self.root.records_data: 321 | index._print_structure(tabs + ' ') 322 | 323 | def _max_depth(self): 324 | """Yield the max depth of the tree index""" 325 | if not self.is_terminal: 326 | max_dep = 0 327 | for index in self.root.records_data: 328 | max_dep = max(max_dep, index._max_depth()) 329 | return 1 + max_dep 330 | else: 331 | return 1 332 | 333 | def _matrix_sizes(self, ret=None): 334 | """Return all of the matrix sizes within the index""" 335 | if ret is None: 336 | ret = [] 337 | ret.append(len(self.root.records_data)) 338 | if not self.is_terminal: 339 | for index in self.root.records_data: 340 | ret.extend(index._matrix_sizes()) 341 | return ret 342 | 343 | 344 | class MultiClusterIndex(object): 345 | """Search structure which provides query speedup at the loss of recall. 346 | 347 | There are two components to this. 348 | 349 | = Cluster Indexes = 350 | Uses cluster pruning index structure as defined in: 351 | http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html 352 | 353 | Refer to ClusterIndex documentation. 354 | 355 | = Multiple Indexes = 356 | The MultiClusterIndex creates multiple ClusterIndexes. This method 357 | gives better recall at the cost of allocating more memory. The 358 | ClusterIndexes are created by randomly picking representative clusters. 359 | The randomization tends to do a pretty good job but it is not perfect. 360 | Elements can be assigned to clusters that are far from an optimal match. 361 | Creating more Indexes (random cluster allocations) increases the chances 362 | of finding a good match. 363 | 364 | There are three perameters that impact recall. Will discuss them all 365 | here: 366 | 1) MuitiClusterIndex(matrix_size) 367 | This impacts the tree structure (see cluster index documentation). 368 | Has a good default value. By increasing this value your index will 369 | behave increasingly like brute force search and you will loose query 370 | efficiency. If matrix_size is greater than your number of records 371 | you get brute force search. 372 | 2) MuitiClusterIndex.search(k_clusters) 373 | Number of clusters to check when looking for records. This increases 374 | recall at the cost of query speed. Can be specified dynamically. 375 | 3) MuitiClusterIndex(num_indexes) 376 | Number of indexes to generate. This increases recall at the cost of 377 | query speed. It also increases memory usage. It can only be 378 | specified at index construction time. 379 | 380 | Compared to (2) this argument gives better recall and has comparable 381 | speed. This statement assumes default (automatic) matrix_size is 382 | used. 383 | Scenario 1: 384 | 385 | (a) num_indexes=2, k_clusters=1 386 | (b) num_indexes=1, k_clusters=2 387 | 388 | (a) will have better recall but consume 2x the memory. (a) will be 389 | slightly slower than (b). 390 | 391 | Scenario 2: 392 | 393 | (a) num_indexes=2, k_clusters=1, matrix_size >> records 394 | (b) num_indexes=1, k_clusters=2, matrix_size >> records 395 | 396 | This means that each index does a brute force search. (a) and (b) 397 | will have the same recall. (a) will be 2x slower than (b). (a) will 398 | consume 2x the memory of (b). 399 | 400 | Scenario 1 will be much faster than Scenario 2 for large data. 401 | Scenario 2 will have better recall than Scenario 1. 402 | """ 403 | 404 | def __init__(self, features, records_data, 405 | distance_type=pysparnn.matrix_distance.CosineDistance, 406 | matrix_size=None, num_indexes=2): 407 | """Create a search index composed of multtiple ClusterIndexes. See 408 | class docstring for a description of the method. 409 | 410 | Args: 411 | features: A matrix with rows that represent records 412 | (corresponding to the elements in records_data) and columns 413 | that describe a point in space for each row. 414 | records_data: Data to return when a doc is matched. Index of 415 | corresponds to records_features. 416 | distance_type: Class that defines the distance measure to use. 417 | matrix_size: Ideal size for matrix multiplication. This controls 418 | the depth of the tree. Defaults to 2 levels (approx). Highly 419 | reccomended that the default value is used. 420 | num_indexes: Number of ClusterIndexes to construct. Improves recall 421 | at the cost of memory. 422 | """ 423 | 424 | self.indexes = [] 425 | for _ in range(num_indexes): 426 | self.indexes.append((ClusterIndex(features, records_data, 427 | distance_type, matrix_size))) 428 | 429 | def insert(self, feature, record): 430 | """Insert a single record into the index. 431 | 432 | Args: 433 | feature: feature vector 434 | record: record to return as the result of a search 435 | """ 436 | for ind in self.indexes: 437 | ind.insert(feature, record) 438 | 439 | def search(self, features, k=1, k_clusters=1, 440 | return_distance=True, num_indexes=None): 441 | """Find the closest item(s) for each feature_list in the index. 442 | 443 | Args: 444 | features: A matrix with rows that represent records 445 | (corresponding to the elements in records_data) and columns 446 | that describe a point in space for each row. 447 | k: Return the k closest results. 448 | k_clusters: number of branches (clusters) to search at each level 449 | within each index. This increases recall at the cost of some 450 | speed. 451 | 452 | num_indexes: number of indexes to search. This increases recall at 453 | the cost of some speed. Can not be larger than the number of 454 | num_indexes that was specified in the constructor. Defaults to 455 | searching all indexes. 456 | 457 | Returns: 458 | For each element in features_list, return the k-nearest items 459 | and (optionally) their distance score 460 | [[(score1_1, item1_1), ..., (score1_k, item1_k)], 461 | [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] 462 | 463 | Note: if return_distance == False then the scores are omitted 464 | [[item1_1, ..., item1_k], 465 | [item2_1, ..., item2_k], ...] 466 | """ 467 | results = [] 468 | if num_indexes is None: 469 | num_indexes = len(self.indexes) 470 | for ind in self.indexes[:num_indexes]: 471 | results.append(ind.search(features, k, k_clusters, True)) 472 | ret = [] 473 | for r in _np.hstack(results): 474 | ret.append( 475 | _filter_distance( 476 | _k_best(_filter_unique(r), k), 477 | return_distance 478 | ) 479 | ) 480 | 481 | return ret 482 | -------------------------------------------------------------------------------- /pysparnn/matrix_distance.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | """Defines a distance search structure""" 8 | 9 | from __future__ import absolute_import, division, print_function, unicode_literals 10 | 11 | import abc as _abc 12 | 13 | import numpy as _np 14 | import scipy.sparse as _sparse 15 | import scipy.spatial.distance as _spatial_distance 16 | 17 | 18 | class MatrixMetricSearch(object): 19 | """A matrix representation out of features.""" 20 | __metaclass__ = _abc.ABCMeta 21 | 22 | def __init__(self, features, records_data): 23 | """ 24 | Args: 25 | features: A matrix with rows that represent records 26 | (corresponding to the elements in records_data) and columns 27 | that describe a point in space for each row. 28 | records_data: Data to return when a doc is matched. Index of 29 | corresponds to features. 30 | """ 31 | self.matrix = features 32 | self.records_data = _np.array(records_data) 33 | 34 | def get_feature_matrix(self): 35 | return self.matrix 36 | 37 | def get_records(self): 38 | return self.records_data 39 | 40 | @staticmethod 41 | @_abc.abstractmethod 42 | def features_to_matrix(features): 43 | """ 44 | Args: 45 | val: A list of features to be formatted. 46 | Returns: 47 | The transformed matrix. 48 | """ 49 | return 50 | 51 | @staticmethod 52 | @_abc.abstractmethod 53 | def vstack(matrix_list): 54 | """ 55 | Args: 56 | val: A list of features to be formatted. 57 | Returns: 58 | The transformed matrix. 59 | """ 60 | return 61 | 62 | @_abc.abstractmethod 63 | def _transform_value(self, val): 64 | """ 65 | Args: 66 | val: A numeric value to be (potentially transformed). 67 | Returns: 68 | The transformed numeric value. 69 | """ 70 | return 71 | 72 | @_abc.abstractmethod 73 | def _distance(self, a_matrix): 74 | """ 75 | Args: 76 | a_matrix: A matrix with rows that represent records 77 | to search against. 78 | records_data: Data to return when a doc is matched. Index of 79 | corresponds to features. 80 | Returns: 81 | A dense array representing distance. 82 | """ 83 | return 84 | 85 | def nearest_search(self, features): 86 | """Find the closest item(s) for each set of features in features_list. 87 | 88 | Args: 89 | features: A matrix with rows that represent records 90 | (corresponding to the elements in records_data) and columns 91 | that describe a point in space for each row. 92 | 93 | Returns: 94 | For each element in features_list, return the k-nearest items 95 | and their distance scores 96 | [[(score1_1, item1_1), ..., (score1_k, item1_k)], 97 | [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] 98 | """ 99 | 100 | dist_matrix = self._distance(features) 101 | 102 | ret = [] 103 | for i in range(dist_matrix.shape[0]): 104 | # replacing the for loop by matrix ops could speed things up 105 | 106 | scores = dist_matrix[i] 107 | records = self.records_data 108 | 109 | arg_index = _np.argsort(scores) 110 | 111 | curr_ret = list(zip(scores[arg_index], records[arg_index])) 112 | 113 | ret.append(curr_ret) 114 | 115 | return ret 116 | 117 | def remove_near_duplicates(self): 118 | """If there are 2 or more records with 0 distance from eachother - 119 | keep only one. 120 | """ 121 | 122 | dist_matrix = self._distance(self.matrix) 123 | 124 | keeps = [] 125 | dupes = set() 126 | for row_index in range(dist_matrix.shape[0]): 127 | max_dist = dist_matrix[row_index].max() 128 | for col_index in range(dist_matrix.shape[0]): 129 | if row_index < col_index: 130 | if dist_matrix[row_index, col_index] / max_dist <= 0.001: 131 | dupes.add(col_index) 132 | if not row_index in dupes: 133 | keeps.append(row_index) 134 | 135 | self.matrix = self.matrix[keeps] 136 | self.records = self.records_data[keeps] 137 | 138 | 139 | class CosineDistance(MatrixMetricSearch): 140 | """A matrix that implements cosine distance search against it. 141 | 142 | cosine_distance = 1 - cosine_similarity 143 | 144 | Note: We want items that are more similar to be closer to zero so we are 145 | going to instead return 1 - cosine_similarity. We do this so similarity 146 | and distance metrics can be treated the same way. 147 | """ 148 | 149 | def __init__(self, features, records_data): 150 | super(CosineDistance, self).__init__(features, records_data) 151 | 152 | m_c = self.matrix.copy() 153 | m_c.data **= 2 154 | self.matrix_root_sum_square = \ 155 | _np.sqrt(_np.asarray(m_c.sum(axis=1)).reshape(-1)) 156 | 157 | @staticmethod 158 | def features_to_matrix(features): 159 | """ 160 | Args: 161 | val: A list of features to be formatted. 162 | Returns: 163 | The transformed matrix. 164 | """ 165 | return _sparse.csr_matrix(features) 166 | 167 | @staticmethod 168 | def vstack(matrix_list): 169 | """ 170 | Args: 171 | val: A list of features to be formatted. 172 | Returns: 173 | The transformed matrix. 174 | """ 175 | return _sparse.vstack(matrix_list) 176 | 177 | def _transform_value(self, v): 178 | return v 179 | 180 | def _distance(self, a_matrix): 181 | """Vectorised cosine distance""" 182 | # what is the implmentation of transpose? can i change the order? 183 | dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0 184 | 185 | a_c = a_matrix.copy() 186 | a_c.data **= 2 187 | a_root_sum_square = _np.asarray(a_c.sum(axis=1)).reshape(-1) 188 | a_root_sum_square = \ 189 | a_root_sum_square.reshape(len(a_root_sum_square), 1) 190 | a_root_sum_square = _np.sqrt(a_root_sum_square) 191 | 192 | magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square) 193 | 194 | return 1 - dprod.multiply(magnitude).toarray() 195 | 196 | 197 | class UnitCosineDistance(MatrixMetricSearch): 198 | """A matrix that implements cosine distance search against it. 199 | 200 | cosine_distance = 1 - cosine_similarity 201 | 202 | Note: We want items that are more similar to be closer to zero so we are 203 | going to instead return 1 - cosine_similarity. We do this so similarity 204 | and distance metrics can be treated the same way. 205 | 206 | Assumes unit-vectors and takes some shortucts: 207 | * Uses integers instead of floats 208 | * 1**2 == 1 so that operation can be skipped 209 | """ 210 | 211 | def __init__(self, features, records_data): 212 | super(UnitCosineDistance, self).__init__(features, records_data) 213 | self.matrix_root_sum_square = \ 214 | _np.sqrt(_np.asarray(self.matrix.sum(axis=1)).reshape(-1)) 215 | 216 | @staticmethod 217 | def features_to_matrix(features): 218 | """ 219 | Args: 220 | val: A list of features to be formatted. 221 | Returns: 222 | The transformed matrix. 223 | """ 224 | return _sparse.csr_matrix(features) 225 | 226 | @staticmethod 227 | def vstack(matrix_list): 228 | """ 229 | Args: 230 | val: A list of features to be formatted. 231 | Returns: 232 | The transformed matrix. 233 | """ 234 | return _sparse.vstack(matrix_list) 235 | 236 | def _transform_value(self, v): 237 | return 1 238 | 239 | def _distance(self, a_matrix): 240 | """Vectorised cosine distance""" 241 | # what is the implmentation of transpose? can i change the order? 242 | dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0 243 | 244 | a_root_sum_square = _np.asarray(a_matrix.sum(axis=1)).reshape(-1) 245 | a_root_sum_square = \ 246 | a_root_sum_square.reshape(len(a_root_sum_square), 1) 247 | a_root_sum_square = _np.sqrt(a_root_sum_square) 248 | 249 | magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square) 250 | 251 | return 1 - dprod.multiply(magnitude).toarray() 252 | 253 | 254 | class SlowEuclideanDistance(MatrixMetricSearch): 255 | """A matrix that implements euclidean distance search against it. 256 | WARNING: This is not optimized. 257 | """ 258 | 259 | def __init__(self, features, records_data): 260 | super(SlowEuclideanDistance, self).__init__(features, records_data) 261 | self.matrix = self.matrix 262 | 263 | @staticmethod 264 | def features_to_matrix(features): 265 | """ 266 | Args: 267 | val: A list of features to be formatted. 268 | Returns: 269 | The transformed matrix. 270 | """ 271 | return _np.array(features, ndmin=2) 272 | 273 | @staticmethod 274 | def vstack(matrix_list): 275 | """ 276 | Args: 277 | val: A list of features to be formatted. 278 | Returns: 279 | The transformed matrix. 280 | """ 281 | return _np.vstack(matrix_list) 282 | 283 | def _transform_value(self, v): 284 | return v 285 | 286 | def _distance(self, a_matrix): 287 | """Euclidean distance""" 288 | 289 | return _spatial_distance.cdist(a_matrix, self.matrix, 'euclidean') 290 | 291 | 292 | class DenseCosineDistance(MatrixMetricSearch): 293 | """A matrix that implements cosine distance search against it. 294 | 295 | cosine_distance = 1 - cosine_similarity 296 | 297 | Note: We want items that are more similar to be closer to zero so we are 298 | going to instead return 1 - cosine_similarity. We do this so similarity 299 | and distance metrics can be treated the same way. 300 | """ 301 | 302 | def __init__(self, features, records_data): 303 | super(DenseCosineDistance, self).__init__(features, records_data) 304 | 305 | self.matrix_root_sum_square = \ 306 | _np.sqrt((self.matrix ** 2).sum(axis=1).reshape(-1)) 307 | 308 | @staticmethod 309 | def features_to_matrix(features): 310 | """ 311 | Args: 312 | val: A list of features to be formatted. 313 | Returns: 314 | The transformed matrix. 315 | """ 316 | return _np.array(features, ndmin=2) 317 | 318 | @staticmethod 319 | def vstack(matrix_list): 320 | """ 321 | Args: 322 | val: A list of features to be formatted. 323 | Returns: 324 | The transformed matrix. 325 | """ 326 | return _np.vstack(matrix_list) 327 | 328 | def _transform_value(self, v): 329 | return v 330 | 331 | def _distance(self, a_matrix): 332 | """Vectorised cosine distance""" 333 | # what is the implmentation of transpose? can i change the order? 334 | dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0 335 | 336 | a_root_sum_square = (a_matrix ** 2).sum(axis=1).reshape(-1) 337 | a_root_sum_square = a_root_sum_square.reshape(len(a_root_sum_square), 1) 338 | a_root_sum_square = _np.sqrt(a_root_sum_square) 339 | 340 | magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square) 341 | 342 | return 1 - (dprod * magnitude) 343 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy == 1.11.2 2 | scipy == 0.18.1 3 | scikit_learn == 0.17.1 4 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python -m unittest discover tests 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | # This is a list of files to install, and where 4 | # (relative to the 'root' dir, where setup.py is) 5 | # You could be more specific. 6 | files = [] 7 | 8 | setup(name="pysparnn", 9 | version="0.4", 10 | description="Sparse (approximate) nearest neighbor search for python!", 11 | author="Spencer Beecher", 12 | author_email="spencebeecher@gmail.com", 13 | # url = "", 14 | # Name the folder where your packages live: 15 | # (If you have other packages (dirs) or modules (py files) then 16 | # put them into the package directory - they will be found 17 | # recursively.) 18 | packages=['pysparnn'], 19 | # 'package' package must contain files (see list above) 20 | # I called the package 'package' thus cleverly confusing the whole issue... 21 | # This dict maps the package name =to=> directories 22 | # It says, package *needs* these files. 23 | # package_data = {}, 24 | # 'runner' is in the root. 25 | # scripts = [], 26 | long_description="""Sparse (approximate) nearest neighbor search for python!""" 27 | # 28 | # This next part it for the Cheese Shop, look a little down the page. 29 | # classifiers = [] 30 | ) 31 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | -------------------------------------------------------------------------------- /tests/test_pysparnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | """Test pysparn search""" 8 | 9 | import unittest 10 | import pysparnn.cluster_index as ci 11 | import numpy as np 12 | from scipy.sparse import csr_matrix 13 | from pysparnn.matrix_distance import SlowEuclideanDistance 14 | from pysparnn.matrix_distance import UnitCosineDistance 15 | from pysparnn.matrix_distance import DenseCosineDistance 16 | from sklearn.feature_extraction import DictVectorizer 17 | 18 | class PysparnnTest(unittest.TestCase): 19 | """End to end tests for pysparnn""" 20 | def setUp(self): 21 | np.random.seed(1) 22 | 23 | def test_remove_duplicates(self): 24 | """Do a quick basic test for index/search functionality""" 25 | data = [ 26 | 'hello world', 27 | 'hello world', 28 | 'oh hello there', 29 | 'oh hello there', 30 | 'oh hello there', 31 | 'Play it', 32 | 'Play it again Sam', 33 | ] 34 | 35 | features = [dict([(x, 1) for x in f.split()]) for f in data] 36 | features = DictVectorizer().fit_transform(features) 37 | dist = UnitCosineDistance(features, data) 38 | 39 | self.assertEqual(dist.matrix.shape[0], 7) 40 | 41 | dist.remove_near_duplicates() 42 | 43 | self.assertEqual(dist.matrix.shape[0], 4) 44 | 45 | def test_cosine(self): 46 | """Do a quick basic test for index/search functionality""" 47 | data = [ 48 | 'hello world', 49 | 'oh hello there', 50 | 'Play it', 51 | 'Play it again Sam', 52 | ] 53 | 54 | features = [dict([(x, 1) for x in f.split()]) for f in data] 55 | features = DictVectorizer().fit_transform(features) 56 | 57 | cluster_index = ci.ClusterIndex(features, data) 58 | 59 | ret = cluster_index.search(features, k=1, k_clusters=1, 60 | return_distance=False) 61 | self.assertEqual([[d] for d in data], ret) 62 | 63 | def test_dense_array(self): 64 | """Do a quick basic test for index/search functionality""" 65 | data = [ 66 | 'hello world', 67 | 'oh hello there', 68 | 'Play it', 69 | 'Play it again Sam', 70 | ] 71 | 72 | features = [dict([(x, 1) for x in f.split()]) for f in data] 73 | features = DictVectorizer().fit_transform(features) 74 | features = features.toarray() 75 | cluster_index = ci.ClusterIndex(features, data) 76 | 77 | ret = cluster_index.search(features, k=1, k_clusters=1, 78 | return_distance=False) 79 | self.assertEqual([[d] for d in data], ret) 80 | 81 | def test_dense_matrix(self): 82 | """Do a quick basic test for index/search functionality""" 83 | data = [ 84 | 'hello world', 85 | 'oh hello there', 86 | 'Play it', 87 | 'Play it again Sam', 88 | ] 89 | 90 | features = [dict([(x, 1) for x in f.split()]) for f in data] 91 | features = DictVectorizer().fit_transform(features) 92 | features = features.toarray() 93 | cluster_index = ci.ClusterIndex(features, data, DenseCosineDistance) 94 | 95 | ret = cluster_index.search(features, k=1, k_clusters=1, 96 | return_distance=False) 97 | self.assertEqual([[d] for d in data], ret) 98 | 99 | def test_euclidean(self): 100 | """Do a quick basic test for index/search functionality""" 101 | data = [ 102 | 'hello world', 103 | 'oh hello there', 104 | 'Play it', 105 | 'Play it again Sam', 106 | ] 107 | 108 | features = [dict([(x, 1) for x in f.split()]) for f in data] 109 | features = DictVectorizer().fit_transform(features) 110 | features = features.toarray() 111 | cluster_index = ci.ClusterIndex(features, data, SlowEuclideanDistance) 112 | 113 | ret = cluster_index.search(features, k=1, k_clusters=1, 114 | return_distance=False) 115 | self.assertEqual([[d] for d in data], ret) 116 | 117 | 118 | 119 | def test_levels(self): 120 | """Test multiple level indexes""" 121 | features = np.random.binomial(1, 0.01, size=(1000, 20000)) 122 | features = csr_matrix(features) 123 | 124 | # build the search index! 125 | data_to_return = np.array(list(range(1000)), dtype=int) 126 | 127 | # matrix size smaller - this forces the index to have multiple levels 128 | cluster_index = ci.ClusterIndex(features, data_to_return, 129 | matrix_size=10) 130 | 131 | ret = cluster_index.search(features[0:10], k=1, k_clusters=1, 132 | return_distance=False) 133 | self.assertEqual([[x] for x in data_to_return[:10]], ret) 134 | 135 | def test_levels_multiindex(self): 136 | """Test multiple level indexes""" 137 | features = np.random.binomial(1, 0.01, size=(1000, 20000)) 138 | features = csr_matrix(features) 139 | 140 | # build the search index! 141 | data_to_return = np.array(list(range(1000)), dtype=int) 142 | 143 | # matrix size smaller - this forces the index to have multiple levels 144 | cluster_index = ci.MultiClusterIndex(features, data_to_return, 145 | matrix_size=10) 146 | 147 | ret = cluster_index.search(features[0:10], k=1, k_clusters=1, 148 | return_distance=False) 149 | self.assertEqual([[x] for x in data_to_return[:10]], ret) 150 | 151 | def test_large_k(self): 152 | """Test multiple level indexes""" 153 | features = np.random.binomial(1, 0.01, size=(1000, 20000)) 154 | features = csr_matrix(features) 155 | 156 | # build the search index! 157 | data_to_return = np.array(list(range(1000)), dtype=int) 158 | 159 | # matrix size smaller - this forces the index to have multiple levels 160 | cluster_index = ci.MultiClusterIndex(features, data_to_return, 161 | matrix_size=10) 162 | 163 | ret = cluster_index.search(features[0], k=100, k_clusters=1, 164 | return_distance=False) 165 | self.assertEqual(100, len(ret[0])) 166 | --------------------------------------------------------------------------------