├── .gitignore
├── CONTRIBUTING.md
├── LICENSE-examples
├── LICENSE.md
├── PATENTS
├── README.md
├── examples
    ├── dense_matrix.ipynb
    ├── enron.ipynb
    ├── pysparnn_utils.py
    └── sparse_search_comparison.ipynb
├── pysparnn
    ├── __init__.py
    ├── cluster_index.py
    └── matrix_distance.py
├── requirements.txt
├── run_tests.sh
├── setup.py
└── tests
    ├── __init__.py
    └── test_pysparnn.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to PySparNN
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## Coding Style  
30 | Please use pylint with the default settings.
31 | 
32 | ## License
33 | By contributing to PySparNN, you agree that your contributions will be licensed
34 | under its BSD license.
35 | 


--------------------------------------------------------------------------------
/LICENSE-examples:
--------------------------------------------------------------------------------
 1 | Copyright (c) 20__-present, Facebook, Inc. All rights reserved.
 2 | 
 3 | The examples provided by Facebook are for non-commercial testing and evaluation
 4 | purposes only. Facebook reserves all rights not expressly granted.
 5 | 
 6 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 7 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 8 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 9 | FACEBOOK BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
10 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
11 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For PySparNN software
 4 | 
 5 | Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
 1 | Additional Grant of Patent Rights Version 2
 2 | 
 3 | "Software" means the PySparNN software distributed by Facebook, Inc.
 4 | 
 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
 7 | (subject to the termination provision below) license under any Necessary
 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
 9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 | 
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 | 
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 | 
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PySparNN
 2 | Approximate Nearest Neighbor Search for Sparse Data in Python! This library is well suited to finding nearest neighbors in sparse, high dimensional spaces (like text documents). 
 3 | 
 4 | Out of the box, PySparNN supports Cosine Distance (i.e. 1 - cosine_similarity).
 5 | 
 6 | PySparNN benefits:
 7 |  * Designed to be efficient on sparse data (memory & cpu).
 8 |  * Implemented leveraging existing python libraries (scipy & numpy).
 9 |  * Easily extended with other metrics: Manhattan, Euclidian, Jaccard, etc.
10 |  * Supports incremental insertion of elements.
11 | 
12 | If your data is NOT SPARSE - please consider [faiss](https://github.com/facebookresearch/faiss) or [annoy](https://github.com/spotify/annoy). They use similar methods and I am a big fan of both. You should expect better performance on dense vectors from both of those projects.
13 | 
14 | The most comparable library to PySparNN is scikit-learn's LSHForest module. As of this writing, PySparNN is ~4x faster on the 20newsgroups dataset (as a sparse vector). A more robust benchmarking on sparse data is desired. [Here is the comparison.](https://github.com/facebookresearch/pysparnn/blob/master/examples/sparse_search_comparison.ipynb) [Here is another comparison](https://github.com/facebookresearch/pysparnn/blob/master/examples/enron.ipynb) on the larger Enron email dataset.
15 | 
16 | 
17 | ## Example Usage
18 | ### Simple Example
19 | ```python
20 | import pysparnn.cluster_index as ci
21 | 
22 | import numpy as np
23 | from scipy.sparse import csr_matrix
24 | 
25 | features = np.random.binomial(1, 0.01, size=(1000, 20000))
26 | features = csr_matrix(features)
27 | 
28 | # build the search index!
29 | data_to_return = range(1000)
30 | cp = ci.MultiClusterIndex(features, data_to_return)
31 | 
32 | cp.search(features[:5], k=1, return_distance=False)
33 | >> [[0], [1], [2], [3], [4]]
34 | ```
35 | ### Text Example
36 | ```python
37 | import pysparnn.cluster_index as ci
38 | 
39 | from sklearn.feature_extraction.text import TfidfVectorizer
40 | 
41 | data = [
42 |     'hello world',
43 |     'oh hello there',
44 |     'Play it',
45 |     'Play it again Sam',
46 | ]    
47 | 
48 | tv = TfidfVectorizer()
49 | tv.fit(data)
50 | 
51 | features_vec = tv.transform(data)
52 | 
53 | # build the search index!
54 | cp = ci.MultiClusterIndex(features_vec, data)
55 | 
56 | # search the index with a sparse matrix
57 | search_data = [
58 |     'oh there',
59 |     'Play it again Frank'
60 | ]
61 | 
62 | search_features_vec = tv.transform(search_data)
63 | 
64 | cp.search(search_features_vec, k=1, k_clusters=2, return_distance=False)
65 | >> [['oh hello there'], ['Play it again Sam']]
66 | 
67 | ```
68 | 
69 | ## Requirements
70 | PySparNN requires numpy and scipy. Tested with numpy 1.11.2 and scipy 0.18.1.
71 | 
72 | ## Installation
73 | ```bash
74 | # clone pysparnn
75 | cd pysparnn 
76 | pip install -r requirements.txt 
77 | python setup.py install
78 | ```
79 | 
80 | ## How PySparNN works
81 | Searching for a document in an collection of D documents is naively O(D) (assuming documents are constant sized). 
82 | 
83 | However! we can create a tree structure where the first level is O(sqrt(D)) and each of the leaves are also O(sqrt(D)) - on average.
84 | 
85 | We randomly pick sqrt(D) candidate items to be in the top level. Then -- each document in the full list of D documents is assigned to the closest candidate in the top level.
86 | 
87 | This breaks up one O(D) search into two O(sqrt(D)) searches which is much much faster when D is big!
88 | 
89 | This generalizes to h levels. The runtime becomes:
90 |     O(h * h_root(D))
91 | 
92 | ## Further Information
93 | http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html
94 | 
95 | See the CONTRIBUTING file for how to help out.
96 | 
97 | ## License
98 | PySparNN is BSD-licensed. We also provide an additional patent grant.
99 | 


--------------------------------------------------------------------------------
/examples/dense_matrix.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# Copyright 2016-present, Facebook, Inc.\n",
 12 |     "# All rights reserved.\n",
 13 |     "\n",
 14 |     "# This source code is licensed under the license found in the\n",
 15 |     "# LICENSE-examples file in the root directory of this source tree."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Study Sparse vs Dense Matrix Implementations\n",
 23 |     "Pysparnn defaults to sparse matricies but you may also use a dense matrix to improve performance\n",
 24 |     "\n",
 25 |     "This is typically when the number of dimensions is small"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import numpy as np\n",
 37 |     "import time"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# make sure you run 'python setup.py install' first!\n",
 49 |     "import pysparnn.cluster_index as ci\n",
 50 |     "import pysparnn.matrix_distance"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "source": [
 59 |     "# Get data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# feature vectors are ~10% full and there are only 100 dimensions\n",
 71 |     "features = np.random.binomial(1, 0.1, size=(100000, 100))"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "test_features = features[:5000]\n",
 83 |     "train_features = features[5000:]\n",
 84 |     "\n",
 85 |     "data_to_return = range(train_features.shape[0])"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Build models to compare"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {
 99 |     "collapsed": false
100 |    },
101 |    "outputs": [
102 |     {
103 |      "name": "stderr",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:189: RuntimeWarning: divide by zero encountered in true_divide\n",
107 |       "  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "cp = ci.MultiClusterIndex(train_features, data_to_return)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stderr",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:334: RuntimeWarning: divide by zero encountered in true_divide\n",
127 |       "  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)\n",
128 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:336: RuntimeWarning: invalid value encountered in multiply\n",
129 |       "  return 1 - (dprod * magnitude)\n",
130 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:108: RuntimeWarning: invalid value encountered in less_equal\n",
131 |       "  dist_filter = (dist_matrix <= max_distance)\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "dense_cp = ci.MultiClusterIndex(train_features, data_to_return, \n",
137 |     "                                distance_type=pysparnn.matrix_distance.DenseCosineDistance)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "## Answer Key"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 8,
150 |    "metadata": {
151 |     "collapsed": false
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "import pysparnn_utils"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 9,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "from sklearn.neighbors import NearestNeighbors \n",
167 |     "knn = NearestNeighbors()\n",
168 |     "        \n",
169 |     "knn.fit(train_features)\n",
170 |     "\n",
171 |     "# get top 3 nearest neighbors for each document\n",
172 |     "answers = knn.kneighbors(test_features, 3, return_distance=False)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Compare Performance\n",
180 |     "Don't worry so much about the recall performance. There are many items in this space (congested). These methods should return close matches even if they arent the closest absolute matches."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 13,
186 |    "metadata": {
187 |     "collapsed": false
188 |    },
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "Percent of time sparse returns a top 3 result: 0.2498\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "t0 = time.time()\n",
200 |     "\n",
201 |     "results = cp.search(test_features, return_distance=False)\n",
202 |     "\n",
203 |     "print('Percent of time sparse returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
204 |     "\n",
205 |     "cp_time = time.time() - t0"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 14,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "Percent of time dense returns a top 3 result: 0.2458\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "t0 = time.time()\n",
225 |     "\n",
226 |     "results = dense_cp.search(test_features, return_distance=False)\n",
227 |     "\n",
228 |     "print('Percent of time dense returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
229 |     "\n",
230 |     "dense_cp_time = time.time() - t0"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 15,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [
240 |     {
241 |      "data": {
242 |       "text/plain": [
243 |        "4.979948311566905"
244 |       ]
245 |      },
246 |      "execution_count": 15,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "# sparse is x times slower than dense\n",
253 |     "cp_time / dense_cp_time"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "**Analysis:** Equivalent performance (the indexes use random seeds for construction) and the dense version is ~4x faster in this case."
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "anaconda-cloud": {},
266 |   "kernelspec": {
267 |    "display_name": "Python 2",
268 |    "language": "python",
269 |    "name": "python2"
270 |   },
271 |   "language_info": {
272 |    "codemirror_mode": {
273 |     "name": "ipython",
274 |     "version": 2
275 |    },
276 |    "file_extension": ".py",
277 |    "mimetype": "text/x-python",
278 |    "name": "python",
279 |    "nbconvert_exporter": "python",
280 |    "pygments_lexer": "ipython2",
281 |    "version": "2.7.12"
282 |   }
283 |  },
284 |  "nbformat": 4,
285 |  "nbformat_minor": 0
286 | }
287 | 


--------------------------------------------------------------------------------
/examples/enron.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# Copyright 2016-present, Facebook, Inc.\n",
 12 |     "# All rights reserved.\n",
 13 |     "\n",
 14 |     "# This source code is licensed under the license found in the\n",
 15 |     "# LICENSE-examples file in the root directory of this source tree."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "# Evaluate pysparnn on Enron data"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import numpy as np\n",
 34 |     "import time\n",
 35 |     "import inspect\n",
 36 |     "\n",
 37 |     "from scipy.sparse import csr_matrix\n",
 38 |     "from sklearn.datasets import fetch_20newsgroups\n",
 39 |     "from sklearn.neighbors import LSHForest\n",
 40 |     "from sklearn.feature_extraction import DictVectorizer"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# make sure you run 'python setup.py install' first!\n",
 52 |     "import pysparnn.cluster_index as ci"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Get data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "raw",
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "source": [
 68 |     "# fetch data\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "!wget https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "_ = !tar -xzvf enron_mail_20150507.tgz"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# load enron data \n",
 86 |     "import os\n",
 87 |     "import sys\n",
 88 |     "\n",
 89 |     "docs = []\n",
 90 |     "max_docs = 100000\n",
 91 |     "for folder, subs, files in os.walk('maildir'):\n",
 92 |     "    for filename in files:\n",
 93 |     "        with open(os.path.join(folder, filename), 'r') as src:\n",
 94 |     "            try:\n",
 95 |     "                txt = ' '.join(src.readlines())\n",
 96 |     "                if len(txt) > 0:\n",
 97 |     "                    docs.append(txt)\n",
 98 |     "            except:\n",
 99 |     "                pass\n",
100 |     "        if len(docs) > max_docs:\n",
101 |     "            break                \n",
102 |     "    if len(docs) > max_docs:\n",
103 |     "        break"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "Num docs: 100001\n",
118 |       "Avg doc length: 413.757442426\n",
119 |       "Num unique words: 942676\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "print('Num docs: {}'.format(len(docs)))\n",
125 |     "print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in docs])))\n",
126 |     "words = set()\n",
127 |     "for doc in docs:\n",
128 |     "    words.update(doc.split())\n",
129 |     "print('Num unique words: {}'.format(len(words)))\n",
130 |     "del words"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Turn documents into vectors"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 6,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "from sklearn.neighbors import LSHForest, NearestNeighbors \n",
149 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
150 |     "\n",
151 |     "tv = TfidfVectorizer(decode_error='ignore')\n",
152 |     "\n",
153 |     "features = csr_matrix(tv.fit_transform(docs))\n",
154 |     "\n",
155 |     "doc_index = np.array(range(len(docs)))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 7,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "test_features = features[:2000]\n",
167 |     "train_features = features[2000:]"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Create an answer key"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 8,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',\n",
188 |        "         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)"
189 |       ]
190 |      },
191 |      "execution_count": 8,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "knn = NearestNeighbors(algorithm='brute', metric='cosine')\n",
198 |     "        \n",
199 |     "knn.fit(train_features)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 9,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [
209 |     {
210 |      "data": {
211 |       "text/plain": [
212 |        "67.97951602935791"
213 |       ]
214 |      },
215 |      "execution_count": 9,
216 |      "metadata": {},
217 |      "output_type": "execute_result"
218 |     }
219 |    ],
220 |    "source": [
221 |     "t0 = time.time()\n",
222 |     "# get 1 NN for each document\n",
223 |     "answers = knn.kneighbors(test_features, n_neighbors=1, return_distance=False)\n",
224 |     "time.time() - t0"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 10,
230 |    "metadata": {
231 |     "collapsed": false
232 |    },
233 |    "outputs": [
234 |     {
235 |      "name": "stderr",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py:211: UserWarning: cannot use tree with sparse input: using brute force\n",
239 |       "  warnings.warn(\"cannot use tree with sparse input: \"\n"
240 |      ]
241 |     },
242 |     {
243 |      "data": {
244 |       "text/plain": [
245 |        "52.95571780204773"
246 |       ]
247 |      },
248 |      "execution_count": 10,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "bknn = NearestNeighbors(algorithm='ball_tree')\n",
255 |     "        \n",
256 |     "bknn.fit(train_features)\n",
257 |     "\n",
258 |     "t0 = time.time()\n",
259 |     "# get 1 NN for each document\n",
260 |     "_ = bknn.kneighbors(test_features, n_neighbors=1, return_distance=False)\n",
261 |     "time.time() - t0"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "## Build models to compare"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 11,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "114.92523193359375"
282 |       ]
283 |      },
284 |      "execution_count": 11,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "t0 = time.time()\n",
291 |     "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)\n",
292 |     "time.time() - t0"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 12,
298 |    "metadata": {
299 |     "collapsed": false
300 |    },
301 |    "outputs": [
302 |     {
303 |      "data": {
304 |       "text/plain": [
305 |        "18.300570964813232"
306 |       ]
307 |      },
308 |      "execution_count": 12,
309 |      "metadata": {},
310 |      "output_type": "execute_result"
311 |     }
312 |    ],
313 |    "source": [
314 |     "t0 = time.time()\n",
315 |     "lshf = LSHForest(n_neighbors=1)\n",
316 |     "        \n",
317 |     "lshf.fit(train_features)\n",
318 |     "time.time() - t0"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "## Compare results"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 13,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "import pysparnn_utils\n",
337 |     "import time "
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 14,
343 |    "metadata": {
344 |     "collapsed": false
345 |    },
346 |    "outputs": [
347 |     {
348 |      "name": "stdout",
349 |      "output_type": "stream",
350 |      "text": [
351 |       "Recall: 0.965\n"
352 |      ]
353 |     },
354 |     {
355 |      "data": {
356 |       "text/plain": [
357 |        "23.034273862838745"
358 |       ]
359 |      },
360 |      "execution_count": 14,
361 |      "metadata": {},
362 |      "output_type": "execute_result"
363 |     }
364 |    ],
365 |    "source": [
366 |     "t0 = time.time()\n",
367 |     "\n",
368 |     "results = snn.search(test_features, return_distance=False)\n",
369 |     "\n",
370 |     "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
371 |     "\n",
372 |     "snn_time = time.time() - t0\n",
373 |     "snn_time"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 15,
379 |    "metadata": {
380 |     "collapsed": false
381 |    },
382 |    "outputs": [
383 |     {
384 |      "name": "stdout",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "Recall: 0.9245\n"
388 |      ]
389 |     },
390 |     {
391 |      "data": {
392 |       "text/plain": [
393 |        "11.743115901947021"
394 |       ]
395 |      },
396 |      "execution_count": 15,
397 |      "metadata": {},
398 |      "output_type": "execute_result"
399 |     }
400 |    ],
401 |    "source": [
402 |     "# only search one index instead of 2\n",
403 |     "t0 = time.time()\n",
404 |     "\n",
405 |     "results = snn.search(test_features, return_distance=False, num_indexes=1)\n",
406 |     "\n",
407 |     "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
408 |     "\n",
409 |     "time.time() - t0"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 16,
415 |    "metadata": {
416 |     "collapsed": false
417 |    },
418 |    "outputs": [
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "Recall: 0.7185\n"
424 |      ]
425 |     },
426 |     {
427 |      "data": {
428 |       "text/plain": [
429 |        "77.58664608001709"
430 |       ]
431 |      },
432 |      "execution_count": 16,
433 |      "metadata": {},
434 |      "output_type": "execute_result"
435 |     }
436 |    ],
437 |    "source": [
438 |     "t0 = time.time()\n",
439 |     "\n",
440 |     "results = lshf.kneighbors(test_features, return_distance=False)\n",
441 |     "\n",
442 |     "print('Recall: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
443 |     "lsh_time = time.time() - t0\n",
444 |     "lsh_time"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 17,
450 |    "metadata": {
451 |     "collapsed": false
452 |    },
453 |    "outputs": [
454 |     {
455 |      "data": {
456 |       "text/plain": [
457 |        "3.3683130860568533"
458 |       ]
459 |      },
460 |      "execution_count": 17,
461 |      "metadata": {},
462 |      "output_type": "execute_result"
463 |     }
464 |    ],
465 |    "source": [
466 |     "# LSH is x times slower than snn\n",
467 |     "lsh_time / snn_time"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "# Track Pysparnn vs Bruteforce as a function of index size"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 18,
480 |    "metadata": {
481 |     "collapsed": false
482 |    },
483 |    "outputs": [
484 |     {
485 |      "name": "stderr",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "/Users/spencebeecher/anaconda2/lib/python2.7/site-packages/pysparnn/matrix_distance.py:117: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 100001 but corresponding boolean dimension is 1000\n",
489 |       "  records = self.records_data[index]\n"
490 |      ]
491 |     }
492 |    ],
493 |    "source": [
494 |     "snn_results = []\n",
495 |     "\n",
496 |     "for n in np.linspace(1000, 80000, 5):\n",
497 |     "    feats = train_features[:n]\n",
498 |     "    \n",
499 |     "    ########## brute force ############\n",
500 |     "    bf = NearestNeighbors(algorithm='brute', metric='cosine')\n",
501 |     "\n",
502 |     "    bf.fit(feats)\n",
503 |     "\n",
504 |     "    # get 1 NN for each document\n",
505 |     "    t0 = time.time()\n",
506 |     "    \n",
507 |     "    answers = bf.kneighbors(test_features, n_neighbors=1, return_distance=False)\n",
508 |     "    \n",
509 |     "    bf_time = time.time() - t0\n",
510 |     "    \n",
511 |     "    \n",
512 |     "    ########## snn ############\n",
513 |     "    snn = ci.MultiClusterIndex(feats, doc_index, num_indexes=2)\n",
514 |     "    # only search one index instead of 2\n",
515 |     "    t0 = time.time()\n",
516 |     "\n",
517 |     "    results = snn.search(test_features, return_distance=False, num_indexes=1)\n",
518 |     "    \n",
519 |     "    snn_time = time.time() - t0\n",
520 |     "    snn_recall = pysparnn_utils.recall(answers, results).mean()\n",
521 |     "    \n",
522 |     "    # results\n",
523 |     "    snn_results.append({\n",
524 |     "            'n': n,\n",
525 |     "            'snn_recall': snn_recall , \n",
526 |     "            'snn_time': snn_time,\n",
527 |     "            'bf_recall': 1.0,\n",
528 |     "            'bf_time': bf_time\n",
529 |     "        })"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 19,
535 |    "metadata": {
536 |     "collapsed": false
537 |    },
538 |    "outputs": [],
539 |    "source": [
540 |     "import pandas as pd"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": 20,
546 |    "metadata": {
547 |     "collapsed": true
548 |    },
549 |    "outputs": [],
550 |    "source": [
551 |     "%matplotlib inline"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": 21,
557 |    "metadata": {
558 |     "collapsed": false
559 |    },
560 |    "outputs": [
561 |     {
562 |      "data": {
563 |       "text/plain": [
564 |        "<matplotlib.axes._subplots.AxesSubplot at 0x206f66050>"
565 |       ]
566 |      },
567 |      "execution_count": 21,
568 |      "metadata": {},
569 |      "output_type": "execute_result"
570 |     },
571 |     {
572 |      "data": {
573 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAEZCAYAAABxbJkKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH2xJREFUeJzt3XuYFPWd7/H3d7gMEEGGOwPMMAF5IEhUBMIaN5mIi3gl\nz+EkAlGUuFmy0ehjyFkQzgqcPSdGE5+w2Xh54GHV1RgMulnBowu6MB5NRO4XuYs63EEychHDMMx8\nzx9VM/QMc+mZ6Z7umfq8nqefrq7+VdW3e6A+Xb9fVbe5OyIiEk0ZqS5ARERSRyEgIhJhCgERkQhT\nCIiIRJhCQEQkwhQCIiIRphCQtGFm3zSz/TGPPzaz62po+5SZzW666urPzOaY2fO1PF/j64tj3c+Y\nWZGZra76vonUh0JAqmVmn5jZF2Z2yswOhTudDk2w6bguXHH3v3f3/5PsYhIg4RfimNm1wBgg291H\n12c7ZnaXmb2T6Jqk+VIISE0cuNndOwFXAlcBD6W2pMQys1aprqGB+gOfuPvZBixrJCGYpPlSCEht\nDMDdjwHLCcIgeMKsrZn90swKzeywmT1pZpkxz483s41mdtLM9pjZ2HD+3Wa2PTzC+NDM/q5BhQVH\nJv8rnO5qZsvM7DMz+7OZvV3LcmVm9iMz2w3sDuddY2ZrwuXfN7O/imlfqcumahePmU0Jj5o+NbP/\nWU0XT6aZPRe+3q1mNryamnqa2Rkzy4qZN9zMjlUNKjP7PrAQ+KtwnXOqWd+M8L09ZWYfmNm3w/mD\ngafCZU+bWVHN77BEhUJA6mRmfYEbgT0xsx8FBgJfDe/7AA+H7UcBzwHT3f1S4BvAJ+FyR4GbwiOM\nqcCvzOxKGmc6sB/oCvQAZtXRfjwwEvhKuON9DZgfLv8r4P/G7pCr4QBm9hXgCWAS0Bu4FMiu0vZW\n4MXwuWVh+8orcz8KrAK+GzP7DuB37l5ape2/Aj8E3nP3Tu4+r5r6PgS+Hr7H84AXzKynu++MWbaj\nu3ep5TVKRCgEpDb/YWangH0EO++5Mc/9AHjQ3U+6+xng5wQ7Q4DvA4vcfSWAux92993h9Bvu/kk4\n/Q6wAvjrRtZZQrATznP3Unf/Yx3tfxbWXQzcDOx29xfdvczdFwM7CXbedZkALHX399z9PGEIVvGu\nuy/34Eu6nicIzer8G3AngJllELyXNQ4q18bdXwmDBXdfQhDeoxqyLmn5FAJSm/Hhp8lvAoOBbgBm\n1h3oAKwPz1ApAt4g+CQN0A/YW90KzexGM3sv7Lb5jOAIo1sj6/xFuL0VYTfIjDraH4iZzgYKqzxf\nSHBkU5dsgiMQANz9L8Cfq7Q5EjP9BdAu3MlX9SowxMxygbHACXdfF0cNFwm7qDaG3VufAUNp/Hss\nLZRCQGpTPibwDkH3zuPh/OMEO7Sh7t4lvHUOu34g2DEOuGhlZm2Bl4HHgO7unkUQHtaYIt39c3f/\nqbsPAG4DfmJm36ptkZjpQwQDrbFygIPh9BmCwCvXK2b6MNC3/IGZtedCENZLeFTye4KjgTto4FGA\nmeUAC4AfuXtW+B5v48J7rEFhqUQhIPGaD/yNmQ0LuzYWAvPDowLMrE/54C+wCJhqZt+yQLaZDQLa\nhrfj7l5mZjcSfOptFDO72czKQ+c0cB4oi3Px14HLzGyimbUys9uBIQTjBACbgIlm1trMRgD/PWbZ\nl4FbzWy0mbWhcndZjeXW8tzzwN0EXVENCgHgSwSv/biZZZjZVODymOePAn3DekUUAlKjSp8Y3f04\nwdFAeb/3TIIByNVmdoKgb39Q2HYtwaDvfOAkUADkuvvnwP3AkrALaSJBN0hcNdTiMuAtMzsN/BF4\nwt1rOkOo6usqAm4BfkpwhPNTglNjy8+c+UeCge8iYA7w25hltwM/Bl4iOKI4BRwDiuN8TVVr+RPB\nDnyDuzfo4i9330FwxLaaoCtqKPBuTJOVBEcGR8zsWEO2IS2L1fWjMma2iOA/yVF3r3ZQy8x+TdC3\newa42903JbpQkXRnZl8CTgAD3b3qOEO86/gv4LfhWUAiSRfPkcAzwA01PRke0g9w98uAacDTCapN\nJO2Z2S1m1j4MgMeBLY0IgJEEF+W9lMgaRWpTZwi4+7vAZ7U0GU9wehvu/j5wqZn1TEx5ImlvPEFX\n0AGCwfCJDVmJmT1L0KX2QHjKrUiTaJ2AdfQh5jQ5grMq+hAMQIm0aO7+A4JrJhq7nrsbX41I/Wlg\nWEQkwhJxJHCQ4OKgcn25cI51JWamc5RFRBrA3Rt1PU1N4g0Bo+bzm5cC9wIvmdlogisda+4Kmluf\n8prIKqC2S4vSSXOpVXUmXnOptbnUCc2n1rlJXLe713oj+PKrQwTnPu8jOP97GvB3MW1+Q3DO+GZg\neC3r8rIy9/373f/wB/dZs9zHjnXv0sUdGn7Lzna/7Tb3f/on9zfecD92zOtlzpw59VsghZpLraoz\n8ZpLrc2lTvfmU2uwq659X93QW51HAu4+OY4298UbOmbQt29w+/a3y5eHTz6Bdetg7drgfv16OHUq\nvnUeOgRLlwa3crm5MGIEjBwZ3F99NXTuHG+VIiLRkIgxgUYzg7y84Pad7wTzysrgww8vhMK6dbBh\nA3zxRXzrLCwMbq+8cmHewIEXQmHECLjqKujYEfLz8xP+mpKludSqOhOvudTaXOqE5lVrstR5xXBC\nN2bmjdleaSns3Fk5GDZtguLaLtKvtR4YPLhyMFx5JbRv3+ASRUQSzsySNjDcrEKgOiUl8MEHF0Jh\n3TrYsgXOn2/Y+lq1gssvvxAKI0bAV78KbdsmtGwRkbgpBOrp7NkgCGKDYdu2oIupIdq2DYIgNhiG\nDoXWadGZJiItnUIgAc6cCbqOYoNh165gULoh2rULxhTKQ2HkSBg0KDiSEBFJJIVAkpw6FQw2l4fC\n2rXw0UcNX98ll8Dw4ZWDYcCAYOxBRKShFAJNqKgoOD019ohh376Gr69z5+D01NjTVXNyFAwiEj+F\nQIodPVo5GNauhSNH6l6uJt26VT5aGDECsrMTV69Udv48/OUvwenFf/lL5Vv5vOLiYIynXTvIzKz7\nvk0bBbk0HYVAmnEPLlCLPVpYuxb+XPUnxuuhd+/KoTBiBHTvnria00lZ2cU74ZoexzuvtjYNPVOs\nNmbxhUUy7mOnFUbRoBBoBtyDi9Nig2HdOjh5suHrzMmpHApXXw1ZWYmruZx7cEZVsnbCVR+fO5f4\n1xBlqQiiqvdt2yqMkkkh0EyVlcHevZWPFjZsCM5UaqiBAy+EQu/eidlRnz2buNcs0ZWZWTkcWrcO\njlSqu2/oc41dviHrbtUq9QGnEGhBSkuDU1Njg2HTJu2Ik8ksuAq8Q4fgvvwW+7hdu6Db6OzZYHwg\n9r66eaWlqX5V0pTKwyFV4TV9ukKgRSspge3bK3+B3pYtwfyWKnZnXN0OOpHzktFVcf58EAhVw6Gp\n75Mx3iHpSCEQOcXFsHVr5WDYti15n0AzM5tmh1z+qTvVh9ctRWlp04RRXW1a8geW9KAQEIL++82b\ng0DYuDHo00/ETrp9e8jQD41KI5SVVQ6F4uLgKKWkpPJ9dfPS/bn06PpTCIiIpERZWRAEqQqkkhKY\nP18hICISWck8O0idACIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIi\nIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISITF\nFQJmNs7MdprZbjObUc3zncxsqZltMrOtZnZ3wisVEZGEq/OH5s0sA9gNjAEOAWuBie6+M6bNQ0An\nd3/IzLoBu4Ce7n6+yrr0Q/MiIvWU6h+aHwXscfdCdy8BFgPjq7RxoGM43RH4c9UAEBGR9BNPCPQB\n9sc8PhDOi/Ub4CtmdgjYDDyQmPJERCSZWidoPTcAG939OjMbALxpZl9198+rNpw7d27FdH5+Pvn5\n+QkqQUSkZSgoKKCgoKBJthXPmMBoYK67jwsfzwTc3R+NafMa8Ii7/zF8/F/ADHdfV2VdGhMQEamn\nVI8JrAUGmlmumbUFJgJLq7QpBK4HMLOewCDgo0QWKiIiiVdnd5C7l5rZfcAKgtBY5O47zGxa8LQv\nAP438KyZbQkX+wd3L0pa1SIikhB1dgcldGPqDhIRqbdUdweJiEgLpRAQEYkwhYCISIQpBEREIkwh\nICISYQoBEZEIUwiIiESYQkBEJMIUAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhE\nmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQ\nEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiIiESYQkBEJMIUAiIiEaYQEBGJMIWAiEiExRUCZjbO\nzHaa2W4zm1FDm3wz22hmH5jZqsSWKSIiyWDuXnsDswxgNzAGOASsBSa6+86YNpcCfwLGuvtBM+vm\n7serWZfXtT0REanMzHB3S8a64zkSGAXscfdCdy8BFgPjq7SZDLzi7gcBqgsAERFJP/GEQB9gf8zj\nA+G8WIOALma2yszWmtmdiSpQRESSp3UC1zMcuA74EvCemb3n7h8maP0iIpIE8YTAQSAn5nHfcF6s\nA8Bxdz8LnDWz/wdcAVwUAnPnzq2Yzs/PJz8/v34Vi4i0cAUFBRQUFDTJtuIZGG4F7CIYGD4MrAEm\nufuOmDaDgX8BxgGZwPvA7e6+vcq6NDAsIlJPyRwYrvNIwN1Lzew+YAXBGMIid99hZtOCp32Bu+80\ns+XAFqAUWFA1AEREJP3UeSSQ0I3pSEBEpN5SfYqoiIi0UAoBEZEIUwiIiESYQkBEJMIUAiIiEaYQ\nEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQi\nTCEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiI\niESYQkBEJMIUAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTC4goBMxtn\nZjvNbLeZzail3UgzKzGz/5a4EkVEJFnqDAEzywB+A9wADAUmmdngGtr9HFie6CJFRCQ54jkSGAXs\ncfdCdy8BFgPjq2n3Y+Bl4FgC6xMRkSSKJwT6APtjHh8I51Uws2zg2+7+FGCJK09ERJIpUQPD84HY\nsQIFgYhIM9A6jjYHgZyYx33DebFGAIvNzIBuwI1mVuLuS6uubO7cuRXT+fn55Ofn17NkEZGWraCg\ngIKCgibZlrl77Q3MWgG7gDHAYWANMMndd9TQ/hlgmbv/ezXPeV3bExGRyswMd09KD0udRwLuXmpm\n9wErCLqPFrn7DjObFjztC6oukoQ6RUQkCeo8EkjoxnQkICJSb8k8EtAVwyIiEaYQEBGJMIWAiEiE\nKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhIhCkEREQiTCEgIhJhCgER\nkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBEREIkwhICISYQoBEZEIUwiIiESYQkBEJMIU\nAiIiEaYQEBGJMIWAiEiEKQRERCJMISAiEmEKARGRCFMIiIhEmEJARCTCFAIiIhGmEBARiTCFgIhI\nhCkEREQiLK4QMLNxZrbTzHab2Yxqnp9sZpvD27tmNizxpYqISKKZu9fewCwD2A2MAQ4Ba4GJ7r4z\nps1oYIe7nzSzccBcdx9dzbq8uu3179+fwsLCRr0QSZ7c3Fw++eSTVJchEllmhrtbMtbdOo42o4A9\n7l4YFrMYGA9UhIC7r45pvxroU58iCgsLqSuMJHXMkvJvT0TSQDzdQX2A/TGPD1D7Tv5vgTcaU5SI\niDSNeI4E4mZm3wKmAtfW1Gbu3LkV0/n5+eTn5yeyBBGRZq+goICCgoIm2VY8YwKjCfr4x4WPZwLu\n7o9WafdV4BVgnLvvrWFd1Y4JhP1dDXsFknT6+4ikVjLHBOLpDloLDDSzXDNrC0wEllYpMIcgAO6s\nKQBERCT91Nkd5O6lZnYfsIIgNBa5+w4zmxY87QuAfwS6AE9aMIpY4u6jklm4iIg0Xp3dQQndmLqD\n0kZeXh6LFi3iuuuuY968eXz44Yc8//zz1bbV30cktVLdHSQRoNNARaIpoWcHJUOi903N6QNtaWkp\nrVq1SnUZItKC6UggDo8++ih9+/alU6dODBkyhFWrVjFv3jxuv/127rrrLjp16sSwYcPYsGFDxTJ5\neXk8/vjjXHHFFWRlZTFp0iTOnTtX63befvtt+vXrx2OPPUbv3r35/ve/D8Brr73GVVddRVZWFtde\ney1bt26tWObAgQNMmDCBHj160L17d+6//34APvroI8aMGUO3bt3o0aMHd9xxB6dOnUrCuyMizZlC\noA67d+/miSeeYP369Zw6dYrly5fTv39/AJYtW8bkyZM5efIkt956K/fee2+lZZcsWcKKFSv4+OOP\n2bx5M88++2yd2zty5AgnTpxg3759LFiwgI0bN3LPPfewcOFCioqKmDZtGrfddhslJSWUlZVxyy23\nkJeXx759+zh48CATJ04EwN2ZNWsWR44cYceOHRw4cKDSNRoiIqAQqFOrVq04d+4cH3zwAefPnycn\nJ4e8vDwArr32Wm644QbMjDvvvJMtW7ZUWvaBBx6gZ8+edO7cmVtvvZVNmzbFtb158+bRpk0bMjMz\nWbhwIT/84Q8ZMWJExXYyMzNZvXo1a9as4fDhwzz22GO0a9eOtm3bcs011wAwYMAAxowZQ+vWrena\ntSsPPvggb7/9duLfIBFp1tI+BNwTe6uvAQMGMH/+fObOnUuPHj2YPHkyhw8fBqBXr14V7Tp06MDZ\ns2cpKyurmNezZ89Kz3/++ed1bq979+60adOm4nFhYSGPP/44Xbp0oUuXLmRlZXHgwAEOHTrE/v37\nyc3NJSPj4j/jsWPHmDRpEn379qVz587ccccdHD9+vP5vgIi0aGkfAulg4sSJvPPOO+zbtw+AGTMu\n+jbthKl6lk6/fv2YPXs2RUVFFBUV8dlnn/H5559z++23069fP/bt21cpeMrNmjWLjIwMtm3bxokT\nJ3jhhRd0mqeIXEQhUIfdu3ezatUqzp07R9u2bWnfvn2NZ+wkYyf7gx/8gKeffpo1a9YAcObMGV5/\n/XXOnDnDqFGj6N27NzNnzuSLL76guLiYP/3pTwCcPn2aSy65hI4dO3Lw4EF+8YtfJLw2EWn+FAJ1\nKC4uZubMmXTv3p3s7Gw+/fRTHnnkkWrbxn6KT9R591dffTULFy7kvvvuo0uXLgwaNIjnnnsOgIyM\nDJYtW8aePXvIycmhX79+/P73vwdgzpw5rF+/vmI8YsKECTXWKiLRpSuGpU76+4iklq4YFhGRpFAI\nNLFHHnmEjh070qlTp0q3m2++OdWliUgEqTtI6qS/j0hqqTtIRESSQiEgIhJhCgERkQhTCIiIRJhC\nQEQkwhQCccjLy2PlypXVPvfUU0/Rq1cvOnXqxGeffdbElV1s6tSpPPzww8CF3ycQEalJ2v+yWDo7\nf/4806dPZ82aNVx++eWpLqda+noIEalN2oeAzUvsTsznJO589yNHjlBcXMyQIUPqtZx+NlJE0oW6\ng+K0Zs0ahg4dSteuXbnnnnvYunUrgwcPBiArK4vrr7++1uUzMjJ48sknGTRoEIMGDQJg586djB07\nlq5duzJkyBCWLFlS0f7s2bNMnz6d/v37k5WVxTe+8Q2Ki4sB+O53v0vv3r3JysoiPz+f7du3J+lV\ni0hLpxCI04svvsibb77J3r172bVrF0uWLKnY+Z48eZK33nqrznW8+uqrrFmzhu3bt/PFF18wduzY\nih97Wbx4MT/60Y/YuXMnANOnT2fjxo2sXr2aoqIiHnvssYofj7npppvYu3cvx44dY/jw4Xzve99L\n3gsXkRZNIRCnH//4x2RnZ9O5c2dmz57N7373u4qvUoj3KxVmzZpF586dyczM5LXXXiMvL48pU6Zg\nZlxxxRVMmDCBJUuW4O4888wz/PrXv6ZXr16YGaNHj674xbG7776bDh060KZNGx5++GE2b97M6dOn\nk/baRaTlSvsxgUT24TdG3759K6Zzc3M5dOgQUL8fkoldR2FhIatXr6ZLly4V6yktLWXKlCkcP36c\ns2fP8uUvf/midZSVlTFr1ixefvlljh8/jplhZhw/fpyOHTs29OWJSESlfQiki/3791dMFxYWkp2d\nDdTv7JvYtv369SM/P5/ly5df1M7dad++PXv37mXYsGGVnnvxxRdZtmwZK1euJCcnh5MnT5KVlaUv\neBORBlF3UJyeeOIJDh48SFFRET/72c+YOHEi0PCflLzlllvYvXs3L7zwAufPn6ekpIR169axa9cu\nzIypU6fyk5/8hMOHD1NWVsbq1as5d+4cp0+fJjMzk6ysLM6cOcNDDz2k00BFpMEUAnEwMyZPnszY\nsWMZOHAgl112GbNnz654Lt51xLrkkktYsWIFixcvJjs7m+zsbGbOnFlxBtAvf/lLhg0bxsiRI+na\ntSszZ87E3ZkyZQo5OTn06dOHyy+/nGuuuSaxL1ZEIkW/JyB10t9HJLX0ewIiIpIUCoEEeffddy/6\n2cjyxyIi6UrdQVIn/X1EUkvdQSIikhQKARGRCEuLi8Vyc3N1rnsay83NTXUJIpIkcY0JmNk4YD7B\nkcMid3+0mja/Bm4EzgB3u/umatpUOyYgIiI1S+mYgJllAL8BbgCGApPMbHCVNjcCA9z9MmAa8HQS\nak2agoKCVJcQt+ZSq+pMvOZSa3OpE5pXrckSz5jAKGCPuxe6ewmwGBhfpc144N8A3P194FIz65nQ\nSpOoOf1DaC61qs7Eay61Npc6oXnVmizxhEAfYH/M4wPhvNraHKymjYiIpBmdHSQiEmF1Dgyb2Whg\nrruPCx/PBDx2cNjMngZWuftL4eOdwDfd/WiVdWlUWESkAZI1MBzPKaJrgYFmlgscBiYCk6q0WQrc\nC7wUhsaJqgEAyXsRIiLSMHWGgLuXmtl9wAounCK6w8ymBU/7And/3cxuMrMPCU4RnZrcskVEJBGa\n9LuDREQkvbSogWEzW2RmR81sS8y8LDNbYWa7zGy5mV0a89xDZrbHzHaY2diY+cPNbIuZ7Taz+THz\n25rZ4nCZ98wsp4F19jWzlWa2zcy2mtn96VirmWWa2ftmtjGsc0461hmzrgwz22BmS9O8zk/MbHP4\nvq5J11rN7FIzWxJud5uZfS3d6jSzQeH7uCG8P2lm96dbnTHretDMPgi389tw3amt1d1bzA24FrgS\n2BIz71HgH8LpGcDPw+mvABsJusT6Ax9y4cjofWBkOP06cEM4/ffAk+H07cDiBtbZC7gynL4E2AUM\nTtNaO4T3rYDVBNeNpF2d4fIPAi8AS9P1bx8u/xGQVWVe2tUKPAtMDadbA5emY50x9WYAh4B+6Vgn\nkB3+7duGj18C7kp1rSnbYSfrBuRSOQR2Aj3D6V7AznB6JjAjpt0bwNfCNttj5k8Engqn/xP4Wjjd\nCvg0QTX/B3B9OtcKdADWASPTsU6gL/AmkM+FEEi7OsPlPwa6VpmXVrUCnYC91cxPqzqr1DYWeCdd\n6yQIgUIgi2DHvpQ0+H/forqDatDDwzOV3P0I0COcX9MFbn0ILogrF3txXMUy7l4KnDCzLo0pzsz6\nExy9rCb4h5BWtYZdLBuBI8Cb7r42HesEfgX8D8Bj5qVjnYQ1vmlma83sb9O01jzguJk9E3a1LDCz\nDmlYZ6zbgRfD6bSr090PAY8D+8LtnnT3t1JdaxRCoCqvu0ncGnXKq5ldArwMPODun3NxbSmv1d3L\n3P0qgk/ao8xsaDV1pbROM7sZOOrBlxbWtnzK38/Q1919OHATcK+Z/TVp9p4SfFIdDjwR1nqG4JNp\nutUZLGjWBrgNWBLOSrs6zawzwVfs5BIcFXzJzL5XTW1NWmsUQuCohd9jZGa9gGPh/IMEfYfl+obz\nappfaRkzawV0cveihhRlZq0JAuB5d381nWsFcPdTQAEwLg3r/Dpwm5l9BPwOuM7MngeOpFmdALj7\n4fD+U4KuwFGk33t6ANjv7uvCx68QhEK61VnuRmC9ux8PH6djndcDH7l7Ufgp/Q/ANamutSWGgFE5\n/ZYCd4fTdwGvxsyfGI6m5wEDgTXh4dhJMxtlZgZMqbLMXeH0d4CVjajzXwn69f45XWs1s27lZyqY\nWXvgb4Ad6Vanu89y9xx3/zJB/+hKd78TWJZOdQKYWYfwCBAz+xJBP/ZW0u89PQrsN7NB4awxwLZ0\nqzPGJIIPAOXSsc59wGgzaxduYwywPeW1NmYgJt1uBP2Bh4Di8A2fSjAI8xbBGTgrgM4x7R8iGHHf\nAYyNmX81wX/MPcA/x8zPBH4fzl8N9G9gnV8HSoFNBKP/Gwg+YXdJp1qBYWFtm4AtwOxwflrVWaXm\nb3JhYDjt6iToay//u28FZqZxrVcQfGPAJuDfCc4OSsc6OwCfAh1j5qVdneG65oTb3QI8B7RJda26\nWExEJMJaYneQiIjESSEgIhJhCgERkQhTCIiIRJhCQEQkwhQCIiIRphAQEYkwhYCISIQpBCTSzCzX\nzLaH35L5gZn9p5llproukaaiEBAJvpPlX9z9cuAkMCHF9Yg0GYWACHzs7lvD6fUEv+IkEgkKAZHg\nCwfLlRJ8l75IJCgERBr540AizZlCQCSxv+Qk0qzoq6RFRCJMRwIiIhGmEBARiTCFgIhIhCkEREQi\nTCEgIhJhCgERkQhTCIiIRJhCQEQkwv4/WW1egacM/bEAAAAASUVORK5CYII=\n",
574 |       "text/plain": [
575 |        "<matplotlib.figure.Figure at 0x206f2f710>"
576 |       ]
577 |      },
578 |      "metadata": {},
579 |      "output_type": "display_data"
580 |     }
581 |    ],
582 |    "source": [
583 |     "pd.DataFrame(snn_results)[['n', 'snn_recall', 'bf_recall']].plot(x='n', ylim=(0, 1), linewidth=4,\n",
584 |     "                                                                 title='Recall is roughly flat')"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 22,
590 |    "metadata": {
591 |     "collapsed": false
592 |    },
593 |    "outputs": [
594 |     {
595 |      "data": {
596 |       "text/plain": [
597 |        "<matplotlib.axes._subplots.AxesSubplot at 0x206f95e90>"
598 |       ]
599 |      },
600 |      "execution_count": 22,
601 |      "metadata": {},
602 |      "output_type": "execute_result"
603 |     },
604 |     {
605 |      "data": {
606 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEoCAYAAAC0OiEVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXmcTuX7x9/XDGMfy8gWhqRIRElKMkokhMraQrK1SNu3\nVPoaUepX+baSJVtRopBSUsxEKQlFVEqW7OuMnTH374/7DI/xnGee2Z5l5nq/XvOac859nXN/znad\n+7nOfa5bjDEoiqIo+YOIYAtQFEVRAoc6fUVRlHyEOn1FUZR8hDp9RVGUfIQ6fUVRlHyEOn1FUZR8\nRJ52+iLylIiMDbaOvIyIFBaRuSJyQESm58D2monIlpzQlg0NF4nIShFJEpEHg6hjtIg8E6z6g0H6\n8y8ia0TkuiBpSRWRC4JRd25SINgCsoOIHATSPjQoBhwHTjnL+hljRgRLWz7iduA8oLTJuY8+cvTj\nERFJBS40xmzwc5UngIXGmAY5qSOzGGPuC2b9QeT0+TfGXBoKOvISYd3SN8aUMMZEG2OigU1AG49l\nHwRbX1YQkcgg15/ZayIW+DMrDj8n9tVPvZnVFgv8lgU5QT9/GRHq+kIBj2MkQRWSWxhj8sQf8A9w\nfbplQ4D3nOlYIBXoCWwG9gL9gIbAL8A+4M106/cC1jq2XwBVXepO23YfYKvz95hHuQCDgL+A3cCH\nQKl06/bCPrgSXOr4D7AN+Be4x1nnAqdsEdDLw7YHsNhjvhbwlbMf64BOHmUTgVHA58BB4HFgByAe\nNrcCq7xoisf+ujoBJDu6BBgMbHS2MwmI9ndfgWbAFuAp51htALr70Hu9r/0HEp06DzkaOznL2wIr\ngf3AEuBSZ/k3QApw1LG/EIgGpgC7sNfZM+nqWgKMBPYAzznL+zjXTjKwBqjvLK8IzHS29TcwwMc1\nPdFje2nH5VFgJ/Ya6+lhWxh41Tnu+4FvgUJuxxxoDHzn2K4Emnlsq6eH9r+Avh5lMcBcZ729QKJH\nmeu+AVcCPwFJwHbgFZd9bgZs9nZfY+/n6cBkR9tq4PJM1P+9o3sr8CZQwKM8Fbgf+BP422PZBVgf\n4dc9EQ5/QReQYzvi7vSnONNpF/8oIApogb2xP3Eu5ErOzdTUsW/vXAAXYX8RPQ1851J32ranOjff\npc6Fl3axDnQuuIpAQWA0MC3dupOAIkAhL9u/yblRajs2U7FhLF9O/1tnuij2IXc31iFfhnWmtZzy\nic6N0NiZL4R1Uq08tvcJ8LDLvp8+xs58L+e4xTp1f+zlHPja12bASeBl51hdh3XYNX3odd1/Zz4V\nqO4x38A51w2dY3KXc/0UdDmeU4BZzv7EAn8A93jUdRLrMCIcPZ2wDvpyx+YCoIpT13LgGSASqIZ1\nqje6HNv0Tv+kc7wjgdbAYaCkU/42sBCo4NTT2Dl+5xxz7LW+J+0cAzc48zHOfGugmjPd1Kkn7aH1\nAvYeinB0NHGW+9w37PV/h8c12chlnzNy+keAVk59LwBL/az/cqCRY1cV+0vuoXTXyHygFM51ydkN\nK7/viVD/C7qAHNsR/5z+KaCCR/kezm71zky7EIB5ODe2Mx/hXPxVvNSddmPV9Fj2EjDOmV4LNPco\nq4htHUd46Ir1sW/vAi94zNfEf6ffGY/WmLPsHeBZZ3oiMCld+RPA+850GWe/y7toS+/0vwb6e8xf\nlMl9bebYF/ZYNh2nde2i1x+nf4HH/ChgaLpt/M6ZB/7p7Tm6jwMXe9j2xcb80+ramG5bX+KlBY91\nOultBwHvuhyL9E7/MBDhUb6TM47sCM6vFS/X5lnH3Dm/k71ovstFx6y0/QGGOvM1MrNv2F9cQ3Ae\nLBmcf19O/yuPstrAYWf6qkwe24HAx+mukWbpbDydvt/3RKj/hfWL3Cyyy2P6KPbG8Zwv7kzHAq+L\nyKvOvGBjw+djW3HpMdjQSxqbsC3+tG3Ncl4opm3rJFDew95z3fRUwrZiPLftb7wxFmgsIvs86o7E\ntl7TSL8/7wNrRaQI9qHxrTFmJ/5RydHnqbUA/u8rwH5jzLF026jkQ29miQXuFpEBzrxgW8WVvNiW\nxerfnE7P+T70VMGGF7zVe366cxGBDcX4w15jTKrH/BHs9VoW24L39aLa85jHAp1FpJ2HjgLYXwqI\nSGvgv5z5lVsE+NWxfRkb1vtKRAy2YfOSH/vWCxgG/C4iG7APs8/93G9PdnhMHwEKO+91qvqqX0Rq\nYkNwDZ39KQD8nG7bvq7L7NwTIUV+dPr+sgUYbvx/ISzYm/1PZ74qNgYP1mH0MsYsPWclkVhn0vjY\n9nZn22nEprM/jP3JnEYFj+kt2DhuKx/bP6tuY8w2EVkK3AbciW0Z+8s2R5+n1pPYh2vaPvjaV4DS\nIlLEGHPUma+Kjd961Yvv/ffGFuB541/vrj1Y/bHYXwM401t96NkC1HCpd4Mx5mI/6s0Me4BjTp2r\nXWw8NW7B/jrrl95IRKKwv3jvBOYYY1JFZBZOI8MYcwj73udxEbkEWCQiy8hg34wxfwPdnTpuA2aK\nSBmPc5xdMjq2o4EVQBdjzBERGYi9vs+S6bbxbN4TIUVY997JApl5G/8O8LRzYSMiJUXk9gzWeVZE\niohIHexLzQ+d5WOAF0SkqrOt80Tklkzo+gjoKSK1RaQothXmySrgVqfuC4F7Pco+Ay4SkTtFpICI\nFBSRhiKSkeN5D/uT9lJs/NJfPgAeEZFqIlIceB740KOF6s85EGCoo7Up0AZ7DNzwtf9gW4ee/a3H\nAf1FpBGAiBQTkZtFpFj6DTu6PwKeF5HizkP6EezxcWM81ile7my/hohUAZYBB0XkCef7hkgRqSMi\nDX1sK0OMjTlMBEaKSEURiRCRxiJS0DFJf8zfB9qJSEvHtrDTP74S9n1XFLDHcfitgZZpK4pIGxFJ\ne6AdxL70Ts1o30TkDhEp66yXhHWwnr9askravmV0bEsAyY7DrwVkpTtsVu+JkCIvOf2MWo/ebFzn\njTGzgReBD0XkAPbn7U0ZbD8R+/JoAfB/xphvnOWvA3OwP4mTsC+1Gvmr3RjzJfAa9uf3n9geJp78\nD9sa3YG9+d/3WPcQ9qbtim2Fb3P2q1AG+zIL26L9JF2oJSMmYG+Ob7EhjiPAQ56748c2tmNf1m5z\nttXPGLPex/qu++8QD0wRkX0icrsx5mds75q3nHDAn9jYvJvGh5z92ODs1/vGmIlu4o0xM7EPu2ki\nkow9lmWcB0hboD42Vr0L+wCKdttWBnjqfBzbyv8J26vmRc7c3+l/yf2L7ajwNPal/iZn/QjnenkI\nmOEcm67YazeNmsDXzjcy3wFvG2MS/di3m4DfnOPxP2yL+3gm99G13I/6HwfucOofw5kGma960i/L\n6j0RUojzYkLJBk7rbwO290dOtF78qTOzHxxlpY607noLc6sORQkn8sI9kZda+sEmT33I4cRdU8P5\n4laUnCSv3BP6IjfnCPRPplyrT0QWYbvD3ZlbdShKOJGX7gkN7yiKouQjNLyjKIqSjwh5px8qaW7D\nFRGpIiLJIpKn3jmEAyIyRER8de3M7vYXiUgvZ7q7iHzpUZYn0wIr2SfknT5n0tyWNMa8FWwx4YYx\nZouxWUcNnO0olICQpfipiPQQkcV+V2LMNGOMZ5dijdsGAedBPyVjy+ARDk4/ljya5tYb4aY53PSG\nEWlpP7Kzfr4ju9djvrieg538x9cfOZTmNt02hwAzsB9nJGNz2tR1yh4HZqazfwP4nzPdE/vBUbLz\nv1u6et8EDmATrF3vsY2euKeqbYb9hPwJ7EdJk8k4je5E4C3s17bJwFI8skim0x+L/fIxAhjuHM8j\nznpvuKxzNzZF725smuT0Sa9mYD+aOoDNqRKF/XhsKzZ/yf84k7EyAejoTDdxtLR25q8HVjrTNRzb\nA865/cBFWyGn7j3YD7h+BM5zyipiPyTai/3gqne68/6Rs24yNp12TWxSrp3YD5RaeNhHY7+s3eac\ni2Gc6fiwEWjgTN/h7FNtZ74X9uOdtDp9pQJ+0rke0tIvd3CW18Je8yexX73uczkWiziTGK4HZ6fT\n9kwWdi02Fch1Htv3mmrbSx09cbl2vdi6nkPgRqeu/dj7JMFD+xCcFOjpr9ms3D/O8vSps+v60O0t\nrfI12K98066xqz3svV5n2Oyfx52/gzjXdqj9BV1AhgKzmebWy/aGOCelIzbx2GPYD6sisTlbDnIm\n/3sk1iHUd+pLwn4QBTaBWO109T7krNPZufDTcub7SlXbzFn3BWzSr0JknEZ3ItYhX+Hs5/s4qZq9\n7G8sNstihLfj6cX+EucYXI3t0vuyc7yuT3f82jnzhYHnsF8Zxzh/3+FkscRmZXzdmX4KWA+M8ChL\ne6BOA55ypqOAa1z09XVuuELY1mwDoLhT9i3WoRTEppDeBcR56D6CTakdgXXEGxxNkUBvbO6WtHpm\nYfOrFMYmNPsB6OOUTQYecabHOPvUz6NsYLo6z0kF7JTfhpOpEZuO+ZDHfA88MoVmdG+kt+dMLvib\nsA+0K5zl3lJt78JJte2lDtdr14ut13PoXBPJnLnnHsZmUvV0+p6ZWtNfs5m9f3ymzvaiOy2tckln\n/dLY8TW6O9dKV2e+tJ/X2RRv9YTKX9AFZCgwm2luvWxvCPC9x7xgW3NpecE/B+51ptsCazxuln3O\nhVs43TZ7AP+mW/YjTv5wLxo8U9U2wybLKuhR3gyXNLrO9ERgrEdZa2CtS12ZdfrPAlM95otwrtNP\nSLfOX5yda7wljgPFtuZXOdNfYFvC3zvzCZxp3U7G5js6P4Pzdw9eWm5AZezNX9Rj2QvABA/d8z3K\n2mIdUVrrvbhznKKxD/RjeDQasDd+2nXWC5jtTK915tPGR9jIGYc0BJdUwC77tpIzD9OccPqDsM6u\ntsdyn6m2/bgfT1+7Xsq8nkOs0/0+3bIt+On0s3D/+Eyd7WV7Z6VVxvbF/yGdzffYB6U/11lIO/1w\niOl7kpU0t944bWPsmfqXM2l1p3DmA4w7cBJrGWOOAF2wiZq2ix0M3DNpmWfWxTRdlcCmqhWRpSKy\nV0T2Y510WQ/b3caYk+nWd0ujm0b6FLOeZdmhEmcfn6PYn7GepD/GlTj3nKQdz6XYhG/lsK2iKUAV\nEYnB5h9KS737H+xDfZmIrBaRe1z0TcG2yj4UkX9F5EUnDlsJGwY5kk6H57WRPo32Huf8p80L9jhW\nxbbitjv5evZjnVnaOUsEmopIBUfzR8C1TjqOaGPMKo963FIBIyJ3Oz3T9jt11OHs6yK7DAQ+Msas\n81gWi5Nq22PfuuOSmdSPa9cTt3N41jXl4Hd67CzcP7HAY+n2sTLeU2en4ZlWOX16cDhzLflznYU0\n4eb0PdPcphGL7zS33jidptjpyliZM2mQZwP1xGbKbIsdpcpu2JgFxpiW2BvkD2CsxzbTn/SqwDY5\nk6r2/7Cx59LYFq/nizZ/NOcUGdW1HXs8ABCbPzwmg21s5dxzsg1OPzR+xjqgNcaYFOyD4FHgL2PM\nPsdulzGmrzHmfKA/MMpbl0NjzCljzDBjTB1s3LUdtgW2DSiTLlNmVc59GPvDFmzrMcYYU8YYU9oY\nU8oYU8/R8Df2ITEA27o+hHXufbG/QjJEbMbVscD9zvZLYzsspF0X2b0mDDZk1FFEPBPepaXaLuOx\nb9HGmAe8aPTn2j1Tofs53I49F554pgpPnxq7YiY1pD9WaamzPfexuDFmujfdXraxDTvylidp11JG\n11kg7+UsEVZO32Qtza03rhCRDk4L8RHsDf6DU8dx7BB/04Afjc1IiIiUE5FbxKY2PomNv3q2xMuJ\nyACx6Ys7YV+WfU4GqWoDhOcNspOz0wynZyY27W5aat54P7b/ITBYRMo66XOf5exz8i3wILaFDDas\n4zmPiNwuImkPzgPYY3tO8joRiRORS53W8iHsuTjlnKfvgREiUkhE6mFTLGe6n7wxZgf2Ref/RKSE\nWC4Qkes8zBIz2icX0s5FMWf/9ohNb3wPZwbdAXueKsuZ9MiZJS1seQPwkIj0d5a7pdqu5WUbmbp2\nfZzDz4FL0u45sbnsPQfVWQVc53xTUhIblsqSBge/U2e7MA+oKSJdHb1dsKG5uX5cZzuBaqH8XUw4\nOP30T85Mpbl1YQ42VLMfG8LpaIw55VE+GajL2aNLRWBbp1uxvziu4+yc3D9ie4Pswfb0uM0Yc8Bk\nnKrWX7LTgvBc93Wgk/NT+bVzDI1Zi23BTsc6jWTsiypfaXCHY3tB/YrtFbMcm1o4jURs2OTbdPOe\nDvJK4EexqW9nY4et3OilrgrYB1MStmW8iDOplLsB1R3dH2Pj1It86E6P53G6G+ts1mLf5czg7BCI\n2z5lNAqWAXBCLq9iGxs7sKEdz18JC7H7t0NEdqXfiBe9bvVswb68flJEehn3VNtR52wg89eu13No\njNmL/dXxEvb+qIF92Z9Wz9fY6+1XbGroudnQgMk4dfY5q6Rbfx/2V/7jjt7HgTbGmP2Oia/rbAb2\ngbtXRDxHuwsZ/M6947SslmNfWN4iIqWxJyoW+/KqszEmKbeE5hQiMgQ7vufdPmyqYLuXVXAuuoy2\n2QP78ve6jGzDDad1dADbayl9nFNRsoTYBGbvGWMmBFtLfiMzLf2B2FZPGoOAr40dnmwhtutb2OM8\n3B7DjvaUocPPi4hIW7GjUBXDtkZ/VYevKHkDv5y+iFQGbsZ+rJJGe2wYBOd/h5yVFniceH0Stpvh\nkCDLCSbtsT9d/8X+FO8aXDlKHiTkX3jmVfwK74jIDGyMtiTwmBPe2e+8SU+z2WeMKZN7UhVFUZTs\nkmFLX0TaADudvse+3kjrk1tRFCXE8WfkrCbALSJyM/brzBJi08XuEJHyxpidzkcqXnsYiIg+DBRF\nUbKAMSbHu35m2NI3xjxtjKlqjLmAM5+i34XtVtXTMeuBj25Uwfrc2NffkCFDgq4hL+kMJ63hojOc\ntKrOzP2lpqbyweoPiHkpxn4J4+0vl8jOGLkvAh+Jzc2+CZvTQ1EURfHBrsO7uP/z+/l43ceuNuWK\nlWOX9+BJtsnUx1nGmERjzC3O9D5jTAtjzMXGmJbGmAO5olBRFCWPMOO3GdQZVcenw+9Spwu/3Z+l\nIUT8Ijst/bAmLi4u2BL8Ilx0QvhoDRedED5aVadvdh/ezQPzHmDG2hmuNmWLlmV0m9HcfsntuarF\n7y9ys1yBiMntOhRFUUKVT9Z9Qv/P+rP7yG5Xm9svuZ23b36bcsXKnV4mIphceJEbtJZ+tWrV2LRJ\nP/IMNrGxsWzcuDHYMhQlz7H3yF4GfDGAD9Z84GoTUySGUW1G0blO4F6JBq2l7zzFcrVuJWP0PChK\nzjPn9zn0+6wfOw/vdLXpWKsjo9uMpnzx8l7L81xLX1EUJa+x7+g+HvriIaaunupqU7pwad66+S26\nXdqNYGRgVqevKIqSA8z9Yy59P+vLjkM7XG1uufgW3mnzDhVLVHS1yW3U6SuKomSD/Uf38/D8h5ny\nyxRXm1KFS/Fm6ze5o+4dQWnde6JOX1EUJYvMWz+PPnP7sO3gNlebNjXbMLbdWCqV8DVEb+AIh5Gz\n8h0jRoygb9++wZahKIoLSceS6DWnF22mtXF1+CULlWRi+4nM7TY3ZBw+aO+doJOYmMidd97Jli1b\nglK/ngdFyRzz/5pP77m9+Tf5X1ebmy68iXHtxlE5unKW68k3vXdyK9wVqn7NGBP0GJ+iKBmTfDyZ\nx+Y/xviV411togtF879W/+Oe+veE7H2t4R0XXnrpJSpXrkx0dDS1a9dm0aJFDB06lC5dutCjRw+i\no6OpW7cuK1asOL1O9erVefXVV7nssssoXbo03bp148SJE651HDlyhJtvvplt27ZRokQJoqOj2bFj\nB0OHDuWuu+4CYNOmTURERDBp0iSqVq1KTEwMY8aMYfny5Vx22WWUKVOGAQMGnLXdCRMmcMkllxAT\nE0Pr1q3ZvHlz7hwkRcknLPh7AZeOutSnw29ZoyVr7ltDrwa9Qtbhgzp9r/z555+8/fbb/PzzzyQn\nJzN//nyqVasGwNy5c+nevTtJSUm0a9eOBx544Kx1Z8yYwVdffcU///zDL7/8wqRJk1zrKVq0KF98\n8QWVKlXi4MGDJCcnU6FCBYBzLpply5bx119/MX36dB5++GFeeOEFFi5cyJo1a/joo49YvHgxAHPm\nzOHFF19k9uzZ7N69m6ZNm9KtW7ecOziKko84ePwg/T/rT8v3W7Il2XsItnhUcca2HcuXd3xJlZJV\nAqww86jT90JkZCQnTpxgzZo1pKSkULVqVapXrw7AtddeS6tWrRAR7rrrLn799dez1h04cCDly5en\nVKlStGvXjlWrVmVbj4jw3//+l6ioKFq0aEGxYsXo1q0bMTExVKpUiaZNm7Jy5UoAxowZw1NPPcVF\nF11EREQEgwYNYtWqVUF7Z6Ao4crCfxZSd3Rdxvw8xtXmhuo3sOa+NfS5ok9It+49UafvhRo1avDa\na68RHx9PuXLl6N69O9u3bwc43RIH21I/duwYqampp5eVL1/+rPJDhw7liKZy5c4kYipSpMhZ9RQp\nUuR0PZs2bWLgwIGUKVOGMmXKEBMTg4iwdevWHNGhKHmdQycO8cDnD3DDlBvYlOQ9P1ixgsUY3WY0\nC+5aQGyp2AArzB4h5/SNyZ2/zNK1a1cWL158Oh7+5JNP5vCeWnK6dVClShXGjBnDvn372LdvH/v3\n7+fQoUM0btw4R+tRlLxI4sZE6o2ux6jlo1xtmldrzur7VtO/Yf+wad17EnJOPxT4888/WbRoESdO\nnCAqKooiRYoQGRnp1Ta73R3Lly/P3r17SU5OdrXJTB39+/fnhRdeYO3atQAkJSUxc+bMbGlUlLzO\n4ROHeeiLh4ibHMc/B/7xalO0YFHeav0WX9/9NdVLVw+wwpxDnb4Xjh8/zqBBgzjvvPOoVKkSu3fv\nZsSIEV5tPZ/0WXnqX3zxxXTr1o0LLriAMmXKsGPHuXk70m/X13yHDh0YNGgQXbt2pVSpUtSrV48v\nv/wy07oUJb+weNNiLnvnMt5c9qarzXWx1/Fr/195oNEDREh4u039OCufo+dBya8cOXmEZ755htd/\nfB2D93ugSIEivNjiRR5s9GDAnX3QPs4SkULAt0CUYz/TGDNURIYAfeD06L1PG2O0SakoSsjz3ebv\nuGfOPazft97VpkmVJkxsP5GaMTUDqCz3yfDRZYw5DjQ3xjQA6gOtRaSRUzzSGHO586cO34URI0ac\n/vjK869NmzbBlqYo+YqjJ4/y+FeP03RiU1eHX7hAYUa2HEliz8Q85/Ahk+EdESmKbfXfB9wMHDLG\nvJrBOhreCWH0PCj5hR/+/YGes3vyx94/XG2urnw1E9tP5OKyFwdQmXdyK7zjV5BKRCJEZCWwA1hg\njPnJKXpQRFaJyHgRKZnT4hRFUbLLsZRjPLngSZpMaOLq8AtFFuLlG19m8T2LQ8Lh5yaZbelHA7OA\nAcBuYI8xxojIcKCiMeZeL+toSz+E0fOg5GWWbV1Gz9k9WbdnnatNo/MbMan9JGqfVzuAyjImJLJs\nGmOSRSQBuMkYM9KjaBww1229+Pj409NxcXHExcVlSqSiKEpmOJ5ynKGJQ3npu5dINalebaIio3gu\n7jkeu+YxCkQEP+FwQkICCQkJuV5Phi19ESkLnDTGJIlIEWA+8CKwwhizw7F5BLjSGNPdy/ra0g9h\n9DwoeY3l25bTc3ZPftv9m6tNw0oNmdR+EnXK1QmgsswRzJZ+RWCyiERg3wFMN8bME5EpIlIfSAU2\nAv1yWpyiKIq/nDh1gmGJwxixZASnzCmvNgUjChIfF88TTZ4IidZ9MPCny+Zqp0tmfWNMPWPM887y\nu535+saYDsaYnbkvNzBUr16dhQsXei0bPXo0FSpUIDo6mv379/u9zSVLllC7dmjFDBUlr7Bi+woa\njm3I8MXDXR3+5RUv5+e+P/N006fzrcMHTcOQKVJSUnjsscf4+uuvSU5OpnTp0q62ERERbNiw4fT8\ntddey7p17i+TFEXJPCdOnWDIoiFcNf4qVu9a7dWmYERBhjUfxg/3/kDd8nUDrDD0CLnHnQzNnax1\nZkj249Y7duzg+PHjfrXYwzH7nqKEE7/s+IUes3vwy85fXG3qV6jPpPaTuKzCZQFUFtpoS9+FZcuW\nUadOHWJiYrj33ntZvXo1tWrVAqB06dK0aNHCdd1mzZphjKFevXpER0czY8YMEhMTqVLlzKg61atX\n55VXXuGyyy6jRIkS9OnTh127dnHzzTcTHR1Ny5YtSUpKOm3/ww8/0KRJE0qXLk2DBg1ITEzMvZ1X\nlBDm5KmTDEscRsNxDV0dfoGIAgxpNoQfe/+oDj8dIdfSDxWmTZvGggULKFq0KG3btmXGjBmsXbuW\n6tWrk5SU5LMln5iYSEREBKtXrz494lZiYuI563zyySd88803nDx5kvr167Ny5UomTJhArVq1aN26\nNW+88QbPPvssW7dupW3btkydOpVWrVrxzTffcNttt/HHH38QExOTq8dBUUKJ1TtX03NOT1ZsX+Fq\nU7dcXSZ3mEyDig0CqCx80Ja+CwMGDKBSpUqUKlWKZ555hg8++OB010Z/uzhmZDdgwADKli1LxYoV\nadq0KVdddRX16tUjKiqKjh07nh4CcerUqbRp04ZWrVoBcMMNN9CwYUPmzZuXjT1UlPAhJTWF5799\nnivGXuHq8CMlkmeve5blfZerw/eBtvRdqFy58unp2NhYtm3bBmR/0BRP0g956GsIxI8++oi5c+ee\n1pCSksL111+fY1oUJVT5bddv9JzTk+Xblrva1DmvDpM7TOaKSlcEUFl4EnJOPydeuOYEngOJb9q0\niUqVKgHBeUFbpUoV7r77bsaMcR+gWVHyGimpKbzy/SsMSRjCiVMnvNpESARPNnmSIc2GUKhAoQAr\nDE80vOPC22+/zdatW9m3bx8vvPACXbt2Bfxv6VeoUOGsLpvZ4c4772Tu3Ll89dVXpKamcuzYMRIT\nE0//+lCUvMa63etoMqEJT33zlKvDr122NkvvXcoLN7ygDj8TqNP3gojQvXt3WrZsyYUXXkjNmjV5\n5plnTpcnv1OuAAAgAElEQVT5Q3x8PHfffTdlypTxOkZtRkMgelK5cmXmzJnDCy+8wHnnnUdsbCyv\nvPIKqanec4ooSrhyKvUUL3/3Mg3GNGDZ1mVebdJa9yv6raDR+Y282iju6HCJ+Rw9D0qo8MeeP7hn\nzj0s/Xepq83FMRczqcMkGlduHEBlwSEksmwqiqLkNKdST/H6j6/zzMJnOJZyzKuNIDx29WM81/w5\nihQsEmCFeQt1+llkyZIltG7d+qywjDEGESE5OTmIyhQlfFi/dz33zLmH77Z852pTs0xNJrafSJOq\nTQKoLO+i4Z18jp4HJRikmlTe/PFNnvrmKY6mHPVqIwgPN36Y4dcPp2jBogFWGHw0vKMoSp7gr31/\n0WtOLxZvXuxqU6N0DSa2n0jT2KYBVJY/UKevKEpASDWpjPppFE9+/SRHTh5xtXuo0UO8cMMLFIsq\nFkB1+YegOf3Y2FjNRBkCxMbGBluCkg/4Z/8/9Pq0FwkbE1xtqpeqzsT2E2lWrVnghOVDghbTVxQl\n75NqUhmzfAz/WfAfDp887Gr3wJUP8GKLFykeVTyA6kIbjekrihJWbDywkXs/vZeF/3gfhQ4gtmQs\nE9pP4PrqmkcqUKjTVxQlRzHGMPbnsTy+4HEOnTjkatf/iv78343/R4lCJQKoTsnQ6YtIIeBbIMqx\nn2mMGSoipYHpQCx2YPTOxpgk1w0pipLn2Zy0md6f9mbBhgWuNlVLVuXdW96lxQXuAxEpuYdfMX0R\nKWqMOSIikcB3wEPAbcBeY8z/iciTQGljzCAv62pMX1HyOMYYJqycwCPzH+HgiYOudn0u78MrLV8h\nulB0ANWFJ0GN6Rtj0vpXFXLWMUB7IO01+2QgATjH6SuKkrf5N/lf+sztw5d/felqUzm6MuPbjafV\nha0CqEzxhl9OX0QigJ+BGsDbxpifRKS8MWYngDFmh4iUy0WdiqKEIAkbE+g0oxN7juxxtelVvxcj\nW42kZOGSAVSmuOFvSz8VaCAi0cAsEamDbe2fZea2fnx8/OnpuLg44uLiMi1UUZTQwRjD2z+9zcNf\nPswpc8qrTaUSlRjXbhw317w5wOrCk4SEBBISEnK9nkz30xeRZ4EjQG8gzhizU0QqAIuMMbW92GtM\nX1HyEMdTjnP/5/czYdUEV5sel/Xgf63+R+kipQOoLG+RWzH9DAdREZGyIlLSmS4C3AisAz4Fejpm\nPYA5OS1OUZTQYtvBbTSb1MzV4VcoXoFPu37KpA6T1OGHKBm29EWkLvZFbYTzN90Y87yIlAE+AqoA\nm7BdNg94WV9b+oqSB/jh3x+4dfqtbD+03Wv5tVWvZWanmZQvXj7AyvImudXS1zQMiqJkyLsr3uX+\nefe7jld7X8P7eO2m14iKjAqwsryLpmFQFCXgnDx1kkfmP8LbP73ttbxgREHevvlt+lzRJ8DKlKyi\nTl9RFK/sOryLTjM68e2mb72Wly9Wno87f6wjWoUZ6vQVRTmHFdtX0OHDDmxJ3uK1/MpKVzKryyzO\njz4/wMqU7JJh7x1FUfIX01ZPo8mEJq4Ov8dlPfj2nm/V4Ycp2tJXFAWAU6mnGPT1IF5Z+orX8kiJ\nZGSrkQxoNEAHQApj1OkrisK+o/vo9nE3vvr7K6/lMUVimNFpBs2rNw+wMiWnUaevKPmcNbvW0OHD\nDvy9/2+v5ZeVv4zZXWdTrVS1wApTcgWN6StKPmbWulk0Ht/Y1eF3qdOF73p9pw4/D6FOX1HyIakm\nlSGLhnDrR7d6HbtWEF684UU+uO0DikUVC4JCJbfQ8I6i5DOSjydz5yd3MvfPuV7LSxYqyQe3fUDr\nmq0DrEwJBOr0FSUf8efeP2n/YXt+3/O71/LaZWszp+scasbUDLAyJVCo01eUfMK89fPo/nF3ko57\nH8r6lotv4b2O7+lQhnkcjekrSh7HGMOLS16k7bS2rg5/SLMhzOoySx1+PkBb+oqShzl84jC9Pu3F\nR7995LW8eFRxpnSYQsfaHQOsTAkW6vQVJY+y8cBGOnzYgV92/uK1vEbpGszpOoc65eoEWJkSTNTp\nK0oeZOE/C+k8ozN7j+71Wt6yRks+vO1DHd0qH6IxfUXJQxhjeP2H12n5XktXh/+fa/7DvO7z1OHn\nU7Slryh5hGMpx+j/WX8m/zLZa3nhAoV595Z36V63e4CVKaGEOn1FyQP8m/wvt06/lZ+2/eS1vGrJ\nqszqMovLK14eYGVKqJFheEdEKovIQhH5TURWi8gAZ/kQEflXRFY4fzflvlxFUdLz3ebvaDi2oavD\nbxbbjOV9lqvDVwA/BkYXkQpABWPMKhEpDvwMtAe6AAeNMSMzWF8HRleUXGLsz2N5cN6DnEw96bX8\nwSsfZGSrkRSMLBhgZUp2CdrA6MaYHcAOZ/qQiKwD0obM0ZEUFCUInDh1goFfDOSdn9/xWh4VGcXo\nNqPp1aBXgJUpoU6GLf2zjEWqAQnApcBjQE8gCVgOPGaMOedzP23pK0rOsvPQTm6fcTtLNi/xWl6x\neEU+6fIJjSs3DrAyJScJWkvfQ0BxYCYw0GnxjwKeM8YYERkOjATu9bZufHz86em4uDji4uKyo1lR\n8i3Lty2n4/SO/Jv8r9fyxpUb83Hnj6lUolKAlSnZJSEhgYSEhFyvx6+WvogUAD4DvjDGvO6lPBaY\na4yp56VMW/qKkgO898t79Jnbh+Onjnst71W/F6PajKJQgUIBVqbkBrnV0vf346wJwFpPh++84E3j\nVmBNTgpTFMWSkprCo/Mf5e7Zd3t1+AUiCvBW67cYf8t4dfhKhvjTe6cJ8C2wGjDO39NAd6A+kAps\nBPoZY3Z6WV9b+oqSRfYe2UuXmV345p9vvJaXLVqWmZ1m0qxaswArU3Kb3GrpZ+pFbpYqUKevKFni\n152/0uHDDvxz4B+v5Q0qNGB219lULVk1wMqUQBDs8I6iKAFk5tqZXP3u1a4Ov9ul3VjSa4k6fCXT\nqNNXlBAi1aTyzDfP0GlGJ46cPHJOeYRE8PKNLzP11qkULVg0CAqVcEdz7yhKiJB0LIk7PrmDz9d/\n7rW8VOFSfHjbh7S6sFWAlSl5CXX6ihIC/L7nd9p/2J4/9/7ptbzOeXWY3XU2F5a5MMDKlLyGOn1F\nCTKf/fkZd3xyB8nHk72Wd6zVkckdJlOiUIkAK1PyIhrTV5QgYYxh+LfDueWDW1wd/nNxzzGz80x1\n+EqOoS19RQkCh04coufsnny87mOv5SWiSvD+re9zy8W3BFiZktdRp68oAWbD/g10+LADq3et9lpe\ns0xN5nSdQ+3zagdYmZIfUKevKAHk6w1f03lGZ/Yf2++1vPWFrZl22zRKFS4VYGVKfkFj+ooSAIwx\njFw6klbvt3J1+IOaDGJut7nq8JVcRVv6ipLLHD15lL6f9eX9X9/3Wl60YFEmtp9I5zqdA6xMyY+o\n01eUXGRz0mY6Tu/Iiu0rvJbHloxlTtc5XFbhsgArU/Ir6vQVJZdYvGkxt310G7uP7PZa3rxacz7q\n9BFli5YNsDIlP6MxfUXJYYwxjP5pNNdPud7V4Q+8aiDz75yvDl8JONrSV5Qc5HjKcQZ8MYBxK8Z5\nLS8UWYgxbcfQo36PACtTFIs6fUXJIbYf3M7tM27n+y3fey0/v8T5fNLlExqd3yjAyhTlDOr0FSUH\nWLZ1GR2nd2TbwW1ey6+pcg0fd/6YCsUreC1XlEChMX1FySaTVk3iuonXuTr8Ppf3YeHdC9XhKyGB\ntvQVJYucPHWSx796nDeWveG1vEBEAd5s/Sb9G/YPsDJFcSdDpy8ilYEpQHnsIOjjjDFviEhpYDoQ\nix0YvbMxJikXtSpKyLDnyB46z+jMoo2LvJaXK1aOmZ1m0jS2aYCVKYpvMhwYXUQqABWMMatEpDjw\nM9AeuAfYa4z5PxF5EihtjBnkZX0dGF3JU6zasYoOH3ZgU9Imr+VXVLyCWV1mUaVklQArU/ISQRsY\n3Rizwxizypk+BKwDKmMd/2THbDLQIafFKUqoMX3NdK559xpXh39nvTtZfM9idfhKyJKpF7kiUg2o\nD/wAlDfG7AT7YADK5bQ4RQkVTqWeYtDXg+j6cVeOphw9pzxCIhjZciRTOkyhSMEiQVCoKP7h94tc\nJ7QzExhojDkkIuljNq4xnPj4+NPTcXFxxMXFZU6logSRA8cO0P3j7nzx1xdey8sUKcP026fT4oIW\nAVam5CUSEhJISEjI9XoyjOkDiEgB4DPgC2PM686ydUCcMWanE/dfZIw5Z9QHjekr4cza3Wvp8GEH\n1u9b77W8brm6zO46mwtKXxBgZUpeJ2gxfYcJwNo0h+/wKdDTme4BzMlBXYoSdOb8Poerxl/l6vBv\nv+R2vr/3e3X4SljhT++dJsC3wGpsCMcATwPLgI+AKsAmbJfNA17W15a+ElakmlSGJQ4jPjHea7kg\nDGs+jKebPo1IjjfEFAXIvZa+X+GdbFWgTl8JIw4eP8jds+9m9u+zvZZHF4pm6q1TaXtR2wArU/Ib\nueX09YtcRXH4a99ftP+wPWt3r/VafnHMxczuOptaZWsFWJmi5Bzq9BUFmP/XfLp+3JUDx86JUALQ\npmYbpt46lZKFSwZYmaLkLJpwTcnXGGN4+buXuXnaza4O/5mmz/Bpt0/V4St5Am3pK/mWIyeP0PvT\n3nyw5gOv5cUKFmNSh0ncfsntAVamKLmHOn0lX7LpwCY6Tu/Iyh0rvZZXL1WdOV3nULd83QArU5Tc\nRcM7Sr7iVOoppvwyhYbjGro6/Buq38BPfX5Sh6/kSbSlr+QLUk0qH6/9mP8m/Jff9/zuavdo40d5\n6caXKBCht4aSN9ErW8nTGGOYt34egxcNZtWOVa52hSILMa7dOO667K4AqlOUwKNOX8mzLPxnIYMX\nDmbpv0t92lWOrsysLrNoWKlhgJQpSvBQp6/kOZZuWcozC59xHdXKk7YXtWV8u/GUL14+AMoUJfio\n01fyDCu3r2TwosHMWz8vQ9tmsc0Yfv1wrq16bQCUKUrooE5fCXvW7l7LkIQhzFw7M0PbRuc34vnr\nn+eG6jdosjQlX6JOXwlb/t73N/GJ8Uz9dSrGfQwfAOqVr8ew5sNod1E7dfZKvkadvhJ2bEnawvBv\nhzNh1QRSUlN82l4cczFD44bSqU4nIkQ/S1EUdfpK2LDz0E5GLBnBO8vf4fip4z5tq5WqxpBmQ7iz\n3p3a515RPNC7QQl59h3dx8vfvcwby97gyMkjPm0rlajE4KaDuffye4mKjAqQQkUJH9TpKyFL8vFk\nXvvhNV5d+irJx5N92pYtWpanrn2K+xreR5GCRQKkUFHCD3X6Sshx5OQR3l72Ni999xJ7j+71aVuy\nUEn+c81/eOiqhyhRqESAFCpK+KJOXwkZjqccZ9yKcTy/+Hl2HNrh07ZYwWI83PhhHrv6MUoXKR0g\nhYoS/mTo9EXkXaAtsNMYU89ZNgToA+xyzJ42xnyZayqVPE1KagpTfpnC0MShbE7a7NO2UGQhHrjy\nAZ689knKFSsXIIWKknfIcGB0EbkWOARMSef0DxpjRmZYgQ6MrriQalKZvmY6QxKGsH7fep+2BSIK\n0LtBbwZfN5jzo88PkEJFCR5BGxjdGLNERGK9acppMUr+wBjDnD/m8OyiZ1mza41P2wiJ4K56dzGk\n2RCql64eIIWKknfJTkz/QRG5C1gOPGaMScohTUoexRjDV39/xeBFg1m+bXmG9p3rdCa+WTy1z6sd\nAHWKkj/IqtMfBTxnjDEiMhwYCdzrZhwfH396Oi4ujri4uCxWq4Qr3276lsELB7N48+IMbdtd1I7n\nmj9H/Qr1A6BMUUKDhIQEEhIScr2eDGP6AE54Z25aTN/fMqdcY/r5mJ+2/sTgRYP56u+vMrRtcUEL\nhjUfRuPKjQOgTFFCm6DF9NPqxyOGLyIVjDFpfepuBXwHZpV8x+qdq3l20bPM+WNOhrbXVLmG569/\nnrhqcbkvTFHyOf502ZwGxAExIrIZGAI0F5H6QCqwEeiXixqVMOLPvX8yJGEI09dMzzDz5eUVL2d4\n8+HcdOFNmvlSUQKEX+GdbFWg4Z18wcYDG3ku8Tkm/zKZVJPq0/aS8y5hWPNhdKzVUZ29orgQ7PCO\nonhl28FtPP/t84xbMY6TqSd92tYoXYP4uHi6XdqNyIjIAClUFMUTdfpKlthzZA8vLXmJt356i2Mp\nx3zaVo6uzH+v+y896/ekYGTBAClUFMUb6vSVTHHg2AFGLh3J/374H4dOHPJpW75YeZ5u+jR9r+hL\n4QKFA6RQURRfqNNX/OLwicO88eMbvPz9y+w/tt+nbenCpXmyyZM82OhBikUVC5BCRVH8QZ2+4pNj\nKcd4Z/k7jFgygl2Hd/m0LRFVgkevfpRHGj9CycIlA6RQUZTMoE5f8crJUyeZsHICw74dxtaDW33a\nFilQhAGNBvBEkyeIKRoTIIWKomQFdfrKWZxKPcXU1VMZmjiUDfs3+LSNioyi3xX9eOrap6hYomKA\nFCpK3iU1FZYsgUmTcq8OdfoKYNMcf7LuE/676L+s27POp22kRNKzfk+eve5ZYkt5S8CqKEpm2LgR\npkyByZNhg++2VrZRp5/PMcYwb/08Bi8azKodq3zaCkK3ut2IbxZPzZiaAVKoKHmTw4fh449tq37R\nosDVq04/H7Pwn4UMXjiYpf8uzdC2Y62OPNf8OS4td2kAlClK3sSYM+Gbjz6CQ757PecK6vTzIUu3\nLGXwosEs/GdhhrY3XXgTw5oPo2GlhgFQpih5k82bbehm8mT4++/galGnn49YuX0lgxcNZt76eRna\nXhd7HcObD6dpbNMAKFOUvMeRI/DJJ7ZVv3ChbeX7gwjceCN8lXE28iyhTj8fsHb3WoYkDGHm2pkZ\n2jY6vxHDmw+nxQUtNBmaomQSY+C7786Ebw4e9H/diy6Cnj3hrrugcmXr/HMDdfp5mL/3/c3QxKFM\nXT01w8yX9crXY1jzYbS7qJ06e0XJJJs3w3vvWWf/11/+rxcdDV27WmffuHHuOXpP1OnnQbYkbWH4\nt8OZsGoCKakpPm0virmI5+Keo1OdTkRIRIAUKkr4c+QIzJplHf0332QufNOihXX0HTpA0aK5qfJc\n1OnnIXYe2smIJSN4Z/k7HD913KdttVLVGNJsCHfWu5MCEXoZKIo/GANLl1pHP306JCf7v+6FF1pH\nf/fdUKVKbinMGL3b8wD7ju7j5e9e5o1lb3Dk5BGfthWLV2TwdYPpfXlvoiKjAqRQUcKbLVvOhG/W\nr/d/vRIloEsX6+yvuSYw4ZuMUKcfxiQfT+b1H17nlaWvkHzcd5OjbNGyDGoyiPuvvJ8iBYsESKGi\nhC9Hj8Ls2TBxInz9debCN9dfD/fcAx07Bj58kxHq9MOQIyePMOqnUby45EX2Ht3r07ZkoZI8fs3j\nDLxqICUKlQiQQkUJT4yBH36wLfoPP8xc+KZGjTO9b2JDODuJPwOjvwu0BXYaY+o5y0oD04FY7MDo\nnY0xSbmoUwGOpxxn/IrxPL/4ebYf2u7TtljBYgy8aiCPX/M4pYuUDpBCRQlPtm61uW8mTYI///R/\nveLFoXNn6+yvvTY0wjcZkeHA6CJyLXAImOLh9F8C9hpj/k9EngRKG2MGuayvA6Nnk1STynu/vMeQ\nhCFsStrk07ZQZCHuv/J+Bl07iHLFygVIoaKEH0ePwpw51tEvWGAzXPrL9ddbR3/rrVAsl8YJyq2B\n0TN0+k7lscBcD6f/O9DMGLNTRCoACcaYWi7rqtPPBuv3rqfvZ31J2Jjg065ARAF6N+jNM9c9Q+Xo\nyoERpyhhhjGwbJl19B98AEmZiE9Ur36m9021arkk0IPccvpZjemXM8bsBDDG7BARbVLmMCmpKbz6\n/avEJ8b7HHg8QiK4q95d/LfZf7mg9AUBVKgo4cO2bWd63/z+u//rFSt2dvgmIg98ypJTL3J9NuXj\n4+NPT8fFxREXF5dD1eZNVm5fyb2f3svKHSt92nW6pBND44ZS+7zaAVKmKOHDsWNnwjdffZW58E1c\nnHX0t91m4/aBICEhgYSEhFyvJ6vhnXVAnEd4Z5Exxqvn0fCO/xw9eZShiUN55ftXOGVOudq1vagt\nw5oPo36F+gFUpyihjzHw009nwjcHDvi/brVqZ8I31avnksBMEOzwjjh/aXwK9AReAnoAc3JWVv4j\ncWMifeb2Yf0+9y8/qpWqxpi2Y2hZo2UAlSlK6LN9+5nwzTrfA7+dRdGi0KmTdfbXXZc3wjcZ4U/v\nnWlAHBAD7ASGALOBGUAVYBO2y6bXZ6q29H2TdCyJJxY8wdgVY11tBOHhxg8zrPkwikXlUlcBRQkz\njh2DuXOto//yy8yFb5o1OxO+KRGin68EtfdOtipQp+/KnN/ncP+8+9l2cJurzaXlLmV8u/FcVfmq\nACpTlNDEGPj5Z/uV7AcfwP79/q8bGws9etjwTY0auacxpwh2eEfJQXYe2smALwYwY+0MV5uCEQUZ\nfN1gBl07SHPkKPmeHTvg/fdtq/633/xfr2hRuP1226pv1ix/hG8yQp1+ADHGMPmXyTw6/1H2H3Nv\nolxd+WrG3zKeS867JIDqFCW0OH787PDNKfe+DefQtKl19J06hW74Jlio0w8Q/+z/h36f9WPBhgWu\nNsWjijPihhHcf+X9mtteyZcYAytWWEc/bRrs2+f/ulWrngnfXHhhrkkMe9Tp5zKnUk/xxo9vMHjR\nYJ9pj1tf2Jp32r5D1ZJVA6hOUUKDHTtg6lTr7Nes8X+9IkXsy9iePaF5cw3f+IM6/Vxk9c7V9J7b\nm2Vbl7naxBSJ4bWbXuOOunfoMIVKvuLECfjsM+vo583LXPjm2mvPhG+io3NLYd5EnX4ucDzlOM8v\nfp4RS0b4HK6we93uvNbqNc4rdl4A1SlK8DAGVq2yvW+mTYO9vjODn0XlyjZ806MH1KyZexrzOur0\nc5jvt3xP7097s26P+xciVaKrMLrNaNpc1CaAyhQleOzadSZ88+uv/q9XuPDZ4ZvIyNxSmH9Qp59D\nHDx+kKe/eZq3f3ob4yMV0QNXPsCIG0bogCZKnufECfj88zPhmxT3H73ncM011tF37gwlS+aWwvyJ\nOv0c4Iv1X9Dvs35sSd7ialOrbC3GtxtPk6pNAqhMUQLPqlXW0U+dCnv2+L/e+eefCd9cdFGuycv3\nqNPPBrsP7+aR+Y8wdfVUV5sCEQUY1GQQz1z3DIULFA6gOkXJfYyx/en37IGZM62z/+UX/9cvXNiO\nI9uzJ9xwg4ZvAoE6/SxgjGHa6mk8PP9h9hxxb8pcWelKxt8ynnrl62W7zpMn7TBuJ07YG6VIkbP/\nR0WFx1BtSuBJc8yHD3v/O3Qoe8szk/MmjauvPhO+KVUqx3dZ8YE6/UyyOWkz931+H/PWz3O1KVKg\nCMOvH87AqwYSGZG9psuff8KYMTB5su+eDiLeHwZp/32VZed/oUL6sMkJjLEP9Jx2yGl/mekOmVtU\nqmQ/nOrRA2p5HWdPCQSacM1PUk0qo38azaBvBnHoxCFXuxYXtGBM2zHZGsXqxAmYNcs6+0WLsryZ\ngJH+oZJbD5j0dQTjQ5z0jjmnnPKhQ6HhmHOaQoWgQwe45x5o0ULDN5lBs2wGkXW719F7bm++3/K9\nq02pwqUY2XIkPev3zPJHVn//DePGwYQJsHt3VtXmH6KisvfgKFAAjhzJnLPOTA+U/MxVV9nwTZcu\nULp0sNWEJ+r0g8CJUyd4aclLDF88nBOnTrja3X7J7bzZ+k0qFK+Q6TpOnrRJpd55Bxa4p+VRlJCl\nQAE7pGDFitCunXX2tXUEz2yjqZUDzLKty+j9aW9W71rtalOxeEVGtRlFh1odMr39TZtsq/7dd23e\nEX8oVAguvtgOHnHsGBw9eua/tkAVXxQoYAf5TvsrXvzs+YyW+yqL0szfYYU6/XQcPnGYZxc9y+s/\nvk6qce+W0OfyPvzfjf9HqcL+dz1ISbEfqYwZA198YV/e+cPFF0O/fvYlWEyM+7bTPwzSPxhy8n/a\n9An3H0BKJomMzJpD9sdRq2NW0tDwjgdfb/iavnP78s+Bf1xtLixzIePajSOuWpzf2/33X9uiHz/e\nTvtDwYL28/N+/ezgD6HaQyY1NecfMP7aBIM0x5xdh+xtuXa7VTzRmH4usu/oPh776jEmrZrkahMp\nkTx29WPEx8VTpGCRDLd56hTMn29b9Z995n9f5ho1oG9fGxctV86/dfIjqan2V0Z2HignT9qRlTLj\nqNUxK4EiJJ2+iGwEkoBU4KQxppEXm5B1+sYYZq6dyYAvBrDz8E5Xu/oV6vPuLe9yecXLM9zm9u22\n9824cTZu7w8FCthubf36wfXXa05wRVFC90VuKhBnjMnE8MShwbaD27j/8/uZ88ccV5vCBQoT3yye\nR69+lIKRBV3tUlPhm29sD5xPP/X/pWq1atCnD/TqBRUy3/FHURQl02TX6QsQVu3SVJPK+BXj+c+C\n/5B8PNnVrllsM8a1G0fNGPfE3bt22bzgY8fChg3+1R8ZCW3bQv/+cOON+rGKoiiBJbvhnQ3AAeAU\nMNYYM86LTciEd9bvXU+fuX1I3JToahNdKJqXb3yZ3pf39jpOrTGQkGBj9Z98YuPC/lC5sm3V33uv\nzSaoKIrii1AN7zQxxmwXkfOABSKyzhizJL1RfHz86em4uDji4uKyWW3mSElN4dXvXyU+MZ5jKe7d\nPm65+BZG3TyK86PP9cp799oMgmPH2nw4/iACN99sY/WtW9vYvaIoijcSEhJISEjI9XpyrPeOiAwB\nDhpjRqZbHtSW/srtK7n303tZuWOlq025YuV4q/Vb3H7J7WelUDAGliyxrfqZM22mQn+oWNG26Hv3\nhtjY7O6Boij5kZBr6YtIUSDCGHNIRIoBLYGhOaYsmxw9eZShiUN55ftXOGXcM1n1rN+TV1u+Spki\nZU4v278f3nvPOvu1a/2vs1Ur26pv29b2s1cURQk1shNwKA/MEhHjbGeqMearnJGVPRI3JtJnbh/W\n7ySScF8AAAnlSURBVFvvalOtVDXGth3LjTVuBGyr/scfbQ+c6dP9//inXDnb+6ZPH7gg64k1FUVR\nAkKe+jgr6VgSTyx4grErxrrrQXi48cMMaz6MYlHFSE6G99+3rfrMDNh8/fW2B0779vqJu6IoOU/I\nhXdCjTm/z+H+efez7eA2V5tLy13K+HbjuaryVSxfbh39tGk2va4/xMTYvOB9+0JN956ciqIoIUvY\nO/2dh3Yy4IsBzFg7w9UmKjKKwU0H80D9J/n4oyjufwdWrPC/juuus7H6W2+1edgVRVHClbB1+sYY\nJv8ymUfnP8r+Y+4fBF9T5RoevXAcX39wCdXawcGD/m2/VCk7rFu/fpobXFGUvENYxvT/2f8P/T7r\nx4IN7qOOFC9YnPbFR7B+2v0s+9H/j4avucY6+k6d7AhLiqIowSAkE675VUEOOv1Tqad448c3GLxo\nMEdOugfiY0+0Zt+Udzj4b1W/thsdDXfdZZ193bo5IlVRFCVb5Hunv3rnanrP7c2yrctcbQqciCFl\n7uuwujs2LZBvrrzS9sDp0sWmzlUURQkV8m3vneMpxxn+7XBe/O5FUlJ9pK/8tTspX74GR87zub3i\nxeGOO2yrvkGDHBarKIoS4oS00/9+y/f0/rQ36/asczdKqgKfjYb1bXxuq0ED6+i7d4cSJXJYqKIo\nSpgQkk7/4PGDPP3N07z909sYXEJDRuCn++HrEXDCuxcvWhS6dbPOvmFDHfFIURQl5Jz+vPXz6P9Z\nf7Ykb3E32l0LPh0PW5p4La5b1zr6O++EkiVzSaiiKEoYEjJOf/fh3Tw8/2GmrZ7mbnSqACwZBIuf\ngZSzv5IqXBg6d7bO/uqrtVWvKIrijaA7fWMM01ZP4+H5D7PnyB53w61X2tb9znpnLa5Vy/bAuesu\nKFPGZV1FURQFCLLT35y0mXtm3sfCf+e5G50sAt88Dz8+BMaOLRgVBbffblv1TZtqq15RFMVfguL0\nT6akMuC9UYz/5ylORR5yN/y7BXw2BvbbnMU1a9pkZz17QtmygdGqKIqSlwio09+2DV4cv44xO3pz\novz34DYo+NFSMH8krOpJgQJCx042hBMXBxFhNQy7oihKaBGQL3K//NIweuwJPt33Euba4VDghPsK\nv3WCL96g+nkV6NvXpjIuXz5XJSqKooQcYZ2GgfN/hFt6Q/nV7obJlYj4YhTta7Wnf39o0UJb9Yqi\n5F/COw3DvVdDRKprcbHf+zLwkpd44OtSVKoUEEWKoij5kmw5fRG5CXgNiADeNca85NXQxeEXO3Yh\nz9QbxxOD44h0i+8riqIoOUaWAygiEgG8BbQC6gDdRKSWX+uaSO6r+yS7n/uVp7oFx+EnJCQEvtIs\nEC46IXy0hotOCB+tqjN8yE7UvBGw3hizyRhzEvgQaJ/RSvXLN2B5v2WMuvVFihQM3igl4XLyw0Un\nhI/WcNEJ4aNVdYYP2XH65wOeCXL+dZZ5pXCBwrx4w4ss6/Mjl1e8PBvVKoqiKFklIC9ym8U2Y1y7\ncdSMqRmI6hRFURQXstxlU0QaA/HGmJuc+UGASf8yV0Ryt0+ooihKHiWk+umLSCTwB3ADsB1YBnQz\nxvgY8URRFEUJJlkO7xhjTonIg8BXnOmyqQ5fURQlhMn1L3IVRVGU0CGsEx2IyLsislNEfvVYVlpE\nvhKRP0RkvoiU9Ch7SkTWi8g6EWnpsfxyEflVRP4Ukdc8lkeJyIfOOktFpGoWdVYWkYUi8puIrBaR\nh0JYayER+VFEVjpah4SqVmdbESKyQkQ+DVWdIrJRRH5xjumyUNXpbKukiMxw6v5NRK4KNa0icpFz\nLFc4/5NE5KFQ0+ls5xERWePUMdXZbnB1GmPC9g+4FqgP/Oqx7CXgCWf6SeBFZ/oSYCU2pFUN+Isz\nv3R+BK50pucBrZzp+4BRznQX4MMs6qwA1Hemi2PfhdQKRa3O+kWd/5HAD9hvMkJV6yPA+8CnIXz+\nNwCl0y0LOZ3O+pOAe5zpAkDJUNXqbCMC2AZUCTWdQCXn3Ec589OBHsHWGTSHnVN/QCxnO/3fgfLO\ndAXgd2d6EPCkh90XwFWOzVqP5V2B0c70l8BVznQksDuHNM8GWoS6VqAosBy4MhS1ApWBBUAcZ5x+\nKOr8B4hJtywUdUYDf3tZHnJaPbbdElgcijqxTn8TUBrryD8lBO77sA7vuFDOGLMTwBizAyjnLE//\nMdlWZ9n52A/L0vD8yOz0OsaYU8ABEcnWoIwiUg376+QH7IkPOa1OyGQlsANYYIz5KUS1/g/4D2A8\nloWiTgMsEJGfRKR3COusDuwRkYlO6GSsiBQNUa1pdAHSBtYOKZ3GmG3Aq8Bmp84kY8zXwdaZF51+\nekzGJn6TrT6zIlIcmAkMNMYc4lxtIaHVGJNqjGmAbUk3EpE6hJhWEWkD7DTGrMpg/VA4pk2MMZcD\nNwMPiEhTQux4OhQALgfedvQexrY+Q1ErIlIQuAWY4SwKKZ0iUgqbmiYW2+ovJiJ3eNEVUJ150env\nFJHyACJSAdjlLN+KjfulUdlZ5rb8rHXEfpcQbYzZlxVRIlIA6/DfM8bMCWWtaRhjkoEE4KYQ1NoE\nuEVENgAfANeLyHvAjhDTiTFmu/N/Nza014jQO55gW5BbjDHLnfmPsQ+BUNQK0Br42Riz5//bu3vW\nqIIwDMP3W0j8QBCx9otgpVgIIlpYRP0DImih0X/hR+FvCPaCrYWiNiKSWhQlJJoUgoKCjSCkTBHG\nYmbxaCUuZMd97wsCZ4fdsw8b8uzZmZOz7XZvOc8Bn0opP9pR+GPg9KRzTkPpB7+/uz0FrrfteeDJ\nYPxyW+0+BMwCr9vHq/WIOBkRAVz74zHzbfsSsDhGzvvUebmFnrNGxL7R2QQRsQM4D6z1lrWUcruU\nsr+Ucpg6x7lYSrkKPOspZ0TsbJ/wiIhd1DnoFTp7PQHalMPXiDjShuaADz1mba5Q3/BHesv5BTgV\nEdvb/ueA1YnnHGcRZdI/1Lm8b8BGe4FvUBdNXlLPkHkB7Bnc/xZ1RXwNuDAYP0H9Q/wILAzGZ4CH\nbfwVcPAfc54BNoEl6ur8O+rR894Osx5r+ZaAZeBOG+8u62B/Z/m1kNtVTuo8+ej3vgLc7DHnYF/H\ngTct8yPq2TvdZaWeZPAd2D0Y6zHn3facy8ADYNukc/rPWZKUyDRM70iS/pKlL0mJWPqSlIilL0mJ\nWPqSlIilL0mJWPqSlIilL0mJWPpKJyIORMRqu4rk+4h4HhEzk84lbQVLX1nNAvdKKUeBdeDihPNI\nW8LSV1afSykrbfst9ZuKpKln6SurjcH2JvVa8tLUs/SV1VhfiCP9ryx9ZeXlZZWSl1aWpEQ80pek\nRCx9SUrE0pekRCx9SUrE0pekRCx9SUrE0pekRCx9SUrkJ1b9/UB1A1asAAAAAElFTkSuQmCC\n",
607 |       "text/plain": [
608 |        "<matplotlib.figure.Figure at 0x206fa1550>"
609 |       ]
610 |      },
611 |      "metadata": {},
612 |      "output_type": "display_data"
613 |     }
614 |    ],
615 |    "source": [
616 |     "df = pd.DataFrame(snn_results)[['n', 'snn_time', 'bf_time']]\n",
617 |     "df.plot(x='n', linewidth=5,\n",
618 |     "        title='Time per query for bruteforce increases linearly\\nfor pysparnn it grows somewhat like a square root')"
619 |    ]
620 |   }
621 |  ],
622 |  "metadata": {
623 |   "anaconda-cloud": {},
624 |   "kernelspec": {
625 |    "display_name": "Python 2",
626 |    "language": "python",
627 |    "name": "python2"
628 |   },
629 |   "language_info": {
630 |    "codemirror_mode": {
631 |     "name": "ipython",
632 |     "version": 2
633 |    },
634 |    "file_extension": ".py",
635 |    "mimetype": "text/x-python",
636 |    "name": "python",
637 |    "nbconvert_exporter": "python",
638 |    "pygments_lexer": "ipython2",
639 |    "version": "2.7.12"
640 |   }
641 |  },
642 |  "nbformat": 4,
643 |  "nbformat_minor": 0
644 | }
645 | 


--------------------------------------------------------------------------------
/examples/pysparnn_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE-examples file in the root directory of this source tree.
 6 | import numpy as np
 7 | 
 8 | 
 9 | # code that will measure query time and recall
10 | def recall(query, full_set):
11 |     ret = []
12 |     for r_items, t_items in zip(query, full_set):
13 |         result = 0.0
14 |         for r in np.unique(r_items):
15 |             result += 1 if r in t_items else 0
16 |         if len(t_items) > 0:
17 |             ret.append(result / len(t_items))
18 |         else:
19 |             ret.append(0.0)
20 |     return np.array(ret)
21 | 


--------------------------------------------------------------------------------
/examples/sparse_search_comparison.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# Copyright 2016-present, Facebook, Inc.\n",
 12 |     "# All rights reserved.\n",
 13 |     "\n",
 14 |     "# This source code is licensed under the license found in the\n",
 15 |     "# LICENSE-examples file in the root directory of this source tree."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "# Evaluate pysparnn on 20 Newsgroups data"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import numpy as np\n",
 34 |     "import time\n",
 35 |     "from scipy.sparse import csr_matrix\n",
 36 |     "from sklearn.datasets import fetch_20newsgroups"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# make sure you run 'python setup.py install' first!\n",
 48 |     "import pysparnn.cluster_index as ci"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "source": [
 57 |     "# Get data"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "dataset = fetch_20newsgroups(subset='all', shuffle=True)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "Num docs: 18846\n",
 83 |       "Avg doc length: 283.6560012734798\n",
 84 |       "Num unique words: 386410\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "print('Num docs: {}'.format(len(dataset.data)))\n",
 90 |     "print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in dataset.data])))\n",
 91 |     "words = set()\n",
 92 |     "for doc in dataset.data:\n",
 93 |     "    words.update(doc.split())\n",
 94 |     "print('Num unique words: {}'.format(len(words)))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## Turn documents into vectors"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 5,
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "from sklearn.neighbors import LSHForest, NearestNeighbors \n",
113 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
114 |     "\n",
115 |     "tv = TfidfVectorizer(decode_error='ignore')\n",
116 |     "\n",
117 |     "features = csr_matrix(tv.fit_transform(dataset.data))\n",
118 |     "\n",
119 |     "doc_index = np.array(range(len(dataset.data)))"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 6,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "test_features = features[:200]\n",
131 |     "train_features = features[200:]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "from sklearn.cluster import KMeans"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 10,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "(18646, 173762)"
156 |       ]
157 |      },
158 |      "execution_count": 10,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "train_features.shape"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 15,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "from sklearn.cluster import KMeans\n",
176 |     "from sklearn import datasets\n",
177 |     "\n",
178 |     "np.random.seed(5)\n",
179 |     "\n",
180 |     "centers = [[1, 1], [-1, -1], [1, -1]]\n",
181 |     "iris = datasets.load_iris()\n",
182 |     "X = iris.data\n",
183 |     "y = iris.target"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 22,
189 |    "metadata": {
190 |     "collapsed": false
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "k_means = KMeans(n_clusters=int(np.sqrt(train_features.shape[0])), max_iter=20)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 23,
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "import time"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 24,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "1223.7469282150269"
219 |       ]
220 |      },
221 |      "execution_count": 24,
222 |      "metadata": {},
223 |      "output_type": "execute_result"
224 |     }
225 |    ],
226 |    "source": [
227 |     "t0 = time.time()\n",
228 |     "\n",
229 |     "k_means.fit(train_features)\n",
230 |     "\n",
231 |     "time.time() - t0"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 25,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [
241 |     {
242 |      "data": {
243 |       "text/plain": [
244 |        "16.347729921340942"
245 |       ]
246 |      },
247 |      "execution_count": 25,
248 |      "metadata": {},
249 |      "output_type": "execute_result"
250 |     }
251 |    ],
252 |    "source": [
253 |     "t0 = time.time()\n",
254 |     "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)\n",
255 |     "time.time() - t0"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "## Create an answer key"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 8,
268 |    "metadata": {
269 |     "collapsed": false
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "knn = NearestNeighbors()\n",
274 |     "        \n",
275 |     "knn.fit(train_features)\n",
276 |     "\n",
277 |     "# get top 3 nearest neighbors for each document\n",
278 |     "answers = knn.kneighbors(test_features, 3, return_distance=False)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## Build models to compare"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 9,
291 |    "metadata": {
292 |     "collapsed": false,
293 |     "scrolled": false
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 10,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,\n",
311 |        "     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)"
312 |       ]
313 |      },
314 |      "execution_count": 10,
315 |      "metadata": {},
316 |      "output_type": "execute_result"
317 |     }
318 |    ],
319 |    "source": [
320 |     "lshf = LSHForest()\n",
321 |     "        \n",
322 |     "lshf.fit(train_features)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "## Compare results"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 11,
335 |    "metadata": {
336 |     "collapsed": false
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "import pysparnn_utils"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 12,
346 |    "metadata": {
347 |     "collapsed": false
348 |    },
349 |    "outputs": [
350 |     {
351 |      "name": "stdout",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "Percent of time snn returns a top 3 result: 0.66\n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "t0 = time.time()\n",
360 |     "\n",
361 |     "results = snn.search(test_features, return_distance=False, num_indexes=1)\n",
362 |     "\n",
363 |     "print('Percent of time snn returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
364 |     "\n",
365 |     "snn_time = time.time() - t0"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 13,
371 |    "metadata": {
372 |     "collapsed": false
373 |    },
374 |    "outputs": [
375 |     {
376 |      "name": "stdout",
377 |      "output_type": "stream",
378 |      "text": [
379 |       "Percent of time lsh returns a top 3 result: 0.143\n"
380 |      ]
381 |     }
382 |    ],
383 |    "source": [
384 |     "t0 = time.time()\n",
385 |     "\n",
386 |     "results = lshf.kneighbors(test_features, return_distance=False)\n",
387 |     "\n",
388 |     "print('Percent of time lsh returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
389 |     "\n",
390 |     "lsh_time = time.time() - t0"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 14,
396 |    "metadata": {
397 |     "collapsed": false
398 |    },
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "text/plain": [
403 |        "5.112146987324278"
404 |       ]
405 |      },
406 |      "execution_count": 14,
407 |      "metadata": {},
408 |      "output_type": "execute_result"
409 |     }
410 |    ],
411 |    "source": [
412 |     "# LSH is x times slower than snn\n",
413 |     "lsh_time / snn_time"
414 |    ]
415 |   }
416 |  ],
417 |  "metadata": {
418 |   "anaconda-cloud": {},
419 |   "kernelspec": {
420 |    "display_name": "Python 3",
421 |    "language": "python",
422 |    "name": "python3"
423 |   },
424 |   "language_info": {
425 |    "codemirror_mode": {
426 |     "name": "ipython",
427 |     "version": 3
428 |    },
429 |    "file_extension": ".py",
430 |    "mimetype": "text/x-python",
431 |    "name": "python",
432 |    "nbconvert_exporter": "python",
433 |    "pygments_lexer": "ipython3",
434 |    "version": "3.6.0"
435 |   }
436 |  },
437 |  "nbformat": 4,
438 |  "nbformat_minor": 0
439 | }
440 | 


--------------------------------------------------------------------------------
/pysparnn/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree. An additional grant
6 | # of patent rights can be found in the PATENTS file in the same directory.
7 | 


--------------------------------------------------------------------------------
/pysparnn/cluster_index.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the BSD-style license found in the
  5 | # LICENSE file in the root directory of this source tree. An additional grant
  6 | # of patent rights can be found in the PATENTS file in the same directory.
  7 | """Defines a cluster pruing search structure to do K-NN Queries"""
  8 | 
  9 | from __future__ import absolute_import, division, print_function, unicode_literals
 10 | 
 11 | import collections as _collections
 12 | import random as _random
 13 | 
 14 | import numpy as _np
 15 | 
 16 | import pysparnn.matrix_distance
 17 | 
 18 | 
 19 | def _k_best(tuple_list, k):
 20 |     """For a list of tuples [(distance, value), ...] - Get the k-best tuples by
 21 |     distance.
 22 |     Args:
 23 |         tuple_list: List of tuples. (distance, value)
 24 |         k: Number of tuples to return.
 25 |     """
 26 |     tuple_lst = sorted(tuple_list, key=lambda x: x[0],
 27 |                        reverse=False)[:k]
 28 | 
 29 |     return tuple_lst
 30 | 
 31 | 
 32 | def _filter_unique(tuple_list):
 33 |     """For a list of tuples [(distance, value), ...] - filter out duplicate
 34 |     values.
 35 |     Args:
 36 |         tuple_list: List of tuples. (distance, value)
 37 |     """
 38 | 
 39 |     added = set()
 40 |     ret = []
 41 |     for distance, value in tuple_list:
 42 |         if not value in added:
 43 |             ret.append((distance, value))
 44 |             added.add(value)
 45 |     return ret
 46 | 
 47 | 
 48 | def _filter_distance(results, return_distance):
 49 |     """For a list of tuples [(distance, value), ...] - optionally filter out
 50 |     the distance elements.
 51 |     Args:
 52 |         tuple_list: List of tuples. (distance, value)
 53 |         return_distance: boolean to determine if distances should be returned.
 54 |     """
 55 |     if return_distance:
 56 |         return results
 57 |     else:
 58 |         return list([x for y, x in results])
 59 | 
 60 | 
 61 | class ClusterIndex(object):
 62 |     """Search structure which gives speedup at slight loss of recall.
 63 | 
 64 |        Uses cluster pruning structure as defined in:
 65 |        http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html
 66 | 
 67 |        tldr - searching for a document in an index of K documents is naievely
 68 |            O(K). However you can create a tree structure where the first level
 69 |            is O(sqrt(K)) and each of the leaves are also O(sqrt(K)).
 70 | 
 71 |            You randomly pick sqrt(K) items to be in the top level. Then for
 72 |            the K doccuments you assign it to the closest neighbor in the top
 73 |            level.
 74 | 
 75 |            This breaks up one O(K) search into O(2 * sqrt(K)) searches which
 76 |            is much much faster when K is big.
 77 | 
 78 |            This generalizes to h levels. The runtime becomes:
 79 |                O(h * h_root(K))
 80 |     """
 81 | 
 82 |     def __init__(self, features, records_data,
 83 |                  distance_type=pysparnn.matrix_distance.CosineDistance,
 84 |                  matrix_size=None,
 85 |                  parent=None):
 86 |         """Create a search index composed of recursively defined
 87 |         matricies. Does recursive KNN search. See class docstring for a
 88 |         description of the method.
 89 | 
 90 |         Args:
 91 |             features: A csr_matrix with rows that represent records
 92 |                 (corresponding to the elements in records_data) and columns
 93 |                 that describe a point in space for each row.
 94 |             records_data: Data to return when a doc is matched. Index of
 95 |                 corresponds to records_features.
 96 |             distance_type: Class that defines the distance measure to use.
 97 |             matrix_size: Ideal size for matrix multiplication. This controls
 98 |                 the depth of the tree. Defaults to 2 levels (approx). Highly
 99 |                 reccomended that the default value is used.
100 |         """
101 | 
102 |         self.is_terminal = False
103 |         self.parent = parent
104 |         self.distance_type = distance_type
105 |         self.desired_matrix_size = matrix_size
106 |         features = distance_type.features_to_matrix(features)
107 |         num_records = features.shape[0]
108 | 
109 |         if matrix_size is None:
110 |             matrix_size = max(int(_np.sqrt(num_records)), 1000)
111 |         else:
112 |             matrix_size = int(matrix_size)
113 | 
114 |         self.matrix_size = matrix_size
115 | 
116 |         num_levels = _np.log(num_records) / _np.log(self.matrix_size)
117 | 
118 |         if num_levels <= 1.4:
119 |             self.is_terminal = True
120 |             self.root = distance_type(features, records_data)
121 |         else:
122 |             self.is_terminal = False
123 |             records_data = _np.array(records_data)
124 | 
125 |             records_index = list(_np.arange(features.shape[0]))
126 |             clusters_size = min(self.matrix_size, num_records)
127 |             clusters_selection = _random.sample(records_index, clusters_size)
128 |             clusters_selection = features[clusters_selection]
129 | 
130 |             item_to_clusters = _collections.defaultdict(list)
131 | 
132 |             root = distance_type(clusters_selection,
133 |                                  list(_np.arange(clusters_selection.shape[0])))
134 | 
135 |             root.remove_near_duplicates()
136 |             root = distance_type(root.matrix,
137 |                                  list(_np.arange(root.matrix.shape[0])))
138 | 
139 |             rng_step = self.matrix_size
140 |             for rng in range(0, features.shape[0], rng_step):
141 |                 max_rng = min(rng + rng_step, features.shape[0])
142 |                 records_rng = features[rng:max_rng]
143 |                 for i, clstrs in enumerate(root.nearest_search(records_rng)):
144 |                     _random.shuffle(clstrs)
145 |                     for _, cluster in _k_best(clstrs, k=1):
146 |                         item_to_clusters[cluster].append(i + rng)
147 | 
148 |             clusters = []
149 |             cluster_keeps = []
150 |             for k, clust_sel in enumerate(clusters_selection):
151 |                 clustr = item_to_clusters[k]
152 |                 if len(clustr) > 0:
153 |                     index = ClusterIndex(self.distance_type.vstack(features[clustr]),
154 |                                          records_data[clustr],
155 |                                          distance_type=distance_type,
156 |                                          matrix_size=self.matrix_size,
157 |                                          parent=self)
158 | 
159 |                     clusters.append(index)
160 |                     cluster_keeps.append(clust_sel)
161 | 
162 |             cluster_keeps = self.distance_type.vstack(cluster_keeps)
163 |             clusters = _np.array(clusters)
164 | 
165 |             self.root = distance_type(cluster_keeps, clusters)
166 | 
167 |     def insert(self, feature, record):
168 |         """Insert a single record into the index.
169 | 
170 |         Args:
171 |             feature: feature vector
172 |             record: record to return as the result of a search
173 |         """
174 |         feature = self.distance_type.features_to_matrix(feature)
175 |         nearest = self
176 |         while not nearest.is_terminal:
177 |             nearest = nearest.root.nearest_search(feature)
178 |             _, nearest = nearest[0][0]
179 | 
180 |         cluster_index = nearest
181 |         parent_index = cluster_index.parent
182 |         while parent_index and cluster_index.matrix_size * 2 < \
183 |                 len(cluster_index.root.get_records()):
184 |             cluster_index = parent_index
185 |             parent_index = cluster_index.parent
186 | 
187 |         cluster_index._reindex(feature, record)
188 | 
189 |     def _get_child_data(self):
190 |         """Get all of the features and corresponding records represented in the
191 |         full tree structure.
192 | 
193 |         Returns:
194 |             A tuple of (list(features), list(records)).
195 |         """
196 | 
197 |         if self.is_terminal:
198 |             return [self.root.get_feature_matrix()], [self.root.get_records()]
199 |         else:
200 |             result_features = []
201 |             result_records = []
202 | 
203 |             for c in self.root.get_records():
204 |                 features, records = c._get_child_data()
205 | 
206 |                 result_features.extend(features)
207 |                 result_records.extend(records)
208 | 
209 |             return result_features, result_records
210 | 
211 |     def _reindex(self, feature=None, record=None):
212 |         """Rebuild the search index. Optionally add a record. This is used
213 |         when inserting records to the index.
214 | 
215 |         Args:
216 |             feature: feature vector
217 |             record: record to return as the result of a search
218 |         """
219 | 
220 |         features, records = self._get_child_data()
221 | 
222 |         flat_rec = []
223 |         for x in records:
224 |             flat_rec.extend(x)
225 | 
226 |         if feature is not None and record is not None:
227 |             features.append(feature)
228 |             flat_rec.append(record)
229 | 
230 |         self.__init__(self.distance_type.vstack(features), flat_rec, self.distance_type,
231 |                       self.desired_matrix_size, self.parent)
232 | 
233 |     def _search(self, features, k=1, k_clusters=1):
234 |         """Find the closest item(s) for each feature_list in.
235 | 
236 |         Args:
237 |             features: A matrix with rows that represent records
238 |                 (corresponding to the elements in records_data) and columns
239 |                 that describe a point in space for each row.
240 |             k: Return the k closest results.
241 |             k_clusters: number of branches (clusters) to search at each level.
242 |                 This increases recall at the cost of some speed.
243 | 
244 |         Returns:
245 |             For each element in features_list, return the k-nearest items
246 |             and their distance score
247 |             [[(score1_1, item1_1), ..., (score1_k, item1_k)],
248 |              [(score2_1, item2_1), ..., (score2_k, item2_k)], ...]
249 |         """
250 |         if self.is_terminal:
251 |             nearest = self.root.nearest_search(features)
252 |             return [r[:k] for r in nearest]
253 |         else:
254 |             ret = []
255 |             nearest = self.root.nearest_search(features)
256 | 
257 |             for search_i, nearest_clusters in enumerate(nearest):
258 |                 curr_ret = []
259 | 
260 |                 for cluster_i, distance_cluster in enumerate(nearest_clusters):
261 |                     distance, cluster = distance_cluster
262 |                     cluster_items = cluster.search(features[search_i], k=k,
263 |                                                    k_clusters=k_clusters)
264 | 
265 |                     for elements in cluster_items:
266 |                         if len(elements) > 0:
267 |                             curr_ret.extend(elements)
268 | 
269 |                     # if we have k elements and we have searched at least
270 |                     # k_clusters then we are done
271 |                     if len(curr_ret) >= k and cluster_i + 1 >= k_clusters:
272 |                         break
273 | 
274 |                 ret.append(_k_best(curr_ret, k))
275 |             return ret
276 | 
277 |     def search(self, features, k=1, k_clusters=1,
278 |                return_distance=True):
279 |         """Find the closest item(s) for each feature_list in the index.
280 | 
281 |         Args:
282 |             features: A matrix with rows that represent records
283 |                 (corresponding to the elements in records_data) and columns
284 |                 that describe a point in space for each row.
285 |             k: Return the k closest results.
286 |             k_clusters: number of branches (clusters) to search at each level.
287 |                 This increases recall at the cost of some speed.
288 | 
289 |         Returns:
290 |             For each element in features_list, return the k-nearest items
291 |             and (optionally) their distance score
292 |             [[(score1_1, item1_1), ..., (score1_k, item1_k)],
293 |              [(score2_1, item2_1), ..., (score2_k, item2_k)], ...]
294 | 
295 |             Note: if return_distance == False then the scores are omitted
296 |             [[item1_1, ..., item1_k],
297 |              [item2_1, ..., item2_k], ...]
298 |         """
299 | 
300 |         # search no more than 1k records at once
301 |         # helps keap the matrix multiplies small
302 |         batch_size = 1000
303 |         results = []
304 |         rng_step = batch_size
305 |         features = self.distance_type.features_to_matrix(features)
306 |         for rng in range(0, features.shape[0], rng_step):
307 |             max_rng = min(rng + rng_step, features.shape[0])
308 |             records_rng = features[rng:max_rng]
309 | 
310 |             results.extend(self._search(features=records_rng,
311 |                                         k=k,
312 |                                         k_clusters=k_clusters))
313 | 
314 |         return [_filter_distance(res, return_distance) for res in results]
315 | 
316 |     def _print_structure(self, tabs=''):
317 |         """Pretty print the tree index structure's matrix sizes"""
318 |         print(tabs + str(self.root.matrix.shape[0]))
319 |         if not self.is_terminal:
320 |             for index in self.root.records_data:
321 |                 index._print_structure(tabs + '  ')
322 | 
323 |     def _max_depth(self):
324 |         """Yield the max depth of the tree index"""
325 |         if not self.is_terminal:
326 |             max_dep = 0
327 |             for index in self.root.records_data:
328 |                 max_dep = max(max_dep, index._max_depth())
329 |             return 1 + max_dep
330 |         else:
331 |             return 1
332 | 
333 |     def _matrix_sizes(self, ret=None):
334 |         """Return all of the matrix sizes within the index"""
335 |         if ret is None:
336 |             ret = []
337 |         ret.append(len(self.root.records_data))
338 |         if not self.is_terminal:
339 |             for index in self.root.records_data:
340 |                 ret.extend(index._matrix_sizes())
341 |         return ret
342 | 
343 | 
344 | class MultiClusterIndex(object):
345 |     """Search structure which provides query speedup at the loss of recall.
346 | 
347 |        There are two components to this.
348 | 
349 |        = Cluster Indexes =
350 |        Uses cluster pruning index structure as defined in:
351 |        http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html
352 | 
353 |        Refer to ClusterIndex documentation.
354 | 
355 |        = Multiple Indexes =
356 |        The MultiClusterIndex creates multiple ClusterIndexes. This method
357 |        gives better recall at the cost of allocating more memory. The
358 |        ClusterIndexes are created by randomly picking representative clusters.
359 |        The randomization tends to do a pretty good job but it is not perfect.
360 |        Elements can be assigned to clusters that are far from an optimal match.
361 |        Creating more Indexes (random cluster allocations) increases the chances
362 |        of finding a good match.
363 | 
364 |        There are three perameters that impact recall. Will discuss them all
365 |        here:
366 |        1) MuitiClusterIndex(matrix_size)
367 |            This impacts the tree structure (see cluster index documentation).
368 |            Has a good default value. By increasing this value your index will
369 |            behave increasingly like brute force search and you will loose query
370 |            efficiency. If matrix_size is greater than your number of records
371 |            you get brute force search.
372 |        2) MuitiClusterIndex.search(k_clusters)
373 |            Number of clusters to check when looking for records. This increases
374 |            recall at the cost of query speed. Can be specified dynamically.
375 |        3) MuitiClusterIndex(num_indexes)
376 |            Number of indexes to generate. This increases recall at the cost of
377 |            query speed. It also increases memory usage. It can only be
378 |            specified at index construction time.
379 | 
380 |            Compared to (2) this argument gives better recall and has comparable
381 |            speed. This statement assumes default (automatic) matrix_size is
382 |            used.
383 |             Scenario 1:
384 | 
385 |             (a) num_indexes=2, k_clusters=1
386 |             (b) num_indexes=1, k_clusters=2
387 | 
388 |             (a) will have better recall but consume 2x the memory. (a) will be
389 |             slightly slower than (b).
390 | 
391 |             Scenario 2:
392 | 
393 |             (a) num_indexes=2, k_clusters=1, matrix_size >> records
394 |             (b) num_indexes=1, k_clusters=2, matrix_size >> records
395 | 
396 |             This means that each index does a brute force search. (a) and (b)
397 |             will have the same recall. (a) will be 2x slower than (b). (a) will
398 |             consume 2x the memory of (b).
399 | 
400 |             Scenario 1 will be much faster than Scenario 2 for large data.
401 |             Scenario 2 will have better recall than Scenario 1.
402 |     """
403 | 
404 |     def __init__(self, features, records_data,
405 |                  distance_type=pysparnn.matrix_distance.CosineDistance,
406 |                  matrix_size=None, num_indexes=2):
407 |         """Create a search index composed of multtiple ClusterIndexes. See
408 |         class docstring for a description of the method.
409 | 
410 |         Args:
411 |             features: A matrix with rows that represent records
412 |                 (corresponding to the elements in records_data) and columns
413 |                 that describe a point in space for each row.
414 |             records_data: Data to return when a doc is matched. Index of
415 |                 corresponds to records_features.
416 |             distance_type: Class that defines the distance measure to use.
417 |             matrix_size: Ideal size for matrix multiplication. This controls
418 |                 the depth of the tree. Defaults to 2 levels (approx). Highly
419 |                 reccomended that the default value is used.
420 |             num_indexes: Number of ClusterIndexes to construct. Improves recall
421 |                 at the cost of memory.
422 |         """
423 | 
424 |         self.indexes = []
425 |         for _ in range(num_indexes):
426 |             self.indexes.append((ClusterIndex(features, records_data,
427 |                                               distance_type, matrix_size)))
428 | 
429 |     def insert(self, feature, record):
430 |         """Insert a single record into the index.
431 | 
432 |         Args:
433 |             feature: feature vector
434 |             record: record to return as the result of a search
435 |         """
436 |         for ind in self.indexes:
437 |             ind.insert(feature, record)
438 | 
439 |     def search(self, features, k=1, k_clusters=1,
440 |                return_distance=True, num_indexes=None):
441 |         """Find the closest item(s) for each feature_list in the index.
442 | 
443 |         Args:
444 |             features: A matrix with rows that represent records
445 |                 (corresponding to the elements in records_data) and columns
446 |                 that describe a point in space for each row.
447 |             k: Return the k closest results.
448 |             k_clusters: number of branches (clusters) to search at each level
449 |                 within each index. This increases recall at the cost of some
450 |                 speed.
451 | 
452 |             num_indexes: number of indexes to search. This increases recall at
453 |                 the cost of some speed. Can not be larger than the number of
454 |                 num_indexes that was specified in the constructor. Defaults to
455 |                 searching all indexes.
456 | 
457 |         Returns:
458 |             For each element in features_list, return the k-nearest items
459 |             and (optionally) their distance score
460 |             [[(score1_1, item1_1), ..., (score1_k, item1_k)],
461 |              [(score2_1, item2_1), ..., (score2_k, item2_k)], ...]
462 | 
463 |             Note: if return_distance == False then the scores are omitted
464 |             [[item1_1, ..., item1_k],
465 |              [item2_1, ..., item2_k], ...]
466 |         """
467 |         results = []
468 |         if num_indexes is None:
469 |             num_indexes = len(self.indexes)
470 |         for ind in self.indexes[:num_indexes]:
471 |             results.append(ind.search(features, k, k_clusters, True))
472 |         ret = []
473 |         for r in _np.hstack(results):
474 |             ret.append(
475 |                 _filter_distance(
476 |                     _k_best(_filter_unique(r), k),
477 |                     return_distance
478 |                 )
479 |             )
480 | 
481 |         return ret
482 | 


--------------------------------------------------------------------------------
/pysparnn/matrix_distance.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the BSD-style license found in the
  5 | # LICENSE file in the root directory of this source tree. An additional grant
  6 | # of patent rights can be found in the PATENTS file in the same directory.
  7 | """Defines a distance search structure"""
  8 | 
  9 | from __future__ import absolute_import, division, print_function, unicode_literals
 10 | 
 11 | import abc as _abc
 12 | 
 13 | import numpy as _np
 14 | import scipy.sparse as _sparse
 15 | import scipy.spatial.distance as _spatial_distance
 16 | 
 17 | 
 18 | class MatrixMetricSearch(object):
 19 |     """A matrix representation out of features."""
 20 |     __metaclass__ = _abc.ABCMeta
 21 | 
 22 |     def __init__(self, features, records_data):
 23 |         """
 24 |         Args:
 25 |             features: A matrix with rows that represent records
 26 |                 (corresponding to the elements in records_data) and columns
 27 |                 that describe a point in space for each row.
 28 |             records_data: Data to return when a doc is matched. Index of
 29 |                 corresponds to features.
 30 |         """
 31 |         self.matrix = features
 32 |         self.records_data = _np.array(records_data)
 33 | 
 34 |     def get_feature_matrix(self):
 35 |         return self.matrix
 36 | 
 37 |     def get_records(self):
 38 |         return self.records_data
 39 | 
 40 |     @staticmethod
 41 |     @_abc.abstractmethod
 42 |     def features_to_matrix(features):
 43 |         """
 44 |         Args:
 45 |             val: A list of features to be formatted.
 46 |         Returns:
 47 |             The transformed matrix.
 48 |         """
 49 |         return
 50 | 
 51 |     @staticmethod
 52 |     @_abc.abstractmethod
 53 |     def vstack(matrix_list):
 54 |         """
 55 |         Args:
 56 |             val: A list of features to be formatted.
 57 |         Returns:
 58 |             The transformed matrix.
 59 |         """
 60 |         return
 61 | 
 62 |     @_abc.abstractmethod
 63 |     def _transform_value(self, val):
 64 |         """
 65 |         Args:
 66 |             val: A numeric value to be (potentially transformed).
 67 |         Returns:
 68 |             The transformed numeric value.
 69 |         """
 70 |         return
 71 | 
 72 |     @_abc.abstractmethod
 73 |     def _distance(self, a_matrix):
 74 |         """
 75 |         Args:
 76 |             a_matrix: A matrix with rows that represent records
 77 |                 to search against.
 78 |             records_data: Data to return when a doc is matched. Index of
 79 |                 corresponds to features.
 80 |         Returns:
 81 |             A dense array representing distance.
 82 |         """
 83 |         return
 84 | 
 85 |     def nearest_search(self, features):
 86 |         """Find the closest item(s) for each set of features in features_list.
 87 | 
 88 |         Args:
 89 |             features: A matrix with rows that represent records
 90 |                 (corresponding to the elements in records_data) and columns
 91 |                 that describe a point in space for each row.
 92 | 
 93 |         Returns:
 94 |             For each element in features_list, return the k-nearest items
 95 |             and their distance scores
 96 |             [[(score1_1, item1_1), ..., (score1_k, item1_k)],
 97 |              [(score2_1, item2_1), ..., (score2_k, item2_k)], ...]
 98 |         """
 99 | 
100 |         dist_matrix = self._distance(features)
101 | 
102 |         ret = []
103 |         for i in range(dist_matrix.shape[0]):
104 |             # replacing the for loop by matrix ops could speed things up
105 | 
106 |             scores = dist_matrix[i]
107 |             records = self.records_data
108 | 
109 |             arg_index = _np.argsort(scores)
110 | 
111 |             curr_ret = list(zip(scores[arg_index], records[arg_index]))
112 | 
113 |             ret.append(curr_ret)
114 | 
115 |         return ret
116 | 
117 |     def remove_near_duplicates(self):
118 |         """If there are 2 or more records with 0 distance from eachother - 
119 |         keep only one. 
120 |         """
121 | 
122 |         dist_matrix = self._distance(self.matrix)
123 | 
124 |         keeps = []
125 |         dupes = set()
126 |         for row_index in range(dist_matrix.shape[0]):
127 |             max_dist = dist_matrix[row_index].max()
128 |             for col_index in range(dist_matrix.shape[0]):
129 |                 if row_index < col_index:
130 |                     if dist_matrix[row_index, col_index] / max_dist <= 0.001:
131 |                         dupes.add(col_index)
132 |             if not row_index in dupes:
133 |                 keeps.append(row_index)
134 | 
135 |         self.matrix = self.matrix[keeps]
136 |         self.records = self.records_data[keeps]
137 | 
138 | 
139 | class CosineDistance(MatrixMetricSearch):
140 |     """A matrix that implements cosine distance search against it.
141 | 
142 |     cosine_distance = 1 - cosine_similarity
143 | 
144 |     Note: We want items that are more similar to be closer to zero so we are
145 |     going to instead return 1 - cosine_similarity. We do this so similarity
146 |     and distance metrics can be treated the same way.
147 |     """
148 | 
149 |     def __init__(self, features, records_data):
150 |         super(CosineDistance, self).__init__(features, records_data)
151 | 
152 |         m_c = self.matrix.copy()
153 |         m_c.data **= 2
154 |         self.matrix_root_sum_square = \
155 |             _np.sqrt(_np.asarray(m_c.sum(axis=1)).reshape(-1))
156 | 
157 |     @staticmethod
158 |     def features_to_matrix(features):
159 |         """
160 |         Args:
161 |             val: A list of features to be formatted.
162 |         Returns:
163 |             The transformed matrix.
164 |         """
165 |         return _sparse.csr_matrix(features)
166 | 
167 |     @staticmethod
168 |     def vstack(matrix_list):
169 |         """
170 |         Args:
171 |             val: A list of features to be formatted.
172 |         Returns:
173 |             The transformed matrix.
174 |         """
175 |         return _sparse.vstack(matrix_list)
176 | 
177 |     def _transform_value(self, v):
178 |         return v
179 | 
180 |     def _distance(self, a_matrix):
181 |         """Vectorised cosine distance"""
182 |         # what is the implmentation of transpose? can i change the order?
183 |         dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0
184 | 
185 |         a_c = a_matrix.copy()
186 |         a_c.data **= 2
187 |         a_root_sum_square = _np.asarray(a_c.sum(axis=1)).reshape(-1)
188 |         a_root_sum_square = \
189 |             a_root_sum_square.reshape(len(a_root_sum_square), 1)
190 |         a_root_sum_square = _np.sqrt(a_root_sum_square)
191 | 
192 |         magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
193 | 
194 |         return 1 - dprod.multiply(magnitude).toarray()
195 | 
196 | 
197 | class UnitCosineDistance(MatrixMetricSearch):
198 |     """A matrix that implements cosine distance search against it.
199 | 
200 |     cosine_distance = 1 - cosine_similarity
201 | 
202 |     Note: We want items that are more similar to be closer to zero so we are
203 |     going to instead return 1 - cosine_similarity. We do this so similarity
204 |     and distance metrics can be treated the same way.
205 | 
206 |     Assumes unit-vectors and takes some shortucts:
207 |       * Uses integers instead of floats
208 |       * 1**2 == 1 so that operation can be skipped
209 |     """
210 | 
211 |     def __init__(self, features, records_data):
212 |         super(UnitCosineDistance, self).__init__(features, records_data)
213 |         self.matrix_root_sum_square = \
214 |             _np.sqrt(_np.asarray(self.matrix.sum(axis=1)).reshape(-1))
215 | 
216 |     @staticmethod
217 |     def features_to_matrix(features):
218 |         """
219 |         Args:
220 |             val: A list of features to be formatted.
221 |         Returns:
222 |             The transformed matrix.
223 |         """
224 |         return _sparse.csr_matrix(features)
225 | 
226 |     @staticmethod
227 |     def vstack(matrix_list):
228 |         """
229 |         Args:
230 |             val: A list of features to be formatted.
231 |         Returns:
232 |             The transformed matrix.
233 |         """
234 |         return _sparse.vstack(matrix_list)
235 | 
236 |     def _transform_value(self, v):
237 |         return 1
238 | 
239 |     def _distance(self, a_matrix):
240 |         """Vectorised cosine distance"""
241 |         # what is the implmentation of transpose? can i change the order?
242 |         dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0
243 | 
244 |         a_root_sum_square = _np.asarray(a_matrix.sum(axis=1)).reshape(-1)
245 |         a_root_sum_square = \
246 |             a_root_sum_square.reshape(len(a_root_sum_square), 1)
247 |         a_root_sum_square = _np.sqrt(a_root_sum_square)
248 | 
249 |         magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
250 | 
251 |         return 1 - dprod.multiply(magnitude).toarray()
252 | 
253 | 
254 | class SlowEuclideanDistance(MatrixMetricSearch):
255 |     """A matrix that implements euclidean distance search against it.
256 |     WARNING: This is not optimized.
257 |     """
258 | 
259 |     def __init__(self, features, records_data):
260 |         super(SlowEuclideanDistance, self).__init__(features, records_data)
261 |         self.matrix = self.matrix
262 | 
263 |     @staticmethod
264 |     def features_to_matrix(features):
265 |         """
266 |         Args:
267 |             val: A list of features to be formatted.
268 |         Returns:
269 |             The transformed matrix.
270 |         """
271 |         return _np.array(features, ndmin=2)
272 | 
273 |     @staticmethod
274 |     def vstack(matrix_list):
275 |         """
276 |         Args:
277 |             val: A list of features to be formatted.
278 |         Returns:
279 |             The transformed matrix.
280 |         """
281 |         return _np.vstack(matrix_list)
282 | 
283 |     def _transform_value(self, v):
284 |         return v
285 | 
286 |     def _distance(self, a_matrix):
287 |         """Euclidean distance"""
288 | 
289 |         return _spatial_distance.cdist(a_matrix, self.matrix, 'euclidean')
290 | 
291 | 
292 | class DenseCosineDistance(MatrixMetricSearch):
293 |     """A matrix that implements cosine distance search against it.
294 | 
295 |     cosine_distance = 1 - cosine_similarity
296 | 
297 |     Note: We want items that are more similar to be closer to zero so we are
298 |     going to instead return 1 - cosine_similarity. We do this so similarity
299 |     and distance metrics can be treated the same way.
300 |     """
301 | 
302 |     def __init__(self, features, records_data):
303 |         super(DenseCosineDistance, self).__init__(features, records_data)
304 | 
305 |         self.matrix_root_sum_square = \
306 |             _np.sqrt((self.matrix ** 2).sum(axis=1).reshape(-1))
307 | 
308 |     @staticmethod
309 |     def features_to_matrix(features):
310 |         """
311 |         Args:
312 |             val: A list of features to be formatted.
313 |         Returns:
314 |             The transformed matrix.
315 |         """
316 |         return _np.array(features, ndmin=2)
317 | 
318 |     @staticmethod
319 |     def vstack(matrix_list):
320 |         """
321 |         Args:
322 |             val: A list of features to be formatted.
323 |         Returns:
324 |             The transformed matrix.
325 |         """
326 |         return _np.vstack(matrix_list)
327 | 
328 |     def _transform_value(self, v):
329 |         return v
330 | 
331 |     def _distance(self, a_matrix):
332 |         """Vectorised cosine distance"""
333 |         # what is the implmentation of transpose? can i change the order?
334 |         dprod = self.matrix.dot(a_matrix.transpose()).transpose() * 1.0
335 | 
336 |         a_root_sum_square = (a_matrix ** 2).sum(axis=1).reshape(-1)
337 |         a_root_sum_square = a_root_sum_square.reshape(len(a_root_sum_square), 1)
338 |         a_root_sum_square = _np.sqrt(a_root_sum_square)
339 | 
340 |         magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
341 | 
342 |         return 1 - (dprod * magnitude)
343 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy == 1.11.2
2 | scipy == 0.18.1
3 | scikit_learn == 0.17.1
4 | 


--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python -m unittest discover tests
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | # This is a list of files to install, and where
 4 | # (relative to the 'root' dir, where setup.py is)
 5 | # You could be more specific.
 6 | files = []
 7 | 
 8 | setup(name="pysparnn",
 9 |       version="0.4",
10 |       description="Sparse (approximate) nearest neighbor search for python!",
11 |       author="Spencer Beecher",
12 |       author_email="spencebeecher@gmail.com",
13 |       # url = "",
14 |       # Name the folder where your packages live:
15 |       # (If you have other packages (dirs) or modules (py files) then
16 |       # put them into the package directory - they will be found
17 |       # recursively.)
18 |       packages=['pysparnn'],
19 |       # 'package' package must contain files (see list above)
20 |       # I called the package 'package' thus cleverly confusing the whole issue...
21 |       # This dict maps the package name =to=> directories
22 |       # It says, package *needs* these files.
23 |       # package_data = {},
24 |       # 'runner' is in the root.
25 |       # scripts = [],
26 |       long_description="""Sparse (approximate) nearest neighbor search for python!"""
27 |       #
28 |       # This next part it for the Cheese Shop, look a little down the page.
29 |       # classifiers = []
30 |       )
31 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree. An additional grant
6 | # of patent rights can be found in the PATENTS file in the same directory.
7 | 


--------------------------------------------------------------------------------
/tests/test_pysparnn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the BSD-style license found in the
  5 | # LICENSE file in the root directory of this source tree. An additional grant
  6 | # of patent rights can be found in the PATENTS file in the same directory.
  7 | """Test pysparn search"""
  8 | 
  9 | import unittest
 10 | import pysparnn.cluster_index as ci
 11 | import numpy as np
 12 | from scipy.sparse import csr_matrix
 13 | from pysparnn.matrix_distance import SlowEuclideanDistance
 14 | from pysparnn.matrix_distance import UnitCosineDistance
 15 | from pysparnn.matrix_distance import DenseCosineDistance
 16 | from sklearn.feature_extraction import DictVectorizer
 17 | 
 18 | class PysparnnTest(unittest.TestCase):
 19 |     """End to end tests for pysparnn"""
 20 |     def setUp(self):
 21 |         np.random.seed(1)
 22 | 
 23 |     def test_remove_duplicates(self):
 24 |         """Do a quick basic test for index/search functionality"""
 25 |         data = [
 26 |             'hello world',
 27 |             'hello world',
 28 |             'oh hello there',
 29 |             'oh hello there',
 30 |             'oh hello there',
 31 |             'Play it',
 32 |             'Play it again Sam',
 33 |         ]
 34 | 
 35 |         features = [dict([(x, 1) for x in f.split()]) for f in data]
 36 |         features = DictVectorizer().fit_transform(features)
 37 |         dist = UnitCosineDistance(features, data)
 38 |         
 39 |         self.assertEqual(dist.matrix.shape[0], 7)
 40 | 
 41 |         dist.remove_near_duplicates()
 42 | 
 43 |         self.assertEqual(dist.matrix.shape[0], 4)
 44 | 
 45 |     def test_cosine(self):
 46 |         """Do a quick basic test for index/search functionality"""
 47 |         data = [
 48 |             'hello world',
 49 |             'oh hello there',
 50 |             'Play it',
 51 |             'Play it again Sam',
 52 |         ]
 53 | 
 54 |         features = [dict([(x, 1) for x in f.split()]) for f in data]
 55 |         features = DictVectorizer().fit_transform(features)
 56 | 
 57 |         cluster_index = ci.ClusterIndex(features, data)
 58 | 
 59 |         ret = cluster_index.search(features, k=1, k_clusters=1,
 60 |                                    return_distance=False)
 61 |         self.assertEqual([[d] for d in data], ret)
 62 | 
 63 |     def test_dense_array(self):
 64 |         """Do a quick basic test for index/search functionality"""
 65 |         data = [
 66 |             'hello world',
 67 |             'oh hello there',
 68 |             'Play it',
 69 |             'Play it again Sam',
 70 |         ]
 71 | 
 72 |         features = [dict([(x, 1) for x in f.split()]) for f in data]
 73 |         features = DictVectorizer().fit_transform(features)
 74 |         features = features.toarray()
 75 |         cluster_index = ci.ClusterIndex(features, data)
 76 | 
 77 |         ret = cluster_index.search(features, k=1, k_clusters=1,
 78 |                                    return_distance=False)
 79 |         self.assertEqual([[d] for d in data], ret)
 80 | 
 81 |     def test_dense_matrix(self):
 82 |         """Do a quick basic test for index/search functionality"""
 83 |         data = [
 84 |             'hello world',
 85 |             'oh hello there',
 86 |             'Play it',
 87 |             'Play it again Sam',
 88 |         ]
 89 | 
 90 |         features = [dict([(x, 1) for x in f.split()]) for f in data]
 91 |         features = DictVectorizer().fit_transform(features)
 92 |         features = features.toarray()
 93 |         cluster_index = ci.ClusterIndex(features, data, DenseCosineDistance)
 94 | 
 95 |         ret = cluster_index.search(features, k=1, k_clusters=1,
 96 |                                    return_distance=False)
 97 |         self.assertEqual([[d] for d in data], ret)
 98 | 
 99 |     def test_euclidean(self):
100 |         """Do a quick basic test for index/search functionality"""
101 |         data = [
102 |             'hello world',
103 |             'oh hello there',
104 |             'Play it',
105 |             'Play it again Sam',
106 |         ]
107 | 
108 |         features = [dict([(x, 1) for x in f.split()]) for f in data]
109 |         features = DictVectorizer().fit_transform(features)
110 |         features = features.toarray()
111 |         cluster_index = ci.ClusterIndex(features, data, SlowEuclideanDistance)
112 | 
113 |         ret = cluster_index.search(features, k=1, k_clusters=1,
114 |                                    return_distance=False)
115 |         self.assertEqual([[d] for d in data], ret)
116 | 
117 | 
118 | 
119 |     def test_levels(self):
120 |         """Test multiple level indexes"""
121 |         features = np.random.binomial(1, 0.01, size=(1000, 20000))
122 |         features = csr_matrix(features)
123 | 
124 |         # build the search index!
125 |         data_to_return = np.array(list(range(1000)), dtype=int)
126 | 
127 |         # matrix size smaller - this forces the index to have multiple levels
128 |         cluster_index = ci.ClusterIndex(features, data_to_return,
129 |                                         matrix_size=10)
130 | 
131 |         ret =  cluster_index.search(features[0:10], k=1, k_clusters=1,
132 |                                     return_distance=False)
133 |         self.assertEqual([[x] for x in data_to_return[:10]], ret)
134 | 
135 |     def test_levels_multiindex(self):
136 |         """Test multiple level indexes"""
137 |         features = np.random.binomial(1, 0.01, size=(1000, 20000))
138 |         features = csr_matrix(features)
139 | 
140 |         # build the search index!
141 |         data_to_return = np.array(list(range(1000)), dtype=int)
142 | 
143 |         # matrix size smaller - this forces the index to have multiple levels
144 |         cluster_index = ci.MultiClusterIndex(features, data_to_return,
145 |                                              matrix_size=10)
146 | 
147 |         ret =  cluster_index.search(features[0:10], k=1, k_clusters=1,
148 |                                     return_distance=False)
149 |         self.assertEqual([[x] for x in data_to_return[:10]], ret)
150 | 
151 |     def test_large_k(self):
152 |         """Test multiple level indexes"""
153 |         features = np.random.binomial(1, 0.01, size=(1000, 20000))
154 |         features = csr_matrix(features)
155 | 
156 |         # build the search index!
157 |         data_to_return = np.array(list(range(1000)), dtype=int)
158 | 
159 |         # matrix size smaller - this forces the index to have multiple levels
160 |         cluster_index = ci.MultiClusterIndex(features, data_to_return,
161 |                                              matrix_size=10)
162 | 
163 |         ret =  cluster_index.search(features[0], k=100, k_clusters=1,
164 |                                     return_distance=False)
165 |         self.assertEqual(100, len(ret[0]))
166 | 


--------------------------------------------------------------------------------