├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── SRP
    ├── SRP.py
    ├── SRP_files.py
    ├── __init__.py
    └── flycheck_SRP.py
├── bld.bat
├── build.sh
├── docs
    ├── Build Fiction Set.ipynb
    ├── Classification Using Tensorflow Estimators.ipynb
    ├── Find Text Lab Books in Hathi.ipynb
    ├── Hash a corpus of text files into SRP space.ipynb
    ├── Increasing Speed through batch processing.ipynb
    ├── Recursive SRP tests.ipynb
    └── Splitting Ids.ipynb
├── meta.yaml
├── pyproject.toml
├── requirements.txt
├── run_test.sh
├── setup.py
├── tests
    └── test.py
└── utils
    ├── clean_file.py
    └── expand_half-precision.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask instance folder
57 | instance/
58 | 
59 | # Scrapy stuff:
60 | .scrapy
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyBuilder
66 | target/
67 | 
68 | # IPython Notebook
69 | .ipynb_checkpoints
70 | 
71 | # pyenv
72 | .python-version
73 | 
74 | # celery beat schedule file
75 | celerybeat-schedule
76 | 
77 | # dotenv
78 | .env
79 | 
80 | # virtualenv
81 | venv/
82 | ENV/
83 | 
84 | # Spyder project settings
85 | .spyderprojecttests/
86 | test.bin
87 | .DS_Store
88 | nohup.out
89 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 |   - "3.7"
 5 |   - "3.8"
 6 | # command to install dependencies
 7 | install:
 8 |   - pip install -r requirements.txt
 9 |   - pip install .
10 | # command to run tests
11 | script: cd tests && python test.py
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT LICENSE
 2 | 
 3 | Copyright (c) 2016-2021 Benjamin Schmidt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pySRP
  2 | 
  3 | Python module implementing Stable Random Projections, as described in 
  4 | [Cultural Analytics Vol. 1, Issue 2, 2018 October 04, 2018: Stable Random Projection: Lightweight, General-Purpose Dimensionality Reduction for Digitized Libraries](https://doi.org/10.22148/16.025)
  5 | 
  6 | These create interchangeable, data-agnostic vectorized representations of text suitable for a variety of contexts. Unlike most vectorizations, they are suitable for representing texts in any language that uses space-tokenization, or non-linguistic content, since they contain no implicit language model besides words.
  7 | 
  8 | You may want to use them in concert with the pre-distributed Hathi SRP features
  9 | described further here.
 10 | 
 11 | ## Installation
 12 | 
 13 | Requires python 3
 14 | 
 15 | ```bash
 16 | pip install pysrp
 17 | ```
 18 | ## Changelog
 19 | 
 20 | **Version 2.0 (July 2022) slightly changes the default tokenization algorithm!**
 21 | 
 22 | Previously it was `\w`; now it is `[\p{L}\p{Pc}\p{N}\p{M}]+`. 
 23 | 
 24 | I also no longer recommend use 
 25 | 
 26 | ## Usage
 27 | 
 28 | ### Examples
 29 | 
 30 | See the [docs folder](https://github.com/bmschmidt/pySRP/tree/master/docs)
 31 | for some IPython notebooks demonstrating:
 32 | 
 33 | * [Taking a subset of the full Hathi collection (100,000 works of fiction) based on
 34 | identifiers, and exploring the major clusters within fiction.](https://github.com/bmschmidt/pySRP/blob/master/docs/Build%20Fiction%20Set.ipynb)
 35 | * [Creating a new SRP representation of text files and plotting dimensionality reductions of them by language and time](https://github.com/bmschmidt/pySRP/blob/master/docs/Hash%20a%20corpus%20of%20text%20files%20into%20SRP%20space.ipynb)
 36 | * [Searching for copies of one set of books in the full HathiTrust collection, and using Hathi metadata to identify duplicates and find errors in local item descriptions.](https://github.com/bmschmidt/pySRP/blob/master/docs/Find%20Text%20Lab%20Books%20in%20Hathi.ipynb)
 37 | * [Training a classifier based on library metadata using TensorFlow, and then applyinig that classification to other sorts of text.](https://github.com/bmschmidt/pySRP/blob/master/docs/Classification%20Using%20Tensorflow%20Estimators.ipynb)
 38 | 
 39 | ### Basic Usage
 40 | 
 41 | Use the SRP class to build an object to perform transformations.
 42 | 
 43 | This is a class method, rather than a function, which builds a cache of previously seen words.
 44 | 
 45 | ```python
 46 | import srp
 47 | # initialize with desired number of dimensions
 48 | hasher = srp.SRP(640)
 49 | ```
 50 | 
 51 | The most important method is 'stable_transform'.
 52 | 
 53 | This can tokenize and then compute the SRP.
 54 | 
 55 | ```python
 56 | hasher.stable_transform(words = "foo bar bar"))
 57 | ```
 58 | 
 59 | If counts are already computed, word and count vectors can be passed separately.
 60 | 
 61 | ```python
 62 | hasher.stable_transform(words = ["foo","bar"],counts = [1,2])
 63 | ```
 64 | 
 65 | 
 66 | ## Read/write tools
 67 | 
 68 | SRP files are stored in a binary file format to save space. 
 69 | This format is the same used by the binary word2vec format.
 70 | 
 71 | **DEPRECATION NOTICE**
 72 | 
 73 | This format is now deprecated--I recommend the Apache Arrow binary serialization format instead. 
 74 | 
 75 | ```python
 76 | file = SRP.Vector_file("hathivectors.bin")
 77 | 
 78 | for (key, vector) in file:
 79 |   pass
 80 |   # 'key' is a unique identifier for a document in a corpus
 81 |   # 'vector' is a `numpy.array` of type `<f4`.
 82 | ```
 83 | 
 84 | There are two other methods. One lets you read an entire matrix in at once.
 85 | This may require lots of memory. It returns a dictionary with two keys: 'matrix' (a numpy array)
 86 | and 'names' (the row names).
 87 | 
 88 | ```python
 89 | all = SRP.Vector_file("hathivectors.bin").to_matrix()
 90 | all['matrix'][:5]
 91 | all['names'][:5]
 92 | ```
 93 | 
 94 | The other lets you treat the file as a dictionary of keys. The first lookup
 95 | may take a very long time; subsequent lookups will be fast *without* requiring
 96 | you to load the vectors into memory. To get a 1-dimensional representation of a book:
 97 | 
 98 | ```python
 99 | all = SRP.Vector_file("hathivectors.bin")
100 | all['gri.ark:/13960/t3032jj3n']
101 | ```
102 | 
103 | You can also, thanks to Peter Organisciak, access multiple vectors at once this way by passing a list of identifiers. This returns a matrix with shape (2, 160) for a 160-dimensional representation.
104 | 
105 | ```python
106 | all[['gri.ark:/13960/t3032jj3n', 'hvd.1234532123']]
107 | ```
108 | 
109 | ### Writing to SRP files
110 | 
111 | You can build your own files row by row.
112 | 
113 | ```python
114 | 
115 | # Note--the dimensions of the file and the hasher should be equal.
116 | output = SRP.Vector_file("new_vectors.bin",dims=640,mode="w")
117 | hasher = SRP.SRP(640)
118 | 
119 | 
120 | for filename in [a,b,c,d]:
121 |   hash = hasher.stable_transform(" ".join(open(filename).readlines()))
122 |   output.add_row(filename,hash)
123 | 
124 | # files must be closed.
125 | output.close()
126 | ```
127 | 
128 | Since files must be closed, it can be easier to use a context manager:
129 | 
130 | ```python
131 | 
132 | # Note--the dimensions of the file and the hasher should be equal.
133 | hasher = SRP.SRP(640)
134 | 
135 | with SRP.Vector_file("new_vectors.bin",dims=640,mode="w") as output:
136 |   for filename in [a,b,c,d]:
137 |     hash = hasher.stable_transform(" ".join(open(filename).readlines()))
138 |     output.add_row(filename,hash)
139 | ```
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/SRP/SRP.py:
--------------------------------------------------------------------------------
  1 | #### -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import, division, print_function, unicode_literals
  4 | import hashlib
  5 | import numpy as np
  6 | import regex
  7 | import sys
  8 | import base64
  9 | from collections import Counter
 10 | 
 11 | tokenregex = regex.compile(u"[\p{L}\p{Pc}\p{N}\p{M}]+")
 12 | 
 13 | class EmptyTextError(ZeroDivisionError):
 14 |     pass
 15 | 
 16 | class SRP(object):
 17 |     """
 18 |     A factory to perform random transformations.
 19 |     """
 20 | 
 21 |     def __init__(self, dim=640, cache=True, cache_limit=15e05, log = True):
 22 |         """
 23 |         dim:     The number of dimensions that the transformer
 24 |                  should reduce to.
 25 | 
 26 |         cache:   Whether to memoize some words. This could cause memory overflows in
 27 |                  extremely large document sets that have not had their
 28 |                  vocabulary culled down to a few million unique tokens if cache_limit
 29 |                  is not set. When it is set, the cache stops marking items once it has
 30 |                  cach_limit items in it already: this should catch most common words, but
 31 |                  does nothing to ensure that the first n words it catches are useful.
 32 | 
 33 |         cache_limit: The maximum cache size. Once the cache hits this size, it is deleted
 34 |                  and starts over: so if this size is too small (less than 100,000, say)
 35 |                  performance will actually be worse. Recommended is over a million.
 36 | 
 37 |         log:     Use a log transform? Usually useful, but in cases where function words matter
 38 |                  more, it may not be.
 39 |         """
 40 |         self.dim=dim
 41 |         self.cache = cache
 42 |         self.cache_limit = cache_limit
 43 |         self.dtype = np.float32
 44 |         self.log = log
 45 | 
 46 |         if cache:
 47 |             # This is the actual hash.
 48 |             self._hash_dict = dict()
 49 |             self._last_hash_dict = dict()
 50 | 
 51 |         self.build_lookup_table()
 52 | 
 53 |     def _cache_size(self):
 54 |         return len(self._hash_dict)
 55 | 
 56 |     def build_lookup_table(self):
 57 |         """
 58 |         A table mapping four digit hexadecimal numbers
 59 |         to 16-bit SRP vectors in 1, -1 space.
 60 |         """
 61 |         lookup = np.zeros((2**16, 16), np.float32)
 62 |         for i in range(2**16):
 63 | #            str_rep = f"{i:04x}"
 64 | #            h = bytes.fromhex(str_rep)
 65 | #            ints = np.frombuffer(h, np.uint8)
 66 |             bits = np.array([i], np.uint16)
 67 |             value = np.unpackbits(bits.view(np.uint8)).astype(np.int8)
 68 |             lookup[i] = value * 2 - 1
 69 |         self.lookup_table = lookup
 70 | 
 71 |     def string_to_binary(self, string):
 72 |         expand = np.ceil(self.dim / 160).astype('i8')
 73 |         full_hash = b""
 74 |         for i in range(0, expand):
 75 |             seedword = string + "_" * i
 76 |             try:
 77 |                 full_hash += hashlib.sha1(seedword.encode("utf-8")).digest()
 78 |             except UnicodeDecodeError:
 79 |                 full_hash += hashlib.sha1(seedword).digest()
 80 |         as_keys = np.frombuffer(full_hash, np.uint16)
 81 |         return self.lookup_table[as_keys].flatten()[:self.dim]
 82 | 
 83 |     def old_string_to_binary(self, string):
 84 |         expand = np.ceil(self.dim / 160).astype('i8')
 85 |         full_hash = ""
 86 |         hash = np.float32()
 87 |         for i in range(0, expand):
 88 |             seedword = string + "_" * i
 89 |             try:
 90 |                 full_hash += hashlib.sha1(seedword.encode("utf-8")).hexdigest()
 91 |             except UnicodeDecodeError:
 92 |                 full_hash += hashlib.sha1(seedword).hexdigest()
 93 |         return self._expand_hexstring(full_hash)
 94 | 
 95 |     def hash_string(self, string, cache=None):
 96 | 
 97 |         """
 98 |         Gives a hash for a word.
 99 |         string:      The string to be hashed.
100 |         cache:       Whether to cache the result. "None"
101 |                      uses the default for object;
102 |                      False turns it off.
103 |         """
104 |         # First we check if the cache ought to contain the
105 |         # results; if so, we either return the result, or
106 |         # set a note to enter into the cache when done.
107 | 
108 | 
109 |         if self.cache:
110 |             try:
111 |                 return self._hash_dict[string]
112 |             except KeyError:
113 |                 try:
114 |                     # Check the previous cache.
115 |                     self._hash_dict[string] = self._last_hash_dict[string]
116 |                     return self._hash_dict[string]
117 |                 except KeyError:
118 |                     if cache is None:
119 |                         cache = True
120 |         else:
121 |             cache = False
122 | 
123 |         value = self.string_to_binary(string)
124 | 
125 |         if cache and self._cache_size() >= self.cache_limit:
126 |             # Clear the cache; maybe things have changed.
127 |             self._hash_dict = self._last_hash_dict
128 |             self._hash_dict = {}
129 | 
130 |         if cache and self._cache_size() < self.cache_limit:
131 |             self._hash_dict[string] = value
132 | 
133 |         return value
134 | 
135 |     def tokenlist(self, string, regex = tokenregex, lower = True):
136 |         if isinstance(string, bytes):
137 |             string = string.decode("utf-8")
138 |         return regex.findall(string)
139 | 
140 |     def tokenize(self, string, regex=tokenregex, lower = True):
141 |         parts = self.tokenlist(string, regex, lower)
142 |         count = dict()
143 |         for part in parts:
144 |             if lower:
145 |                 part = part.lower()
146 |             try:
147 |                 count[part] += 1
148 |             except KeyError:
149 |                 count[part] = 1
150 |         return count
151 | 
152 |     def standardize(self, words, counts, unzip = True):
153 |         full = dict()
154 | 
155 |         for i in range(len(words)):
156 |             """
157 |             Here we retokenize each token. A full text can be tokenized
158 |             at a single pass
159 |             by passing words = [string], counts=[1]
160 |             """
161 |             subCounts = self.tokenize(words[i])
162 |             for (part, partCounts) in subCounts.items():
163 |                 part = regex.sub(u'\d', "#", part)
164 |                 addition = counts[i] * partCounts
165 |                 try:
166 |                     full[part] += addition
167 |                 except KeyError:
168 |                     full[part] = addition
169 |         words = []
170 |         counts = np.zeros(len(full), "<f4")
171 |         if not unzip:
172 |             return full
173 |         for i,(k,v) in enumerate(full.items()):
174 |             words.append(k)
175 |             counts[i] = v
176 |         return (words,counts)
177 | 
178 |     def _str_to_wordcounts(self, words):
179 |         # Note that wordcounts are floats though ints would
180 |         # be more sensical. This is to force the use of BLAS
181 |         # for matrix multiplication, which seems to be faster.
182 | 
183 |         if isinstance(words, str) or isinstance(words, bytes):
184 |             words = [words]
185 |             counts = np.array([1], np.float32)
186 | 
187 |         return (words,counts)
188 | 
189 |     def _log_transform(self,counts,thresh = 1e05):
190 |         # Take a ratio of the full text.
191 |         total = np.sum(counts)
192 |         if total == 0:
193 |             raise EmptyTextError("Can't normalize, zero words in text.")
194 |         counts = counts/total
195 |         counts = np.log(counts*thresh)
196 |         # Anything occurring less than 1 per 100,000 is removed.
197 |         # This lets us avoid negatives, which would screw things up.
198 |         # Once per 100,000 is an arbitrary floor, obv.
199 |         counts.clip(0)
200 |         return counts
201 | 
202 |     def stable_transform(self, words, counts=None, log=None, standardize=True,
203 |                          unit_length = True
204 |         ):
205 |         """
206 |         words: either a list of words, or a single string.
207 |         counts: the number of occurrences for each word in 'words'. This can be "none",
208 |                 in which 'words' is treated as a string.
209 |         log: Apply a log-transform to avoid common words dominating the signature?
210 |         standardize: Apply standard minimal tokenization rules?
211 |         unit_length: normalize each vector to unit length to speed up
212 |                      subsequent calculations on cosine distance?
213 |         """
214 | 
215 |         if log is None:
216 |             log = self.log
217 |         if counts is None:
218 |             (words,counts) = self._str_to_wordcounts(words)
219 |         if standardize:
220 |             (words,counts) = self.standardize(words,counts)
221 |         if log:
222 |             counts = self._log_transform(counts)
223 |         scores = np.zeros((len(words), self.dim), dtype = np.float32)
224 |         for i, word in enumerate(words):
225 |             scores[i] = self.hash_string(word)
226 |         try:
227 |             values = counts @ scores
228 |         except ZeroDivisionError:
229 |             if sum(counts) == 0:
230 |                 raise EmptyTextError
231 |             else:
232 |                 raise
233 |         if unit_length:
234 |             values = values/np.linalg.norm(values)
235 |         return values
236 | 
237 | 
238 |     def to_base64(self,vector):
239 |         """
240 |         Converts a vector to a base64, little-endian, 4-byte representation
241 |         in base 64.
242 |         """
243 |         string = np.array(vector,'<f4')
244 |         return base64.b64encode(string)
245 | 
246 | if __name__=="__main__":
247 |     model = SRP(320)
248 |     print(model.stable_transform("hello world")[:6])
249 |     model = SRP(320)
250 |     print(model.stable_transform(u"Güten Tag")[:6])
251 |     model = SRP(320)
252 |     print(model.stable_transform(u"Güten Tag".encode("utf-8").decode("utf-8"))[:6])
253 |     model = SRP(320)
254 |     print(model.stable_transform(u"Güten Tag".encode("utf-8"))[:6])
255 | 


--------------------------------------------------------------------------------
/SRP/SRP_files.py:
--------------------------------------------------------------------------------
   1 | import sys
   2 | import numpy as np
   3 | import warnings
   4 | import os
   5 | import collections
   6 | import random
   7 | import regex as re
   8 | import re as original_re
   9 | from collections import defaultdict
  10 | from pathlib import Path
  11 | from warnings import warn
  12 | 
  13 | 
  14 | regex_type = type(re.compile("."))
  15 | original_regex_type = type(original_re.compile("."))
  16 | from sqlitedict import SqliteDict
  17 | 
  18 | from collections.abc import MutableSequence
  19 | 
  20 | def textset_yielder(inputfile):
  21 |     for line in open(inputfile, "r"):
  22 |         try:
  23 |             (id, txt) = line.split("\t", 1)
  24 |         except:
  25 |             print(line)
  26 |             raise
  27 |         yield (id, txt)
  28 | 
  29 | def directory_yielder(inputfile):
  30 |     for filename in os.listdir(inputfile):
  31 |         if filename.endswith(".txt"):
  32 |             id = filename[:-4]
  33 |             txt = "\n".join(open(os.path.join(inputfile, filename)).readlines())
  34 |             yield (id, txt)
  35 | 
  36 | def textset_to_srp(
  37 |         inputfile,
  38 |         outputfile,
  39 |         dim=640,
  40 |         limit=float("Inf"),
  41 |         log = True
  42 | ):
  43 |     """
  44 |     A convenience wrapper for converting a text corpus to
  45 |     an SRP collection as a block.
  46 | 
  47 |     inputfile is the collection. The format is the same as the ingest
  48 |     format used by bookworm and mallet; that is to say, a single unicode
  49 |     file where each line is a document represented as a unique filename, then a tab,
  50 |     and then a long string containing the text of the document.
  51 |     To coerce into the this format, newlines must be removed.
  52 | 
  53 |     inputfile can also be a **directory** of txt files.
  54 | 
  55 |     outputfile is the SRP file to be created. Recommended suffix is `.bin`.
  56 | 
  57 |     dims is the dimensionality of the output SRP.
  58 | 
  59 |     """
  60 |     import SRP
  61 | 
  62 |     output = Vector_file(outputfile, dims=dim, mode="w")
  63 |     hasher = SRP.SRP(dim=dim)
  64 | 
  65 |     if inputfile.endswith(".txt"):
  66 |         yielder = textset_yielder
  67 |     elif os.path.isdir(inputfile):
  68 |         yielder = directory_yielder
  69 |     else:
  70 |         raise ValueError("Don't know how to process {}: must be a textfile or a directory".format(inputfile))
  71 | 
  72 |     for i, (id, txt) in enumerate(yielder(inputfile)):
  73 |         transform = hasher.stable_transform(txt, log=True, standardize=True)
  74 |         output.add_row(id, transform)
  75 |         if i + 1 >= limit:
  76 |             break
  77 | 
  78 |     output.close()
  79 | 
  80 | import pyarrow as pa
  81 | from pyarrow import ipc
  82 | from pyarrow import feather
  83 | 
  84 | class Arrow_File(object):
  85 |     """
  86 |     Store in an arrow file.
  87 |     """
  88 | 
  89 |     def __init__(self, filename, dims=float("Inf"), mode="r", max_rows=float("Inf"), precision = "float", offset_cache = False):
  90 |         """
  91 |         Creates an SRP object.
  92 | 
  93 |         filename: The location on disk.
  94 |         dims: The number of vectors to store for each document. Typically ~100 to ~1000.
  95 |               Need not be specified if working with an existing file.
  96 |         mode: One of: 'r' (read an existing file); 'w' (create a new file); 'a' (append to the
  97 |               end of an existing file)
  98 |         max_rows: clip the document to a fixed length. Best left unused.
  99 |         precision: bytes to use for each. 4 (single-precision) is standard; 2 (half precision) is also reasonable. 0 embeds not as floats, but instead into binary hamming space.
 100 |         offset_cache: Whether to store the byte offset lookup information for vectors. By default,
 101 |             this is False, which means the offset table is built on load and kept in memory.
 102 |         """
 103 | 
 104 |         self.filename = filename
 105 |         self.dims = dims
 106 |         self.mode = mode
 107 |         self.max_rows = max_rows
 108 |         if precision == 2:
 109 |             precision = "half"
 110 |         elif precision == 4:
 111 |             precision = "float"
 112 |         try:
 113 |             assert precision in {"half", "float", "binary"}
 114 |         except:
 115 |             e = "Only `4` (single) and `2` (half) bytes are valid options for `precision`"
 116 |             raise ValueError(e)
 117 |         self.precision = precision
 118 |         if self.precision == "half":
 119 |             self.float_format = '<f2'
 120 |         elif self.precision == "float":
 121 |             self.float_format = '<f4'
 122 |         else:
 123 |             try:
 124 |                 assert self.dims % 8 == 0
 125 |             except:
 126 |                 raise ValueError("Binary packing must have dimensionality divisible by 8.")
 127 |             self.float_format = '<u1'
 128 |         self.offset_cache = offset_cache
 129 |         if self.offset_cache and os.path.exists(self.filename + '.offset.db'):
 130 |             if (self.mode == 'w'):
 131 |                 # Leave _build_offset_lookup to build the reference
 132 |                 os.remove(self.filename + '.db')
 133 |             else:
 134 |                 if os.path.exists(self.filename + '.offset.db'):
 135 |                     self._offset_lookup = SqliteDict(self.filename + '.offset.db',
 136 |                                                      flag=('c' if self.mode=='a' else 'r'),
 137 |                                                      encode=int, decode=int)
 138 |                 if os.path.exists(self.filename + '.prefix.db'):
 139 |                     self._prefix_lookup = SqliteDict(self.filename + '.prefix.db',
 140 |                                                      flag=('c' if self.mode=='a' else 'r'))
 141 |                 else:
 142 |                      print("No file", self.filename + '.prefix.db')
 143 |         if self.mode == "r":
 144 |             self._open_for_reading()
 145 |         elif self.mode == "w":
 146 |             self._open_for_writing()
 147 |         elif self.mode == "a":
 148 |             self._open_for_appending()
 149 |         else:
 150 |             raise NameError("Mode must be 'r', 'w', or 'a'.")
 151 | 
 152 |     def __enter__(self):
 153 |         return self
 154 | 
 155 |     def __exit__(self, exc_type, exc_val, exc_tb):
 156 |         self.close()
 157 | 
 158 |     def concatenate_file(self, filename):
 159 |         """
 160 |         Adds all record in a different file to the end of this one.
 161 |         Useful if creation of files has been parallelized or distributed.
 162 | 
 163 |         In theory, it should be possible to add a large file to a smaller one.
 164 |         """
 165 | 
 166 |         with Vector_file(filename, dims=self.dims, mode="r", precision = self.precision) as new_file:
 167 |             for (id, array) in new_file:
 168 |                 self.add_row(id, array)
 169 | 
 170 |     def _open_for_writing(self):
 171 |         dtype = pa.list_(pa.float32(), self.dims)
 172 |         if self.precision == "half":
 173 |             dtype = pa.list_(pa.float16(), self.dims)
 174 |         if self.precision == "binary":
 175 |             dtype = pa.binary(self.dims // 8)
 176 |         self.dtype = dtype
 177 |         self.schema = pa.schema({
 178 |             '@id': pa.string(),
 179 |             'vector': dtype
 180 |         })
 181 |         self.file = ipc.new_file(self.filename, self.schema)
 182 |         self._BATCH_SIZE = 1024
 183 |         self.embedding_buffer = []
 184 |         self.name_buffer = []
 185 | 
 186 |     def _open_for_reading(self):
 187 |         self.tb = feather.read_table(self.filename)
 188 | 
 189 |     def _open_for_appending(self):
 190 |         raise NotImplementedError("Appending to Arrow files not supported.")
 191 | 
 192 |     def batch_yielder(self, size = 100, unit_length = True):
 193 |         """
 194 |         Efficiently yield chunks of the representation as a matrix.
 195 | 
 196 |         size: the number of rows in the matrix.
 197 |         unit_length: whether to convert each row to unit length before
 198 |                      yielding.
 199 |         """
 200 |         raise NotImplementedError("But it would be so easy!")
 201 |         matrix = np.zeros((size, self.dims), np.float32)
 202 |         labels = [None] * size
 203 |         for i, (id, row) in enumerate(self):
 204 |             j = i % size
 205 |             labels[j] = id
 206 |             if unit_length:
 207 |                 row = row/np.linalg.norm(row)
 208 |             matrix[j] = row
 209 | 
 210 |             if j == size - 1:
 211 |                 yield (labels, matrix)
 212 |                 labels = [None] * size
 213 |         # final yield
 214 |         if j != size - 1:
 215 |             yield (labels[:j], matrix[:j])
 216 | 
 217 |     def to_matrix(self, unit_length=False, clean=False):
 218 |         """
 219 |         Returns the entire file as a matrix with names (wrapped in a dict).
 220 | 
 221 |         This can, obviously, overflow memory on a large file.
 222 |         """
 223 |         raise NotImplementedError("But it would be so easy!")
 224 | 
 225 |         labels = []
 226 |         matrix = np.zeros(
 227 |             (min([self.vocab_size, self.max_rows]), self.dims), "<f4")
 228 |         for i, (id, row) in enumerate(self):
 229 |             labels.append(id)
 230 |             if unit_length:
 231 |                 row = row/np.linalg.norm(row)
 232 |             matrix[i] = row
 233 |         return {"names": labels, "matrix": matrix}
 234 | 
 235 |     def flush(self):
 236 |         batch = pa.RecordBatch.from_arrays(
 237 |             [pa.array(self.name_buffer, pa.string()), pa.array(self.embedding_buffer, self.dtype)],
 238 |             schema = self.schema
 239 |         )
 240 |         self.file.write_batch(batch)
 241 |         self.name_buffer = []
 242 |         self.embedding_buffer = []
 243 | 
 244 |     def add_rows(self, rows):
 245 |         """
 246 |         Batch add a list of (id, array) tuples while ensuring
 247 |         that all will be in the same batch in the arrow file.
 248 |         """
 249 |         for row in rows[:-1]:
 250 |             self.add_row(*row, flush_check = False)
 251 |         
 252 |         self.add_row(*rows[-1], flush_check = True)
 253 | 
 254 |     def add_row(self, identifier, array, flush_check = True):
 255 |         """
 256 |         Add a new document/word/whatever to the matrix.
 257 |         """
 258 | 
 259 |         if type(array) != np.ndarray:
 260 |             raise TypeError("Must pass a numpy ndarray as array")
 261 | 
 262 |         if array.dtype != np.dtype(self.float_format):
 263 |             if (array.dtype == np.dtype("<f4")) and self.precision == "half":
 264 |                 array = array.astype(self.float_format)
 265 |             elif (array.dtype == np.dtype("<f4")) and self.precision == "binary":
 266 |                 array = np.packbits(array > 0)
 267 |             else:
 268 |                 raise TypeError("Numpy array must be of type '<f4'")
 269 |         if len(array) != self.dims:
 270 |             if not (len(array) == self.dims / 8 and self.precision == "binary"):
 271 |                 raise IndexError(f"The existing files is {self.dims} dimensions: unable to append with {len(array)} dimensions as requested")
 272 | 
 273 |         self.name_buffer.append(identifier)
 274 |         self.embedding_buffer.append(array)
 275 | 
 276 |         if flush_check and len(self.name_buffer) >= self._BATCH_SIZE:
 277 |             self.flush()
 278 | 
 279 |         try:
 280 |             self._prefix_lookup[identifier.split(self.sep, 1)[0]].append((identifier, self.file.tell()))
 281 |         except AttributeError:
 282 |             pass
 283 |         try:
 284 |             self._offset_lookup[identifier] = self.file.tell()
 285 |         except AttributeError:
 286 |             pass
 287 | 
 288 |     def close(self):
 289 |         """
 290 |         Close the file. It's extremely important to call this method in write modes:
 291 |         not just that the last few files will be missing.
 292 |         If it isn't, the header will have out-of-date information and files won't be read.
 293 |         """
 294 |         self.flush()
 295 |         self.file.close()
 296 | 
 297 |         if self.offset_cache:
 298 |             self._offset_lookup.close()
 299 | 
 300 |     def _regex_search(self, regex):
 301 | 
 302 |         self._build_offset_lookup()
 303 |         values = [(i, k) for k, i in self._offset_lookup.items() if re.search(regex, k)]
 304 |         # Sort to ensure values are returned in disk order.
 305 |         values.sort()
 306 |         for i, k in values:
 307 |             yield (k, self[k])
 308 | 
 309 | 
 310 |         self._build_offset_lookup()
 311 |         values = [(i, k) for k, i in self._offset_lookup.items() if re.search(regex, k)]
 312 |         # Sort to ensure values are returned in disk order.
 313 |         values.sort()
 314 |         for i, k in values:
 315 |             yield (k, self[k])
 316 | 
 317 |     def __getitem__(self, label):
 318 |         """
 319 |         Attributes can be accessed in three ways.
 320 | 
 321 | 
 322 |         With a string: this returns just the vector for that string.
 323 |         With a list of strings: this returns a multidimensional array for each query passed.
 324 |           If any of the requested items do not exist, this will fail.
 325 |         With a single *compiled* regular expression (from either the regex or re module). This
 326 |           will return an iterator over key, value pairs of keys that match the regex.
 327 |         """
 328 | 
 329 |         self._build_offset_lookup()
 330 | 
 331 |         if self.mode == 'a' or self.mode == 'w':
 332 |             self.file.flush()
 333 | 
 334 |         if isinstance(label, original_regex_type):
 335 |             # Convert from re type since that's
 336 |             # more standard
 337 |             label = re.compile(label.pattern)
 338 | 
 339 |         if isinstance(label, regex_type):
 340 |             return self._regex_search(label)
 341 | 
 342 |         if isinstance(label, original_regex_type):
 343 |             label = re.compile(label.pattern)
 344 | 
 345 |         if isinstance(label, regex_type):
 346 |             return self._regex_search(label)
 347 | 
 348 |         if isinstance(label, MutableSequence):
 349 |             is_iterable = True
 350 |         else:
 351 |             is_iterable = False
 352 |             label = [label]
 353 | 
 354 |         vecs = []
 355 |         # Will fail on any missing labels
 356 | 
 357 |         # Prefill and sort so that any block are done in disk-order.
 358 |         # This may make a big difference if you're on a tape drive!
 359 | 
 360 |         vecs = np.zeros((len(label), self.vector_size), '<f4')
 361 |         elements = [(self._offset_lookup[l], i) for i, l in enumerate(label)]
 362 |         elements.sort()
 363 | 
 364 |         for offset, i in elements:
 365 |             self.file.seek(offset)
 366 |             vecs[i] = self._read_binary_row()
 367 | 
 368 |         # Move pointer to the end in case we're writing.
 369 |         self.file.seek(0, 2)
 370 | 
 371 |         if is_iterable:
 372 |             return np.stack(vecs)
 373 |         else:
 374 |             return vecs[0]
 375 | 
 376 |     def find_prefix(self, prefix, sep = "-"):
 377 |         """
 378 |         Uses as an on-disk loca okup to return all rows where the text before 'sep' is equal
 379 |         to prefix.
 380 | 
 381 |         Once used with a prefix, you **cannot** change the prefix on the file.
 382 |         """
 383 | 
 384 |         if self.mode=='a' or self.mode == 'w':
 385 |             self.file.flush()
 386 | 
 387 |         try:
 388 |             # You're locked in.
 389 |             assert(sep == self._prefix_sep)
 390 |         except AttributeError:
 391 |             self._prefix_sep = sep
 392 | 
 393 |         self._build_offset_lookup(sep = sep)
 394 | 
 395 |         # Will fail on any missing labels
 396 | 
 397 |         # Prefill and sort so that any block are done in disk-order.
 398 |         # This may make a big difference if you're on a tape drive!
 399 | 
 400 |         elements = self._prefix_lookup[prefix]
 401 | 
 402 |         output = []
 403 | 
 404 |         for full_name, offset in elements:
 405 |             self.file.seek(offset)
 406 |             output.append((full_name, self._read_binary_row()))
 407 | 
 408 |         # Move pointer to the end in case we're writing.
 409 |         self.file.seek(0, 2)
 410 | 
 411 |         return output
 412 | 
 413 |     def __iter__(self):
 414 |         """
 415 |         Again, I'm starting with a version of the gensim code.
 416 |         https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py
 417 |         """
 418 | 
 419 |         for key, value in zip(self.tb['@id'], self.tb['vector']):
 420 |             yield key.as_py(), value.values.to_numpy()
 421 | 
 422 | class Vector_file(object):
 423 |     """
 424 |     A class to manage binary files in the word2vec format.
 425 |     I've also adopted this as the binary SRP format.
 426 | 
 427 |     One problem is that this format doesn't allow for spaces in
 428 |     ID names.
 429 | 
 430 |     Initialized with a filename, a maximum number of rows to read,
 431 |     and a maximum number of columns to read.
 432 | 
 433 |     There are three basic ways of operating with one of these.
 434 | 
 435 |     1. Treat them as a file object that can read from, line by line, or written
 436 |        to, line by line. This is the basic mode, and the only one that supports
 437 |        write operations.
 438 | 
 439 |     2. Slurp an entire object into memory using the 'as_matrix' method. This returns
 440 |        a dict with a matrix and a list of names. This is probably the easiest method
 441 |        if the object fits in memory.
 442 | 
 443 |     3. Access individual values using dict methods: eg, model['foo'] will return
 444 |        the vector representing token 'foo'. The first usage of this method will parse
 445 |        the entire file for keys, which may take quite a while; later reads will access an
 446 |        in-memory cache of ids to determine where on disk to look, which is significantly faster
 447 |        but still slower than an in-memory lookup.
 448 |     """
 449 | 
 450 |     def __init__(self, filename, dims=float("Inf"), mode="r", max_rows=float("Inf"), precision = "float", offset_cache = False):
 451 |         """
 452 |         Creates an SRP object.
 453 | 
 454 |         filename: The location on disk.
 455 |         dims: The number of vectors to store for each document. Typically ~100 to ~1000.
 456 |               Need not be specified if working with an existing file.
 457 |         mode: One of: 'r' (read an existing file); 'w' (create a new file); 'a' (append to the
 458 |               end of an existing file)
 459 |         max_rows: clip the document to a fixed length. Best left unused.
 460 |         precision: bytes to use for each. 4 (single-precision) is standard; 2 (half precision) is also reasonable. 0 embeds not as floats, but instead into binary hamming space.
 461 |         offset_cache: Whether to store the byte offset lookup information for vectors. By default,
 462 |             this is False, which means the offset table is built on load and kept in memory.
 463 |         """
 464 |         warn('The word2vec-based binary format is deprecated and will be '
 465 |              'removed in version 2.1: use Apache Arrow instead.', DeprecationWarning, stacklevel=2)
 466 | 
 467 |         self.filename = filename
 468 |         self.dims = dims
 469 |         self.mode = mode
 470 |         self.max_rows = max_rows
 471 |         if precision == 2:
 472 |             precision = "half"
 473 |         elif precision == 4:
 474 |             precision = "float"
 475 |         try:
 476 |             assert precision in {"half", "float", "binary"}
 477 |         except:
 478 |             e = "Only `4` (single) and `2` (half) bytes are valid options for `precision`"
 479 |             raise ValueError(e)
 480 |         self.precision = precision
 481 |         if self.precision == "half":
 482 |             self.float_format = '<f2'
 483 |         elif self.precision == "float":
 484 |             self.float_format = '<f4'
 485 |         else:
 486 |             try:
 487 |                 assert self.dims % 8 == 0
 488 |             except:
 489 |                 raise ValueError("Binary packing must have dimensionality divisible by 8.")
 490 |             self.float_format = '<u1'
 491 |         self.offset_cache = offset_cache
 492 |         if self.offset_cache and os.path.exists(self.filename + '.offset.db'):
 493 |             if (self.mode == 'w'):
 494 |                 # Leave _build_offset_lookup to build the reference
 495 |                 os.remove(self.filename + '.db')
 496 |             else:
 497 |                 if os.path.exists(self.filename + '.offset.db'):
 498 |                     self._offset_lookup = SqliteDict(self.filename + '.offset.db',
 499 |                                                      flag=('c' if self.mode=='a' else 'r'),
 500 |                                                      encode=int, decode=int)
 501 |                 if os.path.exists(self.filename + '.prefix.db'):
 502 |                     self._prefix_lookup = SqliteDict(self.filename + '.prefix.db',
 503 |                                                      flag=('c' if self.mode=='a' else 'r'))
 504 |                 else:
 505 |                      print("No file", self.filename + '.prefix.db')
 506 |         if self.mode == "r":
 507 |             self._open_for_reading()
 508 |         elif self.mode == "w":
 509 |             self._open_for_writing()
 510 |         elif self.mode == "a":
 511 |             self._open_for_appending()
 512 |         else:
 513 |             raise NameError("Mode must be 'r', 'w', or 'a'.")
 514 | 
 515 |     def __enter__(self):
 516 |         return self
 517 | 
 518 |     def __exit__(self, exc_type, exc_val, exc_tb):
 519 |         self.close()
 520 | 
 521 |     def repair_file(self):
 522 |         """
 523 |         When writing millions of these, sometimes bytes get unaligned: this
 524 |         is an imprecise rubric that generally works to fix corrupted data.
 525 |         """
 526 |         if self.mode != "a":
 527 |             raise IOError("Can only repair when in append mode")
 528 | 
 529 |         # This also sets the pointer to the front.
 530 |         self._preload_metadata()
 531 |         self.nrows = 0
 532 |         # Avoid breaking the loop.
 533 | 
 534 |         previous_start = self.file.tell()
 535 | 
 536 |         while True:
 537 |             try:
 538 |                 _, _ = self._next_line()
 539 |             except StopIteration:
 540 |                 break
 541 |             except RuntimeError:
 542 |                 break
 543 |             except UnicodeDecodeError:
 544 |                 break
 545 |             except ValueError:
 546 |                 break
 547 |             self.nrows += 1
 548 |             if self.nrows % 100000 == 0:
 549 |                 print("{} read".format(self.nrows))
 550 |             previous_start = self.file.tell()
 551 |         print("Recovered {} rows".format(self.nrows))
 552 |         self.file.truncate(previous_start)
 553 |         self._rewrite_header()
 554 | 
 555 | 
 556 |     def concatenate_file(self, filename):
 557 |         """
 558 |         Adds all record in a different file to the end of this one.
 559 |         Useful if creation of files has been parallelized or distributed.
 560 | 
 561 |         In theory, it should be possible to add a large file to a smaller one.
 562 |         """
 563 | 
 564 |         with Vector_file(filename, dims=self.dims, mode="r", precision = self.precision) as new_file:
 565 |             for (id, array) in new_file:
 566 |                 self.add_row(id, array)
 567 | 
 568 |     def _rewrite_header(self):
 569 |         """
 570 |         Overwrites the first line of a binary file (where file length and number of columns
 571 |         are stored.)
 572 |         """
 573 |         self.file.seek(0)
 574 |         header = "{:09d} {:05d}\n".format(self.nrows, self.dims)
 575 |         self.file.write(header.encode("utf-8"))
 576 |         # Move pointer to end. Just in case.
 577 |         self.file.seek(0, 2)
 578 | 
 579 |     def set_binary_len(self):
 580 |         if self.precision == "binary":
 581 |             self.binary_len = int(self.dims / 8)
 582 |         elif self.precision == "float":
 583 |             self.binary_len = int(4 * self.dims)
 584 |         else:
 585 |             self.binary_len = int(2 * self.dims)
 586 | 
 587 |     def _open_for_writing(self):
 588 |         self.nrows = 0
 589 |         self.vector_size = self.dims
 590 |         self.file = open(self.filename, 'wb')
 591 |         self.set_binary_len()
 592 |         self._rewrite_header()
 593 | 
 594 |     def _open_for_reading(self):
 595 |         self.file = open(self.filename, 'rb')
 596 |         self._preload_metadata()
 597 | 
 598 |     def _open_for_appending(self):
 599 | 
 600 |         self.nrows = None
 601 |         if not Path(self.filename).exists():
 602 |             self._open_for_writing()
 603 |             self.file.close()
 604 | 
 605 |         self.file = open(self.filename, "rb+")
 606 |         self._preload_metadata()
 607 |         self.file.seek(0, 2)
 608 |         if self.nrows is None:
 609 |             self.nrows = self.vocab_size
 610 | 
 611 |         if self.dims != self.vector_size:
 612 |             raise IndexError(
 613 |                 "The existing files is {} dimensions: unable to append"
 614 |                 " with {} dimensions as requested".format(
 615 |                     self.vector_size, self.dims))
 616 | 
 617 |     def batch_yielder(self, size = 100, unit_length = True):
 618 |         """
 619 |         Efficiently yield chunks of the representation as a matrix.
 620 | 
 621 |         size: the number of rows in the matrix.
 622 |         unit_length: whether to convert each row to unit length before
 623 |                      yielding.
 624 |         """
 625 | 
 626 |         matrix = np.zeros((size, self.dims), np.float32)
 627 |         labels = [None] * size
 628 |         for i, (id, row) in enumerate(self):
 629 |             j = i % size
 630 |             labels[j] = id
 631 |             if unit_length:
 632 |                 row = row/np.linalg.norm(row)
 633 |             matrix[j] = row
 634 | 
 635 |             if j == size - 1:
 636 |                 yield (labels, matrix)
 637 |                 labels = [None] * size
 638 |         # final yield
 639 |         if j != size - 1:
 640 |             yield (labels[:j], matrix[:j])
 641 | 
 642 |     def to_matrix(self, unit_length=False, clean=False):
 643 |         """
 644 |         Returns the entire file as a matrix with names (wrapped in a dict).
 645 | 
 646 |         This can, obviously, overflow memory on a large file.
 647 |         """
 648 |         labels = []
 649 |         matrix = np.zeros(
 650 |             (min([self.vocab_size, self.max_rows]), self.dims), "<f4")
 651 |         for i, (id, row) in enumerate(self):
 652 |             labels.append(id)
 653 |             if unit_length:
 654 |                 row = row/np.linalg.norm(row)
 655 |             matrix[i] = row
 656 |         return {"names": labels, "matrix": matrix}
 657 | 
 658 | 
 659 | 
 660 |     def _recover_from_corruption(self):
 661 |         starting = self.pos
 662 |         self.debug_mode = True
 663 |         iterator = self._iter__()
 664 |         for i in range(2000000):
 665 |             self.file.seek(starting + i)
 666 |             try:
 667 |                 gah = iterator.__next__()
 668 |                 safe_pos = self.pos
 669 |                 for n in range(10):
 670 |                     # Make sure things are relatively straightforward from here on out.
 671 |                     _ = self.next()
 672 |                 self.file.seek(safe_pos)
 673 |                 del self.debug_mode
 674 |                 return True
 675 |             except StopIteration:
 676 |                 pass
 677 |             except RuntimeError:
 678 |                 break
 679 |         print("Encountered corrupted data with {} words left and unable to recover at all".format(
 680 |             self.remaining_words))
 681 |         raise StopIteration
 682 | 
 683 |     def add_row(self, identifier, array):
 684 |         """
 685 |         Add a new document/word/whatever to the matrix.
 686 |         """
 687 |         try:
 688 |             if " " in identifier:
 689 |                 raise TypeError("Spaces are not allowed in row identifiers")
 690 |         except UnicodeDecodeError:
 691 |             if " " in identifier.decode("utf-8"):
 692 |                 raise TypeError("Spaces are not allowed in row identifiers")
 693 | 
 694 |         if type(array) != np.ndarray:
 695 |             raise TypeError("Must pass a numpy ndarray as array")
 696 | 
 697 |         if array.dtype != np.dtype(self.float_format):
 698 |             if (array.dtype == np.dtype("<f4")) and self.precision == "half":
 699 |                 array = array.astype(self.float_format)
 700 |             elif (array.dtype == np.dtype("<f4")) and self.precision == "binary":
 701 |                 array = np.packbits(array > 0)
 702 |             else:
 703 |                 raise TypeError("Numpy array must be of type '<f4'")
 704 |         if len(array) != self.dims:
 705 |             if not (len(array) == self.dims / 8 and self.precision == "binary"):
 706 |                 raise IndexError("The existing files is {} dimensions: unable to append with {} dimensions as requested".format(
 707 |                     self.vector_size, self.dims))
 708 |         self.file.write(identifier.encode("utf-8") + b" ")
 709 |         try:
 710 |             self._prefix_lookup[identifier.split(self.sep, 1)[0]].append((identifier, self.file.tell()))
 711 |         except AttributeError:
 712 |             pass
 713 |         try:
 714 |             self._offset_lookup[identifier] = self.file.tell()
 715 |         except AttributeError:
 716 |             pass
 717 |         self.file.write(array.tobytes())
 718 |         self.file.write(b"\n")
 719 |         self.nrows += 1
 720 | 
 721 |     def close(self):
 722 |         """
 723 |         Close the file. It's extremely important to call this method in write modes:
 724 |         not just that the last few files will be missing.
 725 |         If it isn't, the header will have out-of-date information and files won't be read.
 726 |         """
 727 |         if not "r" in self.mode:
 728 |             self._rewrite_header()
 729 | 
 730 |         self.file.close()
 731 |         if self.offset_cache:
 732 |             self._offset_lookup.close()
 733 | 
 734 |     def _preload_metadata(self):
 735 |         """
 736 |         Portions from https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py
 737 |         """
 738 | 
 739 |         counts = None
 740 | 
 741 |         self.file.seek(0)
 742 | 
 743 |         header = b""
 744 |         last = b""
 745 |         while last != b"\n":
 746 |             header += last
 747 |             last = self.file.read(1)
 748 | 
 749 |         header = header.decode("utf-8")
 750 |         self.vocab_size, self.vector_size = map(
 751 |             int, header.split())  # throws for invalid file format
 752 |         if self.vocab_size == 0 and self.mode == 'r':
 753 |             warnings.warn(
 754 |                 "\nWARNING: dataset has 0 items. Try reloading with mode='a' and"
 755 |                 "running set.repair_file()\n")
 756 |         # Some shuffling to decide whether all the columns are going to be read.
 757 |         if self.dims < float("Inf") and self.dims > self.vector_size:
 758 |             warnings.warn(
 759 |                 "WARNING: data has only {} columns but call requested top {}".format(
 760 |                     self.vector_size, self.dims))
 761 |         if self.dims == float("Inf") or self.dims == self.vector_size:
 762 |             self.dims = self.vector_size
 763 |             self.slice_and_dice = False
 764 |         else:
 765 |             self.slice_and_dice = True
 766 | 
 767 |         self.set_binary_len()
 768 | 
 769 |         self.remaining_words = min([self.vocab_size, self.max_rows])
 770 | 
 771 |     def _check_if_half_precision(self):
 772 | 
 773 |         body_start = self.file.tell()
 774 |         word, weights = (self._read_row_name(), self._read_binary_row())
 775 | 
 776 |         meanval = np.mean(np.abs(weights))
 777 | 
 778 |         if meanval > 1e10:
 779 |             warning = "Average size is extremely large" + \
 780 |                "did you mean to specify 'precision = half or precision = binary'?"
 781 |             warnings.warn(warning)
 782 | 
 783 |     def _read_row_name(self):
 784 |         buffer = []
 785 |         while True:
 786 |             ch = self.file.read(1)
 787 |             if not ch and self.remaining_words > 0:
 788 |                 print("Ran out of data with {} words left".format(
 789 |                     self.remaining_words))
 790 |                 return
 791 |             if ch == b' ':
 792 |                 break
 793 |             if ch != b'\n':
 794 |                 # ignore newlines in front of words (some binary files have em)
 795 |                 buffer.append(ch)
 796 |         try:
 797 |             word = b''.join(buffer).decode()
 798 |         except:
 799 |             print("Couldn't export:")
 800 |             print(buffer)
 801 |             raise
 802 |         return word
 803 | 
 804 | 
 805 |     def _build_offset_lookup(self, force=False, sep = None):
 806 |         if hasattr(self, "_offset_lookup") and not force and not sep:
 807 |             return
 808 |         if hasattr(self, "_prefix_lookup") and not force and sep:
 809 |             return
 810 | 
 811 |         if sep is not None:
 812 |             prefix_lookup = defaultdict(list)
 813 |         else:
 814 |             offset_lookup = {}
 815 | 
 816 |         self._preload_metadata()
 817 |         # Add warning for duplicate ids.
 818 |         i = 0
 819 |         while i < self.vocab_size:
 820 |             label = self._read_row_name()
 821 |             if sep is None and label in offset_lookup:
 822 |                 warnings.warn(
 823 |                     "Warning: this vector file has duplicate identifiers " +
 824 |                     "(words) The last vector representation of each " +
 825 |                     "identifier will be used, and earlier ones ignored.")
 826 |             if sep:
 827 |                 key = label.split(sep, 1)[0]
 828 |                 loc = self.file.tell()
 829 |                 prefix_lookup[key].append((label, loc))
 830 |             else:
 831 |                 offset_lookup[label] = self.file.tell()
 832 |             # Skip to the next name without reading.
 833 |             self.file.seek(self.binary_len, 1)
 834 |             i += 1
 835 | 
 836 |         if self.offset_cache:
 837 |             # While building the full dict in memory then saving to cache should be quicker
 838 |             # (for prefix lookup), this defeats the primary value of the cache in avoid holding
 839 |             # huge objects in memory. An intermediate write when the dict is getting big
 840 |             # will be needed for scale.
 841 |             if sep:
 842 |                 self._prefix_lookup = SqliteDict(self.filename + '.prefix.db',
 843 |                                                  autocommit=False, journal_mode ='OFF')
 844 |                 for key, value in prefix_lookup.items():
 845 |                     self._prefix_lookup[key] = value
 846 |                 self._prefix_lookup.commit()
 847 |             else:
 848 |                 self._offset_lookup = SqliteDict(self.filename + '.offset.db', encode=int, decode=int,
 849 |                                                  autocommit=False, journal_mode ='OFF')
 850 |                 for key, value in offset_lookup.items():
 851 |                     self._offset_lookup[key] = value
 852 |                 self._offset_lookup.commit()
 853 |         else:
 854 |             if sep:
 855 |                 self._prefix_lookup = prefix_lookup
 856 |             else:
 857 |                 self._offset_lookup = offset_lookup
 858 | 
 859 |     def sort(self, destination, sort = "names", safe = True, chunk_size = 2000):
 860 |         """
 861 |         This method sorts a vector file by its keys without reading it into memory.
 862 | 
 863 |         It also cleans
 864 | 
 865 |         destination: A new file to be written.
 866 | 
 867 |         sort: one of 'names' (default sort by the filenames), 'random'
 868 |         (sort randomly), or 'none' (keep the current order)
 869 | 
 870 |         safe: whether to check for (and eliminate) duplicate keys and
 871 | 
 872 |         chunk_size: How many vectors to read into memory at a time. Larger numbers
 873 |         may improve performance, especially on hard drives,
 874 |         by keeping the disk head from moving around.
 875 |         """
 876 | 
 877 |         self._build_offset_lookup()
 878 |         ks = list(self._offset_lookup.keys())
 879 |         if sort == 'names':
 880 |             ks.sort()
 881 |         elif sort == 'random':
 882 |             random.shuffle(ks)
 883 |         elif sort == 'none':
 884 |             pass
 885 |         else:
 886 |             raise NotImplementedError("sort type must be one of [names, random, none]")
 887 |         # Chunk size matters because we can pull the vectors
 888 |         # from the disk in order within each chunk.
 889 | 
 890 |         last_written = None
 891 |         with Vector_file(destination,
 892 |                          dims = self.dims,
 893 |                          mode = "w",
 894 |                          precision = self.precision) as output:
 895 |             for i in range(0, len(ks), chunk_size):
 896 |                 keys = ks[i:(i + chunk_size)]
 897 |                 for key, row in zip(keys, self[keys]):
 898 |                     if safe:
 899 |                         norm = np.linalg.norm(row)
 900 |                         if np.isinf(norm) or np.isnan(norm) or norm == 0:
 901 |                             continue
 902 |                         if key == last_written:
 903 |                             continue
 904 |                     last_written = key
 905 |                     output.add_row(key, row)
 906 | 
 907 |     def _regex_search(self, regex):
 908 | 
 909 |         self._build_offset_lookup()
 910 |         values = [(i, k) for k, i in self._offset_lookup.items() if re.search(regex, k)]
 911 |         # Sort to ensure values are returned in disk order.
 912 |         values.sort()
 913 |         for i, k in values:
 914 |             yield (k, self[k])
 915 | 
 916 | 
 917 |         self._build_offset_lookup()
 918 |         values = [(i, k) for k, i in self._offset_lookup.items() if re.search(regex, k)]
 919 |         # Sort to ensure values are returned in disk order.
 920 |         values.sort()
 921 |         for i, k in values:
 922 |             yield (k, self[k])
 923 | 
 924 |     def flush(self):
 925 |         """
 926 |         Flushing requires rewriting the metadata at the head as well as flushing the file buffer.
 927 |         """
 928 |         self.file.flush()
 929 |         self._rewrite_header()
 930 | 
 931 |     def __getitem__(self, label):
 932 |         """
 933 |         Attributes can be accessed in three ways.
 934 | 
 935 | 
 936 |         With a string: this returns just the vector for that string.
 937 |         With a list of strings: this returns a multidimensional array for each query passed.
 938 |           If any of the requested items do not exist, this will fail.
 939 |         With a single *compiled* regular expression (from either the regex or re module). This
 940 |           will return an iterator over key, value pairs of keys that match the regex.
 941 |         """
 942 | 
 943 |         self._build_offset_lookup()
 944 | 
 945 |         if self.mode == 'a' or self.mode == 'w':
 946 |             self.file.flush()
 947 | 
 948 |         if isinstance(label, original_regex_type):
 949 |             # Convert from re type since that's
 950 |             # more standard
 951 |             label = re.compile(label.pattern)
 952 | 
 953 |         if isinstance(label, regex_type):
 954 |             return self._regex_search(label)
 955 | 
 956 |         if isinstance(label, original_regex_type):
 957 |             label = re.compile(label.pattern)
 958 | 
 959 |         if isinstance(label, regex_type):
 960 |             return self._regex_search(label)
 961 | 
 962 |         if isinstance(label, MutableSequence):
 963 |             is_iterable = True
 964 |         else:
 965 |             is_iterable = False
 966 |             label = [label]
 967 | 
 968 |         vecs = []
 969 |         # Will fail on any missing labels
 970 | 
 971 |         # Prefill and sort so that any block are done in disk-order.
 972 |         # This may make a big difference if you're on a tape drive!
 973 | 
 974 |         vecs = np.zeros((len(label), self.vector_size), '<f4')
 975 |         elements = [(self._offset_lookup[l], i) for i, l in enumerate(label)]
 976 |         elements.sort()
 977 | 
 978 |         for offset, i in elements:
 979 |             self.file.seek(offset)
 980 |             vecs[i] = self._read_binary_row()
 981 | 
 982 |         # Move pointer to the end in case we're writing.
 983 |         self.file.seek(0, 2)
 984 | 
 985 |         if is_iterable:
 986 |             return np.stack(vecs)
 987 |         else:
 988 |             return vecs[0]
 989 | 
 990 |     def find_prefix(self, prefix, sep = "-"):
 991 |         """
 992 |         Uses as an on-disk loca okup to return all rows where the text before 'sep' is equal
 993 |         to prefix.
 994 | 
 995 |         Once used with a prefix, you **cannot** change the prefix on the file.
 996 |         """
 997 | 
 998 |         if self.mode=='a' or self.mode == 'w':
 999 |             self.file.flush()
1000 | 
1001 |         try:
1002 |             # You're locked in.
1003 |             assert(sep == self._prefix_sep)
1004 |         except AttributeError:
1005 |             self._prefix_sep = sep
1006 | 
1007 |         self._build_offset_lookup(sep = sep)
1008 | 
1009 |         # Will fail on any missing labels
1010 | 
1011 |         # Prefill and sort so that any block are done in disk-order.
1012 |         # This may make a big difference if you're on a tape drive!
1013 | 
1014 |         elements = self._prefix_lookup[prefix]
1015 | 
1016 |         output = []
1017 | 
1018 |         for full_name, offset in elements:
1019 |             self.file.seek(offset)
1020 |             output.append((full_name, self._read_binary_row()))
1021 | 
1022 |         # Move pointer to the end in case we're writing.
1023 |         self.file.seek(0, 2)
1024 | 
1025 |         return output
1026 | 
1027 |     def _read_binary_row(self):
1028 |         binary_len = self.binary_len
1029 |         self.pos = self.file.tell()
1030 |         if self.slice_and_dice:
1031 |             # When dims is less than the resolution of the file size.
1032 |             assert self.dims % 8 == 0
1033 |             assert self.vector_size % 8 == 0
1034 |             read_length = int(self.dims / self.vector_size * self.binary_len)
1035 |             weights = np.frombuffer(self.file.read(read_length), dtype=self.float_format)
1036 |             # Catch up the pointer by throwing some data away.
1037 |             self.file.read(self.binary_len - read_length)
1038 |         else:
1039 |             try:
1040 |                 weights = np.frombuffer(
1041 |                     self.file.read(binary_len), dtype=self.float_format)
1042 |             except ValueError:
1043 |                 print("Can't parse data with {} words left".format(
1044 |                     self.remaining_words))
1045 |                 raise StopIteration
1046 |             if len(weights) != self.vector_size:
1047 |                 print("Ran out of data with {} words left".format(
1048 |                     self.remaining_words))
1049 |                 raise StopIteration
1050 |         if self.mode=='r' and self.precision == 2:
1051 |             weights = weights.astype("<f4")
1052 |         return weights
1053 | 
1054 |     def __iter__(self):
1055 |         """
1056 |         Again, I'm starting with a version of the gensim code.
1057 |         https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py
1058 |         """
1059 | 
1060 |         # Always preload the metadata so you can iterate multiple times.
1061 |         self._preload_metadata()
1062 | 
1063 |         while True:
1064 |             self.remaining_words = self.remaining_words-1
1065 |             if self.remaining_words <= -1:
1066 |                 # Allow breaking out of the loop early.
1067 |                 return
1068 |             yield self._next_line()
1069 | 
1070 |     def _next_line(self):
1071 |         word = self._read_row_name()
1072 |         weights = self._read_binary_row()
1073 |         return (word, weights)
1074 | 
1075 | if __name__ == "__main__":
1076 |     run_arguments()
1077 | 


--------------------------------------------------------------------------------
/SRP/__init__.py:
--------------------------------------------------------------------------------
1 | from .SRP import SRP, EmptyTextError
2 | from .SRP_files import Vector_file, textset_to_srp, Arrow_File
3 | 


--------------------------------------------------------------------------------
/SRP/flycheck_SRP.py:
--------------------------------------------------------------------------------
  1 | #### -*- coding: utf-8 -*-
  2 | # This code should run under either 2.7 or 3.x
  3 | 
  4 | from __future__ import absolute_import, division, print_function, unicode_literals
  5 | import hashlib
  6 | import numpy as np
  7 | import regex as re
  8 | import sys
  9 | import base64
 10 | from collections import Counter
 11 | 
 12 | if sys.version_info[0]==3:
 13 |     py2,py3 = False,True
 14 |     py3 = True
 15 | else:
 16 |     (py2,py3) = True,False
 17 | 
 18 | tokenregex = re.compile(u"(\p{L}+|\p{N}+)")
 19 | 
 20 | class SRP(object):
 21 |     """
 22 |     A factory to perform random transformations.
 23 |     """
 24 | 
 25 |     def __init__(self, dim=640, cache=True, cache_limit=500000):
 26 |         """
 27 |         dim:     The number of dimensions that the transformer
 28 |                  should reduce to.
 29 | 
 30 |         cache:   Whether to memoize This could cause memory overflows in
 31 |                  extremely large document sets that have not had their
 32 |                  vocabulary culled down to a few million unique tokens.
 33 |         """
 34 |         self.dim=dim
 35 |         self.cache=cache
 36 |         self.cache_limit = cache_limit
 37 |         if cache:
 38 |             # This is the actual hash.
 39 |             self.known_hashes = dict()
 40 | 
 41 |     def _cache_size(self):
 42 |         return len(self.known_hashes)
 43 | 
 44 |     def _expand_hexstring(self, hexstring):
 45 |         if py3 and isinstance(hexstring,str):
 46 |             h = bytes.fromhex(hexstring)
 47 |         elif py2:
 48 |             h = hexstring.decode('hex')
 49 |         ints = np.fromstring(h, np.uint8)
 50 |         value = np.unpackbits(ints).astype(np.int8)
 51 |         value[value == 0] = -1
 52 |         return value
 53 | 
 54 |     def hash_string(self,string,dim=None):
 55 |         """
 56 |         Gives a hash for a word.
 57 |         string:      The string to be hashed.
 58 |         dim:         The number of dimensions to hash into.
 59 |                      Caching occurs when this dim is the class's
 60 |                      number of dimensions.
 61 |         """
 62 |         # First we check if the cache ought to contain the
 63 |         # results; if so, we either return the result, or
 64 |         # set a note to enter into the cache when done.
 65 |         if dim is None:
 66 |             dim=self.dim
 67 | 
 68 |         if dim==self.dim and self.cache:
 69 |             try:
 70 |                 return self.known_hashes[string]
 71 |             except KeyError:
 72 |                 cache = True
 73 |         else:
 74 |             cache = False
 75 | 
 76 |         expand = np.ceil(dim / 160).astype('i8')
 77 |         full_hash = ""
 78 |         for i in range(0,expand):
 79 |             seedword = string + "_"*i
 80 |             try:
 81 |                 full_hash += hashlib.sha1(seedword.encode("utf-8")).hexdigest()
 82 |             except UnicodeDecodeError:
 83 |                 full_hash += hashlib.sha1(seedword).hexdigest()
 84 | 
 85 |         """
 86 |         Do some ugly typecasting
 87 |         """
 88 | 
 89 |         if py2:
 90 |             if isinstance(string,unicode):
 91 |                 pass# string = string.encode("utf-8")
 92 |             else:
 93 |                 pass
 94 |         if py3:
 95 |             if isinstance(string,bytes):
 96 |                 pass# string = string.decode("utf-8")
 97 | 
 98 |         value = self._expand_hexstring(full_hash)[:dim]
 99 |  
100 |         if cache and self._cache_size() < self.cache_limit:
101 |             self.known_hashes[string] = value
102 |         return value
103 | 
104 |     def tokenize(self,string,regex=tokenregex):
105 |         if py3 and isinstance(string,bytes):
106 |             string = string.decode("utf-8")
107 |         if py2 and not isinstance(string,unicode):
108 |             try:
109 |                 string = unicode(string)
110 |             except UnicodeDecodeError:
111 |                 try:
112 |                     string = string.decode("utf-8")
113 |                 except UnicodeDecodeError:
114 |                     sys.stderr.write("Encountered non-unicode string" + "\n")
115 |                     string = string.decode("utf-8","ignore")
116 |         count = dict()
117 |         parts = regex.findall(string)
118 |         for part in parts:
119 |             part = part.lower()
120 |             try:
121 |                 count[part] += 1
122 |             except KeyError:
123 |                 count[part] = 1
124 |         return count
125 | 
126 |     def standardize(self,words,counts):
127 |         full = dict()
128 |         
129 |         for i in range(len(words)):
130 |             """
131 |             Here we retokenize each token. A full text can be tokenized
132 |             at a single pass
133 |             by passing words = [string], counts=[1]
134 |             """
135 |             subCounts = self.tokenize(words[i])
136 |             for (part,partCounts) in subCounts.iteritems():
137 |                 addition = counts[i]*partCounts
138 |                 try:
139 |                     full[part] += addition
140 |                 except KeyError:
141 |                     full[part] = addition
142 |         words = []
143 |         counts = []
144 |         for (k,v) in full.iteritems():
145 |             words.append(k)
146 |             counts.append(v)
147 |         return (words,counts)
148 | 
149 |     def stable_transform(self,words,counts=None,dim=None,log=False,standardize=True):
150 |         """
151 | 
152 |         """
153 |         if dim is None:
154 |             dim = self.dim
155 |         try:
156 |             if isinstance(words,basestring):
157 |                 words = [words]
158 |                 counts = [1]
159 |         except NameError:
160 |             # That is, we're in py3
161 |             if isinstance(words,str) or isinstance(words,bytes):
162 |                 words = [words]
163 |                 counts = [1]            
164 |         if counts is None:
165 |             raise IOError("Counts must be defined when a list of words is passed in.")
166 |         if standardize:
167 |             (words,counts) = self.standardize(words,counts)
168 |         counts = np.array(counts,dtype=np.float32)
169 |         if log:
170 |             # Store as a float because of normalization, etc.
171 |             counts = counts/np.sum(counts)
172 |             counts = np.log(counts*1e05)
173 |             # Anything occurring less than 1 per 100,000 is removed.
174 |             # This lets us avoid negatives, which would screw things up.
175 |             # Once per 100,000 is an arbitrary floor, obv.
176 |             counts.clip(0)
177 |         scores = np.zeros((len(words), dim), dtype=np.int8)
178 |         for i, word in enumerate(words):
179 |             scores[i] = self.hash_string(word, dim=dim)
180 |         values = np.dot(counts,scores)
181 |         return values
182 |     
183 |     def hash_all_substrings(self, string):
184 |         """
185 |         Breaks a string down into all possible substrings, and then
186 |         returns the projection of the string in the space
187 |         defined by them.
188 | 
189 |         Possibly useful as a vector-space approximation of string distance.
190 |         """
191 |         counter = Counter()
192 |         
193 |         for i in xrange(len(string)):
194 |             for j in xrange(i + 1, len(string) + 1):
195 |                 counter[string[i:j]] += 1
196 | 
197 |         return self.stable_transform(counter.keys(), counts = counter.values(), log = False, standardize = False)
198 |     
199 |     def to_base64(self,vector):
200 |         """
201 |         Converts a vector to a base64, little-endian, 4-byte representation
202 |         in base 64.
203 |         """
204 |         string = np.array(vector,'<f4')
205 |         return base64.b64encode(string)
206 | 
207 | if __name__=="__main__":
208 |     model = SRP(320)
209 |     print(model.stable_transform("hello world")[:6])
210 |     model = SRP(320)
211 |     print(model.stable_transform(u"Güten Tag")[:6])
212 |     model = SRP(320)
213 |     print(model.stable_transform(u"Güten Tag".encode("utf-8").decode("utf-8"))[:6])
214 |     model = SRP(320)
215 |     print(model.stable_transform(u"Güten Tag".encode("utf-8"))[:6])
216 |     #print model.stable_transform(["hello", "world"],[1,1])[:6]
217 | 


--------------------------------------------------------------------------------
/bld.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" setup.py install
2 | if errorlevel 1 exit 1
3 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install
2 | 


--------------------------------------------------------------------------------
/docs/Classification Using Tensorflow Estimators.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "deletable": true,
   7 |     "editable": true
   8 |    },
   9 |    "source": [
  10 |     "# Classifier training using Tensorflow Estimators\n",
  11 |     "\n",
  12 |     "Tensorflow's Estimator class takes a lot of the work out of building transferrable machine learning models; you can, for example, push a model into a javascript version that will live on the web, or swap out models of the very few types for which pre-built estimators exist."
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "code",
  17 |    "execution_count": 2,
  18 |    "metadata": {
  19 |     "collapsed": true,
  20 |     "deletable": true,
  21 |     "editable": true
  22 |    },
  23 |    "outputs": [],
  24 |    "source": [
  25 |     "import tensorflow as tf\n",
  26 |     "import SRP\n",
  27 |     "import pandas as pd\n",
  28 |     "import numpy as np"
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "markdown",
  33 |    "metadata": {
  34 |     "deletable": true,
  35 |     "editable": true
  36 |    },
  37 |    "source": [
  38 |     "The goal here is to learn how to classify books by their two-letter library of congress classification. For example:\n",
  39 |     "BF is psychology. First we read in a CSV with ids and classifications for about 2,000,000 books.\n",
  40 |     "\n",
  41 |     "Note that you could replace this csv with any other two-column dataset of htid and categorical labels."
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 3,
  47 |    "metadata": {
  48 |     "collapsed": false,
  49 |     "deletable": true,
  50 |     "editable": true
  51 |    },
  52 |    "outputs": [],
  53 |    "source": [
  54 |     "metadata = pd.read_csv(\"/home/bschmidt/Dropbox/hathi_metadata/data_to_classify_on/lc_ic.csv.gz\", names=[\"htid\", \"lc\"])"
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "metadata": {
  60 |     "deletable": true,
  61 |     "editable": true
  62 |    },
  63 |    "source": [
  64 |     "Now I build a list of the possible category values and sort it alphabetically. Note that I've already pruned this down to a manageable number (221) by removing erroneous labels."
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 4,
  70 |    "metadata": {
  71 |     "collapsed": false,
  72 |     "deletable": true,
  73 |     "editable": true
  74 |    },
  75 |    "outputs": [
  76 |     {
  77 |      "name": "stdout",
  78 |      "output_type": "stream",
  79 |      "text": [
  80 |       "There are 221 categories\n"
  81 |      ]
  82 |     }
  83 |    ],
  84 |    "source": [
  85 |     "all_cats = list(set(list(metadata.lc)))\n",
  86 |     "all_cats.sort()\n",
  87 |     "print(\"There are {} categories\".format(len(all_cats)))"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "markdown",
  92 |    "metadata": {
  93 |     "deletable": true,
  94 |     "editable": true
  95 |    },
  96 |    "source": [
  97 |     "A lookup dictionary stores the classification for each individual volume."
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 5,
 103 |    "metadata": {
 104 |     "collapsed": false,
 105 |     "deletable": true,
 106 |     "editable": true
 107 |    },
 108 |    "outputs": [],
 109 |    "source": [
 110 |     "lookup = dict(zip(metadata.htid, metadata.lc))"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "markdown",
 115 |    "metadata": {
 116 |     "deletable": true,
 117 |     "editable": true
 118 |    },
 119 |    "source": [
 120 |     "I'm using a single-precision, 1280 dimensional feature set. If you want to use 640 dimensional features, that can be changed here."
 121 |    ]
 122 |   },
 123 |   {
 124 |    "cell_type": "code",
 125 |    "execution_count": 6,
 126 |    "metadata": {
 127 |     "collapsed": false,
 128 |     "deletable": true,
 129 |     "editable": true
 130 |    },
 131 |    "outputs": [],
 132 |    "source": [
 133 |     "bytes = 2\n",
 134 |     "dims = 640"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "markdown",
 139 |    "metadata": {
 140 |     "deletable": true,
 141 |     "editable": true
 142 |    },
 143 |    "source": [
 144 |     "Now I need an iterator function to send data to tensorflow. Tensorflow estimators love functions; so this\n",
 145 |     "is a function that returns a function. That's a little weird for python, but perfectly normal for R or Javascript or plenty of other languages.\n",
 146 |     "\n",
 147 |     "Note that you'll need to change the file location in the body of this function."
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "code",
 152 |    "execution_count": 9,
 153 |    "metadata": {
 154 |     "collapsed": false,
 155 |     "deletable": true,
 156 |     "editable": true,
 157 |     "scrolled": true
 158 |    },
 159 |    "outputs": [],
 160 |    "source": [
 161 |     "def base_function(what = 'train', modulo = 10):\n",
 162 |     "    def fun():\n",
 163 |     "        # Your directory will be different than this.\n",
 164 |     "        full_hathi = SRP.Vector_file(\"/home/bschmidt/vector_models/ht-640d-half-precision.bin\".format(what), precision = bytes)\n",
 165 |     "        for i, (id, row) in enumerate(full_hathi):\n",
 166 |     "            if i % modulo == 0 and what != \"test\":\n",
 167 |     "                continue\n",
 168 |     "            if i % modulo == 1 and what != \"validate\":\n",
 169 |     "                continue\n",
 170 |     "            if i % modulo >= 2 and what != \"train\":\n",
 171 |     "                continue\n",
 172 |     "            if id in lookup:\n",
 173 |     "                cat = lookup[id]\n",
 174 |     "                # Normalize vectors to unit length.\n",
 175 |     "                row = row/np.linalg.norm(row.astype('<f4'))\n",
 176 |     "                yield (row, cat)\n",
 177 |     "    return fun\n",
 178 |     "\n",
 179 |     "\n"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {
 185 |     "deletable": true,
 186 |     "editable": true
 187 |    },
 188 |    "source": [
 189 |     "Now we can create train, test, and validate functions. These are functions that return an iterator function that itself returns one entry at a time."
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "code",
 194 |    "execution_count": 10,
 195 |    "metadata": {
 196 |     "collapsed": false,
 197 |     "deletable": true,
 198 |     "editable": true
 199 |    },
 200 |    "outputs": [],
 201 |    "source": [
 202 |     "train = base_function('train')\n",
 203 |     "test = base_function('test')\n",
 204 |     "validate = base_function('validate')"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "markdown",
 209 |    "metadata": {
 210 |     "deletable": true,
 211 |     "editable": true
 212 |    },
 213 |    "source": [
 214 |     "Now onto building a tensorflow estimator. First we need to define a numeric feature column which tells the estimator class about the input data: a 1280 dimnesional numeric vector."
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 12,
 220 |    "metadata": {
 221 |     "collapsed": false,
 222 |     "deletable": true,
 223 |     "editable": true
 224 |    },
 225 |    "outputs": [],
 226 |    "source": [
 227 |     "dtype = tf.float32\n",
 228 |     "\n",
 229 |     "numeric_feature_column = \\\n",
 230 |     "    tf.feature_column.numeric_column(key=\"embedding\",\n",
 231 |     "                                     shape = [dims], \n",
 232 |     "                                     dtype= dtype)\n"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "markdown",
 237 |    "metadata": {
 238 |     "deletable": true,
 239 |     "editable": true
 240 |    },
 241 |    "source": [
 242 |     "Create a DNN classifier. I use 1500 hidden units here, which is a little less than idea.\n",
 243 |     "This also takes the category list we trained earlier; and it uses a directory to store information for later training."
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "code",
 248 |    "execution_count": 13,
 249 |    "metadata": {
 250 |     "collapsed": false,
 251 |     "deletable": true,
 252 |     "editable": true,
 253 |     "scrolled": true
 254 |    },
 255 |    "outputs": [
 256 |     {
 257 |      "name": "stdout",
 258 |      "output_type": "stream",
 259 |      "text": [
 260 |       "INFO:tensorflow:Using default config.\n",
 261 |       "INFO:tensorflow:Using config: {'_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f19687245f8>, '_save_checkpoints_steps': None, '_num_worker_replicas': 1, '_evaluation_master': '', '_is_chief': True, '_service': None, '_tf_random_seed': None, '_task_id': 0, '_model_dir': '/tmp/model2', '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_master': '', '_num_ps_replicas': 0, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None}\n"
 262 |      ]
 263 |     }
 264 |    ],
 265 |    "source": [
 266 |     "classifier = tf.estimator.DNNClassifier(feature_columns=[numeric_feature_column],\n",
 267 |     "                                       hidden_units = [1500],\n",
 268 |     "                                        n_classes = len(all_cats),\n",
 269 |     "                                        label_vocabulary = all_cats,\n",
 270 |     "                                        dropout = 0.5,\n",
 271 |     "                                        model_dir = '/tmp/model2'\n",
 272 |     "                                       )"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "markdown",
 277 |    "metadata": {
 278 |     "deletable": true,
 279 |     "editable": true
 280 |    },
 281 |    "source": [
 282 |     "We want not one element at a time, but a tensorflow `Dataset` batched into (for example) 250 books at a time.\n",
 283 |     "\n",
 284 |     "This is the ugliest part of the code. `Dictize` converts from the tuple format our generator returns to a keyed dictionary for the x values. Technical note: the 'yielder' and 'test_yielder' functions are different because the dataset needs to actually be created inside a functional scope that takes no arguments for the purposes of variable scoping in the estimator. "
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": 16,
 290 |    "metadata": {
 291 |     "collapsed": true,
 292 |     "deletable": true,
 293 |     "editable": true
 294 |    },
 295 |    "outputs": [],
 296 |    "source": [
 297 |     "BATCH_SIZE = 250\n",
 298 |     "\n",
 299 |     "def dictize(x, y):\n",
 300 |     "    return ({numeric_feature_column: x}, y)\n",
 301 |     "\n",
 302 |     "def yielder():\n",
 303 |     "    dataset = tf.data.Dataset.from_generator(\n",
 304 |     "    train, (dtype, tf.string), \n",
 305 |     "                        (tf.TensorShape([dims]), tf.TensorShape([])))\n",
 306 |     "\n",
 307 |     "    batches = dataset.map(dictize).repeat().batch(BATCH_SIZE)\n",
 308 |     "\n",
 309 |     "    return batches\n",
 310 |     "\n",
 311 |     "def test_yielder():\n",
 312 |     "    dataset = tf.data.Dataset.from_generator(\n",
 313 |     "        test, (dtype, tf.string), \n",
 314 |     "             (tf.TensorShape([dims]), tf.TensorShape([])))\n",
 315 |     "\n",
 316 |     "    batches = dataset.map(dictize).repeat().batch(BATCH_SIZE)\n",
 317 |     "\n",
 318 |     "    return batches\n",
 319 |     "\n"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {
 325 |     "deletable": true,
 326 |     "editable": true
 327 |    },
 328 |    "source": [
 329 |     "Now we're ready to train. We train on the training function, and request 2500 steps which is enough to get through the set once or twice.\n",
 330 |     "\n",
 331 |     "On a normal laptop, this is a long process--I'd think about leaving it running overnight. But if you use tensorboard you can inspect some of the progress by typing `tensorboard --logdir '/tmp/model2'` into the command line and visiting `localhost:6006` in your browser.\n",
 332 |     "\n",
 333 |     "Ideally we'd be evaluating performance on the validation data here while it runs, but I'm taking it easy.\n",
 334 |     "\n",
 335 |     "I'll do 10,000 steps: with a batch size of 250 books at a time, that means 2.5 million books. That's about a single pass through the training data here."
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "code",
 340 |    "execution_count": 17,
 341 |    "metadata": {
 342 |     "collapsed": false,
 343 |     "deletable": true,
 344 |     "editable": true
 345 |    },
 346 |    "outputs": [
 347 |     {
 348 |      "name": "stdout",
 349 |      "output_type": "stream",
 350 |      "text": [
 351 |       "INFO:tensorflow:Calling model_fn.\n",
 352 |       "INFO:tensorflow:Done calling model_fn.\n",
 353 |       "INFO:tensorflow:Create CheckpointSaverHook.\n",
 354 |       "INFO:tensorflow:Graph was finalized.\n",
 355 |       "INFO:tensorflow:Running local_init_op.\n",
 356 |       "INFO:tensorflow:Done running local_init_op.\n",
 357 |       "INFO:tensorflow:Saving checkpoints for 1 into /tmp/model2/model.ckpt.\n",
 358 |       "INFO:tensorflow:loss = 1350.954, step = 1\n",
 359 |       "INFO:tensorflow:global_step/sec: 15.1715\n",
 360 |       "INFO:tensorflow:loss = 817.1722, step = 101 (6.591 sec)\n",
 361 |       "INFO:tensorflow:global_step/sec: 14.75\n",
 362 |       "INFO:tensorflow:loss = 858.6796, step = 201 (6.780 sec)\n",
 363 |       "INFO:tensorflow:global_step/sec: 13.5541\n",
 364 |       "INFO:tensorflow:loss = 524.1978, step = 301 (7.378 sec)\n",
 365 |       "INFO:tensorflow:global_step/sec: 15.6822\n",
 366 |       "INFO:tensorflow:loss = 880.43884, step = 401 (6.377 sec)\n",
 367 |       "INFO:tensorflow:global_step/sec: 14.9495\n",
 368 |       "INFO:tensorflow:loss = 644.6073, step = 501 (6.689 sec)\n",
 369 |       "INFO:tensorflow:global_step/sec: 14.1539\n",
 370 |       "INFO:tensorflow:loss = 517.4074, step = 601 (7.066 sec)\n",
 371 |       "INFO:tensorflow:global_step/sec: 13.1359\n",
 372 |       "INFO:tensorflow:loss = 457.95477, step = 701 (7.612 sec)\n",
 373 |       "INFO:tensorflow:global_step/sec: 14.971\n",
 374 |       "INFO:tensorflow:loss = 572.84015, step = 801 (6.680 sec)\n",
 375 |       "INFO:tensorflow:global_step/sec: 13.7205\n",
 376 |       "INFO:tensorflow:loss = 453.17322, step = 901 (7.288 sec)\n",
 377 |       "INFO:tensorflow:global_step/sec: 12.949\n",
 378 |       "INFO:tensorflow:loss = 342.863, step = 1001 (7.723 sec)\n",
 379 |       "INFO:tensorflow:global_step/sec: 14.215\n",
 380 |       "INFO:tensorflow:loss = 604.35754, step = 1101 (7.034 sec)\n",
 381 |       "INFO:tensorflow:global_step/sec: 14.1223\n",
 382 |       "INFO:tensorflow:loss = 506.5045, step = 1201 (7.081 sec)\n",
 383 |       "INFO:tensorflow:global_step/sec: 13.0627\n",
 384 |       "INFO:tensorflow:loss = 411.86505, step = 1301 (7.656 sec)\n",
 385 |       "INFO:tensorflow:global_step/sec: 14.0983\n",
 386 |       "INFO:tensorflow:loss = 530.52374, step = 1401 (7.093 sec)\n",
 387 |       "INFO:tensorflow:global_step/sec: 15.7036\n",
 388 |       "INFO:tensorflow:loss = 695.3821, step = 1501 (6.368 sec)\n",
 389 |       "INFO:tensorflow:global_step/sec: 14.0564\n",
 390 |       "INFO:tensorflow:loss = 809.17755, step = 1601 (7.114 sec)\n",
 391 |       "INFO:tensorflow:global_step/sec: 12.9708\n",
 392 |       "INFO:tensorflow:loss = 459.81866, step = 1701 (7.709 sec)\n",
 393 |       "INFO:tensorflow:global_step/sec: 14.9788\n",
 394 |       "INFO:tensorflow:loss = 564.40594, step = 1801 (6.676 sec)\n",
 395 |       "INFO:tensorflow:global_step/sec: 14.4435\n",
 396 |       "INFO:tensorflow:loss = 604.81226, step = 1901 (6.924 sec)\n",
 397 |       "INFO:tensorflow:global_step/sec: 13.3657\n",
 398 |       "INFO:tensorflow:loss = 611.63446, step = 2001 (7.482 sec)\n",
 399 |       "INFO:tensorflow:global_step/sec: 15.2634\n",
 400 |       "INFO:tensorflow:loss = 547.4445, step = 2101 (6.552 sec)\n",
 401 |       "INFO:tensorflow:global_step/sec: 15.3189\n",
 402 |       "INFO:tensorflow:loss = 633.56476, step = 2201 (6.528 sec)\n",
 403 |       "INFO:tensorflow:global_step/sec: 13.7736\n",
 404 |       "INFO:tensorflow:loss = 452.58167, step = 2301 (7.260 sec)\n",
 405 |       "INFO:tensorflow:global_step/sec: 13.2531\n",
 406 |       "INFO:tensorflow:loss = 609.4887, step = 2401 (7.546 sec)\n",
 407 |       "INFO:tensorflow:global_step/sec: 15.1869\n",
 408 |       "INFO:tensorflow:loss = 506.16394, step = 2501 (6.584 sec)\n",
 409 |       "INFO:tensorflow:global_step/sec: 13.9536\n",
 410 |       "INFO:tensorflow:loss = 533.0686, step = 2601 (7.167 sec)\n",
 411 |       "INFO:tensorflow:global_step/sec: 13.6805\n",
 412 |       "INFO:tensorflow:loss = 351.71048, step = 2701 (7.310 sec)\n",
 413 |       "INFO:tensorflow:global_step/sec: 14.7751\n",
 414 |       "INFO:tensorflow:loss = 525.6358, step = 2801 (6.768 sec)\n",
 415 |       "INFO:tensorflow:global_step/sec: 14.6496\n",
 416 |       "INFO:tensorflow:loss = 458.30002, step = 2901 (6.826 sec)\n",
 417 |       "INFO:tensorflow:global_step/sec: 12.9553\n",
 418 |       "INFO:tensorflow:loss = 497.68106, step = 3001 (7.719 sec)\n",
 419 |       "INFO:tensorflow:global_step/sec: 15.3124\n",
 420 |       "INFO:tensorflow:loss = 333.52762, step = 3101 (6.531 sec)\n",
 421 |       "INFO:tensorflow:global_step/sec: 15.097\n",
 422 |       "INFO:tensorflow:loss = 532.4596, step = 3201 (6.624 sec)\n",
 423 |       "INFO:tensorflow:global_step/sec: 12.9547\n",
 424 |       "INFO:tensorflow:loss = 574.93256, step = 3301 (7.719 sec)\n",
 425 |       "INFO:tensorflow:global_step/sec: 12.8965\n",
 426 |       "INFO:tensorflow:loss = 498.3466, step = 3401 (7.754 sec)\n",
 427 |       "INFO:tensorflow:global_step/sec: 15.0344\n",
 428 |       "INFO:tensorflow:loss = 571.7913, step = 3501 (6.652 sec)\n",
 429 |       "INFO:tensorflow:global_step/sec: 14.8494\n",
 430 |       "INFO:tensorflow:loss = 526.9555, step = 3601 (6.734 sec)\n",
 431 |       "INFO:tensorflow:global_step/sec: 12.8138\n",
 432 |       "INFO:tensorflow:loss = 549.09766, step = 3701 (7.804 sec)\n",
 433 |       "INFO:tensorflow:global_step/sec: 14.9991\n",
 434 |       "INFO:tensorflow:loss = 689.4115, step = 3801 (6.667 sec)\n",
 435 |       "INFO:tensorflow:global_step/sec: 14.9201\n",
 436 |       "INFO:tensorflow:loss = 389.5447, step = 3901 (6.703 sec)\n",
 437 |       "INFO:tensorflow:global_step/sec: 14.0281\n",
 438 |       "INFO:tensorflow:loss = 485.88654, step = 4001 (7.129 sec)\n",
 439 |       "INFO:tensorflow:global_step/sec: 14.657\n",
 440 |       "INFO:tensorflow:loss = 217.64255, step = 4101 (6.823 sec)\n",
 441 |       "INFO:tensorflow:global_step/sec: 15.5606\n",
 442 |       "INFO:tensorflow:loss = 544.39325, step = 4201 (6.426 sec)\n",
 443 |       "INFO:tensorflow:global_step/sec: 14.6206\n",
 444 |       "INFO:tensorflow:loss = 470.13605, step = 4301 (6.840 sec)\n",
 445 |       "INFO:tensorflow:global_step/sec: 14.3451\n",
 446 |       "INFO:tensorflow:loss = 411.63385, step = 4401 (6.971 sec)\n",
 447 |       "INFO:tensorflow:global_step/sec: 15.79\n",
 448 |       "INFO:tensorflow:loss = 474.31177, step = 4501 (6.333 sec)\n",
 449 |       "INFO:tensorflow:global_step/sec: 13.9183\n",
 450 |       "INFO:tensorflow:loss = 413.60974, step = 4601 (7.185 sec)\n",
 451 |       "INFO:tensorflow:global_step/sec: 14.9137\n",
 452 |       "INFO:tensorflow:loss = 444.24258, step = 4701 (6.705 sec)\n",
 453 |       "INFO:tensorflow:global_step/sec: 15.5054\n",
 454 |       "INFO:tensorflow:loss = 414.86804, step = 4801 (6.449 sec)\n",
 455 |       "INFO:tensorflow:global_step/sec: 14.4653\n",
 456 |       "INFO:tensorflow:loss = 451.69962, step = 4901 (6.913 sec)\n",
 457 |       "INFO:tensorflow:global_step/sec: 15.1844\n",
 458 |       "INFO:tensorflow:loss = 447.11688, step = 5001 (6.586 sec)\n",
 459 |       "INFO:tensorflow:global_step/sec: 15.1266\n",
 460 |       "INFO:tensorflow:loss = 475.72626, step = 5101 (6.612 sec)\n",
 461 |       "INFO:tensorflow:global_step/sec: 15.463\n",
 462 |       "INFO:tensorflow:loss = 447.83978, step = 5201 (6.466 sec)\n",
 463 |       "INFO:tensorflow:global_step/sec: 15.4672\n",
 464 |       "INFO:tensorflow:loss = 492.05273, step = 5301 (6.465 sec)\n",
 465 |       "INFO:tensorflow:global_step/sec: 15.045\n",
 466 |       "INFO:tensorflow:loss = 440.04358, step = 5401 (6.647 sec)\n",
 467 |       "INFO:tensorflow:global_step/sec: 15.4434\n",
 468 |       "INFO:tensorflow:loss = 526.6179, step = 5501 (6.475 sec)\n",
 469 |       "INFO:tensorflow:global_step/sec: 14.7776\n",
 470 |       "INFO:tensorflow:loss = 424.3891, step = 5601 (6.767 sec)\n",
 471 |       "INFO:tensorflow:global_step/sec: 15.4351\n",
 472 |       "INFO:tensorflow:loss = 610.66766, step = 5701 (6.479 sec)\n",
 473 |       "INFO:tensorflow:global_step/sec: 14.7057\n",
 474 |       "INFO:tensorflow:loss = 508.62125, step = 5801 (6.800 sec)\n",
 475 |       "INFO:tensorflow:global_step/sec: 13.8683\n",
 476 |       "INFO:tensorflow:loss = 602.8631, step = 5901 (7.211 sec)\n",
 477 |       "INFO:tensorflow:global_step/sec: 13.6665\n",
 478 |       "INFO:tensorflow:loss = 457.2767, step = 6001 (7.317 sec)\n",
 479 |       "INFO:tensorflow:global_step/sec: 15.6325\n",
 480 |       "INFO:tensorflow:loss = 507.33286, step = 6101 (6.397 sec)\n",
 481 |       "INFO:tensorflow:global_step/sec: 14.4518\n",
 482 |       "INFO:tensorflow:loss = 497.25348, step = 6201 (6.920 sec)\n",
 483 |       "INFO:tensorflow:global_step/sec: 12.9159\n",
 484 |       "INFO:tensorflow:loss = 376.90765, step = 6301 (7.742 sec)\n",
 485 |       "INFO:tensorflow:global_step/sec: 13.6395\n",
 486 |       "INFO:tensorflow:loss = 357.36993, step = 6401 (7.332 sec)\n",
 487 |       "INFO:tensorflow:global_step/sec: 13.726\n",
 488 |       "INFO:tensorflow:loss = 548.5192, step = 6501 (7.285 sec)\n",
 489 |       "INFO:tensorflow:global_step/sec: 13.1304\n",
 490 |       "INFO:tensorflow:loss = 291.7473, step = 6601 (7.616 sec)\n",
 491 |       "INFO:tensorflow:global_step/sec: 13.9968\n",
 492 |       "INFO:tensorflow:loss = 457.5477, step = 6701 (7.145 sec)\n",
 493 |       "INFO:tensorflow:global_step/sec: 13.2364\n",
 494 |       "INFO:tensorflow:loss = 486.73502, step = 6801 (7.555 sec)\n",
 495 |       "INFO:tensorflow:global_step/sec: 14.332\n",
 496 |       "INFO:tensorflow:loss = 423.7219, step = 6901 (6.977 sec)\n",
 497 |       "INFO:tensorflow:global_step/sec: 13.1255\n",
 498 |       "INFO:tensorflow:loss = 516.6612, step = 7001 (7.619 sec)\n",
 499 |       "INFO:tensorflow:global_step/sec: 13.0511\n",
 500 |       "INFO:tensorflow:loss = 424.11334, step = 7101 (7.662 sec)\n",
 501 |       "INFO:tensorflow:global_step/sec: 14.0004\n",
 502 |       "INFO:tensorflow:loss = 422.0582, step = 7201 (7.143 sec)\n",
 503 |       "INFO:tensorflow:global_step/sec: 13.6329\n",
 504 |       "INFO:tensorflow:loss = 256.40256, step = 7301 (7.335 sec)\n",
 505 |       "INFO:tensorflow:global_step/sec: 13.9707\n",
 506 |       "INFO:tensorflow:loss = 363.19058, step = 7401 (7.158 sec)\n",
 507 |       "INFO:tensorflow:global_step/sec: 13.8234\n",
 508 |       "INFO:tensorflow:loss = 412.27945, step = 7501 (7.234 sec)\n",
 509 |       "INFO:tensorflow:global_step/sec: 15.0889\n",
 510 |       "INFO:tensorflow:loss = 474.497, step = 7601 (6.627 sec)\n",
 511 |       "INFO:tensorflow:global_step/sec: 15.0845\n",
 512 |       "INFO:tensorflow:loss = 477.4922, step = 7701 (6.629 sec)\n",
 513 |       "INFO:tensorflow:global_step/sec: 14.4551\n",
 514 |       "INFO:tensorflow:loss = 479.04886, step = 7801 (6.918 sec)\n",
 515 |       "INFO:tensorflow:global_step/sec: 15.8013\n",
 516 |       "INFO:tensorflow:loss = 571.80316, step = 7901 (6.329 sec)\n",
 517 |       "INFO:tensorflow:global_step/sec: 15.1741\n",
 518 |       "INFO:tensorflow:loss = 494.90073, step = 8001 (6.590 sec)\n",
 519 |       "INFO:tensorflow:global_step/sec: 14.1238\n",
 520 |       "INFO:tensorflow:loss = 610.4834, step = 8101 (7.080 sec)\n",
 521 |       "INFO:tensorflow:global_step/sec: 13.7636\n",
 522 |       "INFO:tensorflow:loss = 474.1873, step = 8201 (7.265 sec)\n",
 523 |       "INFO:tensorflow:global_step/sec: 15.2927\n",
 524 |       "INFO:tensorflow:loss = 489.36032, step = 8301 (6.539 sec)\n",
 525 |       "INFO:tensorflow:global_step/sec: 14.9084\n",
 526 |       "INFO:tensorflow:loss = 499.76862, step = 8401 (6.708 sec)\n",
 527 |       "INFO:tensorflow:global_step/sec: 14.3719\n",
 528 |       "INFO:tensorflow:loss = 503.863, step = 8501 (6.958 sec)\n",
 529 |       "INFO:tensorflow:global_step/sec: 15.8148\n",
 530 |       "INFO:tensorflow:loss = 385.4067, step = 8601 (6.323 sec)\n",
 531 |       "INFO:tensorflow:Saving checkpoints for 8609 into /tmp/model2/model.ckpt.\n",
 532 |       "INFO:tensorflow:global_step/sec: 14.6939\n",
 533 |       "INFO:tensorflow:loss = 425.8155, step = 8701 (6.805 sec)\n",
 534 |       "INFO:tensorflow:global_step/sec: 15.8584\n",
 535 |       "INFO:tensorflow:loss = 484.06992, step = 8801 (6.306 sec)\n",
 536 |       "INFO:tensorflow:global_step/sec: 15.076\n",
 537 |       "INFO:tensorflow:loss = 538.4522, step = 8901 (6.633 sec)\n",
 538 |       "INFO:tensorflow:global_step/sec: 14.8227\n",
 539 |       "INFO:tensorflow:loss = 310.6877, step = 9001 (6.746 sec)\n",
 540 |       "INFO:tensorflow:global_step/sec: 16.1481\n",
 541 |       "INFO:tensorflow:loss = 513.8233, step = 9101 (6.193 sec)\n",
 542 |       "INFO:tensorflow:global_step/sec: 15.0606\n",
 543 |       "INFO:tensorflow:loss = 369.9548, step = 9201 (6.640 sec)\n",
 544 |       "INFO:tensorflow:global_step/sec: 16.0826\n",
 545 |       "INFO:tensorflow:loss = 416.74197, step = 9301 (6.218 sec)\n",
 546 |       "INFO:tensorflow:global_step/sec: 14.0626\n",
 547 |       "INFO:tensorflow:loss = 400.1271, step = 9401 (7.111 sec)\n",
 548 |       "INFO:tensorflow:global_step/sec: 15.2472\n",
 549 |       "INFO:tensorflow:loss = 505.83978, step = 9501 (6.559 sec)\n",
 550 |       "INFO:tensorflow:global_step/sec: 14.9409\n",
 551 |       "INFO:tensorflow:loss = 390.69742, step = 9601 (6.693 sec)\n",
 552 |       "INFO:tensorflow:global_step/sec: 12.1919\n",
 553 |       "INFO:tensorflow:loss = 374.99982, step = 9701 (8.202 sec)\n",
 554 |       "INFO:tensorflow:global_step/sec: 14.0609\n",
 555 |       "INFO:tensorflow:loss = 520.3057, step = 9801 (7.112 sec)\n",
 556 |       "INFO:tensorflow:global_step/sec: 13.4091\n",
 557 |       "INFO:tensorflow:loss = 378.43762, step = 9901 (7.458 sec)\n",
 558 |       "INFO:tensorflow:Saving checkpoints for 10000 into /tmp/model2/model.ckpt.\n",
 559 |       "INFO:tensorflow:Loss for final step: 463.49017.\n"
 560 |      ]
 561 |     },
 562 |     {
 563 |      "data": {
 564 |       "text/plain": [
 565 |        "<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f19686c0390>"
 566 |       ]
 567 |      },
 568 |      "execution_count": 17,
 569 |      "metadata": {},
 570 |      "output_type": "execute_result"
 571 |     }
 572 |    ],
 573 |    "source": [
 574 |     "classifier.train(input_fn = yielder, steps = 10000)"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "markdown",
 579 |    "metadata": {
 580 |     "deletable": true,
 581 |     "editable": true
 582 |    },
 583 |    "source": [
 584 |     "## Evaluate the classifier performance on the test data.\n",
 585 |     "\n",
 586 |     "In the paper, I got about 68% accuracy at this task. Here it's a little lower (62%) for a few reasons:\n",
 587 |     "\n",
 588 |     "1. The dimensionality is half as big, and I'm only using 1500 neurons in the hidden layer.\n",
 589 |     "2. The precision is only 2-bytes per integer, not 4. I don't think this makes much of a difference; but it might make some.\n",
 590 |     "3. The initializations aren't quite as well thought out--in particular, I haven't taken any steps here to avoid 'dead' neurons in the network by initializing to positive values, etc. \n",
 591 |     "4. We're not choosing a stopping point based on the validation set. Based on my previous experience, I think a single pass is probably a little smaller than ideal: we might get better results with 20,000 or 30,000 steps."
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": 18,
 597 |    "metadata": {
 598 |     "collapsed": false,
 599 |     "deletable": true,
 600 |     "editable": true,
 601 |     "scrolled": false
 602 |    },
 603 |    "outputs": [
 604 |     {
 605 |      "name": "stdout",
 606 |      "output_type": "stream",
 607 |      "text": [
 608 |       "INFO:tensorflow:Calling model_fn.\n",
 609 |       "INFO:tensorflow:Done calling model_fn.\n",
 610 |       "INFO:tensorflow:Starting evaluation at 2018-09-05-15:22:12\n",
 611 |       "INFO:tensorflow:Graph was finalized.\n",
 612 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-11150\n",
 613 |       "INFO:tensorflow:Running local_init_op.\n",
 614 |       "INFO:tensorflow:Done running local_init_op.\n",
 615 |       "INFO:tensorflow:Evaluation [10/100]\n",
 616 |       "INFO:tensorflow:Evaluation [20/100]\n",
 617 |       "INFO:tensorflow:Evaluation [30/100]\n",
 618 |       "INFO:tensorflow:Evaluation [40/100]\n",
 619 |       "INFO:tensorflow:Evaluation [50/100]\n",
 620 |       "INFO:tensorflow:Evaluation [60/100]\n",
 621 |       "INFO:tensorflow:Evaluation [70/100]\n",
 622 |       "INFO:tensorflow:Evaluation [80/100]\n",
 623 |       "INFO:tensorflow:Evaluation [90/100]\n",
 624 |       "INFO:tensorflow:Evaluation [100/100]\n",
 625 |       "INFO:tensorflow:Finished evaluation at 2018-09-05-15:22:18\n",
 626 |       "INFO:tensorflow:Saving dict for global step 11150: accuracy = 0.627, average_loss = 1.4050226, global_step = 11150, loss = 351.25568\n"
 627 |      ]
 628 |     },
 629 |     {
 630 |      "data": {
 631 |       "text/plain": [
 632 |        "{'accuracy': 0.627,\n",
 633 |        " 'average_loss': 1.4050226,\n",
 634 |        " 'global_step': 11150,\n",
 635 |        " 'loss': 351.25568}"
 636 |       ]
 637 |      },
 638 |      "execution_count": 18,
 639 |      "metadata": {},
 640 |      "output_type": "execute_result"
 641 |     }
 642 |    ],
 643 |    "source": [
 644 |     "classifier.evaluate(test_yielder, steps = 100)"
 645 |    ]
 646 |   },
 647 |   {
 648 |    "cell_type": "markdown",
 649 |    "metadata": {},
 650 |    "source": [
 651 |     "## Evaluate impressionistically on truly out-of-domain data.\n",
 652 |     "\n",
 653 |     "Now that we have a model, we'll throw it against some demonstration texts. Here's a function to pull any article from wikipedia."
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "code",
 658 |    "execution_count": 41,
 659 |    "metadata": {
 660 |     "collapsed": false,
 661 |     "deletable": true,
 662 |     "editable": true
 663 |    },
 664 |    "outputs": [
 665 |     {
 666 |      "name": "stdout",
 667 |      "output_type": "stream",
 668 |      "text": [
 669 |       "\n",
 670 |       "\n",
 671 |       "\n",
 672 |       "\n",
 673 |       "Elephants are large mammals of the family Elephantidae and the order Proboscidea. Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus). Elephants are scattered throughout sub-Sa...\n"
 674 |      ]
 675 |     }
 676 |    ],
 677 |    "source": [
 678 |     "import requests\n",
 679 |     "from lxml import html\n",
 680 |     "\n",
 681 |     "def get_wikitext(article_title, language=\"en\"):\n",
 682 |     "    response = requests.get(\n",
 683 |     "        'https://{}.wikipedia.org/w/api.php'.format(language),\n",
 684 |     "        params={\n",
 685 |     "            'action': 'parse',\n",
 686 |     "            'page': article_title,\n",
 687 |     "            'format': 'json',\n",
 688 |     "        }\n",
 689 |     "    ).json()\n",
 690 |     "    raw_html = response['parse']['text']['*']\n",
 691 |     "    document = html.document_fromstring(raw_html)\n",
 692 |     "    first_p = document.xpath('//p')[0]\n",
 693 |     "    body = \"\\n\".join([p.text_content() for p in document.xpath('//p')])\n",
 694 |     "    return (body)\n",
 695 |     "\n",
 696 |     "print(get_wikitext(\"Elephant\")[:300] + \"...\")"
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "markdown",
 701 |    "metadata": {
 702 |     "collapsed": true,
 703 |     "deletable": true,
 704 |     "editable": true
 705 |    },
 706 |    "source": [
 707 |     "Finally, I wrap some prediction code into a function to explore any wikipedia article. You can pull this apart to build a more complicated classifier."
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "code",
 712 |    "execution_count": 44,
 713 |    "metadata": {
 714 |     "collapsed": false
 715 |    },
 716 |    "outputs": [],
 717 |    "source": [
 718 |     "import SRP\n",
 719 |     "\n",
 720 |     "def predict(article, language='en'):\n",
 721 |     "    text = get_wikitext(article, language=language)\n",
 722 |     "    print(text[:300].strip() + \"...\")\n",
 723 |     "    representation = SRP.SRP(640).stable_transform(text)\n",
 724 |     "    representation = representation/np.linalg.norm(representation)\n",
 725 |     "\n",
 726 |     "    predict_input_fn = tf.estimator.inputs.numpy_input_fn({numeric_feature_column: np.array([representation])},\n",
 727 |     "                                                  y=None,\n",
 728 |     "                                                  batch_size=1,\n",
 729 |     "                                                  num_epochs=1,\n",
 730 |     "                                                  shuffle=False)    \n",
 731 |     "\n",
 732 |     "\n",
 733 |     "    p = classifier.predict(predict_input_fn)\n",
 734 |     "    predictions = p.__next__()\n",
 735 |     "    preds = list(zip(predictions['probabilities'], all_cats))\n",
 736 |     "    preds.sort()\n",
 737 |     "    preds.reverse()\n",
 738 |     "\n",
 739 |     "    for i in range(10):\n",
 740 |     "        print(\" {:.02%} {}\".format(*preds[i]))"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "markdown",
 745 |    "metadata": {},
 746 |    "source": [
 747 |     "Elephant is correctly classed as QL."
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "code",
 752 |    "execution_count": 45,
 753 |    "metadata": {
 754 |     "collapsed": false
 755 |    },
 756 |    "outputs": [
 757 |     {
 758 |      "name": "stdout",
 759 |      "output_type": "stream",
 760 |      "text": [
 761 |       "Elephants are large mammals of the family Elephantidae and the order Proboscidea. Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus). Elephants are scattered throughout sub-Sa...\n",
 762 |       "INFO:tensorflow:Calling model_fn.\n",
 763 |       "INFO:tensorflow:Done calling model_fn.\n",
 764 |       "INFO:tensorflow:Graph was finalized.\n",
 765 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 766 |       "INFO:tensorflow:Running local_init_op.\n",
 767 |       "INFO:tensorflow:Done running local_init_op.\n",
 768 |       " 84.42% QL\n",
 769 |       " 5.44% GN\n",
 770 |       " 3.19% QH\n",
 771 |       " 1.36% SF\n",
 772 |       " 0.83% G\n",
 773 |       " 0.83% GR\n",
 774 |       " 0.40% RC\n",
 775 |       " 0.36% Q\n",
 776 |       " 0.25% GV\n",
 777 |       " 0.24% BL\n"
 778 |      ]
 779 |     }
 780 |    ],
 781 |    "source": [
 782 |     "predict(\"elephant\")"
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "markdown",
 787 |    "metadata": {},
 788 |    "source": [
 789 |     "So is 'rhinoceros': but there's a little more uncertainty here."
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "code",
 794 |    "execution_count": 46,
 795 |    "metadata": {
 796 |     "collapsed": false
 797 |    },
 798 |    "outputs": [
 799 |     {
 800 |      "name": "stdout",
 801 |      "output_type": "stream",
 802 |      "text": [
 803 |       "Ceratotherium\n",
 804 |       "Dicerorhinus\n",
 805 |       "Diceros\n",
 806 |       "Rhinoceros\n",
 807 |       "Extinct genera, see text\n",
 808 |       "\n",
 809 |       "A rhinoceros (/raɪˈnɒsərəs/, from Greek  rhinokeros, meaning 'nose-horned', from  rhinos, meaning 'nose', and  kerato/keras, meaning 'horn'), commonly abbreviated to 'rhino', is one of any five extant species of odd-toed ungu...\n",
 810 |       "INFO:tensorflow:Calling model_fn.\n",
 811 |       "INFO:tensorflow:Done calling model_fn.\n",
 812 |       "INFO:tensorflow:Graph was finalized.\n",
 813 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 814 |       "INFO:tensorflow:Running local_init_op.\n",
 815 |       "INFO:tensorflow:Done running local_init_op.\n",
 816 |       " 35.55% QL\n",
 817 |       " 14.03% RC\n",
 818 |       " 12.11% QH\n",
 819 |       " 6.48% SF\n",
 820 |       " 4.56% RA\n",
 821 |       " 2.67% TX\n",
 822 |       " 2.62% QP\n",
 823 |       " 2.16% RM\n",
 824 |       " 1.77% Q\n",
 825 |       " 1.24% SB\n"
 826 |      ]
 827 |     }
 828 |    ],
 829 |    "source": [
 830 |     "predict(\"rhinoceros\")"
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "markdown",
 835 |    "metadata": {},
 836 |    "source": [
 837 |     "The German-language article on Immanuel Kant is reasonably classed as 'B' (General Philosophy) rather than PT (German literature),\n",
 838 |     "showing that we've learned German-specific rules as well as English ones. It's possible that BD or BH would be more correct, though. "
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "code",
 843 |    "execution_count": 47,
 844 |    "metadata": {
 845 |     "collapsed": false
 846 |    },
 847 |    "outputs": [
 848 |     {
 849 |      "name": "stdout",
 850 |      "output_type": "stream",
 851 |      "text": [
 852 |       "Immanuel Kant (* 22. April 1724 in Königsberg, Preußen; † 12. Februar 1804 ebenda) war ein deutscher Philosoph der Aufklärung. Kant zählt zu den bedeutendsten Vertretern der abendländischen Philosophie. Sein Werk Kritik der reinen Vernunft kennzeichnet einen Wendepunkt in der Philosophiegeschichte u...\n",
 853 |       "INFO:tensorflow:Calling model_fn.\n",
 854 |       "INFO:tensorflow:Done calling model_fn.\n",
 855 |       "INFO:tensorflow:Graph was finalized.\n",
 856 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 857 |       "INFO:tensorflow:Running local_init_op.\n",
 858 |       "INFO:tensorflow:Done running local_init_op.\n",
 859 |       " 69.11% B\n",
 860 |       " 5.59% BD\n",
 861 |       " 3.21% BH\n",
 862 |       " 2.45% PT\n",
 863 |       " 1.45% BF\n",
 864 |       " 1.29% BL\n",
 865 |       " 1.21% HM\n",
 866 |       " 1.13% LB\n",
 867 |       " 0.94% Q\n",
 868 |       " 0.91% N\n"
 869 |      ]
 870 |     }
 871 |    ],
 872 |    "source": [
 873 |     "predict(\"Immanuel Kant\", language='de')"
 874 |    ]
 875 |   },
 876 |   {
 877 |    "cell_type": "markdown",
 878 |    "metadata": {},
 879 |    "source": [
 880 |     "French gets Kant right too."
 881 |    ]
 882 |   },
 883 |   {
 884 |    "cell_type": "code",
 885 |    "execution_count": 48,
 886 |    "metadata": {
 887 |     "collapsed": false
 888 |    },
 889 |    "outputs": [
 890 |     {
 891 |      "name": "stdout",
 892 |      "output_type": "stream",
 893 |      "text": [
 894 |       "modifier - modifier le code - modifier Wikidata\n",
 895 |       "Emmanuel Kant (Immanuel en allemand, prononcé dans cette langue [ɪˈmaːnu̯eːl kant]), né le 22 avril 1724 à Königsberg, capitale de la Prusse-Orientale, et mort dans cette même ville le 12 février 1804, est un philosophe allemand, fondateur du criticism...\n",
 896 |       "INFO:tensorflow:Calling model_fn.\n",
 897 |       "INFO:tensorflow:Done calling model_fn.\n",
 898 |       "INFO:tensorflow:Graph was finalized.\n",
 899 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 900 |       "INFO:tensorflow:Running local_init_op.\n",
 901 |       "INFO:tensorflow:Done running local_init_op.\n",
 902 |       " 35.84% B\n",
 903 |       " 9.62% PN\n",
 904 |       " 4.88% BJ\n",
 905 |       " 4.74% Z\n",
 906 |       " 3.59% N\n",
 907 |       " 3.55% BD\n",
 908 |       " 2.32% CB\n",
 909 |       " 1.99% PQ\n",
 910 |       " 1.70% AC\n",
 911 |       " 1.69% LA\n"
 912 |      ]
 913 |     }
 914 |    ],
 915 |    "source": [
 916 |     "predict(\"Emmanuel Kant\", language='fr')"
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "markdown",
 921 |    "metadata": {},
 922 |    "source": [
 923 |     "But William James, who is correct in English and French, gets dumped into German history as the model's priors about German-language text overwhelm any psychology or philosophy-specific content here."
 924 |    ]
 925 |   },
 926 |   {
 927 |    "cell_type": "code",
 928 |    "execution_count": 52,
 929 |    "metadata": {
 930 |     "collapsed": false
 931 |    },
 932 |    "outputs": [
 933 |     {
 934 |      "name": "stdout",
 935 |      "output_type": "stream",
 936 |      "text": [
 937 |       "William James (* 11. Januar 1842 in New York; † 26. August 1910 in Chocorua, New Hampshire) war ein US-amerikanischer Psychologe und Philosoph. Von 1876 bis 1907 war er Professor für Psychologie und Philosophie an der Harvard University. James gilt sowohl als Begründer der Psychologie in den USA[1]...\n",
 938 |       "INFO:tensorflow:Calling model_fn.\n",
 939 |       "INFO:tensorflow:Done calling model_fn.\n",
 940 |       "INFO:tensorflow:Graph was finalized.\n",
 941 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 942 |       "INFO:tensorflow:Running local_init_op.\n",
 943 |       "INFO:tensorflow:Done running local_init_op.\n",
 944 |       " 13.55% DD\n",
 945 |       " 10.68% B\n",
 946 |       " 10.58% PT\n",
 947 |       " 6.69% Z\n",
 948 |       " 6.40% PN\n",
 949 |       " 2.78% PG\n",
 950 |       " 2.70% CT\n",
 951 |       " 2.40% HX\n",
 952 |       " 1.86% D\n",
 953 |       " 1.83% BP\n"
 954 |      ]
 955 |     }
 956 |    ],
 957 |    "source": [
 958 |     "predict(\"William James\", language='de')"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "markdown",
 963 |    "metadata": {},
 964 |    "source": [
 965 |     "In some cases, the model is *extremely* confident: 98% certainty that Johannes Brahms is an article about music.\n",
 966 |     "\n",
 967 |     "I've found, experimentally, that these probabilities tend to be useful; but also to overstate the actual accuracy. Probably it should say something more like 93%."
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "code",
 972 |    "execution_count": 56,
 973 |    "metadata": {
 974 |     "collapsed": false
 975 |    },
 976 |    "outputs": [
 977 |     {
 978 |      "name": "stdout",
 979 |      "output_type": "stream",
 980 |      "text": [
 981 |       "Johannes Brahms (German: [joˈhanəs ˈbʁaːms]; 7 May 1833 – 3 April 1897) was a German composer and pianist of the Romantic period. Born in Hamburg into a Lutheran family, Brahms spent much of his professional life in Vienna, Austria. His reputation and status as a composer are such that he is some...\n",
 982 |       "INFO:tensorflow:Calling model_fn.\n",
 983 |       "INFO:tensorflow:Done calling model_fn.\n",
 984 |       "INFO:tensorflow:Graph was finalized.\n",
 985 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
 986 |       "INFO:tensorflow:Running local_init_op.\n",
 987 |       "INFO:tensorflow:Done running local_init_op.\n",
 988 |       " 98.00% ML\n",
 989 |       " 1.90% MT\n",
 990 |       " 0.03% PN\n",
 991 |       " 0.01% PR\n",
 992 |       " 0.01% GV\n",
 993 |       " 0.01% PT\n",
 994 |       " 0.01% ND\n",
 995 |       " 0.01% M\n",
 996 |       " 0.00% CT\n",
 997 |       " 0.00% NC\n"
 998 |      ]
 999 |     }
1000 |    ],
1001 |    "source": [
1002 |     "predict(\"Johannes Brahms\", language='en')"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "markdown",
1007 |    "metadata": {},
1008 |    "source": [
1009 |     "\"Intersectionality\" is not philosophy, but Subclass HQ. The family. Marriage. Women."
1010 |    ]
1011 |   },
1012 |   {
1013 |    "cell_type": "code",
1014 |    "execution_count": 59,
1015 |    "metadata": {
1016 |     "collapsed": false
1017 |    },
1018 |    "outputs": [
1019 |     {
1020 |      "name": "stdout",
1021 |      "output_type": "stream",
1022 |      "text": [
1023 |       "Intersectionality is an analytic framework that attempts to identify how interlocking systems of power impact those who are most marginalized in society.[1] Intersectionality considers that various forms of social stratification, such as class, race, sexual orientation, age, disability and gender,...\n",
1024 |       "INFO:tensorflow:Calling model_fn.\n",
1025 |       "INFO:tensorflow:Done calling model_fn.\n",
1026 |       "INFO:tensorflow:Graph was finalized.\n",
1027 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
1028 |       "INFO:tensorflow:Running local_init_op.\n",
1029 |       "INFO:tensorflow:Done running local_init_op.\n",
1030 |       " 29.35% HQ\n",
1031 |       " 18.72% HV\n",
1032 |       " 9.86% HD\n",
1033 |       " 9.16% KF\n",
1034 |       " 6.72% HM\n",
1035 |       " 4.47% K\n",
1036 |       " 2.12% JC\n",
1037 |       " 1.99% HN\n",
1038 |       " 1.77% RA\n",
1039 |       " 1.15% HF\n"
1040 |      ]
1041 |     }
1042 |    ],
1043 |    "source": [
1044 |     "predict(\"Intersectionality\", language=\"en\")"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "markdown",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "The Boston Red Sox are sports and leisure. Gee, I'm having trouble finding an example that doesn't work!"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "code",
1056 |    "execution_count": 60,
1057 |    "metadata": {
1058 |     "collapsed": false
1059 |    },
1060 |    "outputs": [
1061 |     {
1062 |      "name": "stdout",
1063 |      "output_type": "stream",
1064 |      "text": [
1065 |       "The Boston Red Sox are an American professional baseball team based in Boston, Massachusetts. The Red Sox compete in Major League Baseball (MLB) as a member club of the American League (AL) East division. The Red Sox have won eight World Series championships and have played in twelve. In addit...\n",
1066 |       "INFO:tensorflow:Calling model_fn.\n",
1067 |       "INFO:tensorflow:Done calling model_fn.\n",
1068 |       "INFO:tensorflow:Graph was finalized.\n",
1069 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
1070 |       "INFO:tensorflow:Running local_init_op.\n",
1071 |       "INFO:tensorflow:Done running local_init_op.\n",
1072 |       " 84.34% GV\n",
1073 |       " 10.00% PN\n",
1074 |       " 1.43% F\n",
1075 |       " 1.30% E\n",
1076 |       " 0.44% PE\n",
1077 |       " 0.43% SF\n",
1078 |       " 0.26% CT\n",
1079 |       " 0.18% QC\n",
1080 |       " 0.13% TL\n",
1081 |       " 0.11% CS\n"
1082 |      ]
1083 |     }
1084 |    ],
1085 |    "source": [
1086 |     "predict(\"Boston Red Sox\", language=\"en\")"
1087 |    ]
1088 |   },
1089 |   {
1090 |    "cell_type": "markdown",
1091 |    "metadata": {},
1092 |    "source": [
1093 |     "Here we go. A lot of wikipedia is famously Pokémon, but that's not something you can find a lot of in libraries. So it punts--maybe it's 'general literature', maybe it's bibliography (Z--a common catchall category). GV would probably be the best bet, but only shows up in third place."
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "code",
1098 |    "execution_count": 64,
1099 |    "metadata": {
1100 |     "collapsed": false
1101 |    },
1102 |    "outputs": [
1103 |     {
1104 |      "name": "stdout",
1105 |      "output_type": "stream",
1106 |      "text": [
1107 |       "Pokémon (Japanese: ポケモン, Hepburn: Pokemon, Japanese: [pokemoɴ]; English: /ˈpoʊkɪˌmɒn, -ki-, -keɪ-/),[1][2][3] also known as Pocket Monsters (ポケットモンスター) in Japan, is a Japanese media franchise managed by The Pokémon Company, a Japanese consortium between Nintendo, Game Freak, and Creatures.[4] The...\n",
1108 |       "INFO:tensorflow:Calling model_fn.\n",
1109 |       "INFO:tensorflow:Done calling model_fn.\n",
1110 |       "INFO:tensorflow:Graph was finalized.\n",
1111 |       "INFO:tensorflow:Restoring parameters from /tmp/model2/model.ckpt-10000\n",
1112 |       "INFO:tensorflow:Running local_init_op.\n",
1113 |       "INFO:tensorflow:Done running local_init_op.\n",
1114 |       " 26.16% PN\n",
1115 |       " 20.31% Z\n",
1116 |       " 10.83% GV\n",
1117 |       " 8.43% NC\n",
1118 |       " 7.01% TR\n",
1119 |       " 2.74% BF\n",
1120 |       " 2.31% QL\n",
1121 |       " 2.23% N\n",
1122 |       " 1.95% Q\n",
1123 |       " 1.69% DS\n"
1124 |      ]
1125 |     }
1126 |    ],
1127 |    "source": [
1128 |     "predict(\"Pokémon\", language=\"en\")"
1129 |    ]
1130 |   }
1131 |  ],
1132 |  "metadata": {
1133 |   "kernelspec": {
1134 |    "display_name": "Python 3",
1135 |    "language": "python",
1136 |    "name": "python3"
1137 |   },
1138 |   "language_info": {
1139 |    "codemirror_mode": {
1140 |     "name": "ipython",
1141 |     "version": 3
1142 |    },
1143 |    "file_extension": ".py",
1144 |    "mimetype": "text/x-python",
1145 |    "name": "python",
1146 |    "nbconvert_exporter": "python",
1147 |    "pygments_lexer": "ipython3",
1148 |    "version": "3.5.2"
1149 |   }
1150 |  },
1151 |  "nbformat": 4,
1152 |  "nbformat_minor": 2
1153 | }
1154 | 


--------------------------------------------------------------------------------
/docs/Find Text Lab Books in Hathi.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Finding instances from one corpus in Hathi\n",
 11 |     "\n",
 12 |     "This shows how to churn through two corpuses to find copies of one in the other.\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": false,
 20 |     "deletable": true,
 21 |     "editable": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import SRP\n",
 26 |     "import numpy as np\n",
 27 |     "from scipy.spatial.distance import cdist"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": false,
 35 |     "deletable": true,
 36 |     "editable": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%load_ext autoreload\n",
 41 |     "%autoreload 2"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {
 47 |     "deletable": true,
 48 |     "editable": true
 49 |    },
 50 |    "source": [
 51 |     "This assumes that you've already created the txtlab file described in the notebook \"Hash a corpus of text files into SRP space\""
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {
 58 |     "collapsed": false,
 59 |     "deletable": true,
 60 |     "editable": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "txtlab = SRP.Vector_file(\"txtlab.bin\").to_matrix()\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {
 71 |     "collapsed": false,
 72 |     "deletable": true,
 73 |     "editable": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "def hathi_chunker(max_size=1000):\n",
 78 |     "    hathi = SRP.Vector_file(\"/home/bschmidt/vector_models/hathi.bin\")\n",
 79 |     "    id_cache = []\n",
 80 |     "    row_cache = np.zeros((max_size,hathi.dims),\"<f4\")\n",
 81 |     "    for id,row in hathi:\n",
 82 |     "        row_cache[len(id_cache)] = row\n",
 83 |     "        id_cache.append(id)\n",
 84 |     "        if len(id_cache) == max_size:\n",
 85 |     "            yield (id_cache, row_cache)\n",
 86 |     "            id_cache = []\n",
 87 |     "            row_cache = np.zeros((max_size,hathi.dims),\"<f4\")        \n",
 88 |     "            \n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {
 94 |     "deletable": true,
 95 |     "editable": true
 96 |    },
 97 |    "source": [
 98 |     "Use an array to store the neighbors of each index: store the top ten items to start."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {
105 |     "collapsed": false,
106 |     "deletable": true,
107 |     "editable": true
108 |    },
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "checked 1 million in hathi\n",
115 |       "checked 2 million in hathi\n",
116 |       "checked 3 million in hathi\n",
117 |       "checked 4 million in hathi\n",
118 |       "checked 5 million in hathi\n",
119 |       "checked 6 million in hathi\n",
120 |       "checked 7 million in hathi\n",
121 |       "checked 8 million in hathi\n",
122 |       "checked 9 million in hathi\n",
123 |       "checked 10 million in hathi\n",
124 |       "checked 11 million in hathi\n",
125 |       "checked 12 million in hathi\n",
126 |       "checked 13 million in hathi\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "hathi_chunks = hathi_chunker()\n",
132 |     "\n",
133 |     "knn = 20\n",
134 |     "neighbors = []\n",
135 |     "for i in range(len(txtlab[\"names\"])):\n",
136 |     "    neighbors.append([])\n",
137 |     "    for j in range(knn):\n",
138 |     "        neighbors[-1].append((float(\"Inf\"),\"nothing\"))\n",
139 |     "\n",
140 |     "n_chunked = 0\n",
141 |     "for ids,rows in hathi_chunks:\n",
142 |     "    n_chunked += 1\n",
143 |     "    if n_chunked % 1000 == 0:\n",
144 |     "        print \"checked {} million in hathi\\r\".format(n_chunked/1000)\n",
145 |     "    pairwise = cdist(txtlab[\"matrix\"], rows, \"cosine\")\n",
146 |     "    closest = np.argpartition(pairwise,knn,1)\n",
147 |     "    for i,row in enumerate(closest):\n",
148 |     "        for ix in range(knn):\n",
149 |     "            dist = pairwise[i][row[ix]]\n",
150 |     "            if dist < neighbors[i][-1][0]:\n",
151 |     "                neighbors[i][-1] = (dist,ids[row[ix]])\n",
152 |     "                neighbors[i].sort()\n",
153 |     "            elif dist < .05:\n",
154 |     "                # Catch everything that close\n",
155 |     "                neighbors[i].append((dist,ids[ix]))\n",
156 |     "                neighbors[i].sort()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 7,
162 |    "metadata": {
163 |     "collapsed": false,
164 |     "deletable": true,
165 |     "editable": true
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0021 from uc1.$b249538\n",
173 |       "DE_1932_Roth,Joseph_Radetzkymarsch_Novel is only 0.0021 from wu.89093642122\n",
174 |       "EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0023 from nyp.33433074833421\n",
175 |       "EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0025 from hvd.hn1m5d\n",
176 |       "EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0032 from hvd.hn6n6v\n",
177 |       "EN_1818_Shelley,Mary_Frankenstein_Novel is only 0.0035 from pst.000049200850\n",
178 |       "EN_1837_Disraeli,Benjamin_Venetia_Novel is only 0.0035 from nnc1.0055235000\n",
179 |       "EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0036 from hvd.hn6nhb\n",
180 |       "EN_1850_Aguilar,Grace_ValeofCedars_Novel is only 0.0039 from nyp.33433074945985\n",
181 |       "EN_1900_Barr,Amelia_TheMaidofMaidenLane_Novel is only 0.0040 from hvd.hn5fcn\n",
182 |       "EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0042 from hvd.32044011866720\n",
183 |       "EN_1837_Disraeli,Benjamin_Venetia_Novel is only 0.0042 from nyp.33433074937099\n",
184 |       "EN_1813_Austen,Jane_PrideandPrejudice_Novel is only 0.0043 from pst.000055930734\n",
185 |       "EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0044 from hvd.hn66kz\n",
186 |       "FR_1883_Maupassant,Guy_UneVie_Novel is only 0.0044 from mdp.39015004664929\n",
187 |       "EN_1900_Kipling,Rudyard_Kim_Novel is only 0.0045 from uc1.c085523086\n",
188 |       "EN_1885_Barr,Amelia_JanVeeder'sWife_Novel is only 0.0045 from hvd.hwssci\n",
189 |       "EN_1813_Austen,Jane_PrideandPrejudice_Novel is only 0.0046 from pst.000049310382\n",
190 |       "EN_1850_Yonge,Charlotte_Henrietta'sWish_Novel is only 0.0046 from uc2.ark:/13960/t42r3vs2v\n",
191 |       "EN_1813_Austen,Jane_PrideandPrejudice_Novel is only 0.0046 from mdp.39015054066439\n",
192 |       "EN_1917_Webb,Mary_GonetoEart_Novel is only 0.0047 from nyp.33433075757827\n",
193 |       "EN_1900_Kipling,Rudyard_Kim_Novel is only 0.0047 from mdp.39015007016580\n",
194 |       "EN_1826_Cooper,JameFenimore_TheLastoftheMohicans_Novel is only 0.0047 from pst.000062567619\n",
195 |       "EN_1850_Aguilar,Grace_ValeofCedars_Novel is only 0.0048 from hvd.hn1nqj\n",
196 |       "EN_1851_Hawthorne,Nathaniel_TheHouseoftheSevenGables_Novel is only 0.0049 from osu.32435017841164\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "nearly = []\n",
202 |     "for i,neighbor in enumerate(neighbors):\n",
203 |     "    name = txtlab[\"names\"][i]\n",
204 |     "    for dist, hathi in neighbor:\n",
205 |     "        if dist < .1:\n",
206 |     "            nearly.append((dist,name,hathi))\n",
207 |     "nearly.sort()\n",
208 |     "for dist,name,hathi in nearly[25]:\n",
209 |     "    print u\"{} is only {:0.4f} from {}\".format(name,dist, hathi)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 27,
215 |    "metadata": {
216 |     "collapsed": false,
217 |     "deletable": true,
218 |     "editable": true
219 |    },
220 |    "outputs": [
221 |     {
222 |      "data": {
223 |       "text/plain": [
224 |        "u'Yearbook of German-American studies.'"
225 |       ]
226 |      },
227 |      "execution_count": 27,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "import urllib2\n",
234 |     "import ujson as json\n",
235 |     "from IPython.display import HTML\n",
236 |     "\n",
237 |     "#hathi_cache = {}\n",
238 |     "\n",
239 |     "def jsonify(id, force = False):\n",
240 |     "    global hathi_cache\n",
241 |     "    if id in hathi_cache and not force:\n",
242 |     "        return hathi_cache[id]\n",
243 |     "    sons = \"\\n\".join(urllib2.urlopen(\"http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json\" %id.replace(\"+\",\":\").replace(\"=\",\"/\")).readlines())\n",
244 |     "    hathi_cache[id] = json.loads(sons)\n",
245 |     "    return hathi_cache[id]\n",
246 |     "\n",
247 |     "def descend(record):\n",
248 |     "    # Parse a hathi API call response.\n",
249 |     "    a = record['records']\n",
250 |     "    try:\n",
251 |     "        return a[a.keys()[0]]\n",
252 |     "    except IndexError:\n",
253 |     "        print record\n",
254 |     "        raise\n",
255 |     "        \n",
256 |     "def pretty_print(htid,text):\n",
257 |     "    output_string = \"\"#u\"<ul>\"\n",
258 |     "    try:\n",
259 |     "        a = descend(jsonify(htid))\n",
260 |     "        a['url'] = u\"https://babel.hathitrust.org/cgi/pt?id=\" + htid\n",
261 |     "        try:\n",
262 |     "            output_string += u\"<li><a href={}>{} ({})</a><br>{}</li>\".format(\n",
263 |     "                a['url'],a['titles'][0].encode(\"ascii\",\"ignore\"),a['publishDates'][0],text.encode(\"ascii\",\"ignore\"))\n",
264 |     "        except:\n",
265 |     "            print a\n",
266 |     "    except IndexError:\n",
267 |     "        print ('no index',p)\n",
268 |     "        pass\n",
269 |     "    except:\n",
270 |     "        print \"\"\n",
271 |     "        raise\n",
272 |     "    return HTML(output_string + \"\")#)\"</ul>\")\n",
273 |     "\n",
274 |     "class Hathi_Book():\n",
275 |     "    def __init__(self,htid,text=\"\"):\n",
276 |     "        self.htid = htid\n",
277 |     "        self.desc = descend(jsonify(htid))\n",
278 |     "        self.text = text\n",
279 |     "    def _repr_html_(self):\n",
280 |     "        self.desc['url'] = u\"https://babel.hathitrust.org/cgi/pt?id=\" + self.htid\n",
281 |     "        output_string = u\"<li><a href={}>{} ({})</a><br>{}</li>\".format(\n",
282 |     "                self.desc['url'],self.desc['titles'][0].encode(\"ascii\",\"ignore\"),self.desc['publishDates'][0],self.text.decode(\"utf-8\",\"ignore\"))\n",
283 |     "        return output_string\n",
284 |     "    def title(self):\n",
285 |     "        return self.desc['titles'][0]\n",
286 |     "    \n",
287 |     "Hathi_Book(\"inu.30000026383574\",\"Some sample text to go with, ❤\").title()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {
293 |     "collapsed": false,
294 |     "deletable": true,
295 |     "editable": true
296 |    },
297 |    "source": [
298 |     "This is code to debug the matches that I find. It's involved in the way that research code can be.\n",
299 |     "\n",
300 |     "Essentially, though, it spends most of its time on data cleaning and cutoff. The big challenge is \n",
301 |     "that I don't want it to flag for me as a problem when Hathi has a \"The Works of Charles Dickens, vol 3\" \n",
302 |     "and the textlab has \"Great Expectations.\"\n",
303 |     "\n",
304 |     "So it doesn't bother to compare matches for uninformative Hathi titles.\n",
305 |     "\n",
306 |     "Then it does some string replacement to normalize words or strings like \"and\", \"roman\", and \"œ\":\n",
307 |     "finally, it can compare the titles from Hathi to see if they're the same as those in the textlab. If not,\n",
308 |     "it prints to console suggesting that we check up.\n",
309 |     "\n",
310 |     "In many cases, this reveals problems in the original data: the textlab called a book \"The Vicar of Wrexham\", but it's actuall *The vicar of Wrexhill*. The machine is a decent proofreader!"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 117,
316 |    "metadata": {
317 |     "collapsed": false,
318 |     "deletable": true,
319 |     "editable": true,
320 |     "scrolled": false
321 |    },
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "........JanVeeder'sWife is 0.0045 from Jan Vedder's wife, (hvd.hwssci - EN_1885_Barr,Amelia_JanVeeder'sWife_Novel)\n",
328 |       ".....EffieBriest is 0.0053 from Effi Briest : roman / (mdp.39015054099133 - DE_1895_Fontane,Theodor_EffieBriest_Novel)\n",
329 |       "........{u'records': {}, u'items': []}\n",
330 |       "........LesPleiadesroman is 0.0066 from Les Pléïades. (uc1.$b183061 - FR_1874_Gobineau,Arthur,comtede_LesPleiadesroman_Novel)\n",
331 |       "..........TheVicarofWrexham is 0.0074 from The vicar of Wrexhill / (inu.32000002550467 - EN_1837_Trollope,FrancesMilton_TheVicarofWrexham_Novel)\n",
332 |       "............PeterPan is 0.0085 from Peter and Wendy, (mdp.39015008403183 - EN_1911_Barrie,J.M._PeterPan_Novel)\n",
333 |       "..............LaMaternelle,Roman is 0.0093 from La maternelle / (wu.89099436701 - FR_1904_Frapie,Leon_LaMaternelle,Roman_Novel)\n",
334 |       "..........AventuresdeMelleMariette is 0.0100 from Les aventures de Mademoiselle Mariette, par Champfleury. (hvd.32044087054235 - FR_1853_Champfleury_AventuresdeMelleMariette_Novel)\n",
335 |       ".HeidesLehrundWanderjahre is 0.0100 from Heidis Lehr- und Wanderjahre / (mdp.39015061379478 - DE_1880_Spyri,Johanna_HeidesLehrundWanderjahre_Novel)\n",
336 |       ".......Eleonora is 0.0104 from Leonora, (mdp.39015031225223 - EN_1902_Bellamy,Edward_Eleonora_Novel)\n",
337 |       "LeBlequiLeve is 0.0104 from Le blé qui lève. (mdp.39015008639711 - FR_1907_Bazin,Rene_LeBlequiLeve_Novel)\n",
338 |       "..DuCotedeChezSwann is 0.0107 from A la recherche du temps perdu. (mdp.49015000787391 - FR_1913_Proust,Marcel_DuCotedeChezSwann_Novel)\n",
339 |       "...........MadameChrysantheme is 0.0113 from Madame Chrysanthème. (hvd.hwddu1 - FR_1887_Loti,Pierre_MadameChrysantheme_Novel)\n",
340 |       "......Confessions is 0.0117 from La confession d'un enfant du siècle / (ucm.5309086467 - FR_1836_Musset,Alfred_Confessions_Novel)\n",
341 |       "....LaFugitive is 0.0121 from A la recherche du temps perdu / (mdp.39015064990834 - FR_1922_Proust,Marcel_LaFugitive_Novel)\n",
342 |       "...JekyllandHyde is 0.0126 from Strange case of Dr. Jekyll and Mr. Hyde, (mdp.39015059642994 - EN_1886_Stevenson,RobertLouis_JekyllandHyde_Novel)\n",
343 |       ".Renee is 0.0127 from Renée Mauperin. (inu.32000007731773 - FR_1864_Goncourt,Edmundde_Renee_Mauperin_Novel)\n",
344 |       "AlfredDe is 0.0127 from Servitude et grandeur militaires / (ucm.5325908522 - FR_1835_Vigny,_AlfredDe_ServitudeEtGrandeurMiliitaires_Novel)\n",
345 |       "LaNouvelleJournee is 0.0127 from Jean-Christophe ... (mdp.39015053622935 - FR_1912_Rolland,Romain_LaNouvelleJournee_Novel)\n",
346 |       ".....Lemalheurd'HenrietteGerard is 0.0129 from Le malheur d'Henriette Gérard ... (nyp.33433075816672 - FR_1860_Duranty,Louis_Lemalheurd'HenrietteGerard_Novel)\n",
347 |       "Lepithaleme is 0.0129 from L'épithalame; roman. Illustrations en couleurs de Clément Servean ... (mdp.39015033180145 - FR_1921_Chardonne,Jacques_Lepithaleme_Novel)\n",
348 |       "...LeNabob is 0.0132 from Le Nabab : mœurs parisiennes / (mdp.49015000749060 - FR_1878_Daudet,Alphonse_LeNabob_Novel)\n",
349 |       "........LeDésespéré is 0.0142 from Le desespéré, Roman. (inu.32000001229568 - FR_1886_Bloy,Leon_LeDésespéré_Novel)\n",
350 |       "..............LeConscritde1813 is 0.0152 from Histoire d'un conscrit de 1813. (njp.32101067522878 - FR_1864_Erckmann,Emile_LeConscritde1813_Novel)\n",
351 |       "............LhotelduNord is 0.0162 from L'Hôtel du Nord. (mdp.39015005637338 - FR_1929_Dabit,Eugene_LhotelduNord_Novel)\n",
352 |       "...LaConfessiondeMinuit is 0.0166 from Confession de minuit. (mdp.39015032034913 - FR_1920_Duhamel,Georges_LaConfessiondeMinuit_Novel)\n",
353 |       ".LeMariagedeGerard is 0.0167 from Le mariage de Gérard. (njp.32101067526671 - FR_1875_Theuriet,Andre_LeMariagedeGerard_Novel)\n",
354 |       ".Irrungen,Wirrungen is 0.0168 from Irrungen Wirrungen: Roman. (inu.30000041641626 - DE_1887_Fontane,Theodor_Irrungen,Wirrungen_Novel)\n",
355 |       "......Memoiresd'uneSuicide is 0.0171 from Mémoires d'un suicidé / (njp.32101071960742 - FR_1853_DuCamp,Maxime_Memoiresd'uneSuicide_Novel)\n",
356 |       "LEFILSDUFORÇAT is 0.0171 from Le fils du forçat, M. Coumbes, (mdp.39015011811752 - FR_1865_Dumas,Alexandre_LEFILSDUFORÇAT_Novel)\n",
357 |       "LaDebacle is 0.0173 from La débacle, (uc1.$b318010 - FR_1892_Zola,Emile_LaDebacle_Novel)\n",
358 |       ".JeromePaturotALaRechercheD'unePositionSociale is 0.0174 from Jérome Paturot à la recherche d'une position sociale / (chi.31549809 - FR_1842_Reybaud,Louis_JeromePaturotALaRechercheD'unePositionSociale_Novel)\n",
359 |       "....Raphael is 0.0178 from Raphaël : pages de la vingtième année / (nyp.33433075831242 - FR_1849_Lamartine,Alphonse_Raphael_Novel)\n",
360 |       "...TheHeirofRedcliffe is 0.0180 from The heir of Redclyffe. (uva.x000779652 - EN_1853_Yonge,Charlotte_TheHeirofRedcliffe_Novel)\n",
361 |       "...........LeTourduMondeEn80Jours is 0.0188 from Le tour du monde en quatre-vingts jours, (mdp.39015078547794 - FR_1873_Verne,Jules_LeTourduMondeEn80Jours_Novel)\n",
362 |       ".RobRoy is 0.0192 from The novels, tales and romances of the author of Waverley. (hvd.hwp69n - EN_1817_Scott,Walter_RobRoy_Novel)\n",
363 |       ".TheExpedictionofHenryClinker is 0.0196 from The expedition of Humphry Clinker. (mdp.39015002306010 - EN_1771_Smollett,Tobias_TheExpedictionofHenryClinker_Novel)\n",
364 |       "Heidekannbrauchen is 0.0196 from Heidi kann brauchen, was es gelernt hat / (mdp.39015061379460 - DE_1881_Spyri,Johanna_Heidekannbrauchen_Novel)\n",
365 |       "...SoeurPhilomene is 0.0199 from Soeur Philomène : roman / (pst.000020572969 - FR_1861_Goncourt,EdmondDe_SoeurPhilomene_Novel)\n",
366 |       ".PoilDeCarrotte is 0.0200 from Poil de carotte. (wu.89002295483 - FR_1894_Renard,Jules_PoilDeCarrotte_Novel)\n",
367 |       ".......Histoiredesara is 0.0204 from Monsieur Nicolas; ou, Le cœur humain dévoilé. (mdp.39015033443691 - FR_1796_RestifdelabretonneNicholas_Histoiredesara_Novel)\n",
368 |       "LeViceSupreme is 0.0206 from Le vice suprême / (hvd.32044010405058 - FR_1884_Peladin,Josephin_LeViceSupreme_Novel)\n",
369 |       ".AndreasVöst is 0.0208 from Andreas Vöst : Bauernroman (hvd.32044087283164 - DE_1906_Thoma,Ludwig_AndreasVöst_Novel)\n",
370 |       "....Novel is 0.0218 from Der Prozess; Roman. (mdp.39015000644933 - DE_1915_Kafka,Franz,DerProzeß_Novel)\n",
371 |       ".LeCapitaineFracasseTexteComplet1863 is 0.0218 from Le capitaine Fracasse / (nyp.33433075826671 - FR_1863_Gautier,Theophile_LeCapitaineFracasseTexteComplet1863_Novel)\n",
372 |       ".LileMysterieuse is 0.0222 from L'île mystérieuse : Texte intégral conforme à l'édition originale Hetzel / (mdp.39015060885681 - FR_1874_Verne,Jules_LileMysterieuse_Novel)\n",
373 |       "..ARebours is 0.0226 from À rebours. (mdp.39015032447107 - FR_1884_Huysman,JK_ARebours_Novel)\n",
374 |       "......DerSchüdderump is 0.0241 from Der schüdderump. (mdp.39015064525192 - DE_1870_Raabe,Wilhelm_DerSchüdderump_Novel)\n",
375 |       ".MephisTome1 is 0.0244 from Méphis / (mdp.39015037759258 - FR_1838_Tristan,Flora_MephisTome1_Novel)\n",
376 |       "Delaterrealalune is 0.0244 from De la terre à la lune : traject direct en 97 heures 20 minutes / (uiuc.2705016 - FR_1865_Verne,Jules_Delaterrealalune_Novel)\n",
377 |       "...Alice'sAdventureinWonderland is 0.0254 from Alice's adventures in Wonderland / (inu.39000002601958 - EN_1865_Carroll,Lewis_Alice'sAdventureinWonderland_Novel)\n",
378 |       "....Nono is 0.0269 from Monsieur Vénus : (wu.89009091950 - FR_1885_Rachilde_Nono_Novel)\n",
379 |       ".....ClemensBrentano'sFrühlingskranz is 0.0283 from Frühlingskranz aus Jugendbriefen ihm geflochten, wie er selbst schriftlich verlangte. (wu.89005898580 - DE_1844_Achim,Bettinevon_ClemensBrentano'sFrühlingskranz_Novel)\n",
380 |       "Obermann is 0.0284 from Oberman; lettres publiées par M... Senancour. Précédé de \"L'Espace désenchanté de Senancour\" par Georges Borgeaud. (uc1.32106001751764 - FR_1840_Senancour,EtiennePivertDe_Obermann_Novel)\n",
381 |       "...CapitanePamphileLe is 0.0294 from Le capitaine Pamphile. (hvd.32044087035689 - FR_1839_Dumas,Alexandre(père),_CapitanePamphileLe_Novel)\n",
382 |       "........DieNachtwachendesBonaventura is 0.0328 from Nachtwachen. (uc1.$c242055 - DE_1805_Klingemann,August_DieNachtwachendesBonaventura_Novel)\n",
383 |       "Cecile is 0.0337 from Cécile, Roman. (inu.30000041654769 - DE_1886_Fontane,Theodor_Cecile_Novel)\n",
384 |       "......DerGrüneHeinrichErsteFassung is 0.0355 from Der grüne Heinrich / (mdp.39015041124879 - DE_1845_Keller,Gottfried_DerGrüneHeinrichErsteFassung_Novel)\n",
385 |       "..LaJongieuse is 0.0366 from La jongleuse : roman / (hvd.32044012275012 - FR_1900_Rachilde_LaJongieuse_Novel)\n",
386 |       "..Valerie-Novel is 0.0375 from Valérie, (hvd.32044018771410 - FR_1803_Krudener,Barbara_Valerie-Novel)\n",
387 |       "L'aneMortEtLaFemmeGuillotinee is 0.0386 from L'âne mort et la femme guillotinée. (wu.89005131693 - FR_1829_Janin,JulesGabriel_L'aneMortEtLaFemmeGuillotinee_Novel)\n",
388 |       "Leonora is 0.0390 from Tales, and miscellaneous pieces / (nyp.33433076091432 - EN_1806_Edgeworth,Maria_Leonora_Novel)\n",
389 |       "Bozena is 0.0397 from Božena : erzählung / (njp.32101067121325 - DE_1876_EbnerEschenbach,Maria_Bozena_Novel)\n",
390 |       "..GeschichtederAbderiten is 0.0427 from Geschichte Abderiten. (mdp.49015001120774 - DE_1774_Wieland,ChristophMartin_GeschichtederAbderiten_Novel)\n",
391 |       "...DerGeldcomplex is 0.0438 from Der Geldkomplex : Roman / (njp.32101068180692 - DE_1916_Reventlow,Franziska_DerGeldcomplex_Novel)\n",
392 |       ".........LaFeeAuxMiettes is 0.0470 from Trilby [et] La fée aux miettes. (inu.30000077514408 - FR_1831_Nodier,Charles_LaFeeAuxMiettes_Novel)\n",
393 |       "DieKronenwachterErsterBandBertholdsErstesUndZweitesLeben is 0.0473 from Die Kronenwächter, Roman. (uc1.$b185704 - DE_1817_Arnim,Achimvon_DieKronenwachterErsterBandBertholdsErstesUndZweitesLeben_Novel)\n",
394 |       "..LesabendioEinAsteroiderRoman is 0.0480 from Lesabéndio : ein asteroïden-Roman / (uc1.b2807621 - DE_1913_Scheerbart,Paul_LesabendioEinAsteroiderRoman_Novel)\n",
395 |       "...DiePoeten is 0.0491 from Das junge Europa (mdp.39015010307737 - DE_1833_Laube,Heinrich_DiePoeten_Novel)\n",
396 |       "TheNarrativeofArthurGordonPym is 0.0513 from Tales / (hvd.32044080922743 - EN_1838_Poe,EdgarAllen_TheNarrativeofArthurGordonPym_Novel)\n",
397 |       "Roche,Sophievon is 0.0515 from Geschichte des Fräulein von Sternheim, (mdp.39015005269223 - DE_1771_La_Roche,Sophievon_GeschichtedesFräuleinsvonSternheim_Novel)\n",
398 |       "Tarub,BagdadsKöchin is 0.0521 from Tarub, Bagdads berühmte Köchin : arabischer Kulturroman / (wu.89052411667 - DE_1897_Scheerbart,Paul_Tarub,BagdadsKöchin_Novel)\n",
399 |       "Flammetti is 0.0524 from Flametti, oder, Vom Dandysmus der Armen : Roman / (njp.32101066455948 - DE_1918_Ball,Hugo_Flammetti_Novel)\n",
400 |       ".DieLeidendesjungenWerthers1 is 0.0547 from Die Leiden des jungen Werthers : synoptischer Druck der beiden Fassungen 1774 und 1787 / (uc1.b4271401 - DE_1774_Goethe,Johann_DieLeidendesjungenWerthers1_Novel)\n",
401 |       "..DernieresLettresDeDeuxAmansDeBarcelone is 0.0560 from Dernières lettres de deux amans de Barcelone (mdp.39015066677686 - FR_1821_Latouche,HenriDe_DernieresLettresDeDeuxAmansDeBarcelone_Novel)\n",
402 |       ".EugénieGrandet is 0.0570 from Eugénie Grandet / (hvd.hn3aem - FR_1833_Balzac,HonoreDe_EugénieGrandet_Novel)\n",
403 |       "Amalie.EinewahreGeschichteinBriefen is 0.0574 from Amalie; eine wahre Geschichte in Briefen, (ien.35556007475064 - DE_1788_Ehrmann,Marianne_Amalie.EinewahreGeschichteinBriefen_Novel)\n",
404 |       "Sousloeilsdesbarbares is 0.0574 from Sous l'oeil des barbares, (mdp.39015033157390 - FR_1888_Barres,Maurice_Sousloeilsdesbarbares_Novel)\n",
405 |       "VathekConteArabe is 0.0591 from Vathek ; conte arabe / (uc1.b2793339 - FR_1787_Beckford,William_VathekConteArabe_Novel)\n",
406 |       ".Anna is 0.0613 from Haus-, wald- und feldmärchen. (hvd.hnxstq - DE_1845_Schopenhauer,Adele_Anna_Novel)\n",
407 |       ".....DieKronenwachterZweiterBand is 0.0647 from Die Kronenwächter, Roman. (uc1.$b185705 - DE_1854_Arnim,Achimvon_DieKronenwachterZweiterBand_Novel)\n",
408 |       "..LettresDeMylordRiversASirCharlesCardigan is 0.0695 from Lettres de Mylord Rivers à Sir Charles Cardigan / (uc1.b3795763 - FR_1776_Riccoboni,MarieJeannedeHeurlesLaborasdeMezieres_LettresDeMylordRiversASirCharlesCardigan_Novel)\n",
409 |       ".HeinrichvonOfterdingen is 0.0721 from Novalis schriften. (mdp.39015005211803 - DE_1800_Novalis_HeinrichvonOfterdingen_Novel)\n",
410 |       "CalisteOuLettresDeLausanne is 0.0729 from Lettres écrites de Lausanne: histoire de Cécile, Caliste, (njp.32101073046680 - FR_1787_Charriere,IsabelleDe_CalisteOuLettresDeLausanne_Novel)\n",
411 |       ".ProblematischeNaturen2 is 0.0750 from Problematische naturen; Roman. (hvd.hn1xar - DE_1861_Spielhagen,Friedrich_ProblematischeNaturen2_Novel)\n",
412 |       ".RitterSchnapphahnski is 0.0753 from Leben und Thaten des berühmten Ritters Schnapphahnski. (mdp.39015014133469 - DE_1849_Weerth,Georg_RitterSchnapphahnski_Novel)\n",
413 |       ".ScenesViedeJeunesse is 0.0775 from Scènes de la vie de jeunesse. (mdp.39015033393482 - FR_1851_Murger,Henri_ScenesViedeJeunesse_Novel)\n",
414 |       "WilliamLovell is 0.0775 from Agorà : eine humanistische Schriftenreihe. (pst.000002387000 - DE_1796_Tieck,Ludwig_WilliamLovell_Novel)\n",
415 |       "..AnnetteEtLeCriminel is 0.0787 from Argow le pirate. (umn.31951002116955t - FR_1824_Balzac,HonoreDe_AnnetteEtLeCriminel_Novel)\n",
416 |       ".OlympesDesCleves is 0.0795 from Olympe de Clèves, (hvd.hwk9ju - FR_1851_Dumas,Alexandre_OlympesDesCleves_Novel)\n",
417 |       "FaustsLeben is 0.0801 from Die Faustdichtung vor, neben und nach Goethe. (mdp.39015014959590 - DE_1791_Klinger,Friedrich_FaustsLeben_Novel)\n",
418 |       ".....***seen 377 at .1 distance\n",
419 |       "..ProblematischeNaturen1 is 0.1039 from Problematische Naturen : Roman / (uc1.a0007935067 - DE_1861_Spielhagen,Friedrich_ProblematischeNaturen1_Novel)\n",
420 |       "Estelle is 0.1049 from Galatée, roman pastoral; imité de Cervantes ... (uva.x004963257 - FR_1788_Florian,Jean-Pierre_Estelle_Novel)\n",
421 |       ".BenjaminNoldmann is 0.1076 from Geschichte der Aufklärung in Abyssinien; oder, Nachricht von seinem und seines Herrn Vetters Aufenthalte an dem Hofe des grossen Negus, oder Priesters Johannes. (nyp.33433082476882 - DE_1791_Knigge,Adolph_BenjaminNoldmann_Novel)\n",
422 |       "...ChampavertContesImmoraux is 0.1142 from Champavert : contes immoraux / (mdp.39015055873569 - FR_1833_Borel,Petrus_ChampavertContesImmoraux_Novel)\n",
423 |       "...LaDotDeSuzanneOuHistoireDeMmeDeSenneterre is 0.1188 from La dot de Suzette / (inu.30000003097700 - FR_1798_Fievee,Joseph_LaDotDeSuzanneOuHistoireDeMmeDeSenneterre_Novel)\n",
424 |       "DelphineTome1 is 0.1211 from Delphine / (nyp.33433075813570 - FR_1802_Stael,MadameDe_DelphineTome1_Novel)\n",
425 |       "..LesChevaliersDuCygneOuLaCourDeCharlemagne is 0.1323 from Les chevaliers du Cygne, ou La cour de Charlemagne. (iau.31858000446694 - FR_1795_Genlis,StephanieFelicite,ComtesseDe_LesChevaliersDuCygneOuLaCourDeCharlemagne_Novel)\n",
426 |       "HammerundAmboß is 0.1334 from Hammer & amboss. (mdp.39015030329885 - DE_1869_Spielhagen,Friedrich_HammerundAmboß_Novel)\n",
427 |       ".......Limposture is 0.1482 from La joie. (uc1.b3750446 - FR_1927_Bernanos,Georges_Limposture_Novel)\n",
428 |       ".AdeleEtTheodoreOuLettresSurLEducation is 0.1518 from Adèle et Théodore, ou Lettres sur l'éducation; (hvd.hwysny - FR_1782_Genlis,StephanieFelicite,Comtessede_AdeleEtTheodoreOuLettresSurLEducation_Novel)\n",
429 |       "Mathilda is 0.1569 from Mary ; Maria / (mdp.39015025013171 - EN_1819_Shelley,Mary_Mathilda_Novel)\n",
430 |       ".DelphineTome2 is 0.1663 from Delphine / (mdp.39015014805975 - FR_1802_Stael,MadameDe_DelphineTome2_Novel)\n",
431 |       ".***seen 410 at .18 distance\n",
432 |       "BungayCastle is 0.1825 from The Lady's monthly museum. (nyp.33433104825397 - EN_1796_Bonhote,Elizabeth_BungayCastle_Novel)\n",
433 |       "ThatUnfortunateMarriage is 0.1909 from How like a woman, (nyp.33433074880067 - EN_1888_Trollope,FrancesEleanor_ThatUnfortunateMarriage_Novel)\n",
434 |       "HermannundUlrike is 0.2034 from Gesamtausgabe in acht Bänden / (wu.89063613699 - DE_1780_Wezel,Johann_HermannundUlrike_Novel)\n",
435 |       "Clementine is 0.2172 from Eine Lebensfrage, Roman (uc1.$b782245 - DE_1843_Lewald,Fanny_Clementine_Novel)\n",
436 |       "ArmutReichtumSchuldUndBusseDerGrafinDolores is 0.2206 from Armuth, Reichtum, Schuld und Busse der Gräfin Dolores; eine wahre Geschichte zur lehrreichen Unterhaltung armer Fräulein, (nnc1.0315056392 - DE_1810_Arnim,Achimvon_ArmutReichtumSchuldUndBusseDerGrafinDolores_Novel)\n",
437 |       "HermannvonUnna is 0.2234 from Konradin von Schwaben : oder, Geschichte des unglücklichen Enkels Kaiser Friedrichs des Zweyten. (chi.81820393 - DE_1788_Naubert,Benedikte_HermannvonUnna_Novel)\n",
438 |       "JulchenGrünthal is 0.2240 from Schriften. (mdp.39015066223796 - DE_1798_Unger,Friederike_JulchenGrünthal_Novel)\n",
439 |       "EllenOlesjerne is 0.2259 from Erzählende schriften von Arthur Schnitzler ... (mdp.39015065998422 - DE_1903_Reventlow,Franziska_EllenOlesjerne_Novel)\n",
440 |       ".AlfvonDülmen is 0.2323 from Walter von Stadion : oder, Geschichte Herzog Leopolds von Oestreich und seiner Kriegsgefährten. (chi.81820766 - DE_1791_Naubert,Benedikte_AlfvonDülmen_Novel)\n",
441 |       "L'AventurierFrancoisOuMemoiresDeGregoireMerveil is 0.2331 from L'aventurier François; ou, Mémoires de Grégoire Merveil. (nyp.33433075821664 - FR_1782_Lesuire,RobertMartin_L'AventurierFrancoisOuMemoiresDeGregoireMerveil_Novel)\n",
442 |       "DieAmtmanninvonHohenweiler is 0.2342 from Pauline Frankini : oder, Täuschungen der Leidenschaft und Freuden der Liebe. (chi.81820684 - DE_1791_Naubert,Benedikte_DieAmtmanninvonHohenweiler_Novel)\n",
443 |       ".StufenjahreeinesGlücklichen is 0.2399 from Stufenjahre eines Glücklichen. (umn.319510020869261 - DE_1877_François,Louisevon_StufenjahreeinesGlücklichen_Novel)\n",
444 |       "Roche,Sophie is 0.2428 from Sophie von La Roche : Lesebuch / (mdp.39015060665083 - DE_1798_La_Roche,Sophie_von_ErscheinungenamSeeOneida_Novel)\n",
445 |       "DieFraudesFalkensteins is 0.2438 from Anthologie. Cabinets-ausg. (hvd.hnwr7h - DE_1810_Fouqué,CarolinedelaMotte_DieFraudesFalkensteins_Novel)\n",
446 |       "NinasBriefeanihrenGeliebten is 0.2474 from Amalie; eine wahre Geschichte in Briefen, (ien.35556007475064 - DE_1788_Ehrmann,Marianne_NinasBriefeanihrenGeliebten_Novel)\n",
447 |       "Titan is 0.2484 from Deutsche National-Litteratur, historisch, kritische Ausgabe ... (mdp.39015067134075 - DE_1803_Paul,Jean_Titan_Novel)\n"
448 |      ]
449 |     }
450 |    ],
451 |    "source": [
452 |     "nearly = []\n",
453 |     "for i,neighbor in enumerate(neighbors):\n",
454 |     "    name = txtlab[\"names\"][i]\n",
455 |     "    for dist, hathi in neighbor:\n",
456 |     "        if dist < .25:\n",
457 |     "            nearly.append((dist,name,hathi))\n",
458 |     "nearly.sort()\n",
459 |     "seen = set()\n",
460 |     "last_dist = 0\n",
461 |     "\n",
462 |     "for dist,name,hathi in nearly:\n",
463 |     "    if dist > .1 and last_dist <= .1:\n",
464 |     "        print \"***seen {} at .1 distance, the conservative cutoff.\".format(len(seen))\n",
465 |     "    if dist > .18 and last_dist <= .18:\n",
466 |     "        print \"***seen {} at .18 distance, the hand-picked cutoff for best performance at this task\".format(len(seen))\n",
467 |     "    last_dist = dist\n",
468 |     "    if name in seen:\n",
469 |     "        # The first match for a book is the best.\n",
470 |     "        continue\n",
471 |     "    try:\n",
472 |     "        hathi_title = Hathi_Book(hathi).title()\n",
473 |     "    except: \n",
474 |     "        continue\n",
475 |     "    broken = False\n",
476 |     "    for workmarker in [\n",
477 |     "        u\"sämmtliche\", u\"Novels and tales\",u\"works of\", \"novels of\",\n",
478 |     "        u\"Werke\", u\"Gesammelte\", u\"Romane und Erzählungen\", \"werke\", \"Romane\", u\"Erzählungen\",\n",
479 |     "        u\"Works\", u\"Life and works\", u\"v.\",u\"O︠e︡uvres\", u\"complètes\", u\"complètes\",\"gesammelt\",u\"Sämmtliche\",\n",
480 |     "        u\"OEuvres\", \"The writings of\", \"Tales and novels\", u\"Œuvres\", \"Waverley novels\", u\"Erzählungen\",\n",
481 |     "        u\"Oeuvres\", \"gesammelte Romane\", \"Standard novels\", \"uvres comple\", u\"sämtliche\", u\"sämliche\",\"Samtliche\",\n",
482 |     "    \"Deutsche Literatur\", \"prose tales\", \"Romans\", \"ovels of\",\n",
483 |     "        \"in philology\", \"Agora\", # These are both 20C journals I can't check to see if they published an old novel.\n",
484 |     "        \"Dichtungen und Schriften\"]:\n",
485 |     "        if workmarker in hathi_title:\n",
486 |     "            broken = True\n",
487 |     "    if broken:\n",
488 |     "        # Don't make me check \"Works v. 4\"\n",
489 |     "        continue\n",
490 |     "    import sys   \n",
491 |     "    seen.add(name)\n",
492 |     "    mcgill_title = name.split(\"_\")[3]\n",
493 |     "    mt = mcgill_title\n",
494 |     "    ht = hathi_title\n",
495 |     "    for find, replace in [\n",
496 |     "        (u\"'\",\"\"),\n",
497 |     "        (u\"œ\", \"oe\"),\n",
498 |     "        (\"the\", \"\"),\n",
499 |     "        (\" \",\"\"),\n",
500 |     "        (u\"è\", \"e\"),\n",
501 |     "        (\"-\",\"\"),\n",
502 |     "        (u\"é\",\"e\"),\n",
503 |     "        (\"man\",\"men\"),\n",
504 |     "        (\"dela\", \"\"),\n",
505 |     "        (\"de\", \"\"),\n",
506 |     "        (u\"ß\",\"ss\"),\n",
507 |     "        (\",roman\",\"\")\n",
508 |     "    ]:\n",
509 |     "        mt = mt.lower().replace(find, replace)\n",
510 |     "        ht = ht.lower().replace(find, replace)\n",
511 |     "    if mt in ht:\n",
512 |     "        sys.stdout.write(\".\")\n",
513 |     "        continue\n",
514 |     "    print u\"{} is {:0.4f} from {} ({} - {})\".format(mcgill_title, dist, hathi_title, hathi, name)\n",
515 |     "    \n",
516 |     "\n",
517 |     "# As with the library metadata, using textual features instead of metadata reveals several places where the metadata itself  is inaccurate.\n",
518 |     "# Jan Vedder's wife is listed as Jan Veeder's Wife; Effi Briest is spelled \"Effie Briest\"; The Vicar of Wrexhill is title \"The Vicar of Wrexham.\"\n",
519 |     "# The metadata identifies a book as Rachilde's Nono, when in fact it is actually Monsieur Venus.\n",
520 |     "# The algorithm also identifies a copy of what the Hathi catalog describes as Adele Schopenhauer's Haus, Wald, und Feldmaerchen as Anna; the catalog metadata (hvd.hnxstq) appears not to note that that 350 novel is bound into the same covers as the 150 page fairy tales.\n"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 53,
526 |    "metadata": {
527 |     "collapsed": false,
528 |     "deletable": true,
529 |     "editable": true
530 |    },
531 |    "outputs": [
532 |     {
533 |      "data": {
534 |       "text/plain": [
535 |        "412"
536 |       ]
537 |      },
538 |      "execution_count": 53,
539 |      "metadata": {},
540 |      "output_type": "execute_result"
541 |     }
542 |    ],
543 |    "source": []
544 |   }
545 |  ],
546 |  "metadata": {
547 |   "kernelspec": {
548 |    "display_name": "Python 2",
549 |    "language": "python",
550 |    "name": "python2"
551 |   },
552 |   "language_info": {
553 |    "codemirror_mode": {
554 |     "name": "ipython",
555 |     "version": 2
556 |    },
557 |    "file_extension": ".py",
558 |    "mimetype": "text/x-python",
559 |    "name": "python",
560 |    "nbconvert_exporter": "python",
561 |    "pygments_lexer": "ipython2",
562 |    "version": "2.7.12"
563 |   }
564 |  },
565 |  "nbformat": 4,
566 |  "nbformat_minor": 2
567 | }
568 | 


--------------------------------------------------------------------------------
/docs/Increasing Speed through batch processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "Here's how you read and write to the binary file used in this extension."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": false,
 18 |     "deletable": true,
 19 |     "editable": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "%load_ext autoreload\n",
 24 |     "%autoreload 2"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "collapsed": false,
 32 |     "deletable": true,
 33 |     "editable": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import SRP\n",
 38 |     "import glob\n",
 39 |     "import numpy as np\n",
 40 |     "import copy\n",
 41 |     "import time"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 16,
 47 |    "metadata": {
 48 |     "collapsed": false,
 49 |     "deletable": true,
 50 |     "editable": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "hasher = SRP.SRP(1280)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "metadata": {
 61 |     "collapsed": false,
 62 |     "deletable": true,
 63 |     "editable": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "files = glob.glob(\"/home/bschmidt/bookworms/txtlab/*.txt\")"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 6,
 73 |    "metadata": {
 74 |     "collapsed": false,
 75 |     "deletable": true,
 76 |     "editable": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "start = datetime.datetime.now()\n",
 81 |     "wordcounts = [(f,hasher.standardize([open(f).read()],[1],unzip=False)) for f in files]\n",
 82 |     "print \"Tokenization had timedelta of {}\".format(len(data),datetime.datetime.now() - start)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": false,
 90 |     "deletable": true,
 91 |     "editable": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": []
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 8,
 99 |    "metadata": {
100 |     "collapsed": false,
101 |     "deletable": true,
102 |     "editable": true
103 |    },
104 |    "outputs": [],
105 |    "source": []
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 9,
110 |    "metadata": {
111 |     "collapsed": false,
112 |     "deletable": true,
113 |     "editable": true
114 |    },
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "Finished 451 with timedelta of 0:01:41.913734\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "wrapper = SRP.SRP_batch(hasher)\n",
126 |     "import datetime\n",
127 |     "\n",
128 |     "start = datetime.datetime.now()\n",
129 |     "data = []\n",
130 |     "\n",
131 |     "for id,count in wordcounts:\n",
132 |     "    cop = copy.deepcopy(count)\n",
133 |     "    try:\n",
134 |     "        wrapper.push(id,cop)\n",
135 |     "    except OverflowError:\n",
136 |     "        for val in wrapper.flush():\n",
137 |     "            data.append(val)\n",
138 |     "        wrapper.push(id,cop)\n",
139 |     "\n",
140 |     "for val in wrapper.flush():\n",
141 |     "    data.append(val)\n",
142 |     "\n",
143 |     "print \"Finished {} with timedelta of {}\".format(len(data),datetime.datetime.now() - start)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 21,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "Finished 451 with timedelta of 0:01:02.626729\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "hasher = SRP.SRP(1280)\n",
163 |     "wrapper = SRP.SRP_batch(hasher,target_size=50)\n",
164 |     "import datetime\n",
165 |     "\n",
166 |     "start = datetime.datetime.now()\n",
167 |     "data = []\n",
168 |     "\n",
169 |     "for id,count in wordcounts:\n",
170 |     "    cop = copy.deepcopy(count)\n",
171 |     "    try:\n",
172 |     "        wrapper.push(id,cop)\n",
173 |     "    except OverflowError:\n",
174 |     "        for val in wrapper.flush():\n",
175 |     "            data.append(val)\n",
176 |     "        wrapper.push(id,cop)\n",
177 |     "\n",
178 |     "for val in wrapper.flush():\n",
179 |     "    data.append(val)\n",
180 |     "\n",
181 |     "print \"Finished {} with timedelta of {}\".format(len(data),datetime.datetime.now() - start)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 10,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "wordcountsb = [(id,dicto.keys(),dicto.values()) for id,dicto in wordcounts]\n"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 20,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "Finished 451 with timedelta of 0:00:39.880202\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "start = datetime.datetime.now()\n",
212 |     "data = []\n",
213 |     "hasher = SRP.SRP(1280)\n",
214 |     "for id,words,counts in wordcountsb:\n",
215 |     "    data.append((id,hasher.stable_transform(words,counts,standardize=False)))\n",
216 |     "\n",
217 |     "print \"Finished {} with timedelta of {}\".format(len(data),datetime.datetime.now() - start)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 19,
223 |    "metadata": {
224 |     "collapsed": false
225 |    },
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "Finished 451 with timedelta of 0:00:58.709061\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "hasher = SRP.SRP(1280,dtype=np.int)\n",
237 |     "\n",
238 |     "start = datetime.datetime.now()\n",
239 |     "data = []\n",
240 |     "\n",
241 |     "for id,words,counts in wordcountsb:\n",
242 |     "    data.append((id,hasher.stable_transform(words,counts,standardize=False)))\n",
243 |     "\n",
244 |     "print \"Finished {} with timedelta of {}\".format(len(data),datetime.datetime.now() - start)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 18,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "('/home/bschmidt/bookworms/txtlab/EN_1805_Lewis,Matthew_TheBravoofVenice_Novel.txt',\n",
258 |        " array([  22.53500605,   49.64756918,  116.97293091, ..., -238.58495045,\n",
259 |        "          56.79128075,  162.70035791]))"
260 |       ]
261 |      },
262 |      "execution_count": 18,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": []
268 |   }
269 |  ],
270 |  "metadata": {
271 |   "kernelspec": {
272 |    "display_name": "Python 2",
273 |    "language": "python",
274 |    "name": "python2"
275 |   },
276 |   "language_info": {
277 |    "codemirror_mode": {
278 |     "name": "ipython",
279 |     "version": 2
280 |    },
281 |    "file_extension": ".py",
282 |    "mimetype": "text/x-python",
283 |    "name": "python",
284 |    "nbconvert_exporter": "python",
285 |    "pygments_lexer": "ipython2",
286 |    "version": "2.7.12"
287 |   }
288 |  },
289 |  "nbformat": 4,
290 |  "nbformat_minor": 0
291 | }
292 | 


--------------------------------------------------------------------------------
/docs/Recursive SRP tests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "It is possible that a later version of SRP will, instead of hashing each word to a simple binary projection,\n",
 11 |     "instead hash it on the basis of all its substrings so that orthographically similar words have similar projections. That requires some additional tricks and more efficient hashing, which are being tested here."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": false,
 19 |     "deletable": true,
 20 |     "editable": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import sys\n",
 25 |     "import numpy as np\n",
 26 |     "sys.path.append(\"..\")\n",
 27 |     "%load_ext autoreload\n",
 28 |     "%autoreload 2\n",
 29 |     "import sklearn"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false,
 37 |     "deletable": true,
 38 |     "editable": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from SRP import Vector_file\n",
 43 |     "from SRP import SRP\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {
 50 |     "collapsed": false,
 51 |     "deletable": true,
 52 |     "editable": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "fah = SRP(32)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 6,
 62 |    "metadata": {
 63 |     "collapsed": false,
 64 |     "deletable": true,
 65 |     "editable": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "words = [l.rstrip(\"\\n\").decode(\"utf-8\") for l in open(\"/Users/bschmidt/vector_models/words.txt\")]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 7,
 75 |    "metadata": {
 76 |     "collapsed": false,
 77 |     "deletable": true,
 78 |     "editable": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "hasher = SRP(320)\n",
 83 |     "hasher.dtype = np.int8\n"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 8,
 89 |    "metadata": {
 90 |     "collapsed": false,
 91 |     "deletable": true,
 92 |     "editable": true
 93 |    },
 94 |    "outputs": [
 95 |     {
 96 |      "data": {
 97 |       "text/plain": [
 98 |        "array([-6, -2, -2, -2,  4, -6,  2,  0,  4,  0, -4, -8,  0, -2,  4,  0,  0,\n",
 99 |        "       -6,  0, -4,  6,  0,  2,  2, -4, -2,  4,  2,  0,  4,  2,  0], dtype=int8)"
100 |       ]
101 |      },
102 |      "execution_count": 8,
103 |      "metadata": {},
104 |      "output_type": "execute_result"
105 |     }
106 |    ],
107 |    "source": [
108 |     "fah.hash_all_substrings(\"foob\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 9,
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "deletable": true,
117 |     "editable": true
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "limit = 10000\n",
122 |     "dicto = np.zeros((limit,320))\n",
123 |     "for i in range(limit):\n",
124 |     "    dicto[i] = hasher.hash_all_substrings(words[i])"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 13,
130 |    "metadata": {
131 |     "collapsed": false,
132 |     "deletable": true,
133 |     "editable": true
134 |    },
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "62776\n",
141 |       "62776\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "print len(hasher.known_hashes)\n",
147 |     "print len(hasher.recurse_dict)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 11,
153 |    "metadata": {
154 |     "collapsed": false,
155 |     "deletable": true,
156 |     "editable": true
157 |    },
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "u'principle'"
163 |       ]
164 |      },
165 |      "execution_count": 11,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "hasher.known_hashes[\"\"]"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 14,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "array([  5.,  -1.,   7.,  15.,   1.,   1.,   9.,  -1.,   3.,  -5.,  -7.,\n",
185 |        "        -7.,   1.,  -5.,  -1.,  -7.,  -1.,  -3.,   5.,   9.,   1.,  -3.,\n",
186 |        "         3.,   9.,  -9., -15.,   7.,   3.,  -5.,  -3.,   5.,  11.,  11.,\n",
187 |        "        -5.,  -7.,  -3.,   5.,  -3.,  -9.,   1.,   7.,   3.,  -5.,   5.,\n",
188 |        "         5.,   1.,  -7.,  -3.,   1.,  11.,  -3.,  -1.,   5.,  -3., -11.,\n",
189 |        "        -1.,  -1.,   7.,   3.,  -1.,   9.,   3.,   7.,  -1.,  -5.,  -7.,\n",
190 |        "        13.,  -1.,  -1.,   3.,   3.,  -3.,  11.,  -7.,  13.,  11.,   7.,\n",
191 |        "         9.,  -3.,   5.,  -3., -11.,  -3.,   1.,  -3.,   1.,   1.,  -7.,\n",
192 |        "        11.,   7., -11.,   5.,   7.,  -5.,  -3.,  -3.,  -3.,  -9.,  -9.,\n",
193 |        "         1.,  -7.,  -5.,   1.,  15.,  -1.,  -5.,   5.,   3.,  -3.,  -5.,\n",
194 |        "         9.,  -9.,   1.,  -3.,  11.,  11., -13.,  -3.,  -9.,   3.,   5.,\n",
195 |        "         1.,  -3.,  -1.,   7.,  -1.,   1.,  -3.,   7.,   1.,   5., -13.,\n",
196 |        "        11.,  -1.,  -7.,  13.,   3.,  -3.,  11.,  11.,   5.,  -3.,  -1.,\n",
197 |        "        -1.,  -3.,  -7.,  -1.,  -7.,  -1.,   1.,   1.,  -9.,  13.,   7.,\n",
198 |        "        -7.,   1.,   5.,   9.,  11.,   3.,   1.,   1.,  -9.,  -3.,  -9.,\n",
199 |        "        -3.,  -3.,   1.,  13.,   7.,  -5.,  -1.,   3.,   5.,  -3.,   5.,\n",
200 |        "        -5.,  -3.,  -3.,  -9.,   3.,  -5.,   5.,  -1.,   5.,  -1.,  -3.,\n",
201 |        "         7.,   1.,  -9.,   1.,  -1.,   7.,  -7.,  -9.,  -5.,   1.,   5.,\n",
202 |        "        13.,   5.,  -5., -13.,  -5., -11.,  13.,   7.,   3., -13.,   3.,\n",
203 |        "        -3.,  -3.,   5.,   1.,  -5.,  -3.,  11.,   1.,   5.,  11.,   9.,\n",
204 |        "        13.,  11.,   3., -13., -15.,  -9.,  -5.,  -3.,  -5.,   3.,   1.,\n",
205 |        "        -1.,  -1.,   5., -11.,  -3.,  -5.,  -3.,   5., -11.,   1.,   3.,\n",
206 |        "        -3., -13.,  -7.,  -3.,   1.,   9.,   1.,   3.,   7.,   7.,  -1.,\n",
207 |        "        -3.,  -7.,   3.,   3.,  -5.,   3.,   1.,  -1.,  13.,   1.,   1.,\n",
208 |        "        -1.,  11.,   7.,  -5.,   1., -13.,  -1.,  13.,   7.,  -1.,   3.,\n",
209 |        "         7.,  -3.,  -7.,   5.,   1.,  -3.,  -3.,   3.,  -1.,  -5.,   1.,\n",
210 |        "         7.,  -3.,  -1.,  11.,  -5.,  -3.,   5., -13.,  -1.,  15.,  -3.,\n",
211 |        "        -3.,  -3.,  -1.,   1.,   3.,   5.,   1.,   3.,  -3.,   1.,   7.,\n",
212 |        "        -1.,  -9.,  11.,   3.,   7.,  -1.,  -1.,  -1.,   1.,  -1.,   1.,\n",
213 |        "         5.])"
214 |       ]
215 |      },
216 |      "execution_count": 14,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "dicto[1000]"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 18,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "import sklearn.metrics.pairwise\n",
234 |     "def closest_to(ix,n = 10):\n",
235 |     "    ds = sklearn.metrics.pairwise.cosine_similarity(dicto[ix].reshape(1,-1),dicto)[0]\n",
236 |     "    top = np.argpartition(-ds,n)[:n]\n",
237 |     "    print\n",
238 |     "    returnt = []\n",
239 |     "    for t in top:\n",
240 |     "        returnt.append((ds[t],words[t]))\n",
241 |     "    returnt.sort(reverse = True)\n",
242 |     "    for a,b in returnt:\n",
243 |     "        print \"{:>8.03} {}\".format(a,b)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 19,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "\n",
258 |       "     1.0 the\n",
259 |       "   0.811 them\n",
260 |       "   0.804 they\n",
261 |       "   0.771 thee\n",
262 |       "   0.743 then\n",
263 |       "   0.724 these\n",
264 |       "   0.714 theme\n",
265 |       "   0.713 there\n",
266 |       "   0.692 their\n",
267 |       "   0.686 Other\n",
268 |       "\n",
269 |       "     1.0 of\n",
270 |       "   0.607 o\n",
271 |       "   0.546 off\n",
272 |       "   0.529 f\n",
273 |       "   0.493 fro\n",
274 |       "   0.437 for\n",
275 |       "   0.423 do\n",
276 |       "   0.418 good\n",
277 |       "   0.399 No\n",
278 |       "   0.394 from\n",
279 |       "\n",
280 |       "     1.0 upon\n",
281 |       "   0.436 Upon\n",
282 |       "   0.426 pond\n",
283 |       "   0.365 non\n",
284 |       "   0.355 weapon\n",
285 |       "   0.327 on\n",
286 |       "   0.295 weapons\n",
287 |       "   0.289 con\n",
288 |       "   0.289 won\n",
289 |       "   0.285 respond\n",
290 |       "\n",
291 |       "     1.0 principle\n",
292 |       "   0.871 principles\n",
293 |       "   0.636 principal\n",
294 |       "   0.622 Principles\n",
295 |       "   0.574 principally\n",
296 |       "   0.529 prince\n",
297 |       "   0.442 princes\n",
298 |       "   0.407 princess\n",
299 |       "   0.397 print\n",
300 |       "   0.382 Prince\n"
301 |      ]
302 |     },
303 |     {
304 |      "ename": "IndexError",
305 |      "evalue": "index 10000 is out of bounds for axis 0 with size 10000",
306 |      "output_type": "error",
307 |      "traceback": [
308 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
309 |       "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
310 |       "\u001b[0;32m<ipython-input-19-b3f126d7b190>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mclosest_to\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mclosest_to\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
311 |       "\u001b[0;32m<ipython-input-18-abd112987415>\u001b[0m in \u001b[0;36mclosest_to\u001b[0;34m(ix, n)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpairwise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclosest_to\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mix\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpairwise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcosine_similarity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdicto\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mix\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdicto\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0mtop\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margpartition\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mprint\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
312 |       "\u001b[0;31mIndexError\u001b[0m: index 10000 is out of bounds for axis 0 with size 10000"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "closest_to(0)\n",
318 |     "\n",
319 |     "closest_to(1)\n",
320 |     "\n",
321 |     "closest_to(100)\n",
322 |     "\n",
323 |     "closest_to(1000)\n",
324 |     "\n",
325 |     "closest_to(10000)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 20,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "ix = 1000\n",
337 |     "ds = sklearn.metrics.pairwise.cosine_similarity(dicto[ix].reshape(1,-1),dicto)[0]\n"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 21,
343 |    "metadata": {
344 |     "collapsed": false
345 |    },
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "(array([ 256, 3993, 4908,  803,   31,    3,    3,    1,    1,    1]),\n",
351 |        " array([-0.22981407, -0.10683266,  0.01614875,  0.13913015,  0.26211156,\n",
352 |        "         0.38509297,  0.50807437,  0.63105578,  0.75403719,  0.87701859,  1.        ]))"
353 |       ]
354 |      },
355 |      "execution_count": 21,
356 |      "metadata": {},
357 |      "output_type": "execute_result"
358 |     }
359 |    ],
360 |    "source": [
361 |     "np.histogram(ds)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 22,
367 |    "metadata": {
368 |     "collapsed": false,
369 |     "deletable": true,
370 |     "editable": true
371 |    },
372 |    "outputs": [
373 |     {
374 |      "data": {
375 |       "text/plain": [
376 |        "[u'amazing', u'Amazing']"
377 |       ]
378 |      },
379 |      "execution_count": 22,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "[word for word in words if word==\"amazing\" or word==\"Amazing\"]"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 23,
391 |    "metadata": {
392 |     "collapsed": false,
393 |     "deletable": true,
394 |     "editable": true
395 |    },
396 |    "outputs": [
397 |     {
398 |      "data": {
399 |       "text/plain": [
400 |        "62776"
401 |       ]
402 |      },
403 |      "execution_count": 23,
404 |      "metadata": {},
405 |      "output_type": "execute_result"
406 |     }
407 |    ],
408 |    "source": [
409 |     "len(hasher.recurse_dict)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 21,
415 |    "metadata": {
416 |     "collapsed": false,
417 |     "deletable": true,
418 |     "editable": true
419 |    },
420 |    "outputs": [],
421 |    "source": [
422 |     "out = Vector_file(\"out.bin\",dims=320,mode=\"w\")"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 24,
428 |    "metadata": {
429 |     "collapsed": false,
430 |     "deletable": true,
431 |     "editable": true
432 |    },
433 |    "outputs": [
434 |     {
435 |      "ename": "NameError",
436 |      "evalue": "name 'out' is not defined",
437 |      "output_type": "error",
438 |      "traceback": [
439 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
440 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
441 |       "\u001b[0;32m<ipython-input-24-7ddc37327e9e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdicto\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_row\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"<f4\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
442 |       "\u001b[0;31mNameError\u001b[0m: name 'out' is not defined"
443 |      ]
444 |     }
445 |    ],
446 |    "source": [
447 |     "for id,row in zip(words,dicto):\n",
448 |     "    out.add_row(id.encode(\"utf-8\"),row.astype(\"<f4\"))\n",
449 |     "    \n",
450 |     "out.close()"
451 |    ]
452 |   }
453 |  ],
454 |  "metadata": {
455 |   "kernelspec": {
456 |    "display_name": "Python 2",
457 |    "language": "python",
458 |    "name": "python2"
459 |   },
460 |   "language_info": {
461 |    "codemirror_mode": {
462 |     "name": "ipython",
463 |     "version": 2
464 |    },
465 |    "file_extension": ".py",
466 |    "mimetype": "text/x-python",
467 |    "name": "python",
468 |    "nbconvert_exporter": "python",
469 |    "pygments_lexer": "ipython2",
470 |    "version": "2.7.13"
471 |   }
472 |  },
473 |  "nbformat": 4,
474 |  "nbformat_minor": 2
475 | }
476 | 


--------------------------------------------------------------------------------
/docs/Splitting Ids.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "In the Hathi Trust case, it's useful to access Ids by their prefixes, as identified by a separator"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import SRP\n",
 17 |     "import tempfile\n",
 18 |     "import numpy as np\n",
 19 |     "from pathlib import Path "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%load_ext autoreload\n",
 29 |     "%autoreload 2"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 13,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "('file_1-part-0', array([1., 0.], dtype=float32))\n",
 42 |       "('file_1-part-1', array([1., 1.], dtype=float32))\n",
 43 |       "('file_1-part-2', array([1., 2.], dtype=float32))\n",
 44 |       "('file_1-part-3', array([1., 3.], dtype=float32))\n",
 45 |       "('file_1-part-4', array([1., 4.], dtype=float32))\n",
 46 |       "[('file_100-part_302', array([100., 302.], dtype=float32))]\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "with tempfile.TemporaryDirectory() as dir:\n",
 52 |     "    fout = SRP.Vector_file(Path(dir, \"out.bin\"), mode = \"a\", dims = 2)\n",
 53 |     "    for i in range(3):\n",
 54 |     "        for j in range(5):\n",
 55 |     "            fout.add_row(f\"file_{i}-part-{j}\", np.array([i, j], '<f4'))\n",
 56 |     "    fout.close()\n",
 57 |     "    newview = SRP.Vector_file(Path(dir, \"out.bin\"), mode = \"a\", dims = 2)\n",
 58 |     "    for k, v in newview.find_prefix(\"file_1\", \"-\"):\n",
 59 |     "        print((k, v))\n",
 60 |     "    newview.add_row('file_100-part_302', np.array([100, 302], '<f4'))\n",
 61 |     "    print(newview.find_prefix(\"file_100\"))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 14,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "1.0"
 73 |       ]
 74 |      },
 75 |      "execution_count": 14,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "np.float32(1)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": []
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "kernelspec": {
 94 |    "display_name": "Python (htrc)",
 95 |    "language": "python",
 96 |    "name": "htrc"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": {
100 |     "name": "ipython",
101 |     "version": 3
102 |    },
103 |    "file_extension": ".py",
104 |    "mimetype": "text/x-python",
105 |    "name": "python",
106 |    "nbconvert_exporter": "python",
107 |    "pygments_lexer": "ipython3",
108 |    "version": "3.7.6"
109 |   }
110 |  },
111 |  "nbformat": 4,
112 |  "nbformat_minor": 4
113 | }
114 | 


--------------------------------------------------------------------------------
/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: pysrp
 3 |   version: "v1.1.0"
 4 | 
 5 | source:
 6 |   git_rev: "v1.1.0"
 7 |   git_url: https://github.com/bmschmidt/pySRP
 8 | 
 9 | requirements:
10 |   build:
11 |     - "python >=3.4, <4"
12 |     - setuptools
13 |     - {{ pin_compatible('numpy') }}
14 |     - regex
15 |     - sqlitedict
16 |   host:
17 |     - numpy
18 |   run:
19 |     - python
20 |     - {{ pin_compatible('numpy') }}
21 |     - regex
22 |     - sqlitedict
23 | 
24 | build:
25 |   script: python setup.py install
26 | 
27 | test:
28 |   imports:
29 |     - unittest
30 |     - numpy
31 |     - regex
32 |     - sqlitedict
33 |   
34 | about:
35 |   home: https://github.com/bmschmidt/pySRP
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmschmidt/pySRP/228bdeb7094f6db032315b9484d9472d21a9d78b/requirements.txt


--------------------------------------------------------------------------------
/run_test.sh:
--------------------------------------------------------------------------------
1 | python tests/test.py
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(name='pysrp',
 7 |       packages=["SRP"],
 8 |       version='2.0.0',
 9 |       description="Run stable random projections.",
10 |       long_description = long_description,
11 |       long_description_content_type = "text/markdown",
12 |       url="http://github.com/bmschmidt/SRP",
13 |       author="Benjamin Schmidt",
14 |       author_email="bmschmidt@gmail.com",
15 |       license="MIT",
16 |       classifiers=[
17 |         'Intended Audience :: Education',
18 |         "Natural Language :: English",
19 |         # Pick your license as you wish (should match "license" above)
20 |         'License :: OSI Approved :: MIT License',
21 |         "Operating System :: Unix",
22 |         # Specify the Python versions you support here. In particular, ensure
23 |         # that you indicate whether you support Python 2, Python 3 or both.
24 |         'Programming Language :: Python :: 3.6',          
25 |         'Programming Language :: Python :: 3.7',
26 |         'Programming Language :: Python :: 3.8',          
27 |         'Programming Language :: Python :: 3.9',
28 |         'Programming Language :: Python :: 3.10',
29 |         "Topic :: Text Processing :: Indexing",
30 |         "Topic :: Text Processing :: Linguistic"
31 |         ],
32 |       install_requires= ["numpy", "regex", "sqlitedict"]
33 | )
34 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
  1 | #### -*- coding: utf-8 -*-
  2 | 
  3 | import SRP
  4 | import numpy as np
  5 | import unittest
  6 | import warnings
  7 | from pathlib import Path
  8 | import tempfile
  9 | 
 10 | class ReadAndWrite(unittest.TestCase):
 11 |     array1 = np.array([1,2,3],'<f4')
 12 |     array2 = np.array([3,2,1],'<f4')
 13 |     # Dummy values for file testing.
 14 |     test_set = [("foo", array1),
 15 |                 ("foo2", array1),
 16 |                 ("fü", array2),
 17 |                 ("stop", array2)]
 18 | 
 19 |     def make_testfile(self, path, array = None):
 20 |         if array is None:
 21 |             array = self.test_set
 22 | 
 23 |         with SRP.Vector_file(path, dims=3, mode="w") as testfile:
 24 |             for row in self.test_set:
 25 |                 if row[0] == "stop":
 26 |                     continue
 27 |                 testfile.add_row(*row)
 28 | 
 29 |     def test_sorting(self):
 30 |         with tempfile.TemporaryDirectory() as dir:
 31 |             self.make_testfile(Path(dir, "test.bin"), reversed(self.test_set))
 32 |             with SRP.Vector_file(Path(dir, "test.bin")) as fin:
 33 |                 fin.sort(Path(dir, "test2.bin"))
 34 |             with SRP.Vector_file(Path(dir, "test2.bin")) as f2:
 35 |                 rows = [k for (k, v) in f2]
 36 |             self.assertEqual(rows, [t[0] for t in self.test_set[:3]])
 37 | 
 38 |     def test_entrance_format(self):
 39 |         with tempfile.TemporaryDirectory() as dir:
 40 |             self.make_testfile(Path(dir, "test.bin"))
 41 | 
 42 |             with SRP.Vector_file(Path(dir, "test.bin"), dims=3, mode="a") as testfile2:
 43 |                 testfile2.add_row(*self.test_set[3])
 44 | 
 45 |             with SRP.Vector_file(Path(dir, "test2.bin"), dims=3, mode="a") as a:
 46 |                 a.concatenate_file(Path(dir, "test.bin"))
 47 |                 a.concatenate_file(Path(dir, "test.bin"))
 48 | 
 49 |             with SRP.Vector_file(Path(dir, 'test2.bin')) as f:
 50 |                 self.assertEqual(f.vocab_size, 8)
 51 |                 with warnings.catch_warnings(record=True) as w:
 52 |                     warnings.simplefilter("always")
 53 |                     # Trigger a warning.
 54 |                     f['foo']
 55 |                     # Verify some things
 56 |                     assert "duplicate identifiers" in str(w[-1].message)
 57 | 
 58 |             self.assertTrue(testfile2.nrows == 4)
 59 | 
 60 |     def test_prefix_lookup(self):
 61 |         with tempfile.TemporaryDirectory() as dir:
 62 |             with SRP.Vector_file(Path(dir, "out.bin"), mode = "a", dims = 2) as fout:
 63 |                 for i in range(3):
 64 |                     for j in range(5):
 65 |                         fout.add_row(f"file_{i}-part-{j}", np.array([i, j], '<f4'))
 66 | 
 67 |             newview = SRP.Vector_file(Path(dir, "out.bin"), mode = "a", dims = 2)
 68 |             for k, v in newview.find_prefix("file_1", "-"):
 69 |                 self.assertEqual(np.float32(1), v[0])
 70 |             newview.close()
 71 | 
 72 |             """
 73 |             newview.add_row('file_100-part_302', np.array([100, 302], '<f4'))
 74 |             newview.flush()
 75 |             fname, row = newview.find_prefix("file_100")[0]
 76 |             self.assertEqual(fname, 'file_100-part_302')
 77 |             newview.close()
 78 |             """
 79 | 
 80 |     def test_null_vectors(self):
 81 |         """
 82 |         Not clear that these *should* be supported, but apparently they are.
 83 |         """
 84 |         with tempfile.TemporaryDirectory() as dir:
 85 |             p = Path(dir, "test.bin")
 86 |             fout = SRP.Vector_file(p, dims = 0, mode = 'a')
 87 |             fout.add_row("good", np.array([], '<f4'))
 88 |             fout.flush()
 89 |             with p.open() as fin:
 90 |                 self.assertEqual(fin.read(), '000000001 00000\ngood \n')
 91 |             fout.close()
 92 | 
 93 |     def test_creation_and_reading(self):
 94 |         with tempfile.TemporaryDirectory() as dir:
 95 |             testloc = Path(dir, "test.bin")
 96 |             self.make_testfile(testloc)
 97 | 
 98 |             testfile2 = SRP.Vector_file(Path(dir, "test.bin"), dims=3, mode="a")
 99 |             testfile2.add_row(*self.test_set[3])
100 |             testfile2.close()
101 |             self.assertTrue(testfile2.nrows==4)
102 | 
103 |             reloaded = SRP.Vector_file(Path(dir, "test.bin"), mode="r")
104 |             self.assertTrue(reloaded.vocab_size==4)
105 | 
106 |             # Test Iteration
107 |             read_in_values = dict()
108 |             for (i, (name, array)) in enumerate(reloaded):
109 |                 read_in_values[name] = array
110 |                 (comp_name, comp_array) = self.test_set[i]
111 |                 self.assertEqual(comp_name, name)
112 |                 self.assertEqual(array.tolist(), comp_array.tolist())
113 | 
114 |             # Individual Vec Getter
115 |             keys = [i for i, a in self.test_set]
116 |             for i, key in enumerate(keys):
117 |                 vec = reloaded[key]
118 |                 np.testing.assert_array_almost_equal(vec, self.test_set[i][1])
119 | 
120 |             # List Vec Getter
121 |             vecs = reloaded[keys]
122 |             self.assertEqual(vecs.shape, (len(keys), reloaded.dims))
123 |             for i in range(len(keys)):
124 |                 np.testing.assert_array_almost_equal(vecs[i], self.test_set[i][1])
125 |             reloaded.close()
126 | 
127 |             self.assertEqual(read_in_values["foo"].tolist(),read_in_values["foo2"].tolist())
128 |             self.assertFalse(read_in_values["foo2"].tolist()==read_in_values["fü"].tolist())
129 | 
130 |     def test_error_on_load(self):
131 |         with tempfile.TemporaryDirectory() as dir:
132 |             testfile = SRP.Vector_file(Path(dir, "test.bin"), dims=3, mode="w")
133 |             with self.assertRaises(TypeError):
134 |                 testfile.add_row("this is a space", self.array1)
135 |             testfile.close()
136 | 
137 | class BasicHashing(unittest.TestCase):
138 |     def test_ascii(self):
139 |         hasher = SRP.SRP(6)
140 |         hello_world = hasher.stable_transform("hello world", log=False, unit_length = False)
141 | 
142 |         self.assertEqual(
143 |             hello_world.tolist(),
144 |             np.array([0.,  0.,  2.,  0.,  2.,  0.]).tolist()
145 |             )
146 | 
147 |     def test_dtype(self):
148 |         hasher = SRP.SRP(6)
149 |         hello_world = hasher.stable_transform("hello world", log=False)
150 |         self.assertEqual(
151 |             hello_world.dtype,
152 |             np.float32)
153 | 
154 | 
155 |     def test_wordcounts_unicode(self):
156 |         hasher = SRP.SRP(160)
157 | 
158 |         wordcount_style = hasher.stable_transform(
159 |             words = [u"Güten",u"Tag"],
160 |             counts = [1,1],
161 |             log=False
162 |         ).tolist()
163 | 
164 |         string_style = hasher.stable_transform(
165 |             words =  u"Güten Tag",
166 |             log=False
167 |         ).tolist()
168 | 
169 |         self.assertEqual(wordcount_style,string_style)
170 | 
171 |     def test_ascii_equals_unicode(self):
172 |         hasher = SRP.SRP(160)
173 | 
174 |         hello_world = hasher.stable_transform("hello world", log=False).tolist()
175 |         hello_world_unicode = hasher.stable_transform(u"hello world", log=False).tolist()
176 | 
177 |         self.assertEqual(hello_world,hello_world_unicode)
178 | 
179 |     def test_logs_are_plausible(self):
180 |         log_unit = np.log(1e05)
181 | 
182 |         hasher = SRP.SRP(20)
183 |         log_srp = hasher.stable_transform("hello", log=True)
184 |         nonlog_srp = hasher.stable_transform("hello", log=False)
185 |         difference = sum(log_srp - (nonlog_srp) * log_unit)
186 | 
187 |         # Forgive floating point error.
188 |         self.assertTrue(difference < 1e-05)
189 | 
190 |     def test_unicode(self):
191 |         """
192 |         One of the goals is be able to pass *either* encoded or decoded
193 |         utf-8, because that tends to happen.
194 | 
195 |         These tests are a lot easier to pass now that python2 is deprecated.
196 | 
197 |         """
198 |         hasher = SRP.SRP(6)
199 |         guten = u"Güten Tag"
200 |         gutenhash = np.array([0., 2., -2., 0., 2., 0.]).tolist()
201 | 
202 |         basic = hasher.stable_transform(guten, log=False, unit_length = False).tolist()
203 |         self.assertTrue(basic == gutenhash)
204 | 
205 |         encoded = hasher.stable_transform(guten.encode("utf-8"), log=False, unit_length = False).tolist()
206 |         self.assertTrue(encoded == gutenhash)
207 | 
208 |         decoded = hasher.stable_transform(guten.encode("utf-8").decode("utf-8"),log=False, unit_length = False).tolist()
209 |         self.assertTrue(decoded == gutenhash)
210 | 
211 |     def test_standardization(self):
212 |         """
213 |         standardization does case normalization,
214 |         and tokenizes by a charater regex.
215 |         """
216 |         hasher = SRP.SRP(6)
217 |         string1 = "Gravity's rainbow"
218 |         hashed_standardized = hasher.stable_transform(string1, log=False, standardize=True)
219 |         manually_tokenized = ["Gravity","s","RAINBOW"]
220 |         hashed_manually_tokenized = hasher.stable_transform(manually_tokenized, [1, 1, 1], log=False,standardize=True)
221 |         self.assertEqual(hashed_manually_tokenized.tolist(), hashed_standardized.tolist())
222 | 
223 |     def test_numeric_substitution(self):
224 |         hasher = SRP.SRP(36)
225 |         string1 = "I was born in 2001"
226 |         string2 = "I was born in 1907"
227 |         h1 = hasher.stable_transform(string1, log=False, standardize=True)
228 |         h2 = hasher.stable_transform(string1, log=False, standardize=True)
229 |         self.assertEqual(h1.tolist(), h2.tolist())
230 | 
231 | 
232 | 
233 | if __name__=="__main__":
234 |     unittest.main()
235 | 


--------------------------------------------------------------------------------
/utils/clean_file.py:
--------------------------------------------------------------------------------
 1 | import SRP
 2 | from SRP import Vector_file
 3 | import sys
 4 | 
 5 | if len(sys.argv) != 3:
 6 |     print "Usage: python clean_file.py INPUT OUTPUT"
 7 |     print "where input is a file to be stripped of crufy rows."    
 8 | 
 9 | 
10 | fin = Vector_file(sys.argv[1])
11 | 
12 | fout = Vector_file(sys.argv[2], dims = fin.dims, mode="w")
13 | 
14 | 
15 | for i, (id, row) in enumerate(fin):
16 |     if sum(row) != 0 and abs(sum(row)) < 1e15:
17 |         fout.add_row(id,row)
18 |     else:
19 |         print "Skipping row {} of magnitude {}".format(i, sum(row))
20 |         
21 | fout.close()
22 | 


--------------------------------------------------------------------------------
/utils/expand_half-precision.py:
--------------------------------------------------------------------------------
 1 | import SRP
 2 | from SRP import Vector_file
 3 | import sys
 4 | import numpy as np
 5 | 
 6 | 
 7 | if not len(sys.argv) >= 3:
 8 |     print "Usage: python expand_half-precision .py INPUT OUTPUT [lengthout]"
 9 |     print "where input is a two-byte file, and OUTPUT is the desired 4-byte file."
10 |     print "lengthout optionally lets you only write the first n lines of the file."    
11 | 
12 | 
13 | fin = Vector_file(sys.argv[1], precision = 2)
14 | 
15 | fout = Vector_file(sys.argv[2], dims = fin.dims, mode="w", precision = 4)
16 | 
17 | if len(sys.argv)==4:
18 |     limit = int(sys.argv[3])
19 | else:
20 |     limit = float("Inf")
21 |     
22 | for i, (id, row) in enumerate(fin):
23 |     if sum(row) != 0 and np.sum(np.abs(row)) < 1e10:
24 |         fout.add_row(id,row)
25 |     else:
26 |         print "Skipping row {} of magnitude {}".format(i, np.sum(np.abs(row)))
27 |     if i >= limit - 1:
28 |        break
29 |        
30 | fout.close()
31 | 


--------------------------------------------------------------------------------