├── .gitignore
├── .travis.yml
├── LICENSE.md
├── README.md
├── checkstyle.xml
├── lsh-minhash.png
├── lsh-superbit.png
├── pom.xml
└── src
├── main
└── java
│ └── info
│ └── debatty
│ └── java
│ └── lsh
│ ├── LSH.java
│ ├── LSHMinHash.java
│ ├── LSHSuperBit.java
│ ├── MinHash.java
│ ├── SuperBit.java
│ └── examples
│ ├── InitialSeed.java
│ ├── LSHMinHashExample.java
│ ├── LSHSuperBitExample.java
│ ├── MinHashExample.java
│ ├── SerializeExample.java
│ ├── SimpleLSHMinHashExample.java
│ └── SuperBitExample.java
└── test
└── java
└── info
└── debatty
└── java
└── lsh
├── LSHMinHashTest.java
├── MinHashTest.java
└── SuperBitTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 | /nbproject/private/
14 | /dist/
15 | /build/
16 | /target/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -Dgpg.skip=true
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # License
2 |
3 | This project is licensed under the terms of the **MIT license**.
4 |
5 | https://opensource.org/licenses/MIT
6 |
7 | > Copyright 2015 Thibault Debatty.
8 | >
9 | > Permission is hereby granted, free of charge, to any person obtaining
10 | > a copy of this software and associated documentation files (the
11 | > "Software"), to deal in the Software without restriction, including
12 | > without limitation the rights to use, copy, modify, merge, publish,
13 | > distribute, sublicense, and/or sell copies of the Software, and to
14 | > permit persons to whom the Software is furnished to do so, subject to
15 | > the following conditions:
16 | >
17 | > The above copyright notice and this permission notice shall be
18 | > included in all copies or substantial portions of the Software.
19 | >
20 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | > NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | > LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | > OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | > WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # java-LSH
2 | [](https://maven-badges.herokuapp.com/maven-central/info.debatty/java-lsh) [](https://travis-ci.org/tdebatty/java-LSH) [](http://www.javadoc.io/doc/info.debatty/java-lsh)
3 |
4 |
5 | A Java implementation of Locality Sensitive Hashing (LSH).
6 |
7 | * [Download](#download)
8 | * [MinHash](#minhash)
9 | * [Super-Bit](#super-bit)
10 | * [Comparable signatures](#comparable-signatures)
11 | * [Initial seed](#initial-seed)
12 | * [Serialization](#serialization)
13 |
14 |
15 | Locality Sensitive Hashing (LSH) is a family of hashing methods that tent to produce the same hash (or signature) for similar items. There exist different LSH functions, that each correspond to a similarity metric. For example, the MinHash algorithm is designed for Jaccard similarity (the relative number of elements that two sets have in common). For cosine similarity, the traditional LSH algorithm used is Random Projection, but others exist, like Super-Bit, that deliver better results.
16 |
17 | LSH functions have two main use cases:
18 | * Compute the signature of large input vectors. These signatures can be used to quickly estimate the similarity between vectors.
19 | * With a given number of buckets, bin similar vectors together.
20 |
21 | This library implements Locality Sensitive Hashing (LSH), as described in Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets", Cambridge University Press.
22 |
23 | Are currently implemented:
24 | * MinHash algorithm for Jaccard index;
25 | * Super-Bit algorithm for cosine similarity.
26 |
27 | The coeficients of hashing functions are randomly choosen when the LSH object is instantiated. You can thus only compare signatures or bucket binning generated by the same LSH object. To reuse your LSH object between executions, you have to serialize it and save it to a file (see below the [example of LSH object serialization](https://github.com/tdebatty/java-LSH#serialization)).
28 |
29 | ## Download
30 |
31 | Using maven:
32 | ```
33 |
34 | info.debatty
35 | java-lsh
36 | RELEASE
37 |
38 | ```
39 |
40 | Or see the [releases](https://github.com/tdebatty/java-LSH/releases) page.
41 |
42 | ## MinHash
43 |
44 | MinHash is a hashing scheme that tents to produce similar signatures for sets that have a high Jaccard similarity.
45 |
46 | The Jaccard similarity between two sets is the relative number of elements these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B| A MinHash signature is a sequence of numbers produced by multiple hash functions hi. It can be shown that the Jaccard similarity between two sets is also the probability that this hash result is the same for the two sets: J(A, B) = Pr[hi(A) = hi(B)]. Therefore, MinHash signatures can be used to estimate Jaccard similarity between two sets. Moreover, it can be shown that the expected estimation error is O(1 / sqrt(n)), where n is the size of the signature (the number of hash functions that are used to produce the signature).
47 |
48 | ### Binning
49 |
50 | ```java
51 | import info.debatty.java.lsh.LSHMinHash;
52 | import java.util.Random;
53 |
54 | public class SimpleLSHMinHashExample {
55 |
56 | public static void main(String[] args) {
57 | // proportion of 0's in the vectors
58 | // if the vectors are dense (lots of 1's), the average jaccard similarity
59 | // will be very high (especially for large vectors), and LSH
60 | // won't be able to distinguish them
61 | // as a result, all vectors will be binned in the same bucket...
62 | double sparsity = 0.75;
63 |
64 | // Number of sets
65 | int count = 10000;
66 |
67 | // Size of vectors
68 | int n = 100;
69 |
70 | // LSH parameters
71 | // the number of stages is also sometimes called thge number of bands
72 | int stages = 2;
73 |
74 | // Attention: to get relevant results, the number of elements per bucket
75 | // should be at least 100
76 | int buckets = 10;
77 |
78 | // Let's generate some random sets
79 | boolean[][] vectors = new boolean[count][n];
80 | Random rand = new Random();
81 |
82 | for (int i = 0; i < count; i++) {
83 | for (int j = 0; j < n; j++) {
84 | vectors[i][j] = rand.nextDouble() > sparsity;
85 | }
86 | }
87 |
88 | // Create and configure LSH algorithm
89 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
90 |
91 | int[][] counts = new int[stages][buckets];
92 |
93 | // Perform hashing
94 | for (boolean[] vector : vectors) {
95 | int[] hash = lsh.hash(vector);
96 |
97 | for (int i = 0; i < hash.length; i++) {
98 | counts[i][hash[i]]++;
99 | }
100 |
101 | print(vector);
102 | System.out.print(" : ");
103 | print(hash);
104 | System.out.print("\n");
105 | }
106 |
107 | System.out.println("Number of elements per bucket at each stage:");
108 | for (int i = 0; i < stages; i++) {
109 | print(counts[i]);
110 | System.out.print("\n");
111 | }
112 | }
113 |
114 | static void print(int[] array) {
115 | System.out.print("[");
116 | for (int v : array) {
117 | System.out.print("" + v + " ");
118 | }
119 | System.out.print("]");
120 | }
121 |
122 | static void print(boolean[] array) {
123 | System.out.print("[");
124 | for (boolean v : array) {
125 | System.out.print(v ? "1" : "0");
126 | }
127 | System.out.print("]");
128 | }
129 | }
130 | ```
131 |
132 | Pay attention, LSH using MinHash is very sensitive to the average Jaccard similarity in your dataset! If most vectors in your dataset have a Jaccard similarity above or below 0.5, they might all fall in the same bucket. This is illustrated by example below:
133 |
134 | ```java
135 | import info.debatty.java.lsh.LSHMinHash;
136 | import info.debatty.java.lsh.MinHash;
137 | import java.util.Random;
138 |
139 | public class LSHMinHashExample {
140 |
141 | public static void main(String[] args) {
142 | // Number of sets
143 | int count = 2000;
144 |
145 | // Size of dictionary
146 | int n = 100;
147 |
148 | // Number of buckets
149 | // Attention: to get relevant results, the number of elements per bucket
150 | // should be at least 100
151 | int buckets = 10;
152 |
153 | // Let's generate some random sets
154 | boolean[][] vectors = new boolean[count][];
155 | Random r = new Random();
156 |
157 | // To get some interesting measures, we first generate a single
158 | // sparse random vector
159 | vectors[0] = new boolean[n];
160 | for (int j = 0; j < n; j++) {
161 | vectors[0][j] = (r.nextInt(10) == 0);
162 | }
163 |
164 | // Then we generate the other vectors, which have a reasonable chance
165 | // to look like the first one...
166 | for (int i = 1; i < count; i++) {
167 | vectors[i] = new boolean[n];
168 |
169 | for (int j = 0; j < n; j++) {
170 | vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0));
171 | }
172 | }
173 |
174 | // Now we can proceed to LSH binning
175 | // We will test multiple stages
176 | for (int stages = 1; stages <= 10; stages++) {
177 |
178 | // Compute the LSH hash of each vector
179 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
180 | int[][] hashes = new int[count][];
181 | for (int i = 0; i < count; i++) {
182 | boolean[] vector = vectors[i];
183 | hashes[i] = lsh.hash(vector);
184 | }
185 |
186 | // We now have the LSH hash for each input set
187 | // Let's have a look at how similar sets (according to Jaccard
188 | // index) were binned...
189 | int[][] results = new int[11][2];
190 | for (int i = 0; i < vectors.length; i++) {
191 | boolean[] vector1 = vectors[i];
192 | int[] hash1 = hashes[i];
193 |
194 | for (int j = 0; j < i; j++) {
195 | boolean[] vector2 = vectors[j];
196 | int[] hash2 = hashes[j];
197 |
198 | // We compute the similarity between each pair of sets
199 | double similarity = MinHash.jaccardIndex(vector1, vector2);
200 |
201 | // We count the number of pairs with similarity 0.1, 0.2,
202 | // 0.3, etc.
203 | results[(int) (10 * similarity)][0]++;
204 |
205 | // Do they fall in the same bucket for one of the stages?
206 | for (int stage = 0; stage < stages; stage++) {
207 | if (hash1[stage] == hash2[stage]) {
208 | results[(int) (10 * similarity)][1]++;
209 | break;
210 | }
211 | }
212 | }
213 | }
214 |
215 | // Now we can display (and plot in Gnuplot) the result:
216 | // For pairs that have a similarity x, the probability of falling
217 | // in the same bucket for at least one of the stages is y
218 | for (int i = 0; i < results.length; i++) {
219 | double similarity = (double) i / 10;
220 |
221 | double probability = 0;
222 | if (results[i][0] != 0) {
223 | probability = (double) results[i][1] / results[i][0];
224 | }
225 | System.out.println("" + similarity + "\t" + probability + "\t" + stages);
226 | }
227 |
228 | // Separate the series for Gnuplot...
229 | System.out.print("\n");
230 | }
231 | }
232 | }
233 | ```
234 |
235 | This example will run LSH binning for different number of stages. At each step, for each value of Jaccard similarity between pairs of sets (in the range [0, 0.1, 0.2, ... 1.0]), the program computes the probability that these two pairs fall in the same bucket for at least one stage. The results can be plotted with Gnuplot for example:
236 |
237 | 
238 |
239 | On this figure, the x-axis is the Jaccard similarity between sets, the y-axis is the probability that these pairs fall in the same bucket for at least one stage. The different series represent different values for the number of stages (from 1 to 10).
240 |
241 | We can clearly recognize the typical S curve of MinHash, with the threshold (the point where the curve is the steepest) located around x = 0.5.
242 |
243 | This curve is very important! It shows that if all your sets are similar (similarity above 0.6), all sets will most probably fall in a single bucket. And all other buckets will thus most probably be empty. This can happen for example if your dataset is skewed and presents some sort of principal direction.
244 |
245 | At the opposite, if your sets are all different from each other (similarity below 0.2), the curve is nearly flat. This means that pairs of sets have the same probability of falling in the same bucket, independantly of their similarity. The items are then randomly binned into the buckets. If using B buckets and S stages, computing the probability that two items are binned in the same bucket is similar to the problem of rolling S times a dice with B values. The resuling probability is 1 - [(B-1) / B]^S. The computed probability for 10 buckets is presented in table below, and roughly correspond to the above graph.
246 |
247 |
248 | | Stages | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
249 | |--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|----|
250 | | Pr | 0.1 | 0.19 | 0.27 | 0.34 | 0.41 | 0.47 | 0.52 | 0.57 | 0.61 | 0.65 |
251 |
252 | ### Signatures
253 |
254 | If you simply wish to compute MinHash signatures (witout performing LSH binning), you can directly use the MinHash class:
255 |
256 | ```java
257 | import info.debatty.java.lsh.MinHash;
258 | import java.util.TreeSet;
259 |
260 | public class MinHashExample {
261 |
262 | public static void main(String[] args) {
263 | // Initialize the hash function for an similarity error of 0.1
264 | // For sets built from a dictionary of 5 items
265 | MinHash minhash = new MinHash(0.1, 5);
266 |
267 | // Sets can be defined as an vector of booleans:
268 | // [1 0 0 1 0]
269 | boolean[] vector1 = {true, false, false, true, false};
270 | int[] sig1 = minhash.signature(vector1);
271 |
272 | // Or as a set of integers:
273 | // set2 = [1 0 1 1 0]
274 | TreeSet set2 = new TreeSet();
275 | set2.add(0);
276 | set2.add(2);
277 | set2.add(3);
278 | int[] sig2 = minhash.signature(set2);
279 |
280 | System.out.println("Signature similarity: " + minhash.similarity(sig1, sig2));
281 | System.out.println("Real similarity (Jaccard index)" +
282 | MinHash.JaccardIndex(MinHash.Convert2Set(vector1), set2));
283 | }
284 | }
285 | ```
286 |
287 | Which will produce:
288 |
289 | ```
290 | Signature similarity: 0.6767676767676768
291 | Real similarity (Jaccard index)0.6666666666666666
292 | ```
293 |
294 | [Read Javadoc...](http://www.javadoc.io/doc/info.debatty/java-lsh)
295 |
296 | ## Super-Bit
297 |
298 | Super-Bit is an improvement of Random Projection LSH. It computes an estimation of cosine similarity. In Super-Bit, the K random vectors are orthogonalized in L batches of N vectors, where
299 | * N is called the Super-Bit depth
300 | * L is called the number of Super-Bits
301 | * K = L * N is the code length (the size of the signature)
302 |
303 | Super-Bit Locality-Sensitive Hashing, Jianqiu Ji, Jianmin Li, Shuicheng Yan, Bo Zhang, Qi Tian
304 | http://papers.nips.cc/paper/4847-super-bit-locality-sensitive-hashing.pdf
305 | Published in Advances in Neural Information Processing Systems 25, 2012
306 |
307 | The cosine similarity between two points vectors in R^n is the cosine of their angle. It is computed as v1 . v2 / (|v1| * |v2|).
308 | Two vectors with the same orientation have a Cosine similarity of 1, two vectors at 90° have a similarity of 0, and two vectors diametrically opposed have a similarity of -1, independent of their magnitude.
309 |
310 | Here is an example of how to quickly bin together vectors that have a high cosine similarity using LSH + Super-Bit:
311 |
312 | ```java
313 | import info.debatty.java.lsh.LSHSuperBit;
314 | import java.util.Random;
315 |
316 | public class LSHSuperBitExample {
317 |
318 | public static void main(String[] args) {
319 | int count = 100;
320 |
321 | // R^n
322 | int n = 3;
323 |
324 | int stages = 2;
325 | int buckets = 4;
326 |
327 | // Produce some vectors in R^n
328 | Random r = new Random();
329 | double[][] vectors = new double[count][];
330 | for (int i = 0; i < count; i++) {
331 | vectors[i] = new double[n];
332 |
333 | for (int j = 0; j < n; j++) {
334 | vectors[i][j] = r.nextGaussian();
335 | }
336 | }
337 |
338 | LSHSuperBit lsh = new LSHSuperBit(stages, buckets, n);
339 |
340 | // Compute a SuperBit signature, and a LSH hash
341 | for (int i = 0; i < count; i++) {
342 | double[] vector = vectors[i];
343 | int[] hash = lsh.hash(vector);
344 | for (double v : vector) {
345 | System.out.printf("%6.2f\t", v);
346 | }
347 | System.out.print(hash[0]);
348 | System.out.print("\n");
349 | }
350 | }
351 | }
352 | ```
353 |
354 | This will produce something like, where the last column is the bucket in which this vector was binned (at first stage):
355 |
356 | ```
357 | -0.48 -0.68 1.87 1
358 | 0.77 0.11 2.20 1
359 | -0.05 0.23 -1.12 2
360 | 1.30 0.02 1.44 3
361 | -0.34 -1.51 0.78 3
362 | 1.64 0.02 0.84 3
363 | -0.74 1.58 -0.79 0
364 | -0.17 -1.27 -1.25 2
365 | ...
366 | ```
367 |
368 | This can be plotted with Gnuplot for example:
369 |
370 | 
371 |
372 | If you only wish to compute super-bit signatures of vectors (without performing LSH binning), you can directly use the SuperBit class:
373 | ```java
374 | import info.debatty.lsh.SuperBit;
375 |
376 | public class MyApp {
377 |
378 | public static void main(String[] args) {
379 |
380 | int n = 10;
381 |
382 | // Initialize Super-Bit
383 | SuperBit sb = new SuperBit(n);
384 |
385 | Random rand = new Random();
386 | double[] v1 = new double[n];
387 | double[] v2 = new double[n];
388 | for (int i = 0; i < n; i++) {
389 | v1[i] = rand.nextInt();
390 | v2[i] = rand.nextInt();
391 | }
392 |
393 | boolean[] sig1 = sb.signature(v1);
394 | boolean[] sig2 = sb.signature(v2);
395 |
396 | System.out.println("Signature (estimated) similarity: " + sb.similarity(sig1, sig2));
397 | System.out.println("Real (cosine) similarity: " + cosineSimilarity(v1, v2));
398 | }
399 | ```
400 |
401 | [Read Javadoc...](http://www.javadoc.io/doc/info.debatty/java-lsh)
402 |
403 | ## Comparable signatures
404 |
405 |
406 | As the parameters of the hashing function are randomly initialized when the LSH object is instantiated:
407 | * two LSH objects will produce different hashes and signatures for the same input vector;
408 | * two executions of your program will produce different hashes and signatures for the same input vector;
409 | * the signatures produced by two different LSH objects can not be used to estimate the similarity between vectors.
410 |
411 | There are two possibilities to produce comparable signatures: provide an initial seed or serialize your hash object.
412 |
413 | ### Initial seed
414 |
415 | ```java
416 | import info.debatty.java.lsh.MinHash;
417 | import java.util.Random;
418 |
419 | public class InitialSeed {
420 |
421 | public static void main(String[] args) {
422 |
423 | // Initialize two minhash objects, with the same seed
424 | int signature_size = 20;
425 | int dictionary_size = 100;
426 | long initial_seed = 123456;
427 |
428 | MinHash mh = new MinHash(signature_size, dictionary_size, initial_seed);
429 | MinHash mh2 = new MinHash(signature_size, dictionary_size, initial_seed);
430 |
431 | // Create a single vector of size dictionary_size
432 | Random r = new Random();
433 | boolean[] vector = new boolean[dictionary_size];
434 | for (int i = 0; i < dictionary_size; i++) {
435 | vector[i] = r.nextBoolean();
436 | }
437 |
438 | // The two minhash objects will produce the same signature
439 | println(mh.signature(vector));
440 | println(mh2.signature(vector));
441 | }
442 |
443 | static void println(final int[] array) {
444 | System.out.print("[");
445 | for (int v : array) {
446 | System.out.print("" + v + " ");
447 | }
448 | System.out.println("]");
449 | }
450 | }
451 | ```
452 |
453 | Will output:
454 |
455 | ```
456 | [0 0 1 1 3 3 0 1 0 2 0 0 9 1 0 0 0 1 7 0 ]
457 | [0 0 1 1 3 3 0 1 0 2 0 0 9 1 0 0 0 1 7 0 ]
458 | ```
459 |
460 | ### Serialization
461 |
462 | ```java
463 | import info.debatty.java.lsh.LSHMinHash;
464 | import java.io.File;
465 | import java.io.FileInputStream;
466 | import java.io.FileOutputStream;
467 | import java.io.IOException;
468 | import java.io.ObjectInputStream;
469 | import java.io.ObjectOutputStream;
470 | import java.util.Random;
471 |
472 | public class SerializeExample {
473 |
474 | public static void main(String[] args)
475 | throws IOException, ClassNotFoundException {
476 |
477 | // Create a single random boolean vector
478 | int n = 100;
479 | double sparsity = 0.75;
480 | boolean[] vector = new boolean[n];
481 | Random rand = new Random();
482 | for (int j = 0; j < n; j++) {
483 | vector[j] = rand.nextDouble() > sparsity;
484 | }
485 |
486 | // Create and configure LSH
487 | int stages = 2;
488 | int buckets = 10;
489 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
490 | println(lsh.hash(vector));
491 |
492 | // Create another LSH object
493 | // as the parameters of the hashing function are randomly initialized
494 | // these two LSH objects will produce different hashes for the same
495 | // input vector!
496 | LSHMinHash other_lsh = new LSHMinHash(stages, buckets, n);
497 | println(other_lsh.hash(vector));
498 |
499 | // Moreover, signatures produced by different LSH objects cannot
500 | // be used to compute estimated similarity!
501 | // The solution is to serialize and save the object, so it can be
502 | // reused later...
503 | File tempfile = File.createTempFile("lshobject", ".ser");
504 | FileOutputStream fout = new FileOutputStream(tempfile);
505 | ObjectOutputStream oos = new ObjectOutputStream(fout);
506 | oos.writeObject(lsh);
507 | oos.close();
508 | System.out.println(
509 | "LSH object serialized to " + tempfile.getAbsolutePath());
510 |
511 | FileInputStream fin = new FileInputStream(tempfile);
512 | ObjectInputStream ois = new ObjectInputStream(fin);
513 | LSHMinHash saved_lsh = (LSHMinHash) ois.readObject();
514 | println(saved_lsh.hash(vector));
515 | }
516 |
517 | static void println(int[] array) {
518 | System.out.print("[");
519 | for (int v : array) {
520 | System.out.print("" + v + " ");
521 | }
522 | System.out.println("]");
523 | }
524 | }
525 | ```
526 |
527 | Will produce something like:
528 | ```
529 | [5 5 ]
530 | [3 1 ]
531 | LSH object serialized to /tmp/lshobject5903174677942358274.ser
532 | [5 5 ]
533 | ```
534 |
535 | [Check the examples](https://github.com/tdebatty/java-LSH/tree/master/src/main/java/info/debatty/java/lsh/examples) or [read Javadoc](http://www.javadoc.io/doc/info.debatty/java-lsh)
536 |
--------------------------------------------------------------------------------
/checkstyle.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
35 |
36 |
37 |
44 |
45 |
46 |
47 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
--------------------------------------------------------------------------------
/lsh-minhash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdebatty/java-LSH/423fc19894f464968a0968bda7869d8532ffd3c0/lsh-minhash.png
--------------------------------------------------------------------------------
/lsh-superbit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdebatty/java-LSH/423fc19894f464968a0968bda7869d8532ffd3c0/lsh-superbit.png
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 4.0.0
6 | info.debatty
7 | java-lsh
8 | 0.13-SNAPSHOT
9 | jar
10 |
11 | ${project.artifactId}
12 | https://github.com/tdebatty/java-LSH
13 | A Java implementation of Locality Sensitive Hashing (LSH)
14 |
15 |
16 | UTF-8
17 |
18 |
19 |
20 |
21 | MIT License
22 | http://www.opensource.org/licenses/mit-license.php
23 |
24 |
25 |
26 |
27 |
28 | Thibault Debatty
29 | thibault@debatty.info
30 | debatty.info
31 | http://debatty.info
32 |
33 |
34 |
35 |
36 | scm:git:git@github.com:tdebatty/java-LSH.git
37 | scm:git:git@github.com:tdebatty/java-LSH.git
38 | git@github.com:tdebatty/java-LSH.git
39 | HEAD
40 |
41 |
42 |
43 |
44 | ossrh
45 | https://oss.sonatype.org/content/repositories/snapshots
46 |
47 |
48 | ossrh
49 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
50 |
51 |
52 |
53 |
54 |
55 |
56 | org.sonatype.plugins
57 | nexus-staging-maven-plugin
58 | 1.6.3
59 | true
60 |
61 | ossrh
62 | https://oss.sonatype.org/
63 | true
64 |
65 |
66 |
67 |
68 | org.apache.maven.plugins
69 | maven-source-plugin
70 | 2.2.1
71 |
72 |
73 | attach-sources
74 |
75 | jar-no-fork
76 |
77 |
78 |
79 |
80 |
81 |
82 | org.apache.maven.plugins
83 | maven-javadoc-plugin
84 | 2.9.1
85 |
86 |
87 | attach-javadocs
88 |
89 | jar
90 |
91 |
92 |
93 |
94 |
95 |
96 | org.apache.maven.plugins
97 | maven-gpg-plugin
98 | 1.5
99 |
100 |
101 | sign-artifacts
102 | verify
103 |
104 | sign
105 |
106 |
107 |
108 |
109 |
110 | org.apache.maven.plugins
111 | maven-compiler-plugin
112 | 2.3.2
113 |
114 | 1.5
115 | 1.5
116 |
117 |
118 |
119 |
120 |
121 | org.apache.maven.plugins
122 | maven-release-plugin
123 | 2.5.1
124 |
125 | v@{project.version}
126 |
127 |
128 |
129 |
130 |
131 | org.apache.maven.plugins
132 | maven-checkstyle-plugin
133 | 2.17
134 |
135 | checkstyle.xml
136 | **\/examples\/*.java
137 | false
138 |
139 |
140 |
141 |
142 | test
143 | test
144 |
145 | true
146 | true
147 |
148 |
149 | check
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 | junit
160 | junit
161 | 4.10
162 | test
163 |
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/LSH.java:
--------------------------------------------------------------------------------
1 | package info.debatty.java.lsh;
2 |
3 | import java.io.Serializable;
4 |
5 | /**
6 | * Implementation of Locality Sensitive Hashing (LSH) principle, as described in
7 | * Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets",
8 | * Cambridge University Press.
9 | *
10 | * @author Thibault Debatty http://www.debatty.info
11 | */
12 | public abstract class LSH implements Serializable {
13 |
14 | protected static final long LARGE_PRIME = 433494437;
15 | private static final int DEFAULT_STAGES = 3;
16 | private static final int DEFAULT_BUCKETS = 10;
17 |
18 | private int stages = DEFAULT_STAGES;
19 | private int buckets = DEFAULT_BUCKETS;
20 |
21 | /**
22 | * Instantiates a LSH instance with s stages (or bands) and b buckets (per
23 | * stage), in a space with n dimensions.
24 | *
25 | * @param stages stages
26 | * @param buckets buckets (per stage)
27 | */
28 | public LSH(final int stages, final int buckets) {
29 | this.stages = stages;
30 | this.buckets = buckets;
31 | }
32 |
33 | /**
34 | * Instantiate an empty LSH instance (useful only for serialization).
35 | */
36 | public LSH() {
37 |
38 | }
39 |
40 | /**
41 | * Hash a signature.
42 | * The signature is divided in s stages (or bands). Each stage is hashed to
43 | * one of the b buckets.
44 | * @param signature
45 | * @return An vector of s integers (between 0 and b-1)
46 | */
47 | public final int[] hashSignature(final int[] signature) {
48 |
49 | // Create an accumulator for each stage
50 | int[] hash = new int[stages];
51 |
52 | // Number of rows per stage
53 | int rows = signature.length / stages;
54 |
55 | for (int i = 0; i < signature.length; i++) {
56 | int stage = Math.min(i / rows, stages - 1);
57 | hash[stage] = (int)
58 | ((hash[stage] + (long) signature[i] * LARGE_PRIME)
59 | % buckets);
60 |
61 | }
62 |
63 | return hash;
64 | }
65 |
66 | /**
67 | * Hash a signature.
68 | * The signature is divided in s stages (or bands). Each stage is hashed to
69 | * one of the b buckets.
70 | * @param signature
71 | * @return An vector of s integers (between 0 and b-1)
72 | */
73 | public final int[] hashSignature(final boolean[] signature) {
74 |
75 | // Create an accumulator for each stage
76 | long[] acc = new long[stages];
77 | for (int i = 0; i < stages; i++) {
78 | acc[i] = 0;
79 | }
80 |
81 | // Number of rows per stage
82 | int rows = signature.length / stages;
83 |
84 | for (int i = 0; i < signature.length; i++) {
85 | long v = 0;
86 | if (signature[i]) {
87 | v = (i + 1) * LARGE_PRIME;
88 | }
89 |
90 | // current stage
91 | int j = Math.min(i / rows, stages - 1);
92 | acc[j] = (acc[j] + v) % Integer.MAX_VALUE;
93 | }
94 |
95 | int[] r = new int[stages];
96 | for (int i = 0; i < stages; i++) {
97 | r[i] = (int) (acc[i] % buckets);
98 | }
99 |
100 | return r;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/LSHMinHash.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh;
26 |
27 | /**
28 | *
29 | * @author Thibault Debatty
30 | */
31 | public class LSHMinHash extends LSH {
32 | private final MinHash mh;
33 | private static final double THRESHOLD = 0.5;
34 |
35 | /**
36 | * Instantiates a LSH instance that internally uses MinHash,
37 | * with s stages (or bands) and b buckets (per stage), for sets out of a
38 | * dictionary of n elements.
39 | *
40 | * Attention: the number of buckets should be chosen such that we have at
41 | * least 100 items per bucket.
42 | *
43 | * @param s stages
44 | * @param b buckets (per stage)
45 | * @param n dictionary size
46 | */
47 | public LSHMinHash(final int s, final int b, final int n) {
48 | super(s, b);
49 | int signature_size = computeSignatureSize(s, n);
50 | this.mh = new MinHash(signature_size, n);
51 | }
52 |
53 | /**
54 | * Instantiates a LSH instance that internally uses MinHash,
55 | * with s stages (or bands) and b buckets (per stage), for sets out of a
56 | * dictionary of n elements.
57 | *
58 | * Attention: the number of buckets should be chosen such that we have at
59 | * least 100 items per bucket.
60 | *
61 | * @param s stages
62 | * @param b buckets (per stage)
63 | * @param n dictionary size
64 | * @param seed random number generator seed. using the same value will
65 | * guarantee identical hashes across object instantiations
66 | */
67 | public LSHMinHash(final int s, final int b, final int n, final long seed) {
68 | super(s, b);
69 | int signature_size = computeSignatureSize(s, n);
70 | this.mh = new MinHash(signature_size, n, seed);
71 | }
72 |
73 | /**
74 | * Compute the size of the signature according to "Mining of Massive
75 | * Datasets" p88.
76 | * It can be shown that, using MinHash, the probability that the
77 | * signatures of 2 sets with Jaccard similarity s agree in all the
78 | * rows of at least one stage (band), and therefore become a candidate
79 | * pair, is 1−(1−s^R)^b
80 | * where R = signature_size / b (number of rows in a stage/band)
81 | * Thus, the curve that shows the probability that 2 items fall in the
82 | * same bucket for at least one of the stages, as a function of their
83 | * Jaccard index similarity, has a S shape.
84 | * The threshold (the value of similarity at which the probability of
85 | * becoming a candidate is 1/2) is a function of the number of stages
86 | * (s, or bands b in the book) and the signature size:
87 | * threshold ≃ (1/s)^(1/R)
88 | * Hence the signature size can be computed as:
89 | * R = ln(1/s) / ln(threshold)
90 | * signature_size = R * b
91 | */
92 | private int computeSignatureSize(final int s, final int n) {
93 |
94 | int r = (int) Math.ceil(Math.log(1.0 / s) / Math.log(THRESHOLD)) + 1;
95 | return r * s;
96 | }
97 |
98 | /**
99 | * Bin this vector to corresponding buckets.
100 | * @param vector
101 | * @return
102 | */
103 | public final int[] hash(final boolean[] vector) {
104 | return hashSignature(this.mh.signature(vector));
105 | }
106 |
107 | /**
108 | * Get the coefficients used by internal hashing functions.
109 | * @return
110 | */
111 | public final long[][] getCoefficients() {
112 | return mh.getCoefficients();
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/LSHSuperBit.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh;
26 |
27 | import java.io.Serializable;
28 |
29 | /**
30 | *
31 | * @author Thibault Debatty
32 | */
33 | public class LSHSuperBit extends LSH implements Serializable {
34 | private SuperBit sb;
35 |
36 | /**
37 | * LSH implementation relying on SuperBit, to bin vectors s times (stages)
38 | * in b buckets (per stage), in a space with n dimensions. Input vectors
39 | * with a high cosine similarity have a high probability of falling in the
40 | * same bucket...
41 | *
42 | * Supported input types:
43 | * - double[]
44 | * - int[]
45 | * - others to come...
46 | *
47 | * @param stages stages
48 | * @param buckets buckets (per stage)
49 | * @param dimensions dimensionality
50 | */
51 | public LSHSuperBit(
52 | final int stages, final int buckets, final int dimensions) {
53 |
54 | super(stages, buckets);
55 |
56 | int code_length = stages * buckets / 2;
57 | int superbit = computeSuperBit(stages, buckets, dimensions);
58 |
59 | this.sb = new SuperBit(dimensions, superbit, code_length / superbit);
60 | }
61 |
62 | /**
63 | * LSH implementation relying on SuperBit, to bin vectors s times (stages)
64 | * in b buckets (per stage), in a space with n dimensions. Input vectors
65 | * with a high cosine similarity have a high probability of falling in the
66 | * same bucket...
67 | *
68 | * Supported input types:
69 | * - double[]
70 | * - int[]
71 | * - others to come...
72 | *
73 | * @param stages stages
74 | * @param buckets buckets (per stage)
75 | * @param dimensions dimensionality
76 | * @param seed random number generator seed. using the same value will
77 | * guarantee identical hashes across object instantiations
78 | *
79 | */
80 | public LSHSuperBit(
81 | final int stages,
82 | final int buckets,
83 | final int dimensions,
84 | final long seed) {
85 |
86 | super(stages, buckets);
87 |
88 | int code_length = stages * buckets / 2;
89 | int superbit = computeSuperBit(stages, buckets, dimensions);
90 |
91 | this.sb = new SuperBit(
92 | dimensions, superbit, code_length / superbit, seed);
93 | }
94 |
95 | /**
96 | * Compute the superbit value.
97 | * @param stages
98 | * @param buckets
99 | * @param dimensions
100 | * @return
101 | */
102 | private int computeSuperBit(
103 | final int stages, final int buckets, final int dimensions) {
104 |
105 | // SuperBit code length
106 | int code_length = stages * buckets / 2;
107 | int superbit; // superbit value
108 | for (superbit = dimensions; superbit >= 1; superbit--) {
109 | if (code_length % superbit == 0) {
110 | break;
111 | }
112 | }
113 |
114 | if (superbit == 0) {
115 | throw new IllegalArgumentException(
116 | "Superbit is 0 with parameters: s=" + stages
117 | + " b=" + buckets + " n=" + dimensions);
118 | }
119 |
120 | return superbit;
121 | }
122 |
123 | /**
124 | * Empty constructor, used only for serialization.
125 | */
126 | public LSHSuperBit() {
127 | }
128 |
129 | /**
130 | * Hash (bin) a vector in s stages into b buckets.
131 | * @param vector
132 | * @return
133 | */
134 | public final int[] hash(final double[] vector) {
135 | return hashSignature(sb.signature(vector));
136 | }
137 |
138 | /**
139 | * Hash (bin) a vector in s stages into b buckets.
140 | * @param vector
141 | * @return
142 | */
143 | public final int[] hash(final int[] vector) {
144 |
145 | double[] d = new double[vector.length];
146 | for (int i = 0; i < vector.length; i++) {
147 | d[i] = (double) vector[i];
148 | }
149 | return hash(d);
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/MinHash.java:
--------------------------------------------------------------------------------
1 | package info.debatty.java.lsh;
2 |
3 | import java.io.Serializable;
4 | import java.security.InvalidParameterException;
5 | import java.util.ArrayList;
6 | import java.util.Collections;
7 | import java.util.HashSet;
8 | import java.util.List;
9 | import java.util.Random;
10 | import java.util.Set;
11 | import java.util.TreeSet;
12 |
13 | /**
14 | * MinHash is a hashing scheme that tents to produce similar signatures for sets
15 | * that have a high Jaccard similarity.
16 | *
17 | * The Jaccard similarity between two sets is the relative number of elements
18 | * these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B| A MinHash signature is
19 | * a sequence of numbers produced by multiple hash functions hi. It can be shown
20 | * that the Jaccard similarity between two sets is also the probability that
21 | * this hash result is the same for the two sets: J(A, B) = Pr[hi(A) = hi(B)].
22 | * Therefore, MinHash signatures can be used to estimate Jaccard similarity
23 | * between two sets. Moreover, it can be shown that the expected estimation
24 | * error is O(1 / sqrt(n)), where n is the size of the signature (the number of
25 | * hash functions that are used to produce the signature).
26 | *
27 | * @author Thibault Debatty http://www.debatty.info
28 | */
29 | public class MinHash implements Serializable {
30 |
31 | private static final int LARGE_PRIME = 2147483647; // = 2^31 - 1 !
32 |
33 | /**
34 | * Compute the jaccard index between two sets.
35 | * @param s1
36 | * @param s2
37 | * @return
38 | */
39 | public static double jaccardIndex(
40 | final Set s1, final Set s2) {
41 |
42 | Set intersection = new HashSet(s1);
43 | intersection.retainAll(s2);
44 |
45 | Set union = new HashSet(s1);
46 | union.addAll(s2);
47 |
48 | if (union.isEmpty()) {
49 | return 0;
50 | }
51 |
52 | return (double) intersection.size() / union.size();
53 | }
54 |
55 | /**
56 | * Compute the exact jaccard index between two sets, represented as
57 | * arrays of booleans.
58 | * @param s1
59 | * @param s2
60 | * @return
61 | */
62 | public static double jaccardIndex(final boolean[] s1, final boolean[] s2) {
63 | if (s1.length != s2.length) {
64 | throw new InvalidParameterException("sets must be same size!");
65 | }
66 | return jaccardIndex(convert2Set(s1), convert2Set(s2));
67 | }
68 |
69 | /**
70 | * Convert a set represented as an array of booleans to a set of integer.
71 | *
72 | * @param array
73 | * @return
74 | */
75 | public static Set convert2Set(final boolean[] array) {
76 | Set set = new TreeSet();
77 | for (int i = 0; i < array.length; i++) {
78 | if (array[i]) {
79 | set.add(i);
80 | }
81 | }
82 | return set;
83 | }
84 |
85 | /**
86 | * Computes the size of the signature required to achieve a given error in
87 | * similarity estimation. (1 / error^2)
88 | *
89 | * @param error
90 | * @return size of the signature
91 | */
92 | public static int size(final double error) {
93 | if (error < 0 && error > 1) {
94 | throw new IllegalArgumentException("error should be in [0 .. 1]");
95 | }
96 | return (int) (1 / (error * error));
97 | }
98 |
99 | /**
100 | * Signature size.
101 | */
102 | private int n;
103 |
104 | /**
105 | * Random a and b coefficients for the random hash functions.
106 | */
107 | private long[][] hash_coefs;
108 |
109 | /**
110 | * Dictionary size (is also the size of vectors if the sets are provided
111 | * as vectors).
112 | */
113 | private int dict_size;
114 |
115 | /**
116 | * Initializes hash functions to compute MinHash signatures for sets built
117 | * from a dictionary of dict_size elements.
118 | *
119 | * @param size the number of hash functions (and the size of resulting
120 | * signatures)
121 | * @param dict_size
122 | */
123 | public MinHash(final int size, final int dict_size) {
124 | init(size, dict_size, new Random());
125 | }
126 |
127 | /**
128 | * Initializes hash function to compute MinHash signatures for sets built
129 | * from a dictionary of dict_size elements, with a given similarity
130 | * estimation error.
131 | *
132 | * @param error
133 | * @param dict_size
134 | */
135 | public MinHash(final double error, final int dict_size) {
136 | init(size(error), dict_size, new Random());
137 | }
138 |
139 | /**
140 | * Initializes hash functions to compute MinHash signatures for sets built
141 | * from a dictionary of dict_size elements.
142 | *
143 | * @param size the number of hash functions (and the size of resulting
144 | * signatures)
145 | * @param dict_size
146 | * @param seed random number generator seed. using the same value will
147 | * guarantee identical hashes across object instantiations
148 | */
149 | public MinHash(final int size, final int dict_size, final long seed) {
150 | init(size, dict_size, new Random(seed));
151 | }
152 |
153 | /**
154 | * Initializes hash function to compute MinHash signatures for sets built
155 | * from a dictionary of dict_size elements, with a given similarity
156 | * estimation error.
157 | *
158 | * @param error
159 | * @param dict_size
160 | * @param seed random number generator seed. using the same value will
161 | * guarantee identical hashes across object instantiations
162 | */
163 | public MinHash(final double error, final int dict_size, final long seed) {
164 | init(size(error), dict_size, new Random(seed));
165 | }
166 |
167 | /**
168 | * Computes the signature for this set The input set is represented as an
169 | * vector of booleans.
170 | * For example the array [true, false, true, true, false]
171 | * corresponds to the set {0, 2, 3}
172 | *
173 | * @param vector
174 | * @return the signature
175 | */
176 | public final int[] signature(final boolean[] vector) {
177 | if (vector.length != dict_size) {
178 | throw new IllegalArgumentException(
179 | "Size of array should be dict_size");
180 | }
181 |
182 | return signature(convert2Set(vector));
183 | }
184 |
185 | /**
186 | * Computes the signature for this set. For example set = {0, 2, 3}
187 | *
188 | * @param set
189 | * @return the signature
190 | */
191 | public final int[] signature(final Set set) {
192 | int[] sig = new int[n];
193 |
194 | for (int i = 0; i < n; i++) {
195 | sig[i] = Integer.MAX_VALUE;
196 | }
197 |
198 | // For each row r:
199 | //for (int r = 0; r < dict_size; r++) {
200 | // if set has 0 in row r, do nothing
201 | // if (!set.contains(r)) {
202 | // continue;
203 | // }
204 | // Loop over true values, instead of loop over all values of dictionary
205 | // to speedup computation
206 | final List list = new ArrayList(set);
207 | Collections.sort(list);
208 |
209 | for (final int r : list) {
210 |
211 | // However, if c has 1 in row r, then for each i = 1, 2, . . . ,n
212 | // set SIG(i, c) to the smaller of the current value of
213 | // SIG(i, c) and hi(r)
214 | for (int i = 0; i < n; i++) {
215 | sig[i] = Math.min(
216 | sig[i],
217 | h(i, r));
218 | }
219 | }
220 |
221 | return sig;
222 | }
223 |
224 | /**
225 | * Computes an estimation of Jaccard similarity (the number of elements in
226 | * common) between two sets, using the MinHash signatures of these two sets.
227 | *
228 | * @param sig1 MinHash signature of set1
229 | * @param sig2 MinHash signature of set2 (produced using the same
230 | * coefficients)
231 | * @return the estimated similarity
232 | */
233 | public final double similarity(final int[] sig1, final int[] sig2) {
234 | if (sig1.length != sig2.length) {
235 | throw new IllegalArgumentException(
236 | "Size of signatures should be the same");
237 | }
238 |
239 | double sim = 0;
240 | for (int i = 0; i < sig1.length; i++) {
241 | if (sig1[i] == sig2[i]) {
242 | sim += 1;
243 | }
244 | }
245 |
246 | return sim / sig1.length;
247 | }
248 |
249 | /**
250 | * Computes the expected error of similarity computed using signatures.
251 | *
252 | * @return the expected error
253 | */
254 | public final double error() {
255 | return 1.0 / Math.sqrt(n);
256 | }
257 |
258 | /**
259 | * Compute hash function coefficients using provided Random.
260 | * @param size
261 | * @param dict_size
262 | * @param r
263 | */
264 | private void init(final int size, final int dict_size, final Random r) {
265 | if (size <= 0) {
266 | throw new InvalidParameterException(
267 | "Signature size should be positive");
268 | }
269 |
270 | if (dict_size <= 0) {
271 | throw new InvalidParameterException(
272 | "Dictionary size (or vector size) should be positive");
273 | }
274 |
275 | // In function h(i, x) the largest value could be
276 | // dict_size * dict_size + dict_size
277 | // throw an error if dict_size * dict_size + dict_size > Long.MAX_VALUE
278 | if (dict_size > (Long.MAX_VALUE - dict_size) / dict_size) {
279 | throw new InvalidParameterException(
280 | "Dictionary size (or vector size) is too big and will "
281 | + "cause a multiplication overflow");
282 | }
283 |
284 | this.dict_size = dict_size;
285 | this.n = size;
286 |
287 | // h = (a * x) + b
288 | // a and b should be randomly generated in [1,PRIME-1]
289 | hash_coefs = new long[n][2];
290 | for (int i = 0; i < n; i++) {
291 | hash_coefs[i][0] = r.nextInt(LARGE_PRIME - 1) + 1; // a
292 | hash_coefs[i][1] = r.nextInt(LARGE_PRIME - 1) + 1; // b
293 | }
294 | }
295 |
296 | /**
297 | * Computes hi(x) as (a_i * x + b_i) % LARGE_PRIME .
298 | *
299 | * @param i
300 | * @param x
301 | * @return the hashed value of x, using ith hash function
302 | */
303 | private int h(final int i, final int x) {
304 | return (int)
305 | ((hash_coefs[i][0] * (long) x + hash_coefs[i][1])
306 | % LARGE_PRIME);
307 | }
308 |
309 | /**
310 | * Get the coefficients used by hash function hi.
311 | * @return
312 | */
313 | public final long[][] getCoefficients() {
314 | return hash_coefs;
315 | }
316 | }
317 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/SuperBit.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh;
26 |
27 | import java.io.Serializable;
28 | import java.util.Random;
29 |
30 | /**
31 | * Implementation of Super-Bit Locality-Sensitive Hashing.
32 | * Super-Bit is an improvement of Random Projection LSH.
33 | * It computes an estimation of cosine similarity.
34 | *
35 | * Super-Bit Locality-Sensitive Hashing
36 | * Jianqiu Ji, Jianmin Li, Shuicheng Yan, Bo Zhang, Qi Tian
37 | * http://papers.nips.cc/paper/4847-super-bit-locality-sensitive-hashing.pdf
38 | * Advances in Neural Information Processing Systems 25, 2012
39 | *
40 | * Supported input types:
41 | * - double[]
42 | * - others to come...
43 | *
44 | * @author Thibault Debatty
45 | */
46 | public class SuperBit implements Serializable {
47 |
48 | private double[][] hyperplanes;
49 | private static final int DEFAULT_CODE_LENGTH = 10000;
50 |
51 | /**
52 | * Initialize SuperBit algorithm.
53 | * Super-Bit depth n must be [1 .. d] and number of Super-Bit l in [1 ..
54 | * The resulting code length k = n * l
55 | * The K vectors are orthogonalized in L batches of N vectors
56 | *
57 | * @param d data space dimension
58 | * @param n Super-Bit depth [1 .. d]
59 | * @param l number of Super-Bit [1 ..
60 | */
61 | public SuperBit(final int d, final int n, final int l) {
62 | this(d, n, l, new Random());
63 | }
64 |
65 | /**
66 | * Initialize SuperBit algorithm.
67 | * Super-Bit depth n must be [1 .. d] and number of Super-Bit l in [1 ..
68 | * The resulting code length k = n * l
69 | * The K vectors are orthogonalized in L batches of N vectors
70 | *
71 | * @param d data space dimension
72 | * @param n Super-Bit depth [1 .. d]
73 | * @param l number of Super-Bit [1 ..
74 | * @param seed to use for the random number generator
75 | */
76 | public SuperBit(final int d, final int n, final int l, final long seed) {
77 | this(d, n, l, new Random(seed));
78 | }
79 |
80 | private SuperBit(final int d, final int n, final int l, final Random rand) {
81 | if (d <= 0) {
82 | throw new IllegalArgumentException("Dimension d must be >= 1");
83 | }
84 |
85 | if (n < 1 || n > d) {
86 | throw new IllegalArgumentException(
87 | "Super-Bit depth N must be 1 <= N <= d");
88 | }
89 |
90 | if (l < 1) {
91 | throw new IllegalArgumentException(
92 | "Number of Super-Bit L must be >= 1");
93 | }
94 |
95 | // Input: Data space dimension d, Super-Bit depth 1 <= N <= d,
96 | // number of Super-Bit L >= 1,
97 | // resulting code length K = N * L
98 |
99 | // Generate a random matrix H with each element sampled independently
100 | // from the normal distribution
101 | // N (0, 1), with each column normalized to unit length.
102 | // Denote H = [v1, v2, ..., vK].
103 | int code_length = n * l;
104 |
105 | double[][] v = new double[code_length][d];
106 |
107 | for (int i = 0; i < code_length; i++) {
108 | double[] vector = new double[d];
109 | for (int j = 0; j < d; j++) {
110 | vector[j] = rand.nextGaussian();
111 | }
112 |
113 | normalize(vector);
114 | v[i] = vector;
115 | }
116 |
117 |
118 | // for i = 0 to L - 1 do
119 | // for j = 1 to N do
120 | // w_{iN+j} = v_{iN+j}
121 | // for k = 1 to j - 1 do
122 | // w_{iN+j} = w_{iN+j} - w_{iN+k} w^T_{iN+k} v_{iN+j}
123 | // end for
124 | // wiN+j = wiN+j / | wiN+j |
125 | // end for
126 | // end for
127 | // Output: H˜ = [w1, w2, ..., wK]
128 |
129 | double[][] w = new double[code_length][d];
130 | for (int i = 0; i <= l - 1; i++) {
131 | for (int j = 1; j <= n; j++) {
132 | java.lang.System.arraycopy(
133 | v[i * n + j - 1],
134 | 0,
135 | w[i * n + j - 1],
136 | 0,
137 | d);
138 |
139 | for (int k = 1; k <= (j - 1); k++) {
140 | w[i * n + j - 1] = sub(
141 | w[i * n + j - 1],
142 | product(
143 | dotProduct(
144 | w[i * n + k - 1],
145 | v[ i * n + j - 1]),
146 | w[i * n + k - 1]));
147 | }
148 |
149 | normalize(w[i * n + j - 1]);
150 |
151 | }
152 | }
153 |
154 | this.hyperplanes = w;
155 | }
156 |
157 | /**
158 | * Initialize SuperBit algorithm.
159 | * With code length K = 10000
160 | * The K vectors are orthogonalized in d batches of 10000/d vectors
161 | * The resulting mean error is 0.01
162 | * @param d
163 | */
164 | public SuperBit(final int d) {
165 | this(d, d, DEFAULT_CODE_LENGTH / d);
166 | }
167 |
168 | /**
169 | * Initialize SuperBit algorithm without parameters
170 | * (used only for serialization).
171 | */
172 | public SuperBit() {
173 |
174 | }
175 |
176 | /**
177 | * Compute the signature of this vector.
178 | * @param vector
179 | * @return
180 | */
181 | public final boolean[] signature(final double[] vector) {
182 | boolean[] sig = new boolean[this.hyperplanes.length];
183 | for (int i = 0; i < this.hyperplanes.length; i++) {
184 | sig[i] = (dotProduct(this.hyperplanes[i], vector) >= 0);
185 | }
186 | return sig;
187 | }
188 |
189 | /**
190 | * Compute the similarity between two signature, which is also an
191 | * estimation of the cosine similarity between the two vectors.
192 | *
193 | * @param sig1
194 | * @param sig2
195 | * @return estimated cosine similarity
196 | */
197 | public final double similarity(final boolean[] sig1, final boolean[] sig2) {
198 |
199 | double agg = 0;
200 | for (int i = 0; i < sig1.length; i++) {
201 | if (sig1[i] == sig2[i]) {
202 | agg++;
203 | }
204 | }
205 |
206 | agg = agg / sig1.length;
207 |
208 | return Math.cos((1 - agg) * Math.PI);
209 | }
210 |
211 | /**
212 | * Get the hyperplanes coefficients used to compute signatures.
213 | * @return
214 | */
215 | public final double[][] getHyperplanes() {
216 | return this.hyperplanes;
217 | }
218 |
219 | /* ---------------------- STATIC ---------------------- */
220 |
221 | /**
222 | * Computes the cosine similarity, computed as v1 dot v2 / (|v1| * |v2|).
223 | * Cosine similarity of two vectors is the cosine of the angle between them.
224 | * It ranges between -1 and +1
225 | *
226 | * @param v1
227 | * @param v2
228 | * @return
229 | */
230 | public static double cosineSimilarity(final double[]v1, final double[] v2) {
231 |
232 | return dotProduct(v1, v2) / (norm(v1) * norm(v2));
233 | }
234 |
235 | private static double[] product(final double x, final double[] v) {
236 | double[] r = new double[v.length];
237 | for (int i = 0; i < v.length; i++) {
238 | r[i] = x * v[i];
239 | }
240 | return r;
241 | }
242 |
243 | private static double[] sub(final double[] a, final double[] b) {
244 | double[] r = new double[a.length];
245 | for (int i = 0; i < a.length; i++) {
246 | r[i] = a[i] - b[i];
247 | }
248 | return r;
249 | }
250 |
251 | private static void normalize(final double[] vector) {
252 | double norm = norm(vector);
253 | for (int i = 0; i < vector.length; i++) {
254 | vector[i] = vector[i] / norm;
255 | }
256 |
257 | }
258 |
259 | /**
260 | * Returns the norm L2. sqrt(sum_i(v_i^2))
261 | * @param v
262 | * @return
263 | */
264 | private static double norm(final double[] v) {
265 | double agg = 0;
266 |
267 | for (int i = 0; i < v.length; i++) {
268 | agg += (v[i] * v[i]);
269 | }
270 |
271 | return Math.sqrt(agg);
272 | }
273 |
274 | private static double dotProduct(final double[] v1, final double[] v2) {
275 | double agg = 0;
276 |
277 | for (int i = 0; i < v1.length; i++) {
278 | agg += (v1[i] * v2[i]);
279 | }
280 |
281 | return agg;
282 | }
283 | }
284 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/InitialSeed.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2016 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.MinHash;
28 | import java.util.Random;
29 |
30 | /**
31 | *
32 | * @author Thibault Debatty
33 | */
34 | public class InitialSeed {
35 |
36 | /**
37 | * @param args the command line arguments
38 | */
39 | public static void main(String[] args) {
40 |
41 | // Initialize two minhash objects, with the same seed
42 | int signature_size = 20;
43 | int dictionary_size = 100;
44 | long initial_seed = 123456;
45 |
46 | MinHash mh = new MinHash(signature_size, dictionary_size, initial_seed);
47 | MinHash mh2 = new MinHash(signature_size, dictionary_size, initial_seed);
48 |
49 | // Create a single vector of size dictionary_size
50 | Random r = new Random();
51 | boolean[] vector = new boolean[dictionary_size];
52 | for (int i = 0; i < dictionary_size; i++) {
53 | vector[i] = r.nextBoolean();
54 | }
55 |
56 | // The two minhash objects will produce the same signature
57 | println(mh.signature(vector));
58 | println(mh2.signature(vector));
59 | }
60 |
61 | static void println(final int[] array) {
62 | System.out.print("[");
63 | for (int v : array) {
64 | System.out.print("" + v + " ");
65 | }
66 | System.out.println("]");
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/LSHMinHashExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.LSHMinHash;
28 | import info.debatty.java.lsh.MinHash;
29 | import java.util.Random;
30 |
31 | /**
32 | *
33 | * @author Thibault Debatty
34 | */
35 | public class LSHMinHashExample {
36 |
37 | /**
38 | * @param args the command line arguments
39 | */
40 | public static void main(String[] args) {
41 | // Number of sets
42 | int count = 2000;
43 |
44 | // Size of dictionary
45 | int n = 100;
46 |
47 | // Number of buckets
48 | // Attention: to get relevant results, the number of elements per bucket
49 | // should be at least 100
50 | int buckets = 10;
51 |
52 | // Let's generate some random sets
53 | boolean[][] vectors = new boolean[count][];
54 | Random r = new Random();
55 |
56 | // To get some interesting measures, we first generate a single
57 | // sparse random vector
58 | vectors[0] = new boolean[n];
59 | for (int j = 0; j < n; j++) {
60 | vectors[0][j] = (r.nextInt(10) == 0);
61 | }
62 |
63 | // Then we generate the other vectors, which have a reasonable chance
64 | // to look like the first one...
65 | for (int i = 1; i < count; i++) {
66 | vectors[i] = new boolean[n];
67 |
68 | for (int j = 0; j < n; j++) {
69 | vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0));
70 | }
71 | }
72 |
73 | // Now we can proceed to LSH binning
74 | // We will test multiple stages
75 | for (int stages = 1; stages <= 10; stages++) {
76 |
77 | // Compute the LSH hash of each vector
78 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
79 | int[][] hashes = new int[count][];
80 | for (int i = 0; i < count; i++) {
81 | boolean[] vector = vectors[i];
82 | hashes[i] = lsh.hash(vector);
83 | }
84 |
85 | // We now have the LSH hash for each input set
86 | // Let's have a look at how similar sets (according to Jaccard
87 | // index) were binned...
88 | int[][] results = new int[11][2];
89 | for (int i = 0; i < vectors.length; i++) {
90 | boolean[] vector1 = vectors[i];
91 | int[] hash1 = hashes[i];
92 |
93 | for (int j = 0; j < i; j++) {
94 | boolean[] vector2 = vectors[j];
95 | int[] hash2 = hashes[j];
96 |
97 | // We compute the similarity between each pair of sets
98 | double similarity = MinHash.jaccardIndex(vector1, vector2);
99 |
100 | // We count the number of pairs with similarity 0.1, 0.2,
101 | // 0.3, etc.
102 | results[(int) (10 * similarity)][0]++;
103 |
104 | // Do they fall in the same bucket for one of the stages?
105 | for (int stage = 0; stage < stages; stage++) {
106 | if (hash1[stage] == hash2[stage]) {
107 | results[(int) (10 * similarity)][1]++;
108 | break;
109 | }
110 | }
111 | }
112 | }
113 |
114 | // Now we can display (and plot in Gnuplot) the result:
115 | // For pairs that have a similarity x, the probability of falling
116 | // in the same bucket for at least one of the stages is y
117 | for (int i = 0; i < results.length; i++) {
118 | double similarity = (double) i / 10;
119 |
120 | double probability = 0;
121 | if (results[i][0] != 0) {
122 | probability = (double) results[i][1] / results[i][0];
123 | }
124 | System.out.println("" + similarity + "\t" + probability + "\t" + stages);
125 | }
126 |
127 | // Separate the series for Gnuplot...
128 | System.out.print("\n");
129 | }
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/LSHSuperBitExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.LSHSuperBit;
28 | import java.util.Random;
29 | import java.util.logging.Level;
30 | import java.util.logging.Logger;
31 |
32 | /**
33 | *
34 | * @author Thibault Debatty
35 | */
36 | public class LSHSuperBitExample {
37 |
38 | /**
39 | * @param args the command line arguments
40 | */
41 | public static void main(String[] args) {
42 |
43 | int count = 100;
44 |
45 | // R^n
46 | int n = 3;
47 |
48 | int stages = 2;
49 | int buckets = 4;
50 |
51 | // Produce some vectors in R^n
52 | Random r = new Random();
53 | double[][] vectors = new double[count][];
54 | for (int i = 0; i < count; i++) {
55 | vectors[i] = new double[n];
56 |
57 | for (int j = 0; j < n; j++) {
58 | vectors[i][j] = r.nextGaussian();
59 | }
60 | }
61 | try {
62 | LSHSuperBit lsh = new LSHSuperBit(stages, buckets, n);
63 |
64 | // Compute a SuperBit signature, and a LSH hash
65 | for (int i = 0; i < count; i++) {
66 | double[] vector = vectors[i];
67 | int[] hash = lsh.hash(vector);
68 | for (double v : vector) {
69 | System.out.printf("%6.2f\t", v);
70 | }
71 | System.out.print(hash[0]);
72 | System.out.print("\n");
73 | }
74 | } catch (Exception ex) {
75 | Logger.getLogger(LSHSuperBitExample.class.getName()).log(Level.SEVERE, null, ex);
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/MinHashExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.MinHash;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | *
32 | * @author Thibault Debatty
33 | */
34 | public class MinHashExample {
35 |
36 | public static void main(String[] args) {
37 | // Initialize the hash function for an similarity error of 0.1
38 | // For sets built from a dictionary of 5 items
39 | MinHash minhash = new MinHash(0.1, 5);
40 |
41 | // Sets can be defined as an vector of booleans:
42 | // [1 0 0 1 0]
43 | boolean[] vector1 = {true, false, false, true, false};
44 | int[] sig1 = minhash.signature(vector1);
45 |
46 | // Or as a set of integers:
47 | // set2 = [1 0 1 1 0]
48 | TreeSet set2 = new TreeSet();
49 | set2.add(0);
50 | set2.add(2);
51 | set2.add(3);
52 | int[] sig2 = minhash.signature(set2);
53 |
54 | System.out.println("Signature similarity: " + minhash.similarity(sig1, sig2));
55 | System.out.println("Real similarity (Jaccard index)" +
56 | MinHash.jaccardIndex(MinHash.convert2Set(vector1), set2));
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/SerializeExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 | package info.debatty.java.lsh.examples;
25 |
26 | import info.debatty.java.lsh.LSHMinHash;
27 | import java.io.File;
28 | import java.io.FileInputStream;
29 | import java.io.FileOutputStream;
30 | import java.io.IOException;
31 | import java.io.ObjectInputStream;
32 | import java.io.ObjectOutputStream;
33 | import java.nio.file.Files;
34 | import java.util.Random;
35 |
36 | /**
37 | *
38 | * @author Thibault Debatty
39 | */
40 | public class SerializeExample {
41 |
42 | /**
43 | * @param args the command line arguments
44 | * @throws java.io.IOException
45 | * @throws java.lang.ClassNotFoundException
46 | */
47 | public static void main(String[] args)
48 | throws IOException, ClassNotFoundException {
49 |
50 | // Create a single random boolean vector
51 | int n = 100;
52 | double sparsity = 0.75;
53 | boolean[] vector = new boolean[n];
54 | Random rand = new Random();
55 | for (int j = 0; j < n; j++) {
56 | vector[j] = rand.nextDouble() > sparsity;
57 | }
58 |
59 | // Create and configure LSH
60 | int stages = 2;
61 | int buckets = 10;
62 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
63 | println(lsh.hash(vector));
64 |
65 | // Create another LSH object
66 | // as the parameters of the hashing function are randomly initialized
67 | // these two LSH objects will produce different hashes for the same
68 | // input vector!
69 | LSHMinHash other_lsh = new LSHMinHash(stages, buckets, n);
70 | println(other_lsh.hash(vector));
71 |
72 | // Moreover, signatures produced by different LSH objects cannot
73 | // be used to compute estimated similarity!
74 | // The solution is to serialize and save the object, so it can be
75 | // reused later...
76 | File tempfile = Files.createTempFile("lshobject", ".ser").toFile();
77 | FileOutputStream fout = new FileOutputStream(tempfile);
78 | ObjectOutputStream oos = new ObjectOutputStream(fout);
79 | oos.writeObject(lsh);
80 | oos.close();
81 | System.out.println(
82 | "LSH object serialized to " + tempfile.getAbsolutePath());
83 |
84 | FileInputStream fin = new FileInputStream(tempfile);
85 | ObjectInputStream ois = new ObjectInputStream(fin);
86 | LSHMinHash saved_lsh = (LSHMinHash) ois.readObject();
87 | println(saved_lsh.hash(vector));
88 | }
89 |
90 | static void println(int[] array) {
91 | System.out.print("[");
92 | for (int v : array) {
93 | System.out.print("" + v + " ");
94 | }
95 | System.out.println("]");
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/SimpleLSHMinHashExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 tibo.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.LSHMinHash;
28 | import java.util.Random;
29 |
30 | /**
31 | *
32 | * @author tibo
33 | */
34 | public class SimpleLSHMinHashExample {
35 |
36 | /**
37 | * @param args the command line arguments
38 | */
39 | public static void main(String[] args) {
40 | // proportion of 0's in the vectors
41 | // if the vectors are dense (lots of 1's), the average jaccard similarity
42 | // will be very high (especially for large vectors), and LSH
43 | // won't be able to distinguish them
44 | // as a result, all vectors will be binned in the same bucket...
45 | double sparsity = 0.75;
46 |
47 | // Number of sets
48 | int count = 10000;
49 |
50 | // Size of vectors
51 | int n = 100;
52 |
53 | // LSH parameters
54 | // the number of stages is also sometimes called thge number of bands
55 | int stages = 2;
56 |
57 | // Attention: to get relevant results, the number of elements per bucket
58 | // should be at least 100
59 | int buckets = 10;
60 |
61 | // Let's generate some random sets
62 | boolean[][] vectors = new boolean[count][n];
63 | Random rand = new Random();
64 |
65 | for (int i = 0; i < count; i++) {
66 | for (int j = 0; j < n; j++) {
67 | vectors[i][j] = rand.nextDouble() > sparsity;
68 | }
69 | }
70 |
71 | // Create and configure LSH algorithm
72 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
73 |
74 | int[][] counts = new int[stages][buckets];
75 |
76 | // Perform hashing
77 | for (boolean[] vector : vectors) {
78 | int[] hash = lsh.hash(vector);
79 |
80 | for (int i = 0; i < hash.length; i++) {
81 | counts[i][hash[i]]++;
82 | }
83 |
84 | print(vector);
85 | System.out.print(" : ");
86 | print(hash);
87 | System.out.print("\n");
88 | }
89 |
90 | System.out.println("Number of elements per bucket at each stage:");
91 | for (int i = 0; i < stages; i++) {
92 | print(counts[i]);
93 | System.out.print("\n");
94 | }
95 | }
96 |
97 | static void print(int[] array) {
98 | System.out.print("[");
99 | for (int v : array) {
100 | System.out.print("" + v + " ");
101 | }
102 | System.out.print("]");
103 | }
104 |
105 | static void print(boolean[] array) {
106 | System.out.print("[");
107 | for (boolean v : array) {
108 | System.out.print(v ? "1" : "0");
109 | }
110 | System.out.print("]");
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/java/info/debatty/java/lsh/examples/SuperBitExample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2015 tibo.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 |
25 | package info.debatty.java.lsh.examples;
26 |
27 | import info.debatty.java.lsh.SuperBit;
28 | import java.util.Random;
29 |
30 | /**
31 | * @author Thibault Debatty
32 | */
33 | public class SuperBitExample {
34 |
35 | /**
36 | * @param args the command line arguments
37 | */
38 | public static void main(String[] args) {
39 |
40 | int n = 100;
41 |
42 | SuperBit sb = new SuperBit(n);
43 |
44 | Random rand = new Random();
45 | double[] v1 = new double[n];
46 | double[] v2 = new double[n];
47 | for (int i = 0; i < n; i++) {
48 | v1[i] = rand.nextInt();
49 | v2[i] = rand.nextInt();
50 | }
51 |
52 | boolean[] sig1 = sb.signature(v1);
53 | boolean[] sig2 = sb.signature(v2);
54 |
55 | System.out.println("Signature (estimated) similarity: " +
56 | sb.similarity(sig1, sig2));
57 | System.out.println("Real (cosine) similarity: " +
58 | SuperBit.cosineSimilarity(v1, v2));
59 |
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/test/java/info/debatty/java/lsh/LSHMinHashTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * The MIT License
3 | *
4 | * Copyright 2016 Thibault Debatty.
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
7 | * of this software and associated documentation files (the "Software"), to deal
8 | * in the Software without restriction, including without limitation the rights
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | * copies of the Software, and to permit persons to whom the Software is
11 | * furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | * THE SOFTWARE.
23 | */
24 | package info.debatty.java.lsh;
25 |
26 | import java.util.Random;
27 | import org.junit.Test;
28 |
29 | /**
30 | *
31 | * @author Thibault Debatty
32 | */
33 | public class LSHMinHashTest {
34 |
35 | /**
36 | * Test of hash method, of class LSHMinHash.
37 | */
38 | @Test
39 | public void testHash() {
40 | System.out.println("hash");
41 |
42 | // proportion of 0's in the vectors
43 | // if the vectors are dense (lots of 1's), the average jaccard similarity
44 | // will be very high (especially for large vectors), and LSH
45 | // won't be able to distinguish them
46 | // as a result, all vectors will be binned in the same bucket...
47 | double sparsity = 0.75;
48 |
49 | // Number and size of vectors
50 | int count = 1000;
51 | int n = 10000;
52 |
53 | int stages = 2;
54 | int buckets = 10;
55 |
56 | // Let's generate some random sets
57 | boolean[][] vectors = new boolean[count][n];
58 | Random rand = new Random();
59 |
60 | for (int i = 0; i < count; i++) {
61 | for (int j = 0; j < n; j++) {
62 | vectors[i][j] = rand.nextDouble() > sparsity;
63 | }
64 | }
65 |
66 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
67 | int[][] counts = new int[stages][buckets];
68 |
69 | // Perform hashing
70 | for (boolean[] vector : vectors) {
71 | int[] hash = lsh.hash(vector);
72 |
73 | for (int i = 0; i < hash.length; i++) {
74 | // this will raise an ArrayIndexOutOfBoundsException
75 | // if the bin values are negatives or too large
76 | counts[i][hash[i]]++;
77 | }
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/test/java/info/debatty/java/lsh/MinHashTest.java:
--------------------------------------------------------------------------------
1 | package info.debatty.java.lsh;
2 |
3 | import static org.junit.Assert.assertArrayEquals;
4 |
5 | import java.util.HashSet;
6 | import java.util.Random;
7 | import java.util.Set;
8 |
9 | import org.junit.Test;
10 |
11 | /**
12 | *
13 | * @author Thibault Debatty
14 | */
15 | public class MinHashTest {
16 |
17 | /**
18 | * Test with initial seed.
19 | */
20 | @Test
21 | public void testSeed() {
22 | MinHash mh = new MinHash(100, 100, 123456);
23 | MinHash mh2 = new MinHash(100, 100, 123456);
24 |
25 | Random r = new Random();
26 |
27 | Set ints = new HashSet();
28 | for (int i = 0; i < 50; i++) {
29 | ints.add(r.nextInt());
30 | }
31 |
32 | assertArrayEquals(mh.signature(ints), mh2.signature(ints));
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/test/java/info/debatty/java/lsh/SuperBitTest.java:
--------------------------------------------------------------------------------
1 | package info.debatty.java.lsh;
2 |
3 | import static org.junit.Assert.assertEquals;
4 |
5 | import java.util.Random;
6 |
7 | import org.junit.Test;
8 |
9 | /**
10 | *
11 | * @author Thibault Debatty
12 | */
13 | public class SuperBitTest {
14 |
15 | /**
16 | * Test with initial seed.
17 | */
18 | @Test
19 | public final void testSeed() {
20 | int d = 50;
21 | SuperBit sb = new SuperBit(d, 25, 100, 123456);
22 | SuperBit sb2 = new SuperBit(d, 25, 100, 123456);
23 |
24 | Random r = new Random();
25 |
26 | double[] vector = new double[d];
27 | for (int i = 0; i < d; i++) {
28 | vector[i] = r.nextDouble();
29 | }
30 |
31 | boolean[] sig1 = sb.signature(vector);
32 | boolean[] sig2 = sb2.signature(vector);
33 |
34 | for (int i = 0; i < sig1.length; i++) {
35 | assertEquals(
36 | "Signatures are different at index " + i, sig1[i], sig2[i]);
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------