├── .gitignore ├── .travis.yml ├── LICENSE.md ├── README.md ├── checkstyle.xml ├── lsh-minhash.png ├── lsh-superbit.png ├── pom.xml └── src ├── main └── java │ └── info │ └── debatty │ └── java │ └── lsh │ ├── LSH.java │ ├── LSHMinHash.java │ ├── LSHSuperBit.java │ ├── MinHash.java │ ├── SuperBit.java │ └── examples │ ├── InitialSeed.java │ ├── LSHMinHashExample.java │ ├── LSHSuperBitExample.java │ ├── MinHashExample.java │ ├── SerializeExample.java │ ├── SimpleLSHMinHashExample.java │ └── SuperBitExample.java └── test └── java └── info └── debatty └── java └── lsh ├── LSHMinHashTest.java ├── MinHashTest.java └── SuperBitTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | /nbproject/private/ 14 | /dist/ 15 | /build/ 16 | /target/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -Dgpg.skip=true 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | This project is licensed under the terms of the **MIT license**. 4 | 5 | https://opensource.org/licenses/MIT 6 | 7 | > Copyright 2015 Thibault Debatty. 8 | > 9 | > Permission is hereby granted, free of charge, to any person obtaining 10 | > a copy of this software and associated documentation files (the 11 | > "Software"), to deal in the Software without restriction, including 12 | > without limitation the rights to use, copy, modify, merge, publish, 13 | > distribute, sublicense, and/or sell copies of the Software, and to 14 | > permit persons to whom the Software is furnished to do so, subject to 15 | > the following conditions: 16 | > 17 | > The above copyright notice and this permission notice shall be 18 | > included in all copies or substantial portions of the Software. 19 | > 20 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | > NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | > LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | > OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | > WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # java-LSH 2 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/info.debatty/java-lsh/badge.svg)](https://maven-badges.herokuapp.com/maven-central/info.debatty/java-lsh) [![Build Status](https://travis-ci.org/tdebatty/java-LSH.svg?branch=master)](https://travis-ci.org/tdebatty/java-LSH) [![Javadocs](http://www.javadoc.io/badge/info.debatty/java-lsh.svg)](http://www.javadoc.io/doc/info.debatty/java-lsh) 3 | 4 | 5 | A Java implementation of Locality Sensitive Hashing (LSH). 6 | 7 | * [Download](#download) 8 | * [MinHash](#minhash) 9 | * [Super-Bit](#super-bit) 10 | * [Comparable signatures](#comparable-signatures) 11 | * [Initial seed](#initial-seed) 12 | * [Serialization](#serialization) 13 | 14 | 15 | Locality Sensitive Hashing (LSH) is a family of hashing methods that tent to produce the same hash (or signature) for similar items. There exist different LSH functions, that each correspond to a similarity metric. For example, the MinHash algorithm is designed for Jaccard similarity (the relative number of elements that two sets have in common). For cosine similarity, the traditional LSH algorithm used is Random Projection, but others exist, like Super-Bit, that deliver better results. 16 | 17 | LSH functions have two main use cases: 18 | * Compute the signature of large input vectors. These signatures can be used to quickly estimate the similarity between vectors. 19 | * With a given number of buckets, bin similar vectors together. 20 | 21 | This library implements Locality Sensitive Hashing (LSH), as described in Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets", Cambridge University Press. 22 | 23 | Are currently implemented: 24 | * MinHash algorithm for Jaccard index; 25 | * Super-Bit algorithm for cosine similarity. 26 | 27 | The coeficients of hashing functions are randomly choosen when the LSH object is instantiated. You can thus only compare signatures or bucket binning generated by the same LSH object. To reuse your LSH object between executions, you have to serialize it and save it to a file (see below the [example of LSH object serialization](https://github.com/tdebatty/java-LSH#serialization)). 28 | 29 | ## Download 30 | 31 | Using maven: 32 | ``` 33 | 34 | info.debatty 35 | java-lsh 36 | RELEASE 37 | 38 | ``` 39 | 40 | Or see the [releases](https://github.com/tdebatty/java-LSH/releases) page. 41 | 42 | ## MinHash 43 | 44 | MinHash is a hashing scheme that tents to produce similar signatures for sets that have a high Jaccard similarity. 45 | 46 | The Jaccard similarity between two sets is the relative number of elements these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B| A MinHash signature is a sequence of numbers produced by multiple hash functions hi. It can be shown that the Jaccard similarity between two sets is also the probability that this hash result is the same for the two sets: J(A, B) = Pr[hi(A) = hi(B)]. Therefore, MinHash signatures can be used to estimate Jaccard similarity between two sets. Moreover, it can be shown that the expected estimation error is O(1 / sqrt(n)), where n is the size of the signature (the number of hash functions that are used to produce the signature). 47 | 48 | ### Binning 49 | 50 | ```java 51 | import info.debatty.java.lsh.LSHMinHash; 52 | import java.util.Random; 53 | 54 | public class SimpleLSHMinHashExample { 55 | 56 | public static void main(String[] args) { 57 | // proportion of 0's in the vectors 58 | // if the vectors are dense (lots of 1's), the average jaccard similarity 59 | // will be very high (especially for large vectors), and LSH 60 | // won't be able to distinguish them 61 | // as a result, all vectors will be binned in the same bucket... 62 | double sparsity = 0.75; 63 | 64 | // Number of sets 65 | int count = 10000; 66 | 67 | // Size of vectors 68 | int n = 100; 69 | 70 | // LSH parameters 71 | // the number of stages is also sometimes called thge number of bands 72 | int stages = 2; 73 | 74 | // Attention: to get relevant results, the number of elements per bucket 75 | // should be at least 100 76 | int buckets = 10; 77 | 78 | // Let's generate some random sets 79 | boolean[][] vectors = new boolean[count][n]; 80 | Random rand = new Random(); 81 | 82 | for (int i = 0; i < count; i++) { 83 | for (int j = 0; j < n; j++) { 84 | vectors[i][j] = rand.nextDouble() > sparsity; 85 | } 86 | } 87 | 88 | // Create and configure LSH algorithm 89 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 90 | 91 | int[][] counts = new int[stages][buckets]; 92 | 93 | // Perform hashing 94 | for (boolean[] vector : vectors) { 95 | int[] hash = lsh.hash(vector); 96 | 97 | for (int i = 0; i < hash.length; i++) { 98 | counts[i][hash[i]]++; 99 | } 100 | 101 | print(vector); 102 | System.out.print(" : "); 103 | print(hash); 104 | System.out.print("\n"); 105 | } 106 | 107 | System.out.println("Number of elements per bucket at each stage:"); 108 | for (int i = 0; i < stages; i++) { 109 | print(counts[i]); 110 | System.out.print("\n"); 111 | } 112 | } 113 | 114 | static void print(int[] array) { 115 | System.out.print("["); 116 | for (int v : array) { 117 | System.out.print("" + v + " "); 118 | } 119 | System.out.print("]"); 120 | } 121 | 122 | static void print(boolean[] array) { 123 | System.out.print("["); 124 | for (boolean v : array) { 125 | System.out.print(v ? "1" : "0"); 126 | } 127 | System.out.print("]"); 128 | } 129 | } 130 | ``` 131 | 132 | Pay attention, LSH using MinHash is very sensitive to the average Jaccard similarity in your dataset! If most vectors in your dataset have a Jaccard similarity above or below 0.5, they might all fall in the same bucket. This is illustrated by example below: 133 | 134 | ```java 135 | import info.debatty.java.lsh.LSHMinHash; 136 | import info.debatty.java.lsh.MinHash; 137 | import java.util.Random; 138 | 139 | public class LSHMinHashExample { 140 | 141 | public static void main(String[] args) { 142 | // Number of sets 143 | int count = 2000; 144 | 145 | // Size of dictionary 146 | int n = 100; 147 | 148 | // Number of buckets 149 | // Attention: to get relevant results, the number of elements per bucket 150 | // should be at least 100 151 | int buckets = 10; 152 | 153 | // Let's generate some random sets 154 | boolean[][] vectors = new boolean[count][]; 155 | Random r = new Random(); 156 | 157 | // To get some interesting measures, we first generate a single 158 | // sparse random vector 159 | vectors[0] = new boolean[n]; 160 | for (int j = 0; j < n; j++) { 161 | vectors[0][j] = (r.nextInt(10) == 0); 162 | } 163 | 164 | // Then we generate the other vectors, which have a reasonable chance 165 | // to look like the first one... 166 | for (int i = 1; i < count; i++) { 167 | vectors[i] = new boolean[n]; 168 | 169 | for (int j = 0; j < n; j++) { 170 | vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0)); 171 | } 172 | } 173 | 174 | // Now we can proceed to LSH binning 175 | // We will test multiple stages 176 | for (int stages = 1; stages <= 10; stages++) { 177 | 178 | // Compute the LSH hash of each vector 179 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 180 | int[][] hashes = new int[count][]; 181 | for (int i = 0; i < count; i++) { 182 | boolean[] vector = vectors[i]; 183 | hashes[i] = lsh.hash(vector); 184 | } 185 | 186 | // We now have the LSH hash for each input set 187 | // Let's have a look at how similar sets (according to Jaccard 188 | // index) were binned... 189 | int[][] results = new int[11][2]; 190 | for (int i = 0; i < vectors.length; i++) { 191 | boolean[] vector1 = vectors[i]; 192 | int[] hash1 = hashes[i]; 193 | 194 | for (int j = 0; j < i; j++) { 195 | boolean[] vector2 = vectors[j]; 196 | int[] hash2 = hashes[j]; 197 | 198 | // We compute the similarity between each pair of sets 199 | double similarity = MinHash.jaccardIndex(vector1, vector2); 200 | 201 | // We count the number of pairs with similarity 0.1, 0.2, 202 | // 0.3, etc. 203 | results[(int) (10 * similarity)][0]++; 204 | 205 | // Do they fall in the same bucket for one of the stages? 206 | for (int stage = 0; stage < stages; stage++) { 207 | if (hash1[stage] == hash2[stage]) { 208 | results[(int) (10 * similarity)][1]++; 209 | break; 210 | } 211 | } 212 | } 213 | } 214 | 215 | // Now we can display (and plot in Gnuplot) the result: 216 | // For pairs that have a similarity x, the probability of falling 217 | // in the same bucket for at least one of the stages is y 218 | for (int i = 0; i < results.length; i++) { 219 | double similarity = (double) i / 10; 220 | 221 | double probability = 0; 222 | if (results[i][0] != 0) { 223 | probability = (double) results[i][1] / results[i][0]; 224 | } 225 | System.out.println("" + similarity + "\t" + probability + "\t" + stages); 226 | } 227 | 228 | // Separate the series for Gnuplot... 229 | System.out.print("\n"); 230 | } 231 | } 232 | } 233 | ``` 234 | 235 | This example will run LSH binning for different number of stages. At each step, for each value of Jaccard similarity between pairs of sets (in the range [0, 0.1, 0.2, ... 1.0]), the program computes the probability that these two pairs fall in the same bucket for at least one stage. The results can be plotted with Gnuplot for example: 236 | 237 | ![alt tag](https://raw.githubusercontent.com/tdebatty/java-LSH/master/lsh-minhash.png) 238 | 239 | On this figure, the x-axis is the Jaccard similarity between sets, the y-axis is the probability that these pairs fall in the same bucket for at least one stage. The different series represent different values for the number of stages (from 1 to 10). 240 | 241 | We can clearly recognize the typical S curve of MinHash, with the threshold (the point where the curve is the steepest) located around x = 0.5. 242 | 243 | This curve is very important! It shows that if all your sets are similar (similarity above 0.6), all sets will most probably fall in a single bucket. And all other buckets will thus most probably be empty. This can happen for example if your dataset is skewed and presents some sort of principal direction. 244 | 245 | At the opposite, if your sets are all different from each other (similarity below 0.2), the curve is nearly flat. This means that pairs of sets have the same probability of falling in the same bucket, independantly of their similarity. The items are then randomly binned into the buckets. If using B buckets and S stages, computing the probability that two items are binned in the same bucket is similar to the problem of rolling S times a dice with B values. The resuling probability is 1 - [(B-1) / B]^S. The computed probability for 10 buckets is presented in table below, and roughly correspond to the above graph. 246 | 247 | 248 | | Stages | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 249 | |--------|--------|--------|--------|--------|--------|--------|--------|--------|--------|----| 250 | | Pr | 0.1 | 0.19 | 0.27 | 0.34 | 0.41 | 0.47 | 0.52 | 0.57 | 0.61 | 0.65 | 251 | 252 | ### Signatures 253 | 254 | If you simply wish to compute MinHash signatures (witout performing LSH binning), you can directly use the MinHash class: 255 | 256 | ```java 257 | import info.debatty.java.lsh.MinHash; 258 | import java.util.TreeSet; 259 | 260 | public class MinHashExample { 261 | 262 | public static void main(String[] args) { 263 | // Initialize the hash function for an similarity error of 0.1 264 | // For sets built from a dictionary of 5 items 265 | MinHash minhash = new MinHash(0.1, 5); 266 | 267 | // Sets can be defined as an vector of booleans: 268 | // [1 0 0 1 0] 269 | boolean[] vector1 = {true, false, false, true, false}; 270 | int[] sig1 = minhash.signature(vector1); 271 | 272 | // Or as a set of integers: 273 | // set2 = [1 0 1 1 0] 274 | TreeSet set2 = new TreeSet(); 275 | set2.add(0); 276 | set2.add(2); 277 | set2.add(3); 278 | int[] sig2 = minhash.signature(set2); 279 | 280 | System.out.println("Signature similarity: " + minhash.similarity(sig1, sig2)); 281 | System.out.println("Real similarity (Jaccard index)" + 282 | MinHash.JaccardIndex(MinHash.Convert2Set(vector1), set2)); 283 | } 284 | } 285 | ``` 286 | 287 | Which will produce: 288 | 289 | ``` 290 | Signature similarity: 0.6767676767676768 291 | Real similarity (Jaccard index)0.6666666666666666 292 | ``` 293 | 294 | [Read Javadoc...](http://www.javadoc.io/doc/info.debatty/java-lsh) 295 | 296 | ## Super-Bit 297 | 298 | Super-Bit is an improvement of Random Projection LSH. It computes an estimation of cosine similarity. In Super-Bit, the K random vectors are orthogonalized in L batches of N vectors, where 299 | * N is called the Super-Bit depth 300 | * L is called the number of Super-Bits 301 | * K = L * N is the code length (the size of the signature) 302 | 303 | Super-Bit Locality-Sensitive Hashing, Jianqiu Ji, Jianmin Li, Shuicheng Yan, Bo Zhang, Qi Tian 304 | http://papers.nips.cc/paper/4847-super-bit-locality-sensitive-hashing.pdf 305 | Published in Advances in Neural Information Processing Systems 25, 2012 306 | 307 | The cosine similarity between two points vectors in R^n is the cosine of their angle. It is computed as v1 . v2 / (|v1| * |v2|). 308 | Two vectors with the same orientation have a Cosine similarity of 1, two vectors at 90° have a similarity of 0, and two vectors diametrically opposed have a similarity of -1, independent of their magnitude. 309 | 310 | Here is an example of how to quickly bin together vectors that have a high cosine similarity using LSH + Super-Bit: 311 | 312 | ```java 313 | import info.debatty.java.lsh.LSHSuperBit; 314 | import java.util.Random; 315 | 316 | public class LSHSuperBitExample { 317 | 318 | public static void main(String[] args) { 319 | int count = 100; 320 | 321 | // R^n 322 | int n = 3; 323 | 324 | int stages = 2; 325 | int buckets = 4; 326 | 327 | // Produce some vectors in R^n 328 | Random r = new Random(); 329 | double[][] vectors = new double[count][]; 330 | for (int i = 0; i < count; i++) { 331 | vectors[i] = new double[n]; 332 | 333 | for (int j = 0; j < n; j++) { 334 | vectors[i][j] = r.nextGaussian(); 335 | } 336 | } 337 | 338 | LSHSuperBit lsh = new LSHSuperBit(stages, buckets, n); 339 | 340 | // Compute a SuperBit signature, and a LSH hash 341 | for (int i = 0; i < count; i++) { 342 | double[] vector = vectors[i]; 343 | int[] hash = lsh.hash(vector); 344 | for (double v : vector) { 345 | System.out.printf("%6.2f\t", v); 346 | } 347 | System.out.print(hash[0]); 348 | System.out.print("\n"); 349 | } 350 | } 351 | } 352 | ``` 353 | 354 | This will produce something like, where the last column is the bucket in which this vector was binned (at first stage): 355 | 356 | ``` 357 | -0.48 -0.68 1.87 1 358 | 0.77 0.11 2.20 1 359 | -0.05 0.23 -1.12 2 360 | 1.30 0.02 1.44 3 361 | -0.34 -1.51 0.78 3 362 | 1.64 0.02 0.84 3 363 | -0.74 1.58 -0.79 0 364 | -0.17 -1.27 -1.25 2 365 | ... 366 | ``` 367 | 368 | This can be plotted with Gnuplot for example: 369 | 370 | ![alt tag](https://raw.githubusercontent.com/tdebatty/java-LSH/master/lsh-superbit.png) 371 | 372 | If you only wish to compute super-bit signatures of vectors (without performing LSH binning), you can directly use the SuperBit class: 373 | ```java 374 | import info.debatty.lsh.SuperBit; 375 | 376 | public class MyApp { 377 | 378 | public static void main(String[] args) { 379 | 380 | int n = 10; 381 | 382 | // Initialize Super-Bit 383 | SuperBit sb = new SuperBit(n); 384 | 385 | Random rand = new Random(); 386 | double[] v1 = new double[n]; 387 | double[] v2 = new double[n]; 388 | for (int i = 0; i < n; i++) { 389 | v1[i] = rand.nextInt(); 390 | v2[i] = rand.nextInt(); 391 | } 392 | 393 | boolean[] sig1 = sb.signature(v1); 394 | boolean[] sig2 = sb.signature(v2); 395 | 396 | System.out.println("Signature (estimated) similarity: " + sb.similarity(sig1, sig2)); 397 | System.out.println("Real (cosine) similarity: " + cosineSimilarity(v1, v2)); 398 | } 399 | ``` 400 | 401 | [Read Javadoc...](http://www.javadoc.io/doc/info.debatty/java-lsh) 402 | 403 | ## Comparable signatures 404 | 405 | 406 | As the parameters of the hashing function are randomly initialized when the LSH object is instantiated: 407 | * two LSH objects will produce different hashes and signatures for the same input vector; 408 | * two executions of your program will produce different hashes and signatures for the same input vector; 409 | * the signatures produced by two different LSH objects can not be used to estimate the similarity between vectors. 410 | 411 | There are two possibilities to produce comparable signatures: provide an initial seed or serialize your hash object. 412 | 413 | ### Initial seed 414 | 415 | ```java 416 | import info.debatty.java.lsh.MinHash; 417 | import java.util.Random; 418 | 419 | public class InitialSeed { 420 | 421 | public static void main(String[] args) { 422 | 423 | // Initialize two minhash objects, with the same seed 424 | int signature_size = 20; 425 | int dictionary_size = 100; 426 | long initial_seed = 123456; 427 | 428 | MinHash mh = new MinHash(signature_size, dictionary_size, initial_seed); 429 | MinHash mh2 = new MinHash(signature_size, dictionary_size, initial_seed); 430 | 431 | // Create a single vector of size dictionary_size 432 | Random r = new Random(); 433 | boolean[] vector = new boolean[dictionary_size]; 434 | for (int i = 0; i < dictionary_size; i++) { 435 | vector[i] = r.nextBoolean(); 436 | } 437 | 438 | // The two minhash objects will produce the same signature 439 | println(mh.signature(vector)); 440 | println(mh2.signature(vector)); 441 | } 442 | 443 | static void println(final int[] array) { 444 | System.out.print("["); 445 | for (int v : array) { 446 | System.out.print("" + v + " "); 447 | } 448 | System.out.println("]"); 449 | } 450 | } 451 | ``` 452 | 453 | Will output: 454 | 455 | ``` 456 | [0 0 1 1 3 3 0 1 0 2 0 0 9 1 0 0 0 1 7 0 ] 457 | [0 0 1 1 3 3 0 1 0 2 0 0 9 1 0 0 0 1 7 0 ] 458 | ``` 459 | 460 | ### Serialization 461 | 462 | ```java 463 | import info.debatty.java.lsh.LSHMinHash; 464 | import java.io.File; 465 | import java.io.FileInputStream; 466 | import java.io.FileOutputStream; 467 | import java.io.IOException; 468 | import java.io.ObjectInputStream; 469 | import java.io.ObjectOutputStream; 470 | import java.util.Random; 471 | 472 | public class SerializeExample { 473 | 474 | public static void main(String[] args) 475 | throws IOException, ClassNotFoundException { 476 | 477 | // Create a single random boolean vector 478 | int n = 100; 479 | double sparsity = 0.75; 480 | boolean[] vector = new boolean[n]; 481 | Random rand = new Random(); 482 | for (int j = 0; j < n; j++) { 483 | vector[j] = rand.nextDouble() > sparsity; 484 | } 485 | 486 | // Create and configure LSH 487 | int stages = 2; 488 | int buckets = 10; 489 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 490 | println(lsh.hash(vector)); 491 | 492 | // Create another LSH object 493 | // as the parameters of the hashing function are randomly initialized 494 | // these two LSH objects will produce different hashes for the same 495 | // input vector! 496 | LSHMinHash other_lsh = new LSHMinHash(stages, buckets, n); 497 | println(other_lsh.hash(vector)); 498 | 499 | // Moreover, signatures produced by different LSH objects cannot 500 | // be used to compute estimated similarity! 501 | // The solution is to serialize and save the object, so it can be 502 | // reused later... 503 | File tempfile = File.createTempFile("lshobject", ".ser"); 504 | FileOutputStream fout = new FileOutputStream(tempfile); 505 | ObjectOutputStream oos = new ObjectOutputStream(fout); 506 | oos.writeObject(lsh); 507 | oos.close(); 508 | System.out.println( 509 | "LSH object serialized to " + tempfile.getAbsolutePath()); 510 | 511 | FileInputStream fin = new FileInputStream(tempfile); 512 | ObjectInputStream ois = new ObjectInputStream(fin); 513 | LSHMinHash saved_lsh = (LSHMinHash) ois.readObject(); 514 | println(saved_lsh.hash(vector)); 515 | } 516 | 517 | static void println(int[] array) { 518 | System.out.print("["); 519 | for (int v : array) { 520 | System.out.print("" + v + " "); 521 | } 522 | System.out.println("]"); 523 | } 524 | } 525 | ``` 526 | 527 | Will produce something like: 528 | ``` 529 | [5 5 ] 530 | [3 1 ] 531 | LSH object serialized to /tmp/lshobject5903174677942358274.ser 532 | [5 5 ] 533 | ``` 534 | 535 | [Check the examples](https://github.com/tdebatty/java-LSH/tree/master/src/main/java/info/debatty/java/lsh/examples) or [read Javadoc](http://www.javadoc.io/doc/info.debatty/java-lsh) 536 | -------------------------------------------------------------------------------- /checkstyle.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 35 | 36 | 37 | 44 | 45 | 46 | 47 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /lsh-minhash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdebatty/java-LSH/423fc19894f464968a0968bda7869d8532ffd3c0/lsh-minhash.png -------------------------------------------------------------------------------- /lsh-superbit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tdebatty/java-LSH/423fc19894f464968a0968bda7869d8532ffd3c0/lsh-superbit.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 4.0.0 6 | info.debatty 7 | java-lsh 8 | 0.13-SNAPSHOT 9 | jar 10 | 11 | ${project.artifactId} 12 | https://github.com/tdebatty/java-LSH 13 | A Java implementation of Locality Sensitive Hashing (LSH) 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | 21 | MIT License 22 | http://www.opensource.org/licenses/mit-license.php 23 | 24 | 25 | 26 | 27 | 28 | Thibault Debatty 29 | thibault@debatty.info 30 | debatty.info 31 | http://debatty.info 32 | 33 | 34 | 35 | 36 | scm:git:git@github.com:tdebatty/java-LSH.git 37 | scm:git:git@github.com:tdebatty/java-LSH.git 38 | git@github.com:tdebatty/java-LSH.git 39 | HEAD 40 | 41 | 42 | 43 | 44 | ossrh 45 | https://oss.sonatype.org/content/repositories/snapshots 46 | 47 | 48 | ossrh 49 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 50 | 51 | 52 | 53 | 54 | 55 | 56 | org.sonatype.plugins 57 | nexus-staging-maven-plugin 58 | 1.6.3 59 | true 60 | 61 | ossrh 62 | https://oss.sonatype.org/ 63 | true 64 | 65 | 66 | 67 | 68 | org.apache.maven.plugins 69 | maven-source-plugin 70 | 2.2.1 71 | 72 | 73 | attach-sources 74 | 75 | jar-no-fork 76 | 77 | 78 | 79 | 80 | 81 | 82 | org.apache.maven.plugins 83 | maven-javadoc-plugin 84 | 2.9.1 85 | 86 | 87 | attach-javadocs 88 | 89 | jar 90 | 91 | 92 | 93 | 94 | 95 | 96 | org.apache.maven.plugins 97 | maven-gpg-plugin 98 | 1.5 99 | 100 | 101 | sign-artifacts 102 | verify 103 | 104 | sign 105 | 106 | 107 | 108 | 109 | 110 | org.apache.maven.plugins 111 | maven-compiler-plugin 112 | 2.3.2 113 | 114 | 1.5 115 | 1.5 116 | 117 | 118 | 119 | 120 | 121 | org.apache.maven.plugins 122 | maven-release-plugin 123 | 2.5.1 124 | 125 | v@{project.version} 126 | 127 | 128 | 129 | 130 | 131 | org.apache.maven.plugins 132 | maven-checkstyle-plugin 133 | 2.17 134 | 135 | checkstyle.xml 136 | **\/examples\/*.java 137 | false 138 | 139 | 140 | 141 | 142 | test 143 | test 144 | 145 | true 146 | true 147 | 148 | 149 | check 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | junit 160 | junit 161 | 4.10 162 | test 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/LSH.java: -------------------------------------------------------------------------------- 1 | package info.debatty.java.lsh; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Implementation of Locality Sensitive Hashing (LSH) principle, as described in 7 | * Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets", 8 | * Cambridge University Press. 9 | * 10 | * @author Thibault Debatty http://www.debatty.info 11 | */ 12 | public abstract class LSH implements Serializable { 13 | 14 | protected static final long LARGE_PRIME = 433494437; 15 | private static final int DEFAULT_STAGES = 3; 16 | private static final int DEFAULT_BUCKETS = 10; 17 | 18 | private int stages = DEFAULT_STAGES; 19 | private int buckets = DEFAULT_BUCKETS; 20 | 21 | /** 22 | * Instantiates a LSH instance with s stages (or bands) and b buckets (per 23 | * stage), in a space with n dimensions. 24 | * 25 | * @param stages stages 26 | * @param buckets buckets (per stage) 27 | */ 28 | public LSH(final int stages, final int buckets) { 29 | this.stages = stages; 30 | this.buckets = buckets; 31 | } 32 | 33 | /** 34 | * Instantiate an empty LSH instance (useful only for serialization). 35 | */ 36 | public LSH() { 37 | 38 | } 39 | 40 | /** 41 | * Hash a signature. 42 | * The signature is divided in s stages (or bands). Each stage is hashed to 43 | * one of the b buckets. 44 | * @param signature 45 | * @return An vector of s integers (between 0 and b-1) 46 | */ 47 | public final int[] hashSignature(final int[] signature) { 48 | 49 | // Create an accumulator for each stage 50 | int[] hash = new int[stages]; 51 | 52 | // Number of rows per stage 53 | int rows = signature.length / stages; 54 | 55 | for (int i = 0; i < signature.length; i++) { 56 | int stage = Math.min(i / rows, stages - 1); 57 | hash[stage] = (int) 58 | ((hash[stage] + (long) signature[i] * LARGE_PRIME) 59 | % buckets); 60 | 61 | } 62 | 63 | return hash; 64 | } 65 | 66 | /** 67 | * Hash a signature. 68 | * The signature is divided in s stages (or bands). Each stage is hashed to 69 | * one of the b buckets. 70 | * @param signature 71 | * @return An vector of s integers (between 0 and b-1) 72 | */ 73 | public final int[] hashSignature(final boolean[] signature) { 74 | 75 | // Create an accumulator for each stage 76 | long[] acc = new long[stages]; 77 | for (int i = 0; i < stages; i++) { 78 | acc[i] = 0; 79 | } 80 | 81 | // Number of rows per stage 82 | int rows = signature.length / stages; 83 | 84 | for (int i = 0; i < signature.length; i++) { 85 | long v = 0; 86 | if (signature[i]) { 87 | v = (i + 1) * LARGE_PRIME; 88 | } 89 | 90 | // current stage 91 | int j = Math.min(i / rows, stages - 1); 92 | acc[j] = (acc[j] + v) % Integer.MAX_VALUE; 93 | } 94 | 95 | int[] r = new int[stages]; 96 | for (int i = 0; i < stages; i++) { 97 | r[i] = (int) (acc[i] % buckets); 98 | } 99 | 100 | return r; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/LSHMinHash.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh; 26 | 27 | /** 28 | * 29 | * @author Thibault Debatty 30 | */ 31 | public class LSHMinHash extends LSH { 32 | private final MinHash mh; 33 | private static final double THRESHOLD = 0.5; 34 | 35 | /** 36 | * Instantiates a LSH instance that internally uses MinHash, 37 | * with s stages (or bands) and b buckets (per stage), for sets out of a 38 | * dictionary of n elements. 39 | * 40 | * Attention: the number of buckets should be chosen such that we have at 41 | * least 100 items per bucket. 42 | * 43 | * @param s stages 44 | * @param b buckets (per stage) 45 | * @param n dictionary size 46 | */ 47 | public LSHMinHash(final int s, final int b, final int n) { 48 | super(s, b); 49 | int signature_size = computeSignatureSize(s, n); 50 | this.mh = new MinHash(signature_size, n); 51 | } 52 | 53 | /** 54 | * Instantiates a LSH instance that internally uses MinHash, 55 | * with s stages (or bands) and b buckets (per stage), for sets out of a 56 | * dictionary of n elements. 57 | * 58 | * Attention: the number of buckets should be chosen such that we have at 59 | * least 100 items per bucket. 60 | * 61 | * @param s stages 62 | * @param b buckets (per stage) 63 | * @param n dictionary size 64 | * @param seed random number generator seed. using the same value will 65 | * guarantee identical hashes across object instantiations 66 | */ 67 | public LSHMinHash(final int s, final int b, final int n, final long seed) { 68 | super(s, b); 69 | int signature_size = computeSignatureSize(s, n); 70 | this.mh = new MinHash(signature_size, n, seed); 71 | } 72 | 73 | /** 74 | * Compute the size of the signature according to "Mining of Massive 75 | * Datasets" p88. 76 | * It can be shown that, using MinHash, the probability that the 77 | * signatures of 2 sets with Jaccard similarity s agree in all the 78 | * rows of at least one stage (band), and therefore become a candidate 79 | * pair, is 1−(1−s^R)^b 80 | * where R = signature_size / b (number of rows in a stage/band) 81 | * Thus, the curve that shows the probability that 2 items fall in the 82 | * same bucket for at least one of the stages, as a function of their 83 | * Jaccard index similarity, has a S shape. 84 | * The threshold (the value of similarity at which the probability of 85 | * becoming a candidate is 1/2) is a function of the number of stages 86 | * (s, or bands b in the book) and the signature size: 87 | * threshold ≃ (1/s)^(1/R) 88 | * Hence the signature size can be computed as: 89 | * R = ln(1/s) / ln(threshold) 90 | * signature_size = R * b 91 | */ 92 | private int computeSignatureSize(final int s, final int n) { 93 | 94 | int r = (int) Math.ceil(Math.log(1.0 / s) / Math.log(THRESHOLD)) + 1; 95 | return r * s; 96 | } 97 | 98 | /** 99 | * Bin this vector to corresponding buckets. 100 | * @param vector 101 | * @return 102 | */ 103 | public final int[] hash(final boolean[] vector) { 104 | return hashSignature(this.mh.signature(vector)); 105 | } 106 | 107 | /** 108 | * Get the coefficients used by internal hashing functions. 109 | * @return 110 | */ 111 | public final long[][] getCoefficients() { 112 | return mh.getCoefficients(); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/LSHSuperBit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh; 26 | 27 | import java.io.Serializable; 28 | 29 | /** 30 | * 31 | * @author Thibault Debatty 32 | */ 33 | public class LSHSuperBit extends LSH implements Serializable { 34 | private SuperBit sb; 35 | 36 | /** 37 | * LSH implementation relying on SuperBit, to bin vectors s times (stages) 38 | * in b buckets (per stage), in a space with n dimensions. Input vectors 39 | * with a high cosine similarity have a high probability of falling in the 40 | * same bucket... 41 | * 42 | * Supported input types: 43 | * - double[] 44 | * - int[] 45 | * - others to come... 46 | * 47 | * @param stages stages 48 | * @param buckets buckets (per stage) 49 | * @param dimensions dimensionality 50 | */ 51 | public LSHSuperBit( 52 | final int stages, final int buckets, final int dimensions) { 53 | 54 | super(stages, buckets); 55 | 56 | int code_length = stages * buckets / 2; 57 | int superbit = computeSuperBit(stages, buckets, dimensions); 58 | 59 | this.sb = new SuperBit(dimensions, superbit, code_length / superbit); 60 | } 61 | 62 | /** 63 | * LSH implementation relying on SuperBit, to bin vectors s times (stages) 64 | * in b buckets (per stage), in a space with n dimensions. Input vectors 65 | * with a high cosine similarity have a high probability of falling in the 66 | * same bucket... 67 | * 68 | * Supported input types: 69 | * - double[] 70 | * - int[] 71 | * - others to come... 72 | * 73 | * @param stages stages 74 | * @param buckets buckets (per stage) 75 | * @param dimensions dimensionality 76 | * @param seed random number generator seed. using the same value will 77 | * guarantee identical hashes across object instantiations 78 | * 79 | */ 80 | public LSHSuperBit( 81 | final int stages, 82 | final int buckets, 83 | final int dimensions, 84 | final long seed) { 85 | 86 | super(stages, buckets); 87 | 88 | int code_length = stages * buckets / 2; 89 | int superbit = computeSuperBit(stages, buckets, dimensions); 90 | 91 | this.sb = new SuperBit( 92 | dimensions, superbit, code_length / superbit, seed); 93 | } 94 | 95 | /** 96 | * Compute the superbit value. 97 | * @param stages 98 | * @param buckets 99 | * @param dimensions 100 | * @return 101 | */ 102 | private int computeSuperBit( 103 | final int stages, final int buckets, final int dimensions) { 104 | 105 | // SuperBit code length 106 | int code_length = stages * buckets / 2; 107 | int superbit; // superbit value 108 | for (superbit = dimensions; superbit >= 1; superbit--) { 109 | if (code_length % superbit == 0) { 110 | break; 111 | } 112 | } 113 | 114 | if (superbit == 0) { 115 | throw new IllegalArgumentException( 116 | "Superbit is 0 with parameters: s=" + stages 117 | + " b=" + buckets + " n=" + dimensions); 118 | } 119 | 120 | return superbit; 121 | } 122 | 123 | /** 124 | * Empty constructor, used only for serialization. 125 | */ 126 | public LSHSuperBit() { 127 | } 128 | 129 | /** 130 | * Hash (bin) a vector in s stages into b buckets. 131 | * @param vector 132 | * @return 133 | */ 134 | public final int[] hash(final double[] vector) { 135 | return hashSignature(sb.signature(vector)); 136 | } 137 | 138 | /** 139 | * Hash (bin) a vector in s stages into b buckets. 140 | * @param vector 141 | * @return 142 | */ 143 | public final int[] hash(final int[] vector) { 144 | 145 | double[] d = new double[vector.length]; 146 | for (int i = 0; i < vector.length; i++) { 147 | d[i] = (double) vector[i]; 148 | } 149 | return hash(d); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/MinHash.java: -------------------------------------------------------------------------------- 1 | package info.debatty.java.lsh; 2 | 3 | import java.io.Serializable; 4 | import java.security.InvalidParameterException; 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Random; 10 | import java.util.Set; 11 | import java.util.TreeSet; 12 | 13 | /** 14 | * MinHash is a hashing scheme that tents to produce similar signatures for sets 15 | * that have a high Jaccard similarity. 16 | * 17 | * The Jaccard similarity between two sets is the relative number of elements 18 | * these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B| A MinHash signature is 19 | * a sequence of numbers produced by multiple hash functions hi. It can be shown 20 | * that the Jaccard similarity between two sets is also the probability that 21 | * this hash result is the same for the two sets: J(A, B) = Pr[hi(A) = hi(B)]. 22 | * Therefore, MinHash signatures can be used to estimate Jaccard similarity 23 | * between two sets. Moreover, it can be shown that the expected estimation 24 | * error is O(1 / sqrt(n)), where n is the size of the signature (the number of 25 | * hash functions that are used to produce the signature). 26 | * 27 | * @author Thibault Debatty http://www.debatty.info 28 | */ 29 | public class MinHash implements Serializable { 30 | 31 | private static final int LARGE_PRIME = 2147483647; // = 2^31 - 1 ! 32 | 33 | /** 34 | * Compute the jaccard index between two sets. 35 | * @param s1 36 | * @param s2 37 | * @return 38 | */ 39 | public static double jaccardIndex( 40 | final Set s1, final Set s2) { 41 | 42 | Set intersection = new HashSet(s1); 43 | intersection.retainAll(s2); 44 | 45 | Set union = new HashSet(s1); 46 | union.addAll(s2); 47 | 48 | if (union.isEmpty()) { 49 | return 0; 50 | } 51 | 52 | return (double) intersection.size() / union.size(); 53 | } 54 | 55 | /** 56 | * Compute the exact jaccard index between two sets, represented as 57 | * arrays of booleans. 58 | * @param s1 59 | * @param s2 60 | * @return 61 | */ 62 | public static double jaccardIndex(final boolean[] s1, final boolean[] s2) { 63 | if (s1.length != s2.length) { 64 | throw new InvalidParameterException("sets must be same size!"); 65 | } 66 | return jaccardIndex(convert2Set(s1), convert2Set(s2)); 67 | } 68 | 69 | /** 70 | * Convert a set represented as an array of booleans to a set of integer. 71 | * 72 | * @param array 73 | * @return 74 | */ 75 | public static Set convert2Set(final boolean[] array) { 76 | Set set = new TreeSet(); 77 | for (int i = 0; i < array.length; i++) { 78 | if (array[i]) { 79 | set.add(i); 80 | } 81 | } 82 | return set; 83 | } 84 | 85 | /** 86 | * Computes the size of the signature required to achieve a given error in 87 | * similarity estimation. (1 / error^2) 88 | * 89 | * @param error 90 | * @return size of the signature 91 | */ 92 | public static int size(final double error) { 93 | if (error < 0 && error > 1) { 94 | throw new IllegalArgumentException("error should be in [0 .. 1]"); 95 | } 96 | return (int) (1 / (error * error)); 97 | } 98 | 99 | /** 100 | * Signature size. 101 | */ 102 | private int n; 103 | 104 | /** 105 | * Random a and b coefficients for the random hash functions. 106 | */ 107 | private long[][] hash_coefs; 108 | 109 | /** 110 | * Dictionary size (is also the size of vectors if the sets are provided 111 | * as vectors). 112 | */ 113 | private int dict_size; 114 | 115 | /** 116 | * Initializes hash functions to compute MinHash signatures for sets built 117 | * from a dictionary of dict_size elements. 118 | * 119 | * @param size the number of hash functions (and the size of resulting 120 | * signatures) 121 | * @param dict_size 122 | */ 123 | public MinHash(final int size, final int dict_size) { 124 | init(size, dict_size, new Random()); 125 | } 126 | 127 | /** 128 | * Initializes hash function to compute MinHash signatures for sets built 129 | * from a dictionary of dict_size elements, with a given similarity 130 | * estimation error. 131 | * 132 | * @param error 133 | * @param dict_size 134 | */ 135 | public MinHash(final double error, final int dict_size) { 136 | init(size(error), dict_size, new Random()); 137 | } 138 | 139 | /** 140 | * Initializes hash functions to compute MinHash signatures for sets built 141 | * from a dictionary of dict_size elements. 142 | * 143 | * @param size the number of hash functions (and the size of resulting 144 | * signatures) 145 | * @param dict_size 146 | * @param seed random number generator seed. using the same value will 147 | * guarantee identical hashes across object instantiations 148 | */ 149 | public MinHash(final int size, final int dict_size, final long seed) { 150 | init(size, dict_size, new Random(seed)); 151 | } 152 | 153 | /** 154 | * Initializes hash function to compute MinHash signatures for sets built 155 | * from a dictionary of dict_size elements, with a given similarity 156 | * estimation error. 157 | * 158 | * @param error 159 | * @param dict_size 160 | * @param seed random number generator seed. using the same value will 161 | * guarantee identical hashes across object instantiations 162 | */ 163 | public MinHash(final double error, final int dict_size, final long seed) { 164 | init(size(error), dict_size, new Random(seed)); 165 | } 166 | 167 | /** 168 | * Computes the signature for this set The input set is represented as an 169 | * vector of booleans. 170 | * For example the array [true, false, true, true, false] 171 | * corresponds to the set {0, 2, 3} 172 | * 173 | * @param vector 174 | * @return the signature 175 | */ 176 | public final int[] signature(final boolean[] vector) { 177 | if (vector.length != dict_size) { 178 | throw new IllegalArgumentException( 179 | "Size of array should be dict_size"); 180 | } 181 | 182 | return signature(convert2Set(vector)); 183 | } 184 | 185 | /** 186 | * Computes the signature for this set. For example set = {0, 2, 3} 187 | * 188 | * @param set 189 | * @return the signature 190 | */ 191 | public final int[] signature(final Set set) { 192 | int[] sig = new int[n]; 193 | 194 | for (int i = 0; i < n; i++) { 195 | sig[i] = Integer.MAX_VALUE; 196 | } 197 | 198 | // For each row r: 199 | //for (int r = 0; r < dict_size; r++) { 200 | // if set has 0 in row r, do nothing 201 | // if (!set.contains(r)) { 202 | // continue; 203 | // } 204 | // Loop over true values, instead of loop over all values of dictionary 205 | // to speedup computation 206 | final List list = new ArrayList(set); 207 | Collections.sort(list); 208 | 209 | for (final int r : list) { 210 | 211 | // However, if c has 1 in row r, then for each i = 1, 2, . . . ,n 212 | // set SIG(i, c) to the smaller of the current value of 213 | // SIG(i, c) and hi(r) 214 | for (int i = 0; i < n; i++) { 215 | sig[i] = Math.min( 216 | sig[i], 217 | h(i, r)); 218 | } 219 | } 220 | 221 | return sig; 222 | } 223 | 224 | /** 225 | * Computes an estimation of Jaccard similarity (the number of elements in 226 | * common) between two sets, using the MinHash signatures of these two sets. 227 | * 228 | * @param sig1 MinHash signature of set1 229 | * @param sig2 MinHash signature of set2 (produced using the same 230 | * coefficients) 231 | * @return the estimated similarity 232 | */ 233 | public final double similarity(final int[] sig1, final int[] sig2) { 234 | if (sig1.length != sig2.length) { 235 | throw new IllegalArgumentException( 236 | "Size of signatures should be the same"); 237 | } 238 | 239 | double sim = 0; 240 | for (int i = 0; i < sig1.length; i++) { 241 | if (sig1[i] == sig2[i]) { 242 | sim += 1; 243 | } 244 | } 245 | 246 | return sim / sig1.length; 247 | } 248 | 249 | /** 250 | * Computes the expected error of similarity computed using signatures. 251 | * 252 | * @return the expected error 253 | */ 254 | public final double error() { 255 | return 1.0 / Math.sqrt(n); 256 | } 257 | 258 | /** 259 | * Compute hash function coefficients using provided Random. 260 | * @param size 261 | * @param dict_size 262 | * @param r 263 | */ 264 | private void init(final int size, final int dict_size, final Random r) { 265 | if (size <= 0) { 266 | throw new InvalidParameterException( 267 | "Signature size should be positive"); 268 | } 269 | 270 | if (dict_size <= 0) { 271 | throw new InvalidParameterException( 272 | "Dictionary size (or vector size) should be positive"); 273 | } 274 | 275 | // In function h(i, x) the largest value could be 276 | // dict_size * dict_size + dict_size 277 | // throw an error if dict_size * dict_size + dict_size > Long.MAX_VALUE 278 | if (dict_size > (Long.MAX_VALUE - dict_size) / dict_size) { 279 | throw new InvalidParameterException( 280 | "Dictionary size (or vector size) is too big and will " 281 | + "cause a multiplication overflow"); 282 | } 283 | 284 | this.dict_size = dict_size; 285 | this.n = size; 286 | 287 | // h = (a * x) + b 288 | // a and b should be randomly generated in [1,PRIME-1] 289 | hash_coefs = new long[n][2]; 290 | for (int i = 0; i < n; i++) { 291 | hash_coefs[i][0] = r.nextInt(LARGE_PRIME - 1) + 1; // a 292 | hash_coefs[i][1] = r.nextInt(LARGE_PRIME - 1) + 1; // b 293 | } 294 | } 295 | 296 | /** 297 | * Computes hi(x) as (a_i * x + b_i) % LARGE_PRIME . 298 | * 299 | * @param i 300 | * @param x 301 | * @return the hashed value of x, using ith hash function 302 | */ 303 | private int h(final int i, final int x) { 304 | return (int) 305 | ((hash_coefs[i][0] * (long) x + hash_coefs[i][1]) 306 | % LARGE_PRIME); 307 | } 308 | 309 | /** 310 | * Get the coefficients used by hash function hi. 311 | * @return 312 | */ 313 | public final long[][] getCoefficients() { 314 | return hash_coefs; 315 | } 316 | } 317 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/SuperBit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh; 26 | 27 | import java.io.Serializable; 28 | import java.util.Random; 29 | 30 | /** 31 | * Implementation of Super-Bit Locality-Sensitive Hashing. 32 | * Super-Bit is an improvement of Random Projection LSH. 33 | * It computes an estimation of cosine similarity. 34 | * 35 | * Super-Bit Locality-Sensitive Hashing 36 | * Jianqiu Ji, Jianmin Li, Shuicheng Yan, Bo Zhang, Qi Tian 37 | * http://papers.nips.cc/paper/4847-super-bit-locality-sensitive-hashing.pdf 38 | * Advances in Neural Information Processing Systems 25, 2012 39 | * 40 | * Supported input types: 41 | * - double[] 42 | * - others to come... 43 | * 44 | * @author Thibault Debatty 45 | */ 46 | public class SuperBit implements Serializable { 47 | 48 | private double[][] hyperplanes; 49 | private static final int DEFAULT_CODE_LENGTH = 10000; 50 | 51 | /** 52 | * Initialize SuperBit algorithm. 53 | * Super-Bit depth n must be [1 .. d] and number of Super-Bit l in [1 .. 54 | * The resulting code length k = n * l 55 | * The K vectors are orthogonalized in L batches of N vectors 56 | * 57 | * @param d data space dimension 58 | * @param n Super-Bit depth [1 .. d] 59 | * @param l number of Super-Bit [1 .. 60 | */ 61 | public SuperBit(final int d, final int n, final int l) { 62 | this(d, n, l, new Random()); 63 | } 64 | 65 | /** 66 | * Initialize SuperBit algorithm. 67 | * Super-Bit depth n must be [1 .. d] and number of Super-Bit l in [1 .. 68 | * The resulting code length k = n * l 69 | * The K vectors are orthogonalized in L batches of N vectors 70 | * 71 | * @param d data space dimension 72 | * @param n Super-Bit depth [1 .. d] 73 | * @param l number of Super-Bit [1 .. 74 | * @param seed to use for the random number generator 75 | */ 76 | public SuperBit(final int d, final int n, final int l, final long seed) { 77 | this(d, n, l, new Random(seed)); 78 | } 79 | 80 | private SuperBit(final int d, final int n, final int l, final Random rand) { 81 | if (d <= 0) { 82 | throw new IllegalArgumentException("Dimension d must be >= 1"); 83 | } 84 | 85 | if (n < 1 || n > d) { 86 | throw new IllegalArgumentException( 87 | "Super-Bit depth N must be 1 <= N <= d"); 88 | } 89 | 90 | if (l < 1) { 91 | throw new IllegalArgumentException( 92 | "Number of Super-Bit L must be >= 1"); 93 | } 94 | 95 | // Input: Data space dimension d, Super-Bit depth 1 <= N <= d, 96 | // number of Super-Bit L >= 1, 97 | // resulting code length K = N * L 98 | 99 | // Generate a random matrix H with each element sampled independently 100 | // from the normal distribution 101 | // N (0, 1), with each column normalized to unit length. 102 | // Denote H = [v1, v2, ..., vK]. 103 | int code_length = n * l; 104 | 105 | double[][] v = new double[code_length][d]; 106 | 107 | for (int i = 0; i < code_length; i++) { 108 | double[] vector = new double[d]; 109 | for (int j = 0; j < d; j++) { 110 | vector[j] = rand.nextGaussian(); 111 | } 112 | 113 | normalize(vector); 114 | v[i] = vector; 115 | } 116 | 117 | 118 | // for i = 0 to L - 1 do 119 | // for j = 1 to N do 120 | // w_{iN+j} = v_{iN+j} 121 | // for k = 1 to j - 1 do 122 | // w_{iN+j} = w_{iN+j} - w_{iN+k} w^T_{iN+k} v_{iN+j} 123 | // end for 124 | // wiN+j = wiN+j / | wiN+j | 125 | // end for 126 | // end for 127 | // Output: H˜ = [w1, w2, ..., wK] 128 | 129 | double[][] w = new double[code_length][d]; 130 | for (int i = 0; i <= l - 1; i++) { 131 | for (int j = 1; j <= n; j++) { 132 | java.lang.System.arraycopy( 133 | v[i * n + j - 1], 134 | 0, 135 | w[i * n + j - 1], 136 | 0, 137 | d); 138 | 139 | for (int k = 1; k <= (j - 1); k++) { 140 | w[i * n + j - 1] = sub( 141 | w[i * n + j - 1], 142 | product( 143 | dotProduct( 144 | w[i * n + k - 1], 145 | v[ i * n + j - 1]), 146 | w[i * n + k - 1])); 147 | } 148 | 149 | normalize(w[i * n + j - 1]); 150 | 151 | } 152 | } 153 | 154 | this.hyperplanes = w; 155 | } 156 | 157 | /** 158 | * Initialize SuperBit algorithm. 159 | * With code length K = 10000 160 | * The K vectors are orthogonalized in d batches of 10000/d vectors 161 | * The resulting mean error is 0.01 162 | * @param d 163 | */ 164 | public SuperBit(final int d) { 165 | this(d, d, DEFAULT_CODE_LENGTH / d); 166 | } 167 | 168 | /** 169 | * Initialize SuperBit algorithm without parameters 170 | * (used only for serialization). 171 | */ 172 | public SuperBit() { 173 | 174 | } 175 | 176 | /** 177 | * Compute the signature of this vector. 178 | * @param vector 179 | * @return 180 | */ 181 | public final boolean[] signature(final double[] vector) { 182 | boolean[] sig = new boolean[this.hyperplanes.length]; 183 | for (int i = 0; i < this.hyperplanes.length; i++) { 184 | sig[i] = (dotProduct(this.hyperplanes[i], vector) >= 0); 185 | } 186 | return sig; 187 | } 188 | 189 | /** 190 | * Compute the similarity between two signature, which is also an 191 | * estimation of the cosine similarity between the two vectors. 192 | * 193 | * @param sig1 194 | * @param sig2 195 | * @return estimated cosine similarity 196 | */ 197 | public final double similarity(final boolean[] sig1, final boolean[] sig2) { 198 | 199 | double agg = 0; 200 | for (int i = 0; i < sig1.length; i++) { 201 | if (sig1[i] == sig2[i]) { 202 | agg++; 203 | } 204 | } 205 | 206 | agg = agg / sig1.length; 207 | 208 | return Math.cos((1 - agg) * Math.PI); 209 | } 210 | 211 | /** 212 | * Get the hyperplanes coefficients used to compute signatures. 213 | * @return 214 | */ 215 | public final double[][] getHyperplanes() { 216 | return this.hyperplanes; 217 | } 218 | 219 | /* ---------------------- STATIC ---------------------- */ 220 | 221 | /** 222 | * Computes the cosine similarity, computed as v1 dot v2 / (|v1| * |v2|). 223 | * Cosine similarity of two vectors is the cosine of the angle between them. 224 | * It ranges between -1 and +1 225 | * 226 | * @param v1 227 | * @param v2 228 | * @return 229 | */ 230 | public static double cosineSimilarity(final double[]v1, final double[] v2) { 231 | 232 | return dotProduct(v1, v2) / (norm(v1) * norm(v2)); 233 | } 234 | 235 | private static double[] product(final double x, final double[] v) { 236 | double[] r = new double[v.length]; 237 | for (int i = 0; i < v.length; i++) { 238 | r[i] = x * v[i]; 239 | } 240 | return r; 241 | } 242 | 243 | private static double[] sub(final double[] a, final double[] b) { 244 | double[] r = new double[a.length]; 245 | for (int i = 0; i < a.length; i++) { 246 | r[i] = a[i] - b[i]; 247 | } 248 | return r; 249 | } 250 | 251 | private static void normalize(final double[] vector) { 252 | double norm = norm(vector); 253 | for (int i = 0; i < vector.length; i++) { 254 | vector[i] = vector[i] / norm; 255 | } 256 | 257 | } 258 | 259 | /** 260 | * Returns the norm L2. sqrt(sum_i(v_i^2)) 261 | * @param v 262 | * @return 263 | */ 264 | private static double norm(final double[] v) { 265 | double agg = 0; 266 | 267 | for (int i = 0; i < v.length; i++) { 268 | agg += (v[i] * v[i]); 269 | } 270 | 271 | return Math.sqrt(agg); 272 | } 273 | 274 | private static double dotProduct(final double[] v1, final double[] v2) { 275 | double agg = 0; 276 | 277 | for (int i = 0; i < v1.length; i++) { 278 | agg += (v1[i] * v2[i]); 279 | } 280 | 281 | return agg; 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/InitialSeed.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2016 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.MinHash; 28 | import java.util.Random; 29 | 30 | /** 31 | * 32 | * @author Thibault Debatty 33 | */ 34 | public class InitialSeed { 35 | 36 | /** 37 | * @param args the command line arguments 38 | */ 39 | public static void main(String[] args) { 40 | 41 | // Initialize two minhash objects, with the same seed 42 | int signature_size = 20; 43 | int dictionary_size = 100; 44 | long initial_seed = 123456; 45 | 46 | MinHash mh = new MinHash(signature_size, dictionary_size, initial_seed); 47 | MinHash mh2 = new MinHash(signature_size, dictionary_size, initial_seed); 48 | 49 | // Create a single vector of size dictionary_size 50 | Random r = new Random(); 51 | boolean[] vector = new boolean[dictionary_size]; 52 | for (int i = 0; i < dictionary_size; i++) { 53 | vector[i] = r.nextBoolean(); 54 | } 55 | 56 | // The two minhash objects will produce the same signature 57 | println(mh.signature(vector)); 58 | println(mh2.signature(vector)); 59 | } 60 | 61 | static void println(final int[] array) { 62 | System.out.print("["); 63 | for (int v : array) { 64 | System.out.print("" + v + " "); 65 | } 66 | System.out.println("]"); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/LSHMinHashExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.LSHMinHash; 28 | import info.debatty.java.lsh.MinHash; 29 | import java.util.Random; 30 | 31 | /** 32 | * 33 | * @author Thibault Debatty 34 | */ 35 | public class LSHMinHashExample { 36 | 37 | /** 38 | * @param args the command line arguments 39 | */ 40 | public static void main(String[] args) { 41 | // Number of sets 42 | int count = 2000; 43 | 44 | // Size of dictionary 45 | int n = 100; 46 | 47 | // Number of buckets 48 | // Attention: to get relevant results, the number of elements per bucket 49 | // should be at least 100 50 | int buckets = 10; 51 | 52 | // Let's generate some random sets 53 | boolean[][] vectors = new boolean[count][]; 54 | Random r = new Random(); 55 | 56 | // To get some interesting measures, we first generate a single 57 | // sparse random vector 58 | vectors[0] = new boolean[n]; 59 | for (int j = 0; j < n; j++) { 60 | vectors[0][j] = (r.nextInt(10) == 0); 61 | } 62 | 63 | // Then we generate the other vectors, which have a reasonable chance 64 | // to look like the first one... 65 | for (int i = 1; i < count; i++) { 66 | vectors[i] = new boolean[n]; 67 | 68 | for (int j = 0; j < n; j++) { 69 | vectors[i][j] = (r.nextDouble() <= 0.7 ? vectors[0][j] : (r.nextInt(10) == 0)); 70 | } 71 | } 72 | 73 | // Now we can proceed to LSH binning 74 | // We will test multiple stages 75 | for (int stages = 1; stages <= 10; stages++) { 76 | 77 | // Compute the LSH hash of each vector 78 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 79 | int[][] hashes = new int[count][]; 80 | for (int i = 0; i < count; i++) { 81 | boolean[] vector = vectors[i]; 82 | hashes[i] = lsh.hash(vector); 83 | } 84 | 85 | // We now have the LSH hash for each input set 86 | // Let's have a look at how similar sets (according to Jaccard 87 | // index) were binned... 88 | int[][] results = new int[11][2]; 89 | for (int i = 0; i < vectors.length; i++) { 90 | boolean[] vector1 = vectors[i]; 91 | int[] hash1 = hashes[i]; 92 | 93 | for (int j = 0; j < i; j++) { 94 | boolean[] vector2 = vectors[j]; 95 | int[] hash2 = hashes[j]; 96 | 97 | // We compute the similarity between each pair of sets 98 | double similarity = MinHash.jaccardIndex(vector1, vector2); 99 | 100 | // We count the number of pairs with similarity 0.1, 0.2, 101 | // 0.3, etc. 102 | results[(int) (10 * similarity)][0]++; 103 | 104 | // Do they fall in the same bucket for one of the stages? 105 | for (int stage = 0; stage < stages; stage++) { 106 | if (hash1[stage] == hash2[stage]) { 107 | results[(int) (10 * similarity)][1]++; 108 | break; 109 | } 110 | } 111 | } 112 | } 113 | 114 | // Now we can display (and plot in Gnuplot) the result: 115 | // For pairs that have a similarity x, the probability of falling 116 | // in the same bucket for at least one of the stages is y 117 | for (int i = 0; i < results.length; i++) { 118 | double similarity = (double) i / 10; 119 | 120 | double probability = 0; 121 | if (results[i][0] != 0) { 122 | probability = (double) results[i][1] / results[i][0]; 123 | } 124 | System.out.println("" + similarity + "\t" + probability + "\t" + stages); 125 | } 126 | 127 | // Separate the series for Gnuplot... 128 | System.out.print("\n"); 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/LSHSuperBitExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.LSHSuperBit; 28 | import java.util.Random; 29 | import java.util.logging.Level; 30 | import java.util.logging.Logger; 31 | 32 | /** 33 | * 34 | * @author Thibault Debatty 35 | */ 36 | public class LSHSuperBitExample { 37 | 38 | /** 39 | * @param args the command line arguments 40 | */ 41 | public static void main(String[] args) { 42 | 43 | int count = 100; 44 | 45 | // R^n 46 | int n = 3; 47 | 48 | int stages = 2; 49 | int buckets = 4; 50 | 51 | // Produce some vectors in R^n 52 | Random r = new Random(); 53 | double[][] vectors = new double[count][]; 54 | for (int i = 0; i < count; i++) { 55 | vectors[i] = new double[n]; 56 | 57 | for (int j = 0; j < n; j++) { 58 | vectors[i][j] = r.nextGaussian(); 59 | } 60 | } 61 | try { 62 | LSHSuperBit lsh = new LSHSuperBit(stages, buckets, n); 63 | 64 | // Compute a SuperBit signature, and a LSH hash 65 | for (int i = 0; i < count; i++) { 66 | double[] vector = vectors[i]; 67 | int[] hash = lsh.hash(vector); 68 | for (double v : vector) { 69 | System.out.printf("%6.2f\t", v); 70 | } 71 | System.out.print(hash[0]); 72 | System.out.print("\n"); 73 | } 74 | } catch (Exception ex) { 75 | Logger.getLogger(LSHSuperBitExample.class.getName()).log(Level.SEVERE, null, ex); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/MinHashExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.MinHash; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * 32 | * @author Thibault Debatty 33 | */ 34 | public class MinHashExample { 35 | 36 | public static void main(String[] args) { 37 | // Initialize the hash function for an similarity error of 0.1 38 | // For sets built from a dictionary of 5 items 39 | MinHash minhash = new MinHash(0.1, 5); 40 | 41 | // Sets can be defined as an vector of booleans: 42 | // [1 0 0 1 0] 43 | boolean[] vector1 = {true, false, false, true, false}; 44 | int[] sig1 = minhash.signature(vector1); 45 | 46 | // Or as a set of integers: 47 | // set2 = [1 0 1 1 0] 48 | TreeSet set2 = new TreeSet(); 49 | set2.add(0); 50 | set2.add(2); 51 | set2.add(3); 52 | int[] sig2 = minhash.signature(set2); 53 | 54 | System.out.println("Signature similarity: " + minhash.similarity(sig1, sig2)); 55 | System.out.println("Real similarity (Jaccard index)" + 56 | MinHash.jaccardIndex(MinHash.convert2Set(vector1), set2)); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/SerializeExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | package info.debatty.java.lsh.examples; 25 | 26 | import info.debatty.java.lsh.LSHMinHash; 27 | import java.io.File; 28 | import java.io.FileInputStream; 29 | import java.io.FileOutputStream; 30 | import java.io.IOException; 31 | import java.io.ObjectInputStream; 32 | import java.io.ObjectOutputStream; 33 | import java.nio.file.Files; 34 | import java.util.Random; 35 | 36 | /** 37 | * 38 | * @author Thibault Debatty 39 | */ 40 | public class SerializeExample { 41 | 42 | /** 43 | * @param args the command line arguments 44 | * @throws java.io.IOException 45 | * @throws java.lang.ClassNotFoundException 46 | */ 47 | public static void main(String[] args) 48 | throws IOException, ClassNotFoundException { 49 | 50 | // Create a single random boolean vector 51 | int n = 100; 52 | double sparsity = 0.75; 53 | boolean[] vector = new boolean[n]; 54 | Random rand = new Random(); 55 | for (int j = 0; j < n; j++) { 56 | vector[j] = rand.nextDouble() > sparsity; 57 | } 58 | 59 | // Create and configure LSH 60 | int stages = 2; 61 | int buckets = 10; 62 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 63 | println(lsh.hash(vector)); 64 | 65 | // Create another LSH object 66 | // as the parameters of the hashing function are randomly initialized 67 | // these two LSH objects will produce different hashes for the same 68 | // input vector! 69 | LSHMinHash other_lsh = new LSHMinHash(stages, buckets, n); 70 | println(other_lsh.hash(vector)); 71 | 72 | // Moreover, signatures produced by different LSH objects cannot 73 | // be used to compute estimated similarity! 74 | // The solution is to serialize and save the object, so it can be 75 | // reused later... 76 | File tempfile = Files.createTempFile("lshobject", ".ser").toFile(); 77 | FileOutputStream fout = new FileOutputStream(tempfile); 78 | ObjectOutputStream oos = new ObjectOutputStream(fout); 79 | oos.writeObject(lsh); 80 | oos.close(); 81 | System.out.println( 82 | "LSH object serialized to " + tempfile.getAbsolutePath()); 83 | 84 | FileInputStream fin = new FileInputStream(tempfile); 85 | ObjectInputStream ois = new ObjectInputStream(fin); 86 | LSHMinHash saved_lsh = (LSHMinHash) ois.readObject(); 87 | println(saved_lsh.hash(vector)); 88 | } 89 | 90 | static void println(int[] array) { 91 | System.out.print("["); 92 | for (int v : array) { 93 | System.out.print("" + v + " "); 94 | } 95 | System.out.println("]"); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/SimpleLSHMinHashExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 tibo. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.LSHMinHash; 28 | import java.util.Random; 29 | 30 | /** 31 | * 32 | * @author tibo 33 | */ 34 | public class SimpleLSHMinHashExample { 35 | 36 | /** 37 | * @param args the command line arguments 38 | */ 39 | public static void main(String[] args) { 40 | // proportion of 0's in the vectors 41 | // if the vectors are dense (lots of 1's), the average jaccard similarity 42 | // will be very high (especially for large vectors), and LSH 43 | // won't be able to distinguish them 44 | // as a result, all vectors will be binned in the same bucket... 45 | double sparsity = 0.75; 46 | 47 | // Number of sets 48 | int count = 10000; 49 | 50 | // Size of vectors 51 | int n = 100; 52 | 53 | // LSH parameters 54 | // the number of stages is also sometimes called thge number of bands 55 | int stages = 2; 56 | 57 | // Attention: to get relevant results, the number of elements per bucket 58 | // should be at least 100 59 | int buckets = 10; 60 | 61 | // Let's generate some random sets 62 | boolean[][] vectors = new boolean[count][n]; 63 | Random rand = new Random(); 64 | 65 | for (int i = 0; i < count; i++) { 66 | for (int j = 0; j < n; j++) { 67 | vectors[i][j] = rand.nextDouble() > sparsity; 68 | } 69 | } 70 | 71 | // Create and configure LSH algorithm 72 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 73 | 74 | int[][] counts = new int[stages][buckets]; 75 | 76 | // Perform hashing 77 | for (boolean[] vector : vectors) { 78 | int[] hash = lsh.hash(vector); 79 | 80 | for (int i = 0; i < hash.length; i++) { 81 | counts[i][hash[i]]++; 82 | } 83 | 84 | print(vector); 85 | System.out.print(" : "); 86 | print(hash); 87 | System.out.print("\n"); 88 | } 89 | 90 | System.out.println("Number of elements per bucket at each stage:"); 91 | for (int i = 0; i < stages; i++) { 92 | print(counts[i]); 93 | System.out.print("\n"); 94 | } 95 | } 96 | 97 | static void print(int[] array) { 98 | System.out.print("["); 99 | for (int v : array) { 100 | System.out.print("" + v + " "); 101 | } 102 | System.out.print("]"); 103 | } 104 | 105 | static void print(boolean[] array) { 106 | System.out.print("["); 107 | for (boolean v : array) { 108 | System.out.print(v ? "1" : "0"); 109 | } 110 | System.out.print("]"); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/info/debatty/java/lsh/examples/SuperBitExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2015 tibo. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | 25 | package info.debatty.java.lsh.examples; 26 | 27 | import info.debatty.java.lsh.SuperBit; 28 | import java.util.Random; 29 | 30 | /** 31 | * @author Thibault Debatty 32 | */ 33 | public class SuperBitExample { 34 | 35 | /** 36 | * @param args the command line arguments 37 | */ 38 | public static void main(String[] args) { 39 | 40 | int n = 100; 41 | 42 | SuperBit sb = new SuperBit(n); 43 | 44 | Random rand = new Random(); 45 | double[] v1 = new double[n]; 46 | double[] v2 = new double[n]; 47 | for (int i = 0; i < n; i++) { 48 | v1[i] = rand.nextInt(); 49 | v2[i] = rand.nextInt(); 50 | } 51 | 52 | boolean[] sig1 = sb.signature(v1); 53 | boolean[] sig2 = sb.signature(v2); 54 | 55 | System.out.println("Signature (estimated) similarity: " + 56 | sb.similarity(sig1, sig2)); 57 | System.out.println("Real (cosine) similarity: " + 58 | SuperBit.cosineSimilarity(v1, v2)); 59 | 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/test/java/info/debatty/java/lsh/LSHMinHashTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The MIT License 3 | * 4 | * Copyright 2016 Thibault Debatty. 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | */ 24 | package info.debatty.java.lsh; 25 | 26 | import java.util.Random; 27 | import org.junit.Test; 28 | 29 | /** 30 | * 31 | * @author Thibault Debatty 32 | */ 33 | public class LSHMinHashTest { 34 | 35 | /** 36 | * Test of hash method, of class LSHMinHash. 37 | */ 38 | @Test 39 | public void testHash() { 40 | System.out.println("hash"); 41 | 42 | // proportion of 0's in the vectors 43 | // if the vectors are dense (lots of 1's), the average jaccard similarity 44 | // will be very high (especially for large vectors), and LSH 45 | // won't be able to distinguish them 46 | // as a result, all vectors will be binned in the same bucket... 47 | double sparsity = 0.75; 48 | 49 | // Number and size of vectors 50 | int count = 1000; 51 | int n = 10000; 52 | 53 | int stages = 2; 54 | int buckets = 10; 55 | 56 | // Let's generate some random sets 57 | boolean[][] vectors = new boolean[count][n]; 58 | Random rand = new Random(); 59 | 60 | for (int i = 0; i < count; i++) { 61 | for (int j = 0; j < n; j++) { 62 | vectors[i][j] = rand.nextDouble() > sparsity; 63 | } 64 | } 65 | 66 | LSHMinHash lsh = new LSHMinHash(stages, buckets, n); 67 | int[][] counts = new int[stages][buckets]; 68 | 69 | // Perform hashing 70 | for (boolean[] vector : vectors) { 71 | int[] hash = lsh.hash(vector); 72 | 73 | for (int i = 0; i < hash.length; i++) { 74 | // this will raise an ArrayIndexOutOfBoundsException 75 | // if the bin values are negatives or too large 76 | counts[i][hash[i]]++; 77 | } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/java/info/debatty/java/lsh/MinHashTest.java: -------------------------------------------------------------------------------- 1 | package info.debatty.java.lsh; 2 | 3 | import static org.junit.Assert.assertArrayEquals; 4 | 5 | import java.util.HashSet; 6 | import java.util.Random; 7 | import java.util.Set; 8 | 9 | import org.junit.Test; 10 | 11 | /** 12 | * 13 | * @author Thibault Debatty 14 | */ 15 | public class MinHashTest { 16 | 17 | /** 18 | * Test with initial seed. 19 | */ 20 | @Test 21 | public void testSeed() { 22 | MinHash mh = new MinHash(100, 100, 123456); 23 | MinHash mh2 = new MinHash(100, 100, 123456); 24 | 25 | Random r = new Random(); 26 | 27 | Set ints = new HashSet(); 28 | for (int i = 0; i < 50; i++) { 29 | ints.add(r.nextInt()); 30 | } 31 | 32 | assertArrayEquals(mh.signature(ints), mh2.signature(ints)); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/java/info/debatty/java/lsh/SuperBitTest.java: -------------------------------------------------------------------------------- 1 | package info.debatty.java.lsh; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Random; 6 | 7 | import org.junit.Test; 8 | 9 | /** 10 | * 11 | * @author Thibault Debatty 12 | */ 13 | public class SuperBitTest { 14 | 15 | /** 16 | * Test with initial seed. 17 | */ 18 | @Test 19 | public final void testSeed() { 20 | int d = 50; 21 | SuperBit sb = new SuperBit(d, 25, 100, 123456); 22 | SuperBit sb2 = new SuperBit(d, 25, 100, 123456); 23 | 24 | Random r = new Random(); 25 | 26 | double[] vector = new double[d]; 27 | for (int i = 0; i < d; i++) { 28 | vector[i] = r.nextDouble(); 29 | } 30 | 31 | boolean[] sig1 = sb.signature(vector); 32 | boolean[] sig2 = sb2.signature(vector); 33 | 34 | for (int i = 0; i < sig1.length; i++) { 35 | assertEquals( 36 | "Signatures are different at index " + i, sig1[i], sig2[i]); 37 | } 38 | } 39 | } 40 | --------------------------------------------------------------------------------