├── LSH.java ├── MinHash.java ├── MinHashAccuracy.java ├── MinHashSpeed.java ├── NearDuplicates.java ├── ProcessingFunctions.java └── README.md /LSH.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.Arrays; 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.Map; 6 | import java.util.Random; 7 | import java.util.Set; 8 | 9 | /** 10 | * Uses LSH to find near duplicates for documents. This is done by splitting the 11 | * MinHash matrix into bands and hashing each band. Any documents where a band 12 | * is hashed to the same bucket are considered near duplicates. 13 | * 14 | * @author Alex Shum 15 | */ 16 | public class LSH { 17 | private int n; //number of documents 18 | int rows; //number of rows per band 19 | private int[][] minHashMatrix; //min hash mtx 20 | private String[] docNames; //docnames 21 | private Map hashTable; //Key = 22 | 23 | int p; //hash table modulous 24 | int a; //hash function ax + b % p 25 | int b; //hash function ax + b % p 26 | String name; 27 | 28 | /** 29 | * Creates a new LSH object. 30 | * @param minHashMatrix MinHash matrix where rows are documents, columns are the hash functions. 31 | * @param docNames Array of document names in collection. 32 | * @param bands Number of bands to split minHash matrix into. 33 | */ 34 | public LSH(int[][] minHashMatrix, String[] docNames, int bands) { 35 | Random r = new Random(); 36 | 37 | n = minHashMatrix.length; 38 | rows = minHashMatrix[0].length / bands; 39 | this.minHashMatrix = minHashMatrix; 40 | this.docNames = docNames; 41 | 42 | p = ProcessingFunctions.nextPrime(5 * n); 43 | a = r.nextInt(p); 44 | b = r.nextInt(p); 45 | hashTable = new HashMap(); 46 | 47 | int currBand = 0; 48 | int currProd = 1; 49 | for(int i = 0; i < n; i++) { //all documents 50 | for(int j = 0; j < minHashMatrix[0].length; j++) { //rows in document 51 | currBand = j / rows; 52 | currProd = currProd + (a * minHashMatrix[i][j] + b); 53 | currProd = currProd % p; 54 | 55 | if((j + 1) % rows == 0 || (j + 1) == n) { 56 | Pair pa = new Pair(currBand, currProd); 57 | String names = hashTable.get(pa); 58 | names = names == null ? docNames[i] : names + "~::~" + docNames[i]; 59 | 60 | hashTable.put(pa, names); 61 | currProd = 1; 62 | } 63 | } 64 | } 65 | } 66 | 67 | /** 68 | * Computes a list of near duplicate documents. Near duplicate documents are 69 | * documents where at least one of the bands hash to the same bucket. 70 | * @param docName The document to find near duplicates for. 71 | * @return List of near duplicates for docName. 72 | */ 73 | public ArrayList nearDuplicatesOf(String docName) { 74 | Set setDuplicates = new HashSet(); 75 | ArrayList nearDuplicates = new ArrayList(); 76 | 77 | int docIndex = 0; 78 | for(int i = 0; i < n; i++) { 79 | if(docNames[i].equals(docName)) { 80 | docIndex = i; 81 | } 82 | } 83 | 84 | int currBand = 0; 85 | int currProd = 1; 86 | String[] currString; 87 | for(int i = 0; i < minHashMatrix[docIndex].length; i++) { 88 | currBand = i / rows; 89 | currProd = currProd + (a * minHashMatrix[docIndex][i] + b); 90 | currProd = currProd % p; 91 | 92 | if((i + 1) % rows == 0 || (i + 1) == minHashMatrix[docIndex].length) { 93 | Pair pa = new Pair(currBand, currProd); 94 | String names = hashTable.get(pa); 95 | names = names == null ? "" : names; 96 | 97 | if(!names.equals("")) { 98 | currString = names.split("~::~"); 99 | setDuplicates.addAll(Arrays.asList(currString)); 100 | } 101 | currProd = 1; 102 | } 103 | } 104 | 105 | nearDuplicates.addAll(setDuplicates); 106 | return(nearDuplicates); 107 | } 108 | 109 | /** 110 | * Container object to store the band number and hash value. 111 | * In LSH, the MinHash matrix is split into B bands: 1,2,...,B 112 | * and each band is R-rows. The R-rows in a band are hashed to 113 | * get a hash value for that band. This object stores the band 114 | * Number and the hash value for that band. 115 | * 116 | * @author Alex Shum 117 | */ 118 | public class Pair { 119 | int band; 120 | int hashVal; 121 | 122 | /** 123 | * Creates a new container object. 124 | * @param band Band integer. 125 | * @param hashVal Hash value for that band. 126 | */ 127 | public Pair(int band, int hashVal) { 128 | this.band = band; 129 | this.hashVal = hashVal; 130 | } 131 | 132 | 133 | /** 134 | * Returns the hash code of container for HashMaps. 135 | * @return HashCode of Pair container. 136 | */ 137 | @Override 138 | public int hashCode() { 139 | int hash = 5; 140 | hash = hash * band * hashVal; 141 | 142 | return(hash); 143 | } 144 | 145 | 146 | /** 147 | * Compares equality against other Pair containers. 148 | * @param The other object to check for equality 149 | * @return True if band and hashvalue are equal, otherwise false. 150 | */ 151 | @Override 152 | public boolean equals(Object other) { 153 | if(other == null) return(false); 154 | if(other == this) return(true); 155 | if(!(other instanceof Pair)) return(false); 156 | 157 | Pair p = (Pair) other; 158 | return(band == p.band && hashVal == p.hashVal); 159 | } 160 | 161 | /** 162 | * Gives a string representation of this object. 163 | * @return String representation with band and hashvalue. 164 | */ 165 | @Override 166 | public String toString() { 167 | String s = "(" + band + "," + hashVal + ")"; 168 | 169 | return(s); 170 | } 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /MinHash.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | import java.util.Random; 9 | import java.util.Set; 10 | 11 | /** 12 | * Generates minhash for a collection of documents. 13 | * 14 | * The permutations are represented by randomized hash functions: ax + b % p. 15 | * p is a prime such that p >= n where n is the number of terms in the collection. 16 | * a and b are chosen uniformly at random from {1,2,...,p-1}. 17 | * 18 | * MinHash matrix generated will by M * N. M = number documents and N = number of permutations. 19 | * Each element in the MinHash matrix will be the MinHash value of the document. 20 | * @note: MinHash matrix has documents as rows and permutations as columns. 21 | * 22 | * There is minimal preprocessing 23 | * @author Alex Shum 24 | */ 25 | public class MinHash { 26 | File folder; 27 | int numPermutations; 28 | int numTerms; 29 | int mod; //p: ax + b % p 30 | List AB; //a, b: ax + b % p 31 | 32 | /** 33 | * Constructor that takes a folder and number of permutations. 34 | * @param folder Folder with documents. 35 | * @param numPermutations Number of permutations for MinHash. 36 | * @throws IOException If folder cannot be opened. 37 | */ 38 | public MinHash(String folder, int numPermutations) throws IOException { 39 | this.folder = new File(folder); 40 | this.numPermutations = numPermutations; 41 | 42 | numTerms = ProcessingFunctions.numUnique(this.folder); 43 | mod = ProcessingFunctions.nextPrime(numTerms); 44 | AB = generateCoefficients(mod); 45 | } 46 | 47 | /** 48 | * Returns names of documents in collection. 49 | * @return String array of document names. 50 | */ 51 | public String[] allDocs() { 52 | return(folder.list()); 53 | } 54 | 55 | /** 56 | * Calculates the exact jaccard simularity between two documents. 57 | * @param file1 Filename of first document. 58 | * @param file2 Filename of second document. 59 | * @return Jaccard simularity 60 | * @throws IOException If files cannot be opened. 61 | */ 62 | public double exactJaccard(String file1, String file2) throws IOException { 63 | Set words1 = ProcessingFunctions.UniqueWordList(folder + File.separator + file1); 64 | Set words2 = ProcessingFunctions.UniqueWordList(folder + File.separator + file2); 65 | 66 | int a = words1.size(); 67 | int b = words2.size(); 68 | 69 | words1.retainAll(words2); 70 | int intersect = words1.size(); 71 | 72 | return((double) intersect / (a + b - intersect)); 73 | } 74 | 75 | /** 76 | * Calculates the MinHash signature. 77 | * @param fileName Filename of document. 78 | * @return MinHash signature as int array. 79 | * @throws IOException If file cannot be opened. 80 | */ 81 | public int[] minHashSig(String fileName) throws IOException { 82 | FileReader fr = new FileReader(folder + File.separator + fileName); 83 | BufferedReader b = new BufferedReader(fr); 84 | 85 | String line; 86 | String[] words; 87 | int hashVal; 88 | int[] minHashVals = new int[numPermutations]; 89 | Arrays.fill(minHashVals, Integer.MAX_VALUE); 90 | while((line = b.readLine()) != null) { //iterate through lines 91 | words = line.replaceAll("[.,:;']", "").toLowerCase().split("\\s+"); //remove punctuation 92 | 93 | for(int j = 0; j < words.length; j++) { //iterate through words 94 | if(!ProcessingFunctions.isStopWord(words[j])) { 95 | for(int i = 0; i < numPermutations; i++) { //hash through k-functions 96 | hashVal = word2int(words[j], AB.get(i).a, AB.get(i).b, mod); 97 | if(hashVal < minHashVals[i]) minHashVals[i] = hashVal; 98 | } 99 | } 100 | } 101 | } 102 | b.close(); 103 | 104 | return(minHashVals); 105 | } 106 | 107 | /** 108 | * Computes the approximate jaccard simularity by using the MinHash signatures. 109 | * @param file1 Filename of first document. 110 | * @param file2 Filename of second document. 111 | * @return Approximate jaccard simularity. 112 | * @throws IOException If files cannot be opened. 113 | */ 114 | public double approximateJaccard(String file1, String file2) throws IOException { 115 | int[] hash1 = minHashSig(file1); 116 | int[] hash2 = minHashSig(file2); 117 | 118 | return(approximateJaccard(hash1, hash2)); 119 | } 120 | 121 | /** 122 | * Computes the approximate jaccard simularity by using the MinHash signatures. 123 | * @param d1 MinHash signature of first document. 124 | * @param d2 MinHash signature of second document. 125 | * @return Approximate jaccard simularity. 126 | */ 127 | public double approximateJaccard(int[] d1, int[] d2) { 128 | double numMatch = 0.0; 129 | for(int i = 0; i < numPermutations; i++) { 130 | if(d1[i] == d2[i]) numMatch++; 131 | } 132 | 133 | return(numMatch / numPermutations); 134 | } 135 | 136 | /** 137 | * Computes the MinHash signature for all documents in the collection. 138 | * @note The rows of the matrix are the documents and columns are the permutations (hash functions). 139 | * @return MinHash signatures as a 2d int array. 140 | * @throws IOException If files cannot be read. 141 | */ 142 | public int[][] minHashMatrix() throws IOException { 143 | File[] contents = folder.listFiles(); 144 | int[][] minHashMatrix = new int[contents.length][numPermutations]; //documents are rows 145 | 146 | int[] doc; 147 | for(int i = 0; i < contents.length; i++) { 148 | if(contents[i].isFile()) { 149 | doc = minHashSig(contents[i].getName()); 150 | 151 | for(int j = 0; j < numPermutations; j++) { 152 | minHashMatrix[i][j] = doc[j]; //documents are rows 153 | } 154 | } 155 | } 156 | 157 | return(minHashMatrix); 158 | } 159 | 160 | /** 161 | * Gives the total number of unique terms in the collection of documents after basic preprocessing. 162 | * See the isStopWord function in PreprocessingFunctions.java for more details. 163 | * @return Number of terms in the collection of documents. 164 | */ 165 | public int numTerms() { 166 | return(numTerms); 167 | } 168 | 169 | /** 170 | * Gives the number of permutations used for MinHash matrix. 171 | * @return Number of permutations 172 | */ 173 | public int numPermutations() { 174 | return(numPermutations); 175 | } 176 | 177 | /** 178 | * Hashes a word into an integer using ax + b % p hash function. 179 | * @param s Word to hash. 180 | * @param a First coefficient in hash function. 181 | * @param b Second coefficient in hash function. 182 | * @param mod Modulus of hash function. 183 | * @return Hash value of word. 184 | */ 185 | private int word2int(String s, int a, int b, int mod) { 186 | int hashed = 0; 187 | 188 | for(int i = 0; i < s.length(); i++) { 189 | hashed ^= s.charAt(i); 190 | hashed = a + b * hashed; 191 | hashed = hashed % mod; 192 | } 193 | 194 | return(hashed); 195 | } 196 | 197 | /** 198 | * Container object for a pair of coefficients to be used as part of the hash function. 199 | * Hash functions of form ax + b % p, this object will store coefficients a and b. 200 | * 201 | * This is mainly used as a quick way to check if any of the k-hash functions used for 202 | * the MinHash matrix are duplicated. 203 | * @author Alex Shum 204 | */ 205 | public class Pair { 206 | int a, b; 207 | 208 | /** 209 | * Creates a new coefficient pair container. 210 | * @param a The first coefficient. 211 | * @param b The second coefficient. 212 | */ 213 | public Pair(int a, int b) { 214 | this.a = a; 215 | this.b = b; 216 | } 217 | 218 | /** 219 | * Checks if another pair container is equal to this one. 220 | * @param other The other pair to check for equality. 221 | * @return true if both coefficients are equal. Otherwise false. 222 | */ 223 | @Override 224 | public boolean equals(Object other) { 225 | if(other == null) return(false); 226 | if(other == this) return(true); 227 | if(!(other instanceof Pair)) return(false); 228 | 229 | Pair p = (Pair) other; 230 | return(a == p.a && b == p.b); 231 | } 232 | } 233 | 234 | /** 235 | * Generates k-random hash functions. k is equal to the number of permutations. 236 | * @param mod The modulus for the hash function. 237 | * @return List of pairs of coefficients for hash functions. 238 | */ 239 | private List generateCoefficients(int mod) { 240 | Random r = new Random(); 241 | List coef = new ArrayList(); 242 | 243 | Pair p = new Pair(r.nextInt(mod), r.nextInt(mod)); 244 | for(int i = 0; i < numPermutations; i++) { 245 | while(coef.contains(p)) { 246 | p = new Pair(r.nextInt(mod), r.nextInt(mod)); 247 | } 248 | coef.add(p); 249 | } 250 | 251 | return(coef); 252 | } 253 | 254 | } 255 | -------------------------------------------------------------------------------- /MinHashAccuracy.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | /** 4 | * Compares the accuracy of using the MinHash matrix for jaccard similarity with 5 | * various number of permutations. User must specify with collection of documents, 6 | * for use with MinHash matrix, and the . This will 7 | * print out the number of document pairs where the approximate jaccard similarity and exact 8 | * jaccard similarity differ by more than the . 9 | * 10 | * @author Alex Shum 11 | */ 12 | public class MinHashAccuracy { 13 | 14 | /** 15 | * Calculates the approximate jaccard similarity and exact jaccard similarity for all 16 | * pairs of documents and then counts the number of times the difference between 17 | * the exact and approximate jaccard similarities differs by more than the user 18 | * specified error parameter. 19 | * 20 | * @param args folder, number of permutations, error parameter. 21 | * @throws NumberFormatException If number of permutations not formatted correctly or 22 | * error paramter not formatted correctly. 23 | * @throws IOException If files cannot be opened. 24 | */ 25 | public static void main(String[] args) throws NumberFormatException, IOException { 26 | if(args.length != 3) throw new IllegalArgumentException("Enter "); 27 | 28 | MinHash mh = new MinHash(args[0], Integer.parseInt(args[1])); 29 | double eps = Double.parseDouble(args[2]); 30 | String[] files = mh.allDocs(); 31 | int[][] minHashMat = mh.minHashMatrix(); 32 | 33 | int count = 0; 34 | int comp = 0; 35 | double exact; 36 | double approx; 37 | for(int i = 0; i < files.length; i++) { 38 | System.out.println(files[i]); 39 | for(int j = i + 1; j < files.length; j++) { 40 | exact = mh.exactJaccard(files[i], files[j]); 41 | approx = mh.approximateJaccard(minHashMat[i], minHashMat[j]); 42 | 43 | if(approx < exact && approx + eps < exact) { 44 | count++; 45 | } else if(approx > exact && approx - eps > exact) { 46 | count++; 47 | } 48 | comp++; 49 | } 50 | } 51 | System.out.println("Total number of comparisons: " + comp); 52 | System.out.println("Number of times exact and approximate jaccard differ more than epsilon: " + count); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /MinHashSpeed.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | /** 4 | * Compares the runtime for calculating approximate jaccard similarity using MinHash matrix and 5 | * calculating the exact jaccard similarity. User must specify with collection of 6 | * documents, for use with MinHash matrix. 7 | * 8 | * @author Alex Shum 9 | */ 10 | public class MinHashSpeed { 11 | 12 | /** 13 | * Calculates approximate jaccard similarity and exact jaccard similarity between all 14 | * pairs of documents. Prints time it takes to calculate exact jaccard similarity and 15 | * time it takes to calculate approximate jaccard similarity. 16 | * 17 | * @param args folder and number of permutations 18 | * @throws NumberFormatException If number of permutations not formatted correctly. 19 | * @throws IOException If files cannot be opened. 20 | */ 21 | public static void main(String[] args) throws NumberFormatException, IOException { 22 | if(args.length != 2) throw new IllegalArgumentException("Enter "); 23 | MinHash mh = new MinHash(args[0], Integer.parseInt(args[1])); 24 | String[] allDocs = mh.allDocs(); 25 | 26 | long startTime; 27 | long endTime; 28 | double sec; 29 | startTime = System.currentTimeMillis(); 30 | for(int i = 0; i < allDocs.length; i++) { 31 | for(int j = i + 1; j < allDocs.length; j++) { 32 | mh.exactJaccard(allDocs[i], allDocs[j]); 33 | } 34 | } 35 | endTime = System.currentTimeMillis() - startTime; 36 | sec = (double) endTime / 1000; 37 | System.out.println("Exact jaccard total time: " + endTime + " (ms)"); 38 | System.out.println("Exact jaccard total time: " + sec + " seconds"); 39 | System.out.println("------------------------------"); 40 | 41 | startTime = System.currentTimeMillis(); 42 | int[][] minHashMat = mh.minHashMatrix(); 43 | for(int i = 0; i < minHashMat.length; i++) { 44 | for(int j = i + 1; j < minHashMat.length; j++) { 45 | mh.approximateJaccard(minHashMat[i], minHashMat[j]); 46 | } 47 | } 48 | endTime = System.currentTimeMillis() - startTime; 49 | sec = (double) endTime / 1000; 50 | System.out.println("Approx jaccard total time: " + endTime + " (ms)"); 51 | System.out.println("Exact jaccard total time: " + sec + " seconds"); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /NearDuplicates.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.List; 3 | 4 | /** 5 | * Calculates the number of false positives that were hashed together into the same bucket in LSH. 6 | * User specifies with collection of documents, for MinHash Matrix, 7 | * for LSH, and to find near duplicates for. 8 | * Afterwards this will print out the number of false positives; documents that were hashed together 9 | * in the same bucket for LSH. 10 | * 11 | * @author Alex Shum 12 | */ 13 | public class NearDuplicates { 14 | 15 | /** 16 | * Calculates the MinHash Matrix and hashes each band. Documents that are hashed to same bucket 17 | * for any of the bands is considered a near duplicate. After this LSH procedure, it will remove 18 | * false posistives by calculating the exact jaccard similarities and consider documents duplicates 19 | * if the jaccard similarity between the documents is greater than the user specified simularity 20 | * threshold. 21 | * 22 | * @param args Folder, number of permutations, number of bands, simularity threshold, file. 23 | * @throws IOException 24 | * @throws NumberFormatException 25 | */ 26 | public static void main(String[] args) throws NumberFormatException, IOException { 27 | if(args.length < 5) throw new IllegalArgumentException( 28 | "Enter "); 29 | MinHash mh = new MinHash(args[0], Integer.parseInt(args[1])); 30 | int[][] hashMtx = mh.minHashMatrix(); 31 | String[] docNames = mh.allDocs(); 32 | LSH lsh = new LSH(hashMtx, docNames, Integer.parseInt(args[2])); 33 | List nearDuplicates = lsh.nearDuplicatesOf(args[4]); 34 | 35 | int FP = 0; 36 | for(String s : nearDuplicates) { 37 | 38 | double sim = mh.exactJaccard(args[4], s); 39 | if(sim > Double.parseDouble(args[3])) { 40 | System.out.println(s); 41 | } else { 42 | FP++; 43 | } 44 | } 45 | 46 | System.out.println("Number of false positives: " + FP); 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /ProcessingFunctions.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.File; 3 | import java.io.FileReader; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Set; 10 | 11 | /** 12 | * Miscellaneous functions used to process text documents 13 | * @author Alex Shum 14 | */ 15 | public class ProcessingFunctions { 16 | static final List stopWords = new ArrayList(Arrays.asList("the")); 17 | 18 | /** 19 | * Finds the next prime number larger than a starting integer. 20 | * @param n Starting integer. 21 | * @return The next prime number larger than starting integer. 22 | */ 23 | public static int nextPrime(int n) { 24 | boolean isPrime = false; 25 | 26 | int m = n; 27 | while(!isPrime) { 28 | isPrime = isPrime(++m); 29 | } 30 | return(m); 31 | } 32 | 33 | /** 34 | * Checks if integer is prime or not. 35 | * This is based on the sieve of eratosthenes. 36 | * This particular implementation is is based off information from: 37 | * http://en.wikipedia.org/wiki/Primality_test 38 | * 39 | * @param n Integer to check for primality. 40 | * @return true if n is prime otherwise false. 41 | */ 42 | public static boolean isPrime(int n) { 43 | if(n == 1) return(false); 44 | else if(n == 2 || n == 3) return(true); 45 | else if(n % 2 == 0 || n % 3 == 0) return(false); 46 | else { 47 | for(int i = 5; i*i < n + 1; i += 6) { 48 | if(n % i == 0 || n % (i + 2) == 0) { 49 | return(false); 50 | } 51 | } 52 | return(true); 53 | } 54 | } 55 | 56 | /** 57 | * Checks if a string is a stop word or not. 58 | * Stop words include 'the' and words less than length 3. 59 | * 60 | * @param s String to check 61 | * @return true if the word is in the list of stop words. 62 | */ 63 | public static boolean isStopWord(String s) { 64 | if(stopWords.contains(s) || s.length() < 3) return(true); 65 | return(false); 66 | } 67 | 68 | //number of unique words in all text files in a folder 69 | /** 70 | * Counts the number of unique words in a collection of text documents. 71 | * Does minimal processing to remove words less than 3 characters and 72 | * 'the'. 73 | * @param folder with the collection of text documents 74 | * @return number of unique words in all documents 75 | * @throws IOException if folder cannot be opened 76 | */ 77 | public static int numUnique(File folder) throws IOException { 78 | Set s = new HashSet(); 79 | File[] contents = folder.listFiles(); 80 | 81 | FileReader fr; 82 | BufferedReader b; 83 | for(int i = 0; i < contents.length; i++) { //iterate through the documents 84 | if(contents[i].isFile()) { 85 | fr = new FileReader(contents[i]); 86 | b = new BufferedReader(fr); 87 | 88 | String line; 89 | String[] words; 90 | while((line = b.readLine()) != null) { //iterate through lines 91 | words = line.replaceAll("[.,:;']", "").toLowerCase().split("\\s+"); //remove punctuation 92 | for(int j = 0; j < words.length; j++) { //iterate through words 93 | if(!isStopWord(words[j])) s.add(words[j]); 94 | } 95 | } 96 | b.close(); 97 | } 98 | } 99 | return(s.size()); 100 | } 101 | 102 | /** 103 | * Returns a set of unique words in a text file. Does minimal processing 104 | * to remove words less than 3 characters and 'the'. 105 | * @param fileName The file name of the text document. 106 | * @return Set of unique words in a text file. 107 | * @throws IOException if file cannot be opened 108 | */ 109 | public static Set UniqueWordList(String fileName) throws IOException { 110 | FileReader fr = new FileReader(fileName); 111 | BufferedReader b = new BufferedReader(fr); 112 | Set s = new HashSet(); 113 | 114 | String line; 115 | String[] words; 116 | while((line = b.readLine()) != null) { 117 | words = line.replaceAll("[.,:;']", "").toLowerCase().split("\\s+"); 118 | for(int i = 0; i < words.length; i++) { 119 | if(!isStopWord(words[i])) s.add(words[i]); 120 | } 121 | } 122 | b.close(); 123 | 124 | return(s); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MinHashLSH 2 | Java implementation for MinHash and LSH for finding near duplicate documents as measured by Jaccard similarity. 3 | 4 | Implementation of MinHash for approximating Jaccard similarity in text documents. 5 | Also includes an implementation of LSH which is a fast way to find approximate nearest neighbors. 6 | --------------------------------------------------------------------------------