├── .gitignore ├── .settings ├── org.eclipse.m2e.core.prefs ├── org.eclipse.core.resources.prefs └── org.eclipse.jdt.core.prefs ├── src ├── main │ ├── java │ │ └── org │ │ │ └── rabinfingerprint │ │ │ ├── polynomial │ │ │ ├── Arithmetic.java │ │ │ ├── Polynomials.java │ │ │ └── Polynomial.java │ │ │ ├── fingerprint │ │ │ ├── Fingerprint.java │ │ │ ├── AbstractFingerprint.java │ │ │ ├── RabinFingerprintLongWindowed.java │ │ │ ├── RabinFingerprintLong.java │ │ │ └── RabinFingerprintPolynomial.java │ │ │ ├── scanner │ │ │ ├── FileListing.java │ │ │ ├── TokenReader.java │ │ │ ├── IOUtils.java │ │ │ ├── LineNumberIndex.java │ │ │ ├── StringFinder.java │ │ │ ├── FileFinder.java │ │ │ ├── FilesStringFinder.java │ │ │ └── MatchModel.java │ │ │ ├── handprint │ │ │ ├── BoundaryDetectors.java │ │ │ ├── Handprints.java │ │ │ ├── Handprint.java │ │ │ └── FingerFactory.java │ │ │ ├── datastructures │ │ │ ├── CircularByteQueue.java │ │ │ └── Interval.java │ │ │ ├── Samples.java │ │ │ ├── Args.java │ │ │ └── Main.java │ └── resources │ │ └── usage.txt └── test │ └── java │ └── org │ └── rabinfingerprint │ ├── polynomial │ ├── Stats.java │ └── PolynomialTest.java │ ├── handprint │ ├── TestDataGenerator.java │ └── HandprintTest.java │ └── fingerprint │ └── RabinFingerprintTest.java ├── .classpath ├── LICENSE.txt ├── .project ├── README.md └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/polynomial/Arithmetic.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.polynomial; 2 | 3 | public interface Arithmetic { 4 | public T add(T that); 5 | public T subtract(T that); 6 | public T multiply(T that); 7 | public T and(T that); 8 | public T or(T that); 9 | public T xor(T that); 10 | public T mod(T that); 11 | public T gcd(T that); 12 | } -------------------------------------------------------------------------------- /src/test/java/org/rabinfingerprint/polynomial/Stats.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.polynomial; 2 | 3 | public class Stats { 4 | private long count = 0L; 5 | private double sum = 0.0; 6 | 7 | public synchronized void add(double value) { 8 | sum += value; 9 | count++; 10 | } 11 | 12 | public synchronized double average() { 13 | if (count == 0) { 14 | return 0; 15 | } 16 | return sum / count; 17 | } 18 | 19 | public synchronized double sum() { 20 | return sum; 21 | } 22 | 23 | public synchronized long count() { 24 | return count; 25 | } 26 | } -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2012 Bill Dwyer 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | rabinfingerprint 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.6 13 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/fingerprint/Fingerprint.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | 4 | /** 5 | * Overview of Rabin's scheme given by Broder 6 | * 7 | * Some Applications of Rabin's Fingerprinting Method 8 | * http://citeseer.ist.psu.edu/cache/papers/cs/752/ftp:zSzzSzftp.digital.comzSzpubzSzDECzSzSRCzSzpublicationszSzbroderzSzfing-appl.pdf/broder93some.pdf 9 | */ 10 | public interface Fingerprint { 11 | public void pushBytes(byte[] bytes); 12 | public void pushBytes(byte[] bytes, int offset, int length); 13 | public void pushByte(byte b); 14 | public void reset(); 15 | 16 | public T getFingerprint(); 17 | 18 | public static interface WindowedFingerprint extends Fingerprint { 19 | public void popByte(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/FileListing.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | public class FileListing { 10 | 11 | public static List getFileListing( File directory ) throws FileNotFoundException { 12 | // get files 13 | ArrayList files = new ArrayList( Arrays.asList( directory.listFiles() ) ); 14 | ArrayList result = new ArrayList( files ); 15 | 16 | // recurse 17 | for ( File file : files ) { 18 | if ( !file.isDirectory() ) continue; 19 | result.addAll( getFileListing( file ) ); 20 | } 21 | 22 | // return 23 | return result; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/handprint/BoundaryDetectors.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import org.rabinfingerprint.fingerprint.RabinFingerprintLong; 4 | import org.rabinfingerprint.handprint.FingerFactory.ChunkBoundaryDetector; 5 | 6 | public class BoundaryDetectors { 7 | public static class BitmaskBoundaryDetector implements ChunkBoundaryDetector { 8 | private final long chunkBoundaryMask; 9 | private final long chunkPattern; 10 | 11 | public BitmaskBoundaryDetector(long chunkBoundaryMask, long chunkPattern) { 12 | this.chunkBoundaryMask = chunkBoundaryMask; 13 | this.chunkPattern = chunkPattern; 14 | } 15 | 16 | public boolean isBoundary(RabinFingerprintLong fingerprint) { 17 | return (fingerprint.getFingerprintLong() & chunkBoundaryMask) == chunkPattern; 18 | } 19 | } 20 | 21 | public static final ChunkBoundaryDetector DEFAULT_BOUNDARY_DETECTOR = new BitmaskBoundaryDetector( 22 | 0x1FFF, 0); 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/fingerprint/AbstractFingerprint.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | import org.rabinfingerprint.polynomial.Polynomial; 4 | 5 | public abstract class AbstractFingerprint implements Fingerprint { 6 | protected final Polynomial poly; 7 | 8 | public AbstractFingerprint(Polynomial poly) { 9 | this.poly = poly; 10 | } 11 | 12 | public void pushBytes(final byte[] bytes) { 13 | for (byte b : bytes) { 14 | pushByte(b); 15 | } 16 | } 17 | 18 | public void pushBytes(final byte[] bytes, final int offset, final int length) { 19 | final int max = offset + length; 20 | int i = offset; 21 | while (i < max) { 22 | pushByte(bytes[i++]); 23 | } 24 | } 25 | 26 | public abstract void pushByte(byte b); 27 | public abstract void reset(); 28 | public abstract Polynomial getFingerprint(); 29 | 30 | @Override 31 | public String toString() { 32 | return getFingerprint().toHexString(); 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/TokenReader.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | public class TokenReader { 7 | private int offset; 8 | private int count; 9 | private String string; 10 | private Matcher matcher; 11 | 12 | public static final String LINE_SEPARATOR_PATTERN = "\r\n|[\n\r\u2028\u2029\u0085]"; 13 | public static final String LINE_PATTERN = ".*(?:" + LINE_SEPARATOR_PATTERN + ")|.+$"; 14 | 15 | public TokenReader(String string) { 16 | this(Pattern.compile(LINE_PATTERN), string); 17 | } 18 | 19 | public TokenReader(Pattern pattern, String string) { 20 | this.offset = 0; 21 | this.count = 0; 22 | this.string = string; 23 | this.matcher = pattern.matcher(string); 24 | } 25 | 26 | public String get() { 27 | if (!matcher.find()) return null; 28 | offset = matcher.start(); 29 | count += 1; 30 | final String s = matcher.group(0); 31 | matcher.region(matcher.end(), string.length()); 32 | return s; 33 | } 34 | 35 | public int getOffset() { 36 | return offset; 37 | } 38 | 39 | public int getCount() { 40 | return count; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/resources/usage.txt: -------------------------------------------------------------------------------- 1 | rabinfingerprint v0.2 2 | Usage: 3 | % java -jar rabinfingerprint.jar -h 4 | Prints this message 5 | 6 | % java -jar rabinfingerprint.jar -polygen n 7 | Generates a new irreducible polynomial of degree n. N must be less than 62. 53 is a good choice. 8 | 9 | Output is: 10 | [polynomial] 11 | 12 | % java -jar rabinfingerprint.jar -p xxxxxx file.test 13 | Generates the fingerprint using the rabin fingerprint method using the specified irreducible 14 | polynomial (in hexadecimal) on the specified file. 15 | 16 | Output for fingerprints are: 17 | [fingerprint 0] [file 0] 18 | [fingerprint 1] [file 1] 19 | [fingerprint 2] [file 2] 20 | ... 21 | 22 | % cat file.test | java -jar rabinfingerprint.jar 23 | Generates the fingerprint of stdin using the rabin fingerprint method using a randomly generated 24 | irreducible polynomial. 25 | 26 | Output is: 27 | [fingerprint of stdin] 28 | 29 | % java -jar rabinfingerprint.jar -hand n file.test 30 | Generates the handprint of the file using the rabin fingerprint method. The handprint contains 31 | the sorted fingerprints of chunks of the file. The file is chunked by using a 13-bit bitmask 32 | test (== 0) on a 48-byte sliding window fingerprint of the file. 33 | 34 | Output for fingerprint is the first "n" fingers of the handprint: 35 | [finger 0] 36 | [finger 1] 37 | [finger 2] 38 | ... 39 | [finger n] 40 | -------------------------------------------------------------------------------- /src/test/java/org/rabinfingerprint/handprint/TestDataGenerator.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.util.List; 7 | import java.util.Random; 8 | 9 | import com.google.common.collect.Lists; 10 | 11 | public class TestDataGenerator { 12 | public static int TEST_RESOURCE_BYTES = 4 * (1 << 20); // 4 MB 13 | 14 | public static List getSimilarRandomBytes(int n) throws IOException { 15 | Random rand = new Random(42); 16 | List list = Lists.newArrayList(); 17 | byte[] bytes = new byte[TEST_RESOURCE_BYTES]; 18 | rand.nextBytes(bytes); 19 | while (n-- > 0) { 20 | byte[] ibytes = new byte[TEST_RESOURCE_BYTES]; 21 | System.arraycopy(bytes, 0, ibytes, 0, bytes.length); 22 | 23 | // change 100 random bytes 24 | for(int i = 0; i < 100; i++){ 25 | ibytes[(int)(rand.nextDouble() * TEST_RESOURCE_BYTES)] = (byte)rand.nextInt(); 26 | } 27 | 28 | list.add(new ByteArrayInputStream(ibytes)); 29 | } 30 | return list; 31 | } 32 | 33 | public static List getDifferentRandomBytes(int n) throws IOException { 34 | List list = Lists.newArrayList(); 35 | Random rand = new Random(54); 36 | while (n-- > 0) { 37 | byte[] bytes = new byte[TEST_RESOURCE_BYTES]; 38 | rand.nextBytes(bytes); 39 | list.add(new ByteArrayInputStream(bytes)); 40 | } 41 | return list; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/handprint/Handprints.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import java.io.InputStream; 4 | 5 | import org.rabinfingerprint.handprint.FingerFactory.ChunkBoundaryDetector; 6 | import org.rabinfingerprint.polynomial.Polynomial; 7 | 8 | public class Handprints { 9 | @SuppressWarnings("serial") 10 | public static class HandprintException extends RuntimeException { 11 | public HandprintException(String msg, Throwable wrapped) { 12 | super(msg, wrapped); 13 | } 14 | } 15 | 16 | public static HandPrintFactory newFactory(Polynomial p) { 17 | return new HandPrintFactory(p); 18 | } 19 | 20 | public static class HandPrintFactory { 21 | private final Polynomial p; 22 | private int fingersPerHand = 10; 23 | private long bytesPerWindow = 48; 24 | private ChunkBoundaryDetector boundaryDetector = BoundaryDetectors.DEFAULT_BOUNDARY_DETECTOR; 25 | 26 | public HandPrintFactory(Polynomial p) { 27 | this.p = p; 28 | } 29 | 30 | public HandPrintFactory bytesPerWindow(long bytesPerWindow) { 31 | this.bytesPerWindow = bytesPerWindow; 32 | return this; 33 | } 34 | 35 | public HandPrintFactory setBoundaryDetector(ChunkBoundaryDetector boundaryDetector) { 36 | this.boundaryDetector = boundaryDetector; 37 | return this; 38 | } 39 | 40 | public Handprint newHandprint(InputStream is){ 41 | return new Handprint(is, 42 | fingersPerHand, 43 | new FingerFactory(p, bytesPerWindow, boundaryDetector)); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/IOUtils.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.BufferedOutputStream; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | 11 | public class IOUtils { 12 | 13 | public static String readEntireFile(File file) throws IOException { 14 | return new String(readBytes(file)); 15 | } 16 | 17 | public static String readEntireUnicodeFile(File file) throws IOException { 18 | return new String(readBytes(file), "UTF8"); 19 | } 20 | 21 | private static byte[] readBytes(File file) throws FileNotFoundException, IOException { 22 | // read entire file into string 23 | byte[] buffer = new byte[(int) file.length()]; 24 | BufferedInputStream f = new BufferedInputStream(new FileInputStream(file)); 25 | try { 26 | f.read(buffer); 27 | } finally { 28 | f.close(); 29 | } 30 | return buffer; 31 | } 32 | 33 | public static void writeEntireFile(File file, String contents) throws IOException { 34 | // read entire file into string 35 | BufferedOutputStream f = new BufferedOutputStream(new FileOutputStream(file)); 36 | try { 37 | f.write(contents.getBytes()); 38 | } finally { 39 | f.close(); 40 | } 41 | } 42 | 43 | public static String toCamelCase(String s) { 44 | if (Character.isUpperCase(s.charAt(0))) { 45 | return Character.toLowerCase(s.charAt(0)) + s.substring(1, s.length()); 46 | } else { 47 | return s; 48 | } 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/org/rabinfingerprint/handprint/HandprintTest.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.List; 6 | 7 | import junit.framework.TestCase; 8 | 9 | import org.rabinfingerprint.handprint.Handprint; 10 | import org.rabinfingerprint.handprint.Handprints; 11 | import org.rabinfingerprint.handprint.Handprints.HandPrintFactory; 12 | import org.rabinfingerprint.polynomial.Polynomial; 13 | 14 | public class HandprintTest extends TestCase { 15 | public void testChunkingFiles() throws IOException { 16 | Polynomial p = Polynomial.createIrreducible(53); 17 | 18 | List sims = TestDataGenerator.getSimilarRandomBytes(4); 19 | HandPrintFactory factory = Handprints.newFactory(p); 20 | Handprint hand1 = factory.newHandprint(sims.get(0)); 21 | Handprint hand2 = factory.newHandprint(sims.get(1)); 22 | Handprint hand3 = factory.newHandprint(sims.get(2)); 23 | Handprint hand4 = factory.newHandprint(sims.get(3)); 24 | 25 | assertTrue(Math.abs(0.70 - hand1.getSimilarity(hand2)) < 0.05); 26 | assertTrue(Math.abs(0.70 - hand1.getSimilarity(hand3)) < 0.05); 27 | assertTrue(Math.abs(0.70 - hand1.getSimilarity(hand4)) < 0.05); 28 | 29 | List diffs = TestDataGenerator.getDifferentRandomBytes(3); 30 | Handprint hand5 = factory.newHandprint(diffs.get(0)); 31 | Handprint hand6 = factory.newHandprint(diffs.get(1)); 32 | Handprint hand7 = factory.newHandprint(diffs.get(2)); 33 | 34 | assertTrue(Math.abs(0.00 - hand1.getSimilarity(hand5)) < 0.05); 35 | assertTrue(Math.abs(0.00 - hand1.getSimilarity(hand6)) < 0.05); 36 | assertTrue(Math.abs(0.00 - hand1.getSimilarity(hand7)) < 0.05); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/datastructures/CircularByteQueue.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.datastructures; 2 | 3 | /** 4 | * A fast but unsafe circular byte queue. 5 | * 6 | * There is no enforcement that the indices are valid, and it is easily possible 7 | * to overflow when adding or polling. But, this is faster than Queue by a 8 | * factor of 5 or so. 9 | */ 10 | public class CircularByteQueue { 11 | private int size = 0; 12 | private int head = 0; 13 | private int tail = 0; 14 | private final int capacity; 15 | private final byte[] bytes; 16 | 17 | public CircularByteQueue(int capacity) { 18 | this.capacity = capacity; 19 | this.bytes = new byte[capacity]; 20 | } 21 | 22 | /** 23 | * Adds the byte to the queue 24 | */ 25 | public void add(byte b) { 26 | bytes[head] = b; 27 | head++; 28 | head %= capacity; 29 | size++; 30 | } 31 | 32 | /** 33 | * Removes and returns the next byte in the queue 34 | */ 35 | public byte poll() { 36 | byte b = bytes[tail]; 37 | tail++; 38 | tail %= capacity; 39 | size--; 40 | return b; 41 | } 42 | 43 | /** 44 | * Resets the queue to its original state but DOES NOT clear the array of 45 | * bytes. 46 | */ 47 | public void clear() { 48 | head = 0; 49 | tail = 0; 50 | size = 0; 51 | } 52 | 53 | /** 54 | * Returns the number of elements that have been added to this queue minus 55 | * those that have been removed. 56 | */ 57 | public int size() { 58 | return size; 59 | } 60 | 61 | /** 62 | * Returns the capacity of this queue -- i.e. the total number of bytes that 63 | * can be stored without overflowing. 64 | */ 65 | public int capacity() { 66 | return capacity; 67 | } 68 | 69 | public boolean isFull() { 70 | return size == capacity; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/Samples.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | 7 | import org.rabinfingerprint.fingerprint.RabinFingerprintLong; 8 | import org.rabinfingerprint.fingerprint.RabinFingerprintLongWindowed; 9 | import org.rabinfingerprint.polynomial.Polynomial; 10 | 11 | import com.google.common.io.ByteStreams; 12 | 13 | public class Samples { 14 | public static void fingerprint() throws FileNotFoundException, IOException { 15 | // Create new random irreducible polynomial 16 | // These can also be created from Longs or hex Strings 17 | Polynomial polynomial = Polynomial.createIrreducible(53); 18 | 19 | // Create a fingerprint object 20 | RabinFingerprintLong rabin = new RabinFingerprintLong(polynomial); 21 | 22 | // Push bytes from a file stream 23 | rabin.pushBytes(ByteStreams.toByteArray(new FileInputStream("file.test"))); 24 | 25 | // Get fingerprint value and output 26 | System.out.println(Long.toString(rabin.getFingerprintLong(), 16)); 27 | } 28 | 29 | public static void slidingWindowFingerprint() throws FileNotFoundException, IOException { 30 | // Create new random irreducible polynomial 31 | // These can also be created from Longs or hex Strings 32 | Polynomial polynomial = Polynomial.createIrreducible(53); 33 | 34 | // Create a windowed fingerprint object with a window size of 48 bytes. 35 | RabinFingerprintLongWindowed window = new RabinFingerprintLongWindowed(polynomial, 48); 36 | for (byte b : ByteStreams.toByteArray(new FileInputStream("file.test"))) { 37 | // Push in one byte. Old bytes are automatically popped. 38 | window.pushByte(b); 39 | 40 | // Output current window's fingerprint 41 | System.out.println(Long.toString(window.getFingerprintLong(), 16)); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/LineNumberIndex.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.util.SortedMap; 4 | import java.util.TreeMap; 5 | import java.util.regex.Pattern; 6 | 7 | public class LineNumberIndex { 8 | public static final String LINE_SEPARATOR = "\r\n|[\n\r\u2028\u2029\u0085]"; 9 | public static final String LINE = ".*(?:" + LINE_SEPARATOR + ")|.+$"; 10 | public static final String TERMINAL_LINE = ";\\s*((//|/\\*).*)?$"; 11 | public static final Pattern LINE_SEPARATOR_PATTERN = Pattern.compile(LINE_SEPARATOR); 12 | public static final Pattern LINE_PATTERN = Pattern.compile(LINE); 13 | public static final Pattern TERMINAL_LINE_PATTERN = Pattern.compile(TERMINAL_LINE); 14 | 15 | private final TreeMap index; 16 | private final String fileString; 17 | 18 | public LineNumberIndex(String fileString) { 19 | // build line number index 20 | this.index = new TreeMap(); 21 | this.fileString = fileString; 22 | final TokenReader lineMatcher = new TokenReader(LINE_PATTERN, fileString); 23 | long lines = 1; 24 | long chars = 0; 25 | index.put(chars, lines); 26 | String str = null; 27 | while ((str = lineMatcher.get()) != null) { 28 | lines += 1; 29 | chars += str.length(); 30 | index.put(chars, lines); 31 | } 32 | } 33 | 34 | public int getLineNumber(long characterOffset) { 35 | final SortedMap head = index.headMap(characterOffset + 1); 36 | if(head.isEmpty()) return -1; 37 | return head.get(head.lastKey()).intValue(); 38 | } 39 | 40 | public String getLine(long characterOffset) { 41 | final SortedMap head = index.headMap(characterOffset + 1); 42 | final SortedMap tail = index.tailMap(characterOffset); 43 | int i0 = (int) ((head.isEmpty()) ? 0 : head.lastKey()); 44 | int i1 = (int) ((tail.isEmpty()) ? fileString.length() : tail.firstKey()); 45 | return fileString.substring(i0, i1); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/test/java/org/rabinfingerprint/polynomial/PolynomialTest.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.polynomial; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import org.rabinfingerprint.polynomial.Polynomial; 6 | import org.rabinfingerprint.polynomial.Polynomial.Reducibility; 7 | 8 | public class PolynomialTest extends TestCase { 9 | /** 10 | * Tests loading and printing out of polynomials. 11 | * 12 | * The polys used are from here: 13 | * http://en.wikipedia.org/wiki/Finite_field_arithmetic#Rijndael.27s_finite_field 14 | */ 15 | public void testPolynomialArithmetic() { 16 | Polynomial pa = Polynomial.createFromLong(0x53); 17 | Polynomial pb = Polynomial.createFromLong(0xCA); 18 | Polynomial pm = Polynomial.createFromLong(0x11B); 19 | Polynomial px = pa.multiply(pb); 20 | assertEquals(0x3F7E, px.toBigInteger().longValue()); 21 | Polynomial pabm = px.mod(pm); 22 | assertEquals(0x1, pabm.toBigInteger().longValue()); 23 | } 24 | 25 | /** 26 | * According to Rabin, the expected number of tests required to find an 27 | * irreducible polynomial from a randomly chosen monic polynomial of degree 28 | * k is k (neat, huh!). 29 | * 30 | * Therefore, we should see an average spread of k reducible polynomials 31 | * between irreducible ones. This test computes the running average of these 32 | * spreads for verification. 33 | * 34 | * This is not a perfect correctness verification, but it is a good "mine 35 | * canary". 36 | */ 37 | public void testIrreducibleSpread() { 38 | int degree = 15; 39 | Stats stats = getSpread(degree, 200); 40 | double spread = Math.abs(stats.average() - degree); 41 | assertTrue("Spread of irreducible polynomials is out of expected range: " + spread, spread < 3); 42 | } 43 | 44 | public Stats getSpread(int degree, int tests) { 45 | int i = 0; 46 | int last_i = 0; 47 | Stats stats = new Stats(); 48 | while (tests > 0) { 49 | Polynomial f = Polynomial.createRandom(degree); 50 | Reducibility r = f.getReducibility(); 51 | if (r == Reducibility.IRREDUCIBLE) { 52 | int spread = i - last_i; 53 | stats.add(spread); 54 | last_i = i; 55 | tests--; 56 | } 57 | i++; 58 | } 59 | return stats; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rabinfingerprint 2 | ================ 3 | 4 | A Java implementation of the rabin fingerprinting method. (http://en.wikipedia.org/wiki/Rabin_fingerprint) 5 | 6 | Optimized for use on a stream including a sliding window fingerprint. 7 | 8 | Includes arbitrary-precision-polynomial hashing as well as very fast long-based hashing implementations, which are best for most hashing uses. 9 | 10 | ### Generating a fingerprint ### 11 | ```Java 12 | // Create new random irreducible polynomial 13 | // These can also be created from Longs or hex Strings 14 | Polynomial polynomial = Polynomial.createIrreducible(53); 15 | 16 | // Create a fingerprint object 17 | RabinFingerprintLong rabin = new RabinFingerprintLong(polynomial); 18 | 19 | // Push bytes from a file stream 20 | rabin.pushBytes(ByteStreams.toByteArray(new FileInputStream("file.test"))); 21 | 22 | // Get fingerprint value and output 23 | System.out.println(Long.toString(rabin.getFingerprintLong(), 16)); 24 | ``` 25 | 26 | ### Generating a sliding-window fingerprint ### 27 | ```Java 28 | // Create new random irreducible polynomial 29 | // These can also be created from Longs or hex Strings 30 | Polynomial polynomial = Polynomial.createIrreducible(53); 31 | 32 | // Create a windowed fingerprint object with a window size of 48 bytes. 33 | RabinFingerprintLongWindowed window = new RabinFingerprintLongWindowed(polynomial, 48); 34 | for (byte b : ByteStreams.toByteArray(new FileInputStream("file.test"))) { 35 | // Push in one byte. Old bytes are automatically popped. 36 | window.pushByte(b); 37 | // Output current window's fingerprint 38 | System.out.println(Long.toString(window.getFingerprintLong(), 16)); 39 | } 40 | ``` 41 | 42 | ### Building ### 43 | 44 | This project uses Maven for dependency management. To build this project's runnable jar, sources and javadoc, run this command: 45 | 46 | ``` 47 | % mvn clean install 48 | ``` 49 | 50 | ### Command line ### 51 | 52 | [Full Usage](https://github.com/themadcreator/rabinfingerprint/blob/master/src/main/resources/usage.txt) 53 | 54 | Generate a new irreducible polynomial 55 | ``` 56 | % java -jar rabinfingerprint.jar -polygen 53 57 | 3DE9DD57CA448B 58 | ``` 59 | 60 | Fingerprint a file 61 | ``` 62 | % java -jar rabinfingerprint.jar -p 3DE9DD57CA448B file.test 63 | 43A39C59491F /[path to file]/file.test 64 | ``` 65 | 66 | Fingerprint STDIN 67 | ``` 68 | % cat file.test | java -jar rabinfingerprint.jar -p 3DE9DD57CA448B 69 | 43A39C59491F 70 | ``` 71 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/polynomial/Polynomials.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.polynomial; 2 | 3 | public class Polynomials { 4 | public static final long DEFAULT_POLYNOMIAL_LONG = 0x375AD14A67FC7BL; 5 | 6 | /** 7 | * Generates a handful of irreducible polynomials of the specified degree. 8 | */ 9 | public static void printIrreducibles(final int degree) { 10 | for (int i = 0; i < 10; i++) { 11 | Polynomial p = Polynomial.createIrreducible(degree); 12 | System.out.println(p.toPolynomialString()); 13 | } 14 | } 15 | 16 | /** 17 | * Generates a large irreducible polynomial and prints out its 18 | * representation in ascii and hex. 19 | */ 20 | public static void printLargeIrreducible() { 21 | Polynomial p = Polynomial.createIrreducible(127); 22 | System.out.println(p.toPolynomialString()); 23 | System.out.println(p.toHexString()); 24 | } 25 | 26 | /** 27 | * Computes (a mod b) using synthetic division where a and b represent 28 | * polynomials in GF(2^k). 29 | */ 30 | public static long mod(long a, long b) { 31 | int ma = getMaxBit(a); 32 | int mb = getMaxBit(b); 33 | for (int i = ma - mb; i >= 0; i--) { 34 | if (getBit(a, (i + mb))) { 35 | long shifted = b << i; 36 | a = a ^ shifted; 37 | } 38 | } 39 | return a; 40 | } 41 | 42 | /** 43 | * Returns the index of the maximum set bit. If no bits are set, returns -1. 44 | */ 45 | public static int getMaxBit(long l) { 46 | for (int i = 64 - 1; i >= 0; i--) { 47 | if (getBit(l, i)) 48 | return i; 49 | } 50 | return -1; 51 | } 52 | 53 | /** 54 | * Returns the value of the bit at index of the long. The right most bit is 55 | * at index 0. 56 | */ 57 | public static boolean getBit(long l, int index) { 58 | return (((l >> index) & 1) == 1); 59 | } 60 | 61 | /** 62 | * Returns the value of the bit at index of the byte. The right most bit is 63 | * at index 0. 64 | */ 65 | public static boolean getBit(byte b, int index) { 66 | return (((b >> index) & 1) == 1); 67 | } 68 | 69 | /** 70 | * Returns the value of the bit at index of the byte. The right most bit is 71 | * at index 0 of the last byte in the array. 72 | */ 73 | public static boolean getBit(byte[] bytes, int index) { 74 | // byte array index 75 | final int aidx = bytes.length - 1 - (index / 8); 76 | // bit index 77 | final int bidx = index % 8; 78 | // byte 79 | final byte b = bytes[aidx]; 80 | // bit 81 | return getBit(b, bidx); 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/test/java/org/rabinfingerprint/fingerprint/RabinFingerprintTest.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | 4 | import java.util.Random; 5 | 6 | import junit.framework.TestCase; 7 | 8 | import org.rabinfingerprint.fingerprint.Fingerprint; 9 | import org.rabinfingerprint.fingerprint.RabinFingerprintLong; 10 | import org.rabinfingerprint.fingerprint.RabinFingerprintLongWindowed; 11 | import org.rabinfingerprint.fingerprint.RabinFingerprintPolynomial; 12 | import org.rabinfingerprint.polynomial.Polynomial; 13 | 14 | public class RabinFingerprintTest extends TestCase { 15 | 16 | public static void testPolynomialsAndLongs() { 17 | // generate random data 18 | byte[] data = new byte[1024]; 19 | Random random = new Random(System.currentTimeMillis()); 20 | random.nextBytes(data); 21 | 22 | // generate random irreducible polynomial 23 | Polynomial p = Polynomial.createIrreducible(53); 24 | final Fingerprint rabin0 = new RabinFingerprintPolynomial(p); 25 | final Fingerprint rabin1 = new RabinFingerprintLong(p); 26 | rabin0.pushBytes(data); 27 | rabin1.pushBytes(data); 28 | assertEquals(0, rabin0.getFingerprint().compareTo(rabin1.getFingerprint())); 29 | } 30 | 31 | public static void testWindowing() { 32 | doTestWindowing(true, 5); 33 | doTestWindowing(false, 5); 34 | } 35 | 36 | public static void doTestWindowing(boolean usePolynomials, int times) { 37 | Random random = new Random(System.currentTimeMillis()); 38 | int windowSize = 8; 39 | 40 | for (int i = 0; i < times; i++) { 41 | // Generate Random Irreducible Polynomial 42 | Polynomial p = Polynomial.createIrreducible(53); 43 | 44 | final Fingerprint rabin0, rabin1; 45 | if (usePolynomials) { 46 | rabin0 = new RabinFingerprintPolynomial(p, windowSize); 47 | rabin1 = new RabinFingerprintPolynomial(p); 48 | } else { 49 | rabin0 = new RabinFingerprintLongWindowed(p, windowSize); 50 | rabin1 = new RabinFingerprintLong(p); 51 | } 52 | 53 | // Generate Random Data 54 | byte[] data = new byte[windowSize * 5]; 55 | random.nextBytes(data); 56 | 57 | // Read 3 windows of data to populate one fingerprint 58 | for (int j = 0; j < windowSize * 3; j++) { 59 | rabin0.pushByte(data[j]); 60 | } 61 | 62 | // Starting from same offset, continue fingerprinting for 1 more window 63 | for (int j = windowSize * 3; j < windowSize * 4; j++) { 64 | rabin0.pushByte(data[j]); 65 | rabin1.pushByte(data[j]); 66 | } 67 | 68 | assertEquals(0, rabin0.getFingerprint().compareTo(rabin1.getFingerprint())); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/fingerprint/RabinFingerprintLongWindowed.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | import java.math.BigInteger; 4 | 5 | import org.rabinfingerprint.datastructures.CircularByteQueue; 6 | import org.rabinfingerprint.fingerprint.Fingerprint.WindowedFingerprint; 7 | import org.rabinfingerprint.polynomial.Polynomial; 8 | 9 | public class RabinFingerprintLongWindowed extends RabinFingerprintLong implements WindowedFingerprint { 10 | 11 | protected final CircularByteQueue byteWindow; 12 | protected final long bytesPerWindow; 13 | protected final long[] popTable; 14 | 15 | public RabinFingerprintLongWindowed(Polynomial poly, long bytesPerWindow) { 16 | super(poly); 17 | this.bytesPerWindow = bytesPerWindow; 18 | this.byteWindow = new CircularByteQueue((int) bytesPerWindow + 1); 19 | this.popTable = new long[256]; 20 | precomputePopTable(); 21 | } 22 | 23 | public RabinFingerprintLongWindowed(RabinFingerprintLongWindowed that) { 24 | super(that); 25 | this.bytesPerWindow = that.bytesPerWindow; 26 | this.byteWindow = new CircularByteQueue((int) bytesPerWindow + 1); 27 | this.popTable = that.popTable; 28 | } 29 | 30 | private void precomputePopTable() { 31 | for (int i = 0; i < 256; i++) { 32 | Polynomial f = Polynomial.createFromLong(i); 33 | f = f.shiftLeft(BigInteger.valueOf(bytesPerWindow * 8)); 34 | f = f.mod(poly); 35 | popTable[i] = f.toBigInteger().longValue(); 36 | } 37 | } 38 | 39 | @Override 40 | public void pushBytes(final byte[] bytes) { 41 | for (byte b : bytes) { 42 | int j = (int) ((fingerprint >> shift) & 0x1FF); 43 | fingerprint = ((fingerprint << 8) | (b & 0xFF)) ^ pushTable[j]; 44 | byteWindow.add(b); 45 | if (byteWindow.isFull()) popByte(); 46 | } 47 | } 48 | 49 | @Override 50 | public void pushBytes(final byte[] bytes, final int offset, final int length) { 51 | final int max = offset + length; 52 | int i = offset; 53 | while (i < max) { 54 | byte b = bytes[i++]; 55 | int j = (int) ((fingerprint >> shift) & 0x1FF); 56 | fingerprint = ((fingerprint << 8) | (b & 0xFF)) ^ pushTable[j]; 57 | byteWindow.add(b); 58 | if (byteWindow.isFull()) popByte(); 59 | } 60 | } 61 | 62 | @Override 63 | public void pushByte(byte b) { 64 | int j = (int) ((fingerprint >> shift) & 0x1FF); 65 | fingerprint = ((fingerprint << 8) | (b & 0xFF)) ^ pushTable[j]; 66 | byteWindow.add(b); 67 | if (byteWindow.isFull()) popByte(); 68 | } 69 | 70 | /** 71 | * Removes the contribution of the first byte in the byte queue from the 72 | * fingerprint. 73 | * 74 | * {@link RabinFingerprintPolynomial#popByte} 75 | */ 76 | public void popByte() { 77 | byte b = byteWindow.poll(); 78 | fingerprint ^= popTable[(b & 0xFF)]; 79 | } 80 | 81 | @Override 82 | public void reset() { 83 | super.reset(); 84 | byteWindow.clear(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/StringFinder.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.File; 4 | 5 | import org.rabinfingerprint.fingerprint.RabinFingerprintLongWindowed; 6 | import org.rabinfingerprint.polynomial.Polynomial; 7 | 8 | public class StringFinder { 9 | 10 | private final Polynomial polynomial; 11 | private final String target; 12 | private final long targetFingerprint; 13 | private final RabinFingerprintLongWindowed rabin; 14 | 15 | public StringFinder(String target) { 16 | this.polynomial = Polynomial.createIrreducible(53); 17 | this.target = target; 18 | 19 | // calculate target fingerprint 20 | this.rabin = new RabinFingerprintLongWindowed(polynomial, target.length()); 21 | rabin.pushBytes(target.getBytes()); 22 | this.targetFingerprint = rabin.getFingerprintLong(); 23 | } 24 | 25 | public StringMatcher matcher(String string) { 26 | return new StringMatcher(string); 27 | } 28 | 29 | public final class StringMatcher { 30 | private final RabinFingerprintLongWindowed localRabin; 31 | private final String string; 32 | private final byte[] bytes; 33 | private int offset = 0; 34 | private int start = -1; 35 | private int end = -1; 36 | 37 | private StringMatcher(String string) { 38 | this.localRabin = new RabinFingerprintLongWindowed(rabin); 39 | this.string = string; 40 | this.bytes = string.getBytes(); 41 | } 42 | 43 | public boolean find() { 44 | for (; offset < bytes.length; offset++) { 45 | localRabin.pushByte(bytes[offset]); 46 | if (localRabin.getFingerprintLong() == targetFingerprint) { 47 | final int i0 = offset - target.length() + 1; 48 | final int i1 = offset + 1; 49 | if (i0 < 0 || i1 >= string.length()) continue; 50 | final String substring = string.substring(i0, i1); 51 | if (substring.equals(target)) { 52 | start = i0; 53 | end = i1; 54 | return true; 55 | } 56 | } 57 | } 58 | return false; 59 | } 60 | 61 | public int getStart() { 62 | return start; 63 | } 64 | 65 | public int getEnd() { 66 | return end; 67 | } 68 | } 69 | 70 | public static class StringMatch { 71 | private final File file; 72 | private final String line; 73 | private final int lineOffset; 74 | private final int characterOffset; 75 | 76 | public StringMatch(File file, String line, int lineOffset, int characterOffset) { 77 | this.file = file; 78 | this.line = line; 79 | this.lineOffset = lineOffset; 80 | this.characterOffset = characterOffset; 81 | } 82 | 83 | public File getFile() { 84 | return file; 85 | } 86 | 87 | public String getLine() { 88 | return line; 89 | } 90 | 91 | public int getLineOffset() { 92 | return lineOffset; 93 | } 94 | 95 | public int getCharacterOffset() { 96 | return characterOffset; 97 | } 98 | 99 | @Override 100 | public String toString() { 101 | return String.format("%d:%s", lineOffset, line); 102 | } 103 | } 104 | 105 | public static interface StringMatchVisitor { 106 | public void found(StringMatch match); 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/fingerprint/RabinFingerprintLong.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | import org.rabinfingerprint.polynomial.Polynomial; 4 | 5 | /** 6 | * A {@link Fingerprint} builder that uses longs and lookup tables to increase 7 | * performance. 8 | * 9 | * Note, the polynomial must be of degree 64 - 8 - 1 - 1 = 54 or less! 10 | * 11 | *
 12 |  *   64 for the size of a long
 13 |  *    8 for the space we need when shifting
 14 |  *    1 for the sign bit (Java doesn't support unsigned longs)
 15 |  *    1 for the conversion between degree and bit offset.
 16 |  * 
17 | * 18 | * Some good choices are 53, 47, 31, 15 19 | * 20 | * @see RabinFingerprintPolynomial for a rundown of the math 21 | */ 22 | public class RabinFingerprintLong extends AbstractFingerprint { 23 | protected final long[] pushTable; 24 | protected final int degree; 25 | protected final int shift; 26 | 27 | protected long fingerprint; 28 | 29 | public RabinFingerprintLong(Polynomial poly) { 30 | super(poly); 31 | this.degree = poly.degree().intValue(); 32 | this.shift = degree - 8; 33 | this.fingerprint = 0; 34 | this.pushTable = new long[512]; 35 | precomputePushTable(); 36 | } 37 | 38 | public RabinFingerprintLong(RabinFingerprintLong that) { 39 | super(that.poly); 40 | this.degree = that.degree; 41 | this.shift = that.shift; 42 | this.pushTable = that.pushTable; 43 | this.fingerprint = 0; 44 | } 45 | 46 | /** 47 | * Precomputes the results of pushing and popping bytes. These use the more 48 | * accurate Polynomial methods (they won't overflow like longs, and they 49 | * compute in GF(2^k)). 50 | * 51 | * These algorithms should be synonymous with 52 | * {@link RabinFingerprintPolynomial#pushByte} and 53 | * {@link RabinFingerprintPolynomial#popByte}, but the results are stored to 54 | * be xor'red with the fingerprint in the inner loop of our own 55 | * {@link #pushByte} and {@link #popByte} 56 | */ 57 | private void precomputePushTable() { 58 | for (int i = 0; i < 512; i++) { 59 | Polynomial f = Polynomial.createFromLong(i); 60 | f = f.shiftLeft(poly.degree()); 61 | f = f.xor(f.mod(poly)); 62 | pushTable[i] = f.toBigInteger().longValue(); 63 | } 64 | } 65 | 66 | @Override 67 | public void pushBytes(final byte[] bytes) { 68 | for (byte b : bytes) { 69 | int j = (int) ((fingerprint >> shift) & 0x1FF); 70 | fingerprint = ((fingerprint << 8) | (b & 0xFF)) ^ pushTable[j]; 71 | } 72 | } 73 | 74 | @Override 75 | public void pushBytes(final byte[] bytes, final int offset, final int length) { 76 | final int max = offset + length; 77 | int i = offset; 78 | while (i < max) { 79 | int j = (int) ((fingerprint >> shift) & 0x1FF); 80 | fingerprint = ((fingerprint << 8) | (bytes[i++] & 0xFF)) ^ pushTable[j]; 81 | } 82 | } 83 | 84 | @Override 85 | public void pushByte(byte b) { 86 | int j = (int) ((fingerprint >> shift) & 0x1FF); 87 | fingerprint = ((fingerprint << 8) | (b & 0xFF)) ^ pushTable[j]; 88 | } 89 | 90 | @Override 91 | public void reset() { 92 | this.fingerprint = 0L; 93 | } 94 | 95 | @Override 96 | public Polynomial getFingerprint() { 97 | return Polynomial.createFromLong(fingerprint); 98 | } 99 | 100 | public long getFingerprintLong() { 101 | return fingerprint; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/handprint/Handprint.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Comparator; 6 | import java.util.TreeSet; 7 | 8 | import org.rabinfingerprint.datastructures.Interval; 9 | import org.rabinfingerprint.handprint.FingerFactory.ChunkVisitor; 10 | import org.rabinfingerprint.handprint.Handprints.HandprintException; 11 | 12 | import com.google.common.collect.ArrayListMultimap; 13 | import com.google.common.collect.Iterables; 14 | import com.google.common.collect.Multimap; 15 | import com.google.common.collect.Sets; 16 | 17 | public class Handprint { 18 | private final InputStream stream; 19 | private final FingerFactory factory; 20 | private final int fingersPerHand; 21 | 22 | private Long palm; 23 | private Multimap fingers; 24 | private Multimap hand; 25 | 26 | public Handprint(InputStream stream, int fingersPerHand, FingerFactory factory) { 27 | this.stream = stream; 28 | this.factory = factory; 29 | this.fingersPerHand = fingersPerHand; 30 | } 31 | 32 | public void buildAll() { 33 | getPalm(); 34 | getAllFingers(); 35 | getHandFingers(); 36 | } 37 | 38 | public Long getPalm() { 39 | if (palm != null) 40 | return palm; 41 | try { 42 | palm = factory.getFullFingerprint(stream); 43 | } catch (IOException e) { 44 | throw new HandprintException("Error while computing fingerprints", e); 45 | } 46 | return palm; 47 | } 48 | 49 | public Multimap getAllFingers() { 50 | if (fingers != null) 51 | return fingers; 52 | try { 53 | fingers = ArrayListMultimap.create(); 54 | factory.getChunkFingerprints(stream, new ChunkVisitor() { 55 | public void visit(long fingerprint, long chunkStart, long chunkEnd) { 56 | fingers.put(fingerprint, new Interval(chunkStart, chunkEnd)); 57 | } 58 | }); 59 | } catch (IOException e) { 60 | throw new HandprintException("Error while computing fingerprints", e); 61 | } 62 | return fingers; 63 | } 64 | 65 | public static final Comparator REVERSE_LONG_SORT = new Comparator() { 66 | public int compare(Long o1, Long o2) { 67 | return o2.compareTo(o1); 68 | } 69 | }; 70 | 71 | public Multimap getHandFingers() { 72 | if (hand != null) 73 | return hand; 74 | hand = ArrayListMultimap.create(); 75 | Multimap all = getAllFingers(); 76 | TreeSet keys = Sets.newTreeSet(REVERSE_LONG_SORT); 77 | keys.addAll(all.keySet()); 78 | for (Long key : Iterables.limit(keys, fingersPerHand)) { 79 | hand.putAll(key, all.get(key)); 80 | } 81 | return hand; 82 | } 83 | 84 | public int getFingerCount() { 85 | return getAllFingers().size(); 86 | } 87 | 88 | public int getIntersectingFingerCount(Handprint other) { 89 | return Sets.intersection(getAllFingers().keySet(), other.getAllFingers().keySet()).size(); 90 | } 91 | 92 | public double getSimilarity(Handprint other) { 93 | int maxFingers = Math.max(getFingerCount(), other.getFingerCount()); 94 | if (maxFingers == 0) { 95 | return 1.0; 96 | } 97 | return (double) getIntersectingFingerCount(other) / (double) maxFingers; 98 | } 99 | 100 | @Override 101 | public String toString() { 102 | return getHandFingers().toString(); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/handprint/FingerFactory.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.handprint; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.rabinfingerprint.fingerprint.RabinFingerprintLong; 7 | import org.rabinfingerprint.fingerprint.RabinFingerprintLongWindowed; 8 | import org.rabinfingerprint.polynomial.Polynomial; 9 | 10 | import com.google.common.io.ByteStreams; 11 | 12 | public class FingerFactory { 13 | public static interface ChunkBoundaryDetector { 14 | public boolean isBoundary(RabinFingerprintLong fingerprint); 15 | } 16 | 17 | public static interface ChunkVisitor { 18 | public void visit(long fingerprint, long chunkStart, long chunkEnd); 19 | } 20 | 21 | private final RabinFingerprintLong finger; 22 | private final RabinFingerprintLongWindowed fingerWindow; 23 | private final ChunkBoundaryDetector boundaryDetector; 24 | 25 | public FingerFactory(Polynomial p, long bytesPerWindow, ChunkBoundaryDetector boundaryDetector) { 26 | this.finger = new RabinFingerprintLong(p); 27 | this.fingerWindow = new RabinFingerprintLongWindowed(p, bytesPerWindow); 28 | this.boundaryDetector = boundaryDetector; 29 | } 30 | 31 | private RabinFingerprintLong newFingerprint() { 32 | return new RabinFingerprintLong(finger); 33 | } 34 | 35 | private RabinFingerprintLongWindowed newWindowedFingerprint() { 36 | return new RabinFingerprintLongWindowed(fingerWindow); 37 | } 38 | 39 | /** 40 | * Fingerprint the file into chunks called "Fingers". The chunk boundaries 41 | * are determined using a windowed fingerprinter 42 | * {@link RabinFingerprintLongWindowed}. 43 | * 44 | * The chunk detector is position independent. Therefore, even if a file is 45 | * rearranged or partially corrupted, the untouched chunks can be 46 | * efficiently discovered. 47 | */ 48 | public void getChunkFingerprints(InputStream is, ChunkVisitor visitor) throws IOException { 49 | // windowing fingerprinter for finding chunk boundaries. this is only 50 | // reset at the beginning of the file 51 | final RabinFingerprintLong window = newWindowedFingerprint(); 52 | 53 | // fingerprinter for chunks. this is reset after each chunk 54 | final RabinFingerprintLong finger = newFingerprint(); 55 | 56 | // counters 57 | long chunkStart = 0; 58 | long chunkEnd = 0; 59 | 60 | /* 61 | * fingerprint one byte at a time. we have to use this granularity to 62 | * ensure that, for example, a one byte offset at the beginning of the 63 | * file won't effect the chunk boundaries 64 | */ 65 | for (byte b : ByteStreams.toByteArray(is)) { 66 | // push byte into fingerprints 67 | window.pushByte(b); 68 | finger.pushByte(b); 69 | chunkEnd++; 70 | 71 | /* 72 | * if we've reached a boundary (which we will at some probability 73 | * based on the boundary pattern and the size of the fingerprint 74 | * window), we store the current chunk fingerprint and reset the 75 | * chunk fingerprinter. 76 | */ 77 | if (boundaryDetector.isBoundary(window)) { 78 | visitor.visit(finger.getFingerprintLong(), chunkStart, chunkEnd); 79 | finger.reset(); 80 | 81 | // store last chunk offset 82 | chunkStart = chunkEnd; 83 | } 84 | } 85 | 86 | // final chunk 87 | visitor.visit(finger.getFingerprintLong(), chunkStart, chunkEnd); 88 | } 89 | 90 | /** 91 | * Rapidly fingerprint an entire stream's contents. 92 | */ 93 | public long getFullFingerprint(InputStream is) throws IOException { 94 | final RabinFingerprintLong finger = newFingerprint(); 95 | finger.pushBytes(ByteStreams.toByteArray(is)); 96 | return finger.getFingerprintLong(); 97 | } 98 | } -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/FileFinder.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.File; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | public class FileFinder { 8 | 9 | public static interface FileVisitor { 10 | public void visitFile(File file); 11 | public void visitDirectory(File file); 12 | } 13 | 14 | public static void visitFilesRecursively(FileVisitor visitor, File directory, String filePattern, String directoryPattern, boolean recursively) { 15 | if (!directory.isDirectory()) directory = directory.getParentFile(); 16 | if (!directory.getName().matches(directoryPattern)) return; 17 | visitor.visitDirectory(directory); 18 | final ArrayList childDirectories = new ArrayList(); 19 | for (File file : directory.listFiles()) { 20 | if (file.isDirectory()) { 21 | childDirectories.add(file); 22 | } else if (file.isFile()) { 23 | if (!file.getName().matches(filePattern)) continue; 24 | visitor.visitFile(file); 25 | } 26 | } 27 | if (recursively) { 28 | for (File childDirectory : childDirectories) { 29 | visitFilesRecursively(visitor, childDirectory, filePattern, directoryPattern, recursively); 30 | } 31 | } 32 | } 33 | 34 | public static void visitDirectoriesRecursively(FileVisitor visitor, File directory, String pattern, boolean recursively) { 35 | if (!directory.isDirectory()) directory = directory.getParentFile(); 36 | if (!directory.getName().matches(pattern)) return; 37 | visitor.visitDirectory(directory); 38 | final ArrayList childDirectories = new ArrayList(); 39 | for (File file : directory.listFiles()) { 40 | if (file.isDirectory()) { 41 | childDirectories.add(file); 42 | } 43 | } 44 | if (recursively) { 45 | for (File childDirectory : childDirectories) { 46 | visitDirectoriesRecursively(visitor, childDirectory, pattern, recursively); 47 | } 48 | } 49 | 50 | } 51 | 52 | public static List getDirectoriesMatching(String basePath, String pattern, boolean recursively) { 53 | File dir = new File(basePath); 54 | final List dirs = new ArrayList(); 55 | if (!dir.exists()) return dirs; 56 | visitDirectoriesRecursively(new FileVisitor() { 57 | public void visitDirectory(File d) { 58 | dirs.add(d); 59 | } 60 | 61 | public void visitFile(File f) { 62 | // only finding directories 63 | } 64 | }, dir, pattern, recursively); 65 | return dirs; 66 | } 67 | 68 | public static List getFilesMatching(String basePath, String filePattern) { 69 | return getFilesMatching(basePath, filePattern, "[^.].*", true); 70 | } 71 | 72 | public static List getFilesMatching(String basePath, String filePattern, String directoryPattern, boolean recursively) { 73 | File file = new File(basePath); 74 | final List files = new ArrayList(); 75 | if (!file.exists()) return files; 76 | visitFilesRecursively(new FileVisitor() { 77 | public void visitDirectory(File d) { 78 | // only finding files 79 | } 80 | 81 | public void visitFile(File f) { 82 | files.add(f); 83 | } 84 | }, file, filePattern, directoryPattern, recursively); 85 | return files; 86 | } 87 | 88 | public static String getExtensionPatterns(List exts) { 89 | final StringBuilder sb = new StringBuilder(); 90 | sb.append("("); 91 | for (String ext : exts) { 92 | sb.append(".*\\." + ext + "|"); 93 | } 94 | sb.append(")"); 95 | return sb.toString(); 96 | } 97 | 98 | public static void main(String[] args) throws Exception { 99 | final String curDir = System.getProperty("user.dir"); 100 | final List files = getFilesMatching(curDir, ".*\\.java", "[^.].*", true); 101 | for (File file : files) { 102 | System.out.println("Found " + file.getName()); // (authorized) 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/FilesStringFinder.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | import java.util.concurrent.LinkedBlockingQueue; 7 | import java.util.concurrent.ThreadPoolExecutor; 8 | import java.util.concurrent.TimeUnit; 9 | 10 | import org.rabinfingerprint.scanner.StringFinder.StringMatch; 11 | import org.rabinfingerprint.scanner.StringFinder.StringMatchVisitor; 12 | import org.rabinfingerprint.scanner.StringFinder.StringMatcher; 13 | 14 | public class FilesStringFinder { 15 | 16 | public static void find( 17 | final String baseDirectory, 18 | final String filePattern, 19 | final String targetString, 20 | final StringMatchVisitor visitor) 21 | throws IOException { 22 | final List files = FileFinder.getFilesMatching(baseDirectory, filePattern); 23 | find(files, targetString, visitor); 24 | } 25 | 26 | public static void find( 27 | final List files, 28 | final String targetString, 29 | final StringMatchVisitor visitor) 30 | throws IOException { 31 | 32 | // create finder 33 | final StringFinder scanner = new StringFinder(targetString); 34 | 35 | // find 36 | for (final File file : files) { 37 | final String str = IOUtils.readEntireFile(file); 38 | final StringMatcher sm = scanner.matcher(str); 39 | LineNumberIndex index = null; 40 | 41 | // find matching strings 42 | while (sm.find()) { 43 | if (index == null) index = new LineNumberIndex(str); 44 | final int off = sm.getStart(); 45 | final int lineOffset = index.getLineNumber(off); 46 | final String line = index.getLine(off); 47 | final StringMatch match = new StringMatch(file, line, lineOffset, off); 48 | visitor.found(match); 49 | } 50 | } 51 | } 52 | 53 | public static void findThreaded( 54 | final int threadCount, 55 | final List files, 56 | final String targetString, 57 | final StringMatchVisitor visitor) { 58 | 59 | // create finder 60 | final StringFinder scanner = new StringFinder(targetString); 61 | 62 | // create thread pool 63 | final ThreadPoolExecutor executor = new ThreadPoolExecutor(threadCount, threadCount, 200, TimeUnit.MILLISECONDS, new LinkedBlockingQueue()); 64 | 65 | // find 66 | for (final File file : files) { 67 | executor.submit(new Runnable() { 68 | public void run() { 69 | try { 70 | final String str = IOUtils.readEntireFile(file); 71 | final StringMatcher sm = scanner.matcher(str); 72 | LineNumberIndex index = null; 73 | 74 | // find matching strings 75 | while (sm.find()) { 76 | if (index == null) index = new LineNumberIndex(str); 77 | final int off = sm.getStart(); 78 | final int lineOffset = index.getLineNumber(off); 79 | final String line = index.getLine(off); 80 | final StringMatch match = new StringMatch(file, line, lineOffset, off); 81 | visitor.found(match); 82 | } 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | }); 88 | } 89 | 90 | try { 91 | executor.shutdown(); 92 | executor.awaitTermination(10*60, TimeUnit.SECONDS); 93 | } catch (InterruptedException e) { 94 | e.printStackTrace(); 95 | } 96 | } 97 | 98 | public static void main(String[] args) throws IOException { 99 | final String curDir = System.getProperty("user.dir"); 100 | final String filePattern = ".*\\.java"; 101 | final String targetString = "asdf fdsa asdf"; 102 | 103 | System.out.println(String.format("Searching for \"%s\"", targetString)); 104 | find(curDir, filePattern, targetString, new StringMatchVisitor() { 105 | public void found(final StringMatch match) { 106 | System.out.println(String.format("%s line %d (%s)", match.getFile().getName(), match.getLineOffset(), match.getFile().getAbsolutePath())); 107 | } 108 | }); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/Args.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.Set; 6 | 7 | import org.rabinfingerprint.Args.ArgsModel.InputMode; 8 | import org.rabinfingerprint.Args.ArgsModel.Mode; 9 | import org.rabinfingerprint.polynomial.Polynomials; 10 | 11 | import com.google.common.collect.Lists; 12 | import com.google.common.collect.Sets; 13 | 14 | public class Args { 15 | @SuppressWarnings("serial") 16 | public static class ArgParseException extends Exception{ 17 | public ArgParseException(String arg0, Throwable arg1) { 18 | super(arg0, arg1); 19 | } 20 | 21 | public ArgParseException(String arg0) { 22 | super(arg0); 23 | } 24 | } 25 | 26 | public static abstract class Arg { 27 | private final Set flags; 28 | private final int argValueCount; 29 | 30 | public Arg(int argValueCount, String... flags) { 31 | this.argValueCount = argValueCount; 32 | this.flags = Sets.newHashSet(flags); 33 | } 34 | 35 | public boolean isFlagOf(String arg) { 36 | return flags.contains(arg); 37 | } 38 | 39 | public int getArgValueCount() { 40 | return argValueCount; 41 | } 42 | 43 | public abstract void parse(ArgsModel model, String[] strs) throws ArgParseException; 44 | } 45 | 46 | public static class ArgsModel{ 47 | public static enum Mode{ 48 | HELP, POLYGEN, FINGERPRINT, HANDPRINT; 49 | } 50 | 51 | public static enum InputMode { 52 | STDIN, FILES; 53 | } 54 | 55 | public Mode mode = null; 56 | public InputMode inputModel = InputMode.STDIN; 57 | public int degree = 53; 58 | public int fingerPerHand = 10; 59 | public long polynomial = Polynomials.DEFAULT_POLYNOMIAL_LONG; 60 | public List unflagged = Lists.newArrayList(); 61 | } 62 | 63 | private final List args = Lists.newArrayList(); 64 | 65 | public Args() { 66 | args.add(new Arg(0, "-h", "--help") { 67 | @Override 68 | public void parse(ArgsModel model, String[] strs) { 69 | model.mode = Mode.HELP; 70 | } 71 | }); 72 | args.add(new Arg(1, "-polygen") { 73 | @Override 74 | public void parse(ArgsModel model, String[] strs) throws ArgParseException { 75 | model.mode = Mode.POLYGEN; 76 | try { 77 | model.degree = Integer.parseInt(strs[0]); 78 | } catch (NumberFormatException e) { 79 | throw new ArgParseException("Could not parse polynomial degree."); 80 | } 81 | } 82 | }); 83 | args.add(new Arg(1, "-p") { 84 | @Override 85 | public void parse(ArgsModel model, String[] strs) throws ArgParseException { 86 | try { 87 | model.polynomial = Long.parseLong(strs[0], 16); 88 | } catch (NumberFormatException e) { 89 | throw new ArgParseException("Could not parse polynomial."); 90 | } 91 | } 92 | }); 93 | args.add(new Arg(1, "-hand") { 94 | @Override 95 | public void parse(ArgsModel model, String[] strs) throws ArgParseException { 96 | model.mode = Mode.HANDPRINT; 97 | try { 98 | model.fingerPerHand = Integer.parseInt(strs[0]); 99 | } catch (NumberFormatException e) { 100 | throw new ArgParseException("Could not fingers-per-hand parameter."); 101 | } 102 | } 103 | }); 104 | } 105 | 106 | public ArgsModel parse(String[] strs) throws ArgParseException { 107 | ArgsModel model = new ArgsModel(); 108 | 109 | for(int i = 0; i < strs.length;){ 110 | String str = strs[i]; 111 | boolean flagged = false; 112 | for (Arg arg : args) { 113 | if (arg.isFlagOf(str)) { 114 | arg.parse(model, Arrays.copyOfRange(strs, i + 1, i + 1 + arg.getArgValueCount())); 115 | i += 1 + arg.getArgValueCount(); 116 | flagged = true; 117 | break; 118 | } 119 | } 120 | 121 | if(!flagged){ 122 | model.unflagged.addAll(Lists.newArrayList(Arrays.copyOfRange(strs, i, strs.length))); 123 | break; 124 | } 125 | } 126 | 127 | if (model.mode == null) { 128 | model.mode = Mode.FINGERPRINT; 129 | } 130 | 131 | if (model.unflagged.size() == 0) { 132 | model.inputModel = InputMode.STDIN; 133 | } else { 134 | model.inputModel = InputMode.FILES; 135 | } 136 | 137 | return model; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/Main.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | import org.rabinfingerprint.Args.ArgParseException; 10 | import org.rabinfingerprint.Args.ArgsModel; 11 | import org.rabinfingerprint.Args.ArgsModel.InputMode; 12 | import org.rabinfingerprint.fingerprint.RabinFingerprintLong; 13 | import org.rabinfingerprint.handprint.Handprint; 14 | import org.rabinfingerprint.handprint.Handprints; 15 | import org.rabinfingerprint.handprint.Handprints.HandPrintFactory; 16 | import org.rabinfingerprint.polynomial.Polynomial; 17 | 18 | import com.google.common.io.ByteStreams; 19 | 20 | public class Main { 21 | 22 | public void fingerprintFiles(List paths, Polynomial p) throws FileNotFoundException, 23 | IOException { 24 | final RabinFingerprintLong rabin = new RabinFingerprintLong(p); 25 | for (String path : paths) { 26 | File file = new File(path); 27 | if (file.exists()) { 28 | rabin.reset(); 29 | rabin.pushBytes(ByteStreams.toByteArray(new FileInputStream(file))); 30 | System.out.println(String.format("%X %s", rabin.getFingerprintLong(), file.getAbsolutePath())); 31 | System.out.flush(); 32 | } else { 33 | System.err.print(String.format("Could not find file %s", path)); 34 | System.err.flush(); 35 | } 36 | } 37 | } 38 | 39 | public void fingerprintStdin(Polynomial p) throws IOException { 40 | final RabinFingerprintLong rabin = new RabinFingerprintLong(p); 41 | rabin.pushBytes(ByteStreams.toByteArray(System.in)); 42 | System.out.println(String.format("%X", rabin.getFingerprintLong())); 43 | } 44 | 45 | public void handprintStdin(Polynomial p) throws IOException { 46 | HandPrintFactory factory = Handprints.newFactory(p); 47 | Handprint hand = factory.newHandprint(System.in); 48 | for (Long finger : hand.getHandFingers().keySet()) { 49 | System.out.println(String.format("%X", finger)); 50 | } 51 | } 52 | 53 | public void handprintFiles(List paths, Polynomial p) throws IOException { 54 | HandPrintFactory factory = Handprints.newFactory(p); 55 | for (String path : paths) { 56 | File file = new File(path); 57 | if (file.exists()) { 58 | Handprint hand = factory.newHandprint(new FileInputStream(file)); 59 | for (Long finger : hand.getHandFingers().keySet()) { 60 | System.out.println(String.format("%X", finger)); 61 | } 62 | System.out.flush(); 63 | } else { 64 | System.err.print(String.format("Could not find file %s", path)); 65 | System.err.flush(); 66 | } 67 | } 68 | } 69 | 70 | public void generatePolynomial(int deg) { 71 | Polynomial p = Polynomial.createIrreducible(deg); 72 | System.out.println(p.toHexString()); 73 | } 74 | 75 | public void printUsage() throws IOException { 76 | ByteStreams.copy(getClass().getResourceAsStream("/usage.txt"), System.out); 77 | } 78 | 79 | public Polynomial checkPolynomial(Long l) throws ArgParseException { 80 | Polynomial p = Polynomial.createFromLong(l); 81 | if (p.isReducible()) { 82 | throw new ArgParseException( 83 | "The specified polynomial is not irreducible and therefore invalid for the rabin fingerprint method. Please use -polygen to generate an irreducible polynomial."); 84 | } 85 | return p; 86 | } 87 | 88 | private ArgsModel model; 89 | 90 | private Main(ArgsModel model) { 91 | this.model = model; 92 | } 93 | 94 | private void run() throws Exception { 95 | switch (model.mode) { 96 | case FINGERPRINT: 97 | if (model.inputModel == InputMode.STDIN) { 98 | fingerprintStdin(checkPolynomial(model.polynomial)); 99 | } else { 100 | fingerprintFiles(model.unflagged, checkPolynomial(model.polynomial)); 101 | } 102 | break; 103 | case HANDPRINT: 104 | if (model.inputModel == InputMode.STDIN) { 105 | handprintStdin(checkPolynomial(model.polynomial)); 106 | } else { 107 | handprintFiles(model.unflagged, checkPolynomial(model.polynomial)); 108 | } 109 | break; 110 | case HELP: 111 | printUsage(); 112 | break; 113 | case POLYGEN: 114 | generatePolynomial(model.degree); 115 | break; 116 | } 117 | } 118 | 119 | public static void main(String[] args) { 120 | try { 121 | try { 122 | new Main(new Args().parse(args)).run(); 123 | } catch (ArgParseException e) { 124 | System.err.println(e.getMessage()); 125 | new Main(null).printUsage(); 126 | } 127 | } catch (Exception e) { 128 | e.printStackTrace(); 129 | System.exit(1); 130 | } 131 | System.exit(0); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/fingerprint/RabinFingerprintPolynomial.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.fingerprint; 2 | 3 | import java.math.BigInteger; 4 | 5 | import org.rabinfingerprint.datastructures.CircularByteQueue; 6 | import org.rabinfingerprint.fingerprint.Fingerprint.WindowedFingerprint; 7 | import org.rabinfingerprint.polynomial.Polynomial; 8 | 9 | /** 10 | * This class implements Rabin's Fingerprinting scheme using a Polynomial class 11 | * which handles calculations in the finite field GF(2). This is slower than 12 | * {@link RabinFingerprintLong}, but can support monic irreducible polynomials 13 | * of nearly any degree. 14 | * 15 | *
 16 |  *     Given an n-bit message m_0, ..., m_n-1, we view it as a polynomial of degree
 17 |  *     n-1 over the finite field GF(2).
 18 |  * 
 19 |  *         m(x) = m_0 + m_1 x + ... + m_n-1 x_n-1
 20 |  * 
 21 |  *     We then pick a random irreducible polynomial p(x) of degree k over GF(2), and
 22 |  *     we define the fingerprint of m to be
 23 |  * 
 24 |  *         f(x) = m(x) mod p(x)
 25 |  *   
 26 |  *     which can be viewed as a polynomial of degree k-1 or as a k-bit number.
 27 |  * 
28 | * 29 | * Because we are operating in the space defined by mod p(x), we only ever need 30 | * to store a fingerprint of k bits. All new bits are shifted in and modded with 31 | * p(x), resulting in a new k-bit number. 32 | * 33 | *
 34 |  *     This follows from the fact that given an n-bit message:
 35 |  * 
 36 |  *         m(x) = m_0 x_n-1 + .... + m_n-2 x + m_n-1; 
 37 |  *         
 38 |  *     and its fingerprint
 39 |  *     
 40 |  *         f(x) = m(x) mod p(x)
 41 |  *              = r_0 x_k-1 + ... + r_k-2 x + r_k-1
 42 |  *         
 43 |  *     appending one more bit simplifies to
 44 |  *         
 45 |  *         f({m(x)*x + m_n}) = {m(x)*x + m_n} mod p(x);
 46 |  * 
47 | * 48 | * This means that to add one bit, we need merely shift in that bit and re-mod 49 | * with p(x). Similarly, we can add entire bytes, words, etc, by shifting them 50 | * in and modding with p(x). This can become untennable and slow if we shift too 51 | * much at once, as the mod calculation is done with synthetic division and will 52 | * take on the order of {number of bits shifted in} to complete. 53 | * 54 | * A table lookup method is obviously possible, and that is exactly what we do 55 | * in {@link RabinFingerprintLong}. 56 | * 57 | * 58 | * "Rabin Fingerprint" 59 | * http://en.wikipedia.org/wiki/Rabin_fingerprint 60 | * 61 | * Michael O. Rabin, "Fingerprinting by Random Polynomials" (1981) 62 | * http://www.xmailserver.org/rabin.pdf 63 | * 64 | * Andrei Z. Broder, "Some applications of Rabin's fingerprinting method" (1993) 65 | * http://citeseer.ist.psu.edu/broder93some.html 66 | * 67 | */ 68 | public class RabinFingerprintPolynomial extends AbstractFingerprint implements WindowedFingerprint { 69 | 70 | private final BigInteger byteShift; 71 | private final BigInteger windowShift; 72 | 73 | private final CircularByteQueue byteWindow; 74 | private final long bytesPerWindow; 75 | 76 | private Polynomial fingerprint; 77 | 78 | public RabinFingerprintPolynomial(Polynomial poly) { 79 | this(poly, 0); 80 | } 81 | 82 | public RabinFingerprintPolynomial(Polynomial poly, long bytesPerWindow) { 83 | super(poly); 84 | this.byteShift = BigInteger.valueOf(8); 85 | this.windowShift = BigInteger.valueOf(bytesPerWindow * 8); 86 | this.bytesPerWindow = bytesPerWindow; 87 | this.byteWindow = new CircularByteQueue((int) bytesPerWindow + 1); 88 | this.fingerprint = new Polynomial(); 89 | } 90 | 91 | /** 92 | * Shifts in byte b and mods with poly 93 | * 94 | * If we have passed overflowed our window, we pop a byte 95 | */ 96 | @Override 97 | public synchronized void pushByte(byte b) { 98 | Polynomial f = fingerprint; 99 | f = f.shiftLeft(byteShift); 100 | f = f.or(Polynomial.createFromLong(b & 0xFFL)); 101 | f = f.mod(poly); 102 | 103 | fingerprint = f; 104 | 105 | if (bytesPerWindow > 0) { 106 | byteWindow.add(b); 107 | if (byteWindow.isFull()) popByte(); 108 | } 109 | } 110 | 111 | /** 112 | * Removes the contribution of the first byte in the byte queue from the 113 | * fingerprint. 114 | * 115 | * Note that the shift necessary to calculate it's contribution can be 116 | * sizeable, and the mod calculation will be similarly slow. It is therefore 117 | * done with the Polynomial to support arbitrary sizes. 118 | * 119 | * Note that despite this massive shift, the fingerprint will still result 120 | * in a k-bit number at the end of the calculation. 121 | */ 122 | public synchronized void popByte() { 123 | byte b = byteWindow.poll(); 124 | Polynomial f = Polynomial.createFromLong(b & 0xFFL); 125 | f = f.shiftLeft(windowShift); 126 | f = f.mod(poly); 127 | 128 | fingerprint = fingerprint.xor(f); 129 | } 130 | 131 | @Override 132 | public synchronized void reset() { 133 | this.fingerprint = new Polynomial(); 134 | this.byteWindow.clear(); 135 | } 136 | 137 | @Override 138 | public synchronized Polynomial getFingerprint() { 139 | return fingerprint; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/datastructures/Interval.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.datastructures; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * A Parameter-Style Object that contains a start and end index. 7 | * 8 | * The indices are in common set notation where the start index is inclusive and 9 | * the end offset is exclusive. This allows us to easily represent zero-width 10 | * intervals -- in this case, anything where start == end; 11 | * 12 | * The default comparator sorts first be the start index, then by the end index. 13 | * 14 | */ 15 | public class Interval implements Comparable { 16 | 17 | private final Long start; 18 | private final Long end; 19 | 20 | /** 21 | * The default comparator. Sorts first be the start index, then by the end 22 | * index. 23 | */ 24 | public static final Comparator START_END_COMPARATOR = new Comparator() { 25 | public int compare(Interval o1, Interval o2) { 26 | if (o1 == o2) 27 | return 0; 28 | if (o1 == null) 29 | return -1; 30 | if (o2 == null) 31 | return 1; 32 | int cmp = o1.start.compareTo(o2.start); 33 | if (cmp != 0) 34 | return cmp; 35 | return o1.end.compareTo(o2.end); 36 | } 37 | }; 38 | 39 | /** 40 | * This comparator is used for comparing intervals in 41 | * {@link FastSentenceParagraphInfo}. 42 | */ 43 | public static final Comparator START_END_INV_COMPARATOR = new Comparator() { 44 | public int compare(Interval o1, Interval o2) { 45 | if (o1 == o2) 46 | return 0; 47 | if (o1 == null) 48 | return -1; 49 | if (o2 == null) 50 | return 1; 51 | int cmp = o1.start.compareTo(o2.start); 52 | if (cmp != 0) 53 | return cmp; 54 | return o2.end.compareTo(o1.end); 55 | } 56 | }; 57 | 58 | public static Interval createUndirected(Long start, Long end) { 59 | if (start.compareTo(end) > 0) 60 | return new Interval(end, start); 61 | return new Interval(start, end); 62 | } 63 | 64 | public Interval(Long start, Long end) { 65 | if (start == null || end == null) 66 | throw new IllegalArgumentException("Interval indeces cannot be null"); 67 | if (start.compareTo(end) > 0) 68 | throw new IllegalArgumentException("Interval indeces out of order"); 69 | 70 | this.start = start; 71 | this.end = end; 72 | } 73 | 74 | /** 75 | * Returns the inclusive start offset 76 | */ 77 | public Long getStart() { 78 | return start; 79 | } 80 | 81 | /** 82 | * Returns the exclusive end offset 83 | */ 84 | public Long getEnd() { 85 | return end; 86 | } 87 | 88 | public Long getSize() { 89 | return end - start; 90 | } 91 | 92 | /** 93 | * Return the overlapping region of this interval and the input. If the 94 | * intervals do not overlap, null is returned. 95 | */ 96 | public Interval intersection(Interval interval) { 97 | Long istart = interval.getStart(); 98 | Long iend = interval.getEnd(); 99 | if (istart >= end || start >= iend) 100 | return null; // no overlap 101 | return new Interval(Math.max(start, istart), Math.min(end, iend)); 102 | } 103 | 104 | /** 105 | * Returns the smallest interval that contains both this interval and the 106 | * input. Note that this not a strict union since indices not included in 107 | * either interval can be included in resulting interval. 108 | */ 109 | public Interval union(Interval interval) { 110 | Long istart = interval.getStart(); 111 | Long iend = interval.getEnd(); 112 | return new Interval(Math.min(start, istart), Math.max(end, iend)); 113 | } 114 | 115 | /** 116 | * Tests whether this is an empty (a.k.a. zero-length) interval 117 | */ 118 | public boolean isEmpty() { 119 | return start == end; 120 | } 121 | 122 | /** 123 | * Tests whether the input interval overlaps this interval. Adjacency does 124 | * not count as overlapping. 125 | */ 126 | public boolean isOverlap(Interval interval) { 127 | if (interval.start >= this.end) 128 | return false; 129 | if (this.start >= interval.end) 130 | return false; 131 | return true; 132 | } 133 | 134 | /** 135 | * Tests whether this interval completely contains the input interval. 136 | */ 137 | public boolean contains(Interval interval) { 138 | return (this.start <= interval.start && this.end >= interval.end); 139 | } 140 | 141 | /** 142 | * Tests whether this interval contains the input index. 143 | */ 144 | public boolean contains(Long index) { 145 | return (this.start <= index && this.end > index); 146 | } 147 | 148 | /** 149 | * Object override for printing 150 | */ 151 | @Override 152 | public String toString() { 153 | return "[" + start + ", " + end + ")"; 154 | } 155 | 156 | /** 157 | * Comparable Implementation 158 | */ 159 | public int compareTo(Interval o) { 160 | return START_END_COMPARATOR.compare(this, o); 161 | } 162 | 163 | @Override 164 | public int hashCode() { 165 | final int prime = 31; 166 | int result = 1; 167 | result = prime * result + ((end == null) ? 0 : end.hashCode()); 168 | result = prime * result + ((start == null) ? 0 : start.hashCode()); 169 | return result; 170 | } 171 | 172 | @Override 173 | public boolean equals(Object obj) { 174 | if (this == obj) 175 | return true; 176 | if (obj == null) 177 | return false; 178 | if (getClass() != obj.getClass()) 179 | return false; 180 | final Interval other = (Interval) obj; 181 | if (end == null) { 182 | if (other.end != null) 183 | return false; 184 | } else if (!end.equals(other.end)) 185 | return false; 186 | if (start == null) { 187 | if (other.start != null) 188 | return false; 189 | } else if (!start.equals(other.start)) 190 | return false; 191 | return true; 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/scanner/MatchModel.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.scanner; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | import java.util.List; 9 | import java.util.TreeMap; 10 | import java.util.concurrent.CountDownLatch; 11 | import java.util.concurrent.ExecutorService; 12 | import java.util.concurrent.Executors; 13 | 14 | import org.rabinfingerprint.handprint.Handprint; 15 | import org.rabinfingerprint.handprint.Handprints; 16 | import org.rabinfingerprint.handprint.Handprints.HandPrintFactory; 17 | import org.rabinfingerprint.polynomial.Polynomial; 18 | 19 | public class MatchModel { 20 | 21 | public static abstract class Match { 22 | protected final Handprint a, b; 23 | 24 | private Match(Handprint a, Handprint b) { 25 | this.a = a; 26 | this.b = b; 27 | } 28 | 29 | public Handprint getHandA() { 30 | return a; 31 | } 32 | 33 | public Handprint getHandB() { 34 | return b; 35 | } 36 | 37 | public abstract double getSimilarity(); 38 | } 39 | 40 | public static class ExactMatch extends Match { 41 | private ExactMatch(Handprint a, Handprint b) { 42 | super(a, b); 43 | } 44 | 45 | @Override 46 | public double getSimilarity() { 47 | return 1.0; 48 | } 49 | } 50 | 51 | public static class PartialMatch extends Match { 52 | protected Double similarity; 53 | 54 | private PartialMatch(Handprint a, Handprint b) { 55 | super(a, b); 56 | } 57 | 58 | @Override 59 | public double getSimilarity() { 60 | if (similarity == null) 61 | similarity = a.getSimilarity(b); 62 | return similarity; 63 | } 64 | } 65 | 66 | public static class NonMatch extends Match { 67 | private NonMatch(Handprint a, Handprint b) { 68 | super(a, b); 69 | } 70 | 71 | @Override 72 | public double getSimilarity() { 73 | return 0.0; 74 | } 75 | } 76 | 77 | protected List matches = new ArrayList(); 78 | 79 | public void getMatches(String pathA, String pathB) throws FileNotFoundException { 80 | final Polynomial p = Polynomial.createIrreducible(53); 81 | 82 | Collection handsA = getHandsFromPath(p, pathA); 83 | Collection handsB = getHandsFromPath(p, pathB); 84 | 85 | findExactMatches(handsA, handsB); 86 | findPartialMatches(handsA, handsB); 87 | findNonMatches(handsA, handsB); 88 | } 89 | 90 | private void findExactMatches(Collection handsA, Collection handsB) { 91 | System.out.println("thumbprinting " + (handsA.size() + handsB.size()) + " files"); 92 | 93 | // thumbprint files 94 | TreeMap thumbMapA = new TreeMap(); 95 | TreeMap thumbMapB = new TreeMap(); 96 | 97 | thumbprintTasks(handsA, handsB, thumbMapA, thumbMapB); 98 | System.out.print("\n"); 99 | 100 | List thumbsA = new ArrayList(thumbMapA.keySet()); 101 | 102 | // print intersection 103 | for (Long thumb : thumbsA) { 104 | if (thumbMapB.containsKey(thumb)) { 105 | Handprint matchA = thumbMapA.get(thumb); 106 | Handprint matchB = thumbMapB.get(thumb); 107 | 108 | // found exact match 109 | handsA.remove(matchA); 110 | handsB.remove(matchB); 111 | 112 | matches.add(new ExactMatch(matchA, matchB)); 113 | 114 | StringBuffer str = new StringBuffer(); 115 | str.append("Found exact match between "); 116 | // str.append(matchA.getFile().toString()); 117 | str.append(" and "); 118 | // str.append(matchB.getFile().toString()); 119 | System.out.println(str.toString()); 120 | } 121 | } 122 | } 123 | 124 | private void thumbprintTasks(final Collection handsA, 125 | final Collection handsB, final TreeMap thumbMapA, 126 | final TreeMap thumbMapB) { 127 | 128 | final CountDownLatch doneSignal = new CountDownLatch(2); 129 | final ExecutorService executor = Executors.newFixedThreadPool(2); 130 | 131 | final class ThumbRunnable implements Runnable { 132 | private final Collection hands; 133 | private final TreeMap map; 134 | 135 | public ThumbRunnable(Collection hands, TreeMap map) { 136 | super(); 137 | this.hands = hands; 138 | this.map = map; 139 | } 140 | 141 | public void run() { 142 | for (Handprint hand : hands) { 143 | map.put(hand.getPalm(), hand); 144 | System.out.print("."); 145 | System.out.flush(); 146 | } 147 | doneSignal.countDown(); 148 | } 149 | } 150 | 151 | executor.execute(new ThumbRunnable(handsA, thumbMapA)); 152 | executor.execute(new ThumbRunnable(handsB, thumbMapB)); 153 | 154 | try { 155 | doneSignal.await(); // wait for all to finish 156 | } catch (InterruptedException ie) { 157 | } 158 | executor.shutdown(); 159 | 160 | } 161 | 162 | private void findPartialMatches(Collection handsA, Collection handsB) { 163 | System.out.println("handprinting " + (handsA.size() + handsB.size()) + " files"); 164 | 165 | // build all fingers 166 | TreeMap handMapA = new TreeMap(); 167 | TreeMap handMapB = new TreeMap(); 168 | 169 | handprintTasks(handsA, handsB, handMapA, handMapB); 170 | 171 | // print intersection 172 | List fingersA = new ArrayList(handMapA.keySet()); 173 | for (Long finger : fingersA) { 174 | if (handMapB.containsKey(finger)) { 175 | Handprint matchA = handMapA.get(finger); 176 | Handprint matchB = handMapB.get(finger); 177 | 178 | // found partial match 179 | handsA.remove(matchA); 180 | handsB.remove(matchB); 181 | 182 | matches.add(new PartialMatch(matchA, matchB)); 183 | 184 | StringBuffer str = new StringBuffer(); 185 | str.append("Found partial match between "); 186 | //str.append(matchA.getFile().toString()); 187 | str.append(" and "); 188 | //str.append(matchB.getFile().toString()); 189 | str.append(" with similarity " + (100.0 * matchA.getSimilarity(matchB))); 190 | System.out.println(str.toString()); 191 | } 192 | } 193 | } 194 | 195 | private void handprintTasks(Collection handsA, Collection handsB, 196 | TreeMap handMapA, TreeMap handMapB) { 197 | 198 | final CountDownLatch doneSignal = new CountDownLatch(2); 199 | final ExecutorService executor = Executors.newFixedThreadPool(2); 200 | 201 | final class HandRunnable implements Runnable { 202 | private final Collection hands; 203 | private final TreeMap map; 204 | 205 | public HandRunnable(Collection hands, TreeMap map) { 206 | super(); 207 | this.hands = hands; 208 | this.map = map; 209 | } 210 | 211 | public void run() { 212 | for (Handprint hand : hands) { 213 | for (Long finger : hand.getHandFingers().keySet()) { 214 | map.put(finger, hand); 215 | } 216 | System.out.print("."); 217 | System.out.flush(); 218 | } 219 | doneSignal.countDown(); 220 | } 221 | } 222 | 223 | executor.execute(new HandRunnable(handsA, handMapA)); 224 | executor.execute(new HandRunnable(handsB, handMapB)); 225 | 226 | try { 227 | doneSignal.await(); // wait for all to finish 228 | } catch (InterruptedException ie) { 229 | } 230 | executor.shutdown(); 231 | } 232 | 233 | private void findNonMatches(Collection handsA, Collection handsB) { 234 | for (Handprint hand : handsA) { 235 | matches.add(new NonMatch(hand, null)); 236 | 237 | StringBuffer str = new StringBuffer(); 238 | str.append("Found no match for "); 239 | //str.append(hand.getFile().toString()); 240 | System.out.println(str.toString()); 241 | } 242 | 243 | for (Handprint hand : handsB) { 244 | matches.add(new NonMatch(null, hand)); 245 | 246 | StringBuffer str = new StringBuffer(); 247 | str.append("Found no match for "); 248 | //str.append(hand.getFile().toString()); 249 | System.out.println(str.toString()); 250 | } 251 | } 252 | 253 | private static Collection getHandsFromPath(final Polynomial p, final String path) 254 | throws FileNotFoundException { 255 | File dir = new File(path); 256 | List files = FileListing.getFileListing(dir); 257 | List hands = new ArrayList(); 258 | HandPrintFactory factory = Handprints.newFactory(p); 259 | for (File file : files) { 260 | if (!file.isFile()) 261 | continue; 262 | hands.add(factory.newHandprint(new FileInputStream(file))); 263 | } 264 | 265 | return hands; 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | 6 | org.sonatype.oss 7 | oss-parent 8 | 7 9 | 10 | 11 | Rabin Fingerprint 12 | A collision resistent fingerprinting method. 13 | org.rabinfingerprint 14 | rabinfingerprint 15 | 1.0.0-SNAPSHOT 16 | https://github.com/themadcreator/rabinfingerprint 17 | 2010 18 | 19 | 20 | 21 | http://www.apache.org/licenses/LICENSE-2.0 22 | Apache License, Version 2.0 23 | 24 | 25 | 26 | 27 | https://github.com/themadcreator/rabinfingerprint/issues 28 | 29 | 30 | 31 | scm:git:git@github.com:themadcreator/rabinfingerprint.git 32 | scm:git:git@github.com:themadcreator/rabinfingerprint.git 33 | scm:git:git@github.com:themadcreator/rabinfingerprint.git 34 | 35 | 36 | 37 | 38 | themadcreator 39 | Bill Dwyer 40 | themadcreator@gmail.com 41 | 42 | creator 43 | developer 44 | 45 | 46 | 47 | ianbrandt 48 | Ian Brandt 49 | ian@ianbrandt.com 50 | http://ianbrandt.com/ 51 | http://ianbrandt.com/ 52 | 53 | developer 54 | 55 | -8 56 | 57 | 58 | 59 | 60 | 2.2.1 61 | 62 | 63 | 64 | yyyyMMdd-HHmmss 65 | UTF-8 66 | UTF-8 67 | 68 | 69 | 70 | 71 | release-sign-artifacts 72 | 73 | 74 | performRelease 75 | true 76 | 77 | 78 | 79 | 80 | 81 | org.apache.maven.plugins 82 | maven-gpg-plugin 83 | 1.5 84 | 85 | 86 | sign-artifacts 87 | verify 88 | 89 | sign 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | com.google.guava 102 | guava 103 | 16.0.1 104 | 105 | 106 | junit 107 | junit 108 | 4.11 109 | test 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | maven-assembly-plugin 118 | 2.4 119 | 120 | 121 | maven-clean-plugin 122 | 2.5 123 | 124 | 125 | maven-compiler-plugin 126 | 3.1 127 | 128 | 129 | maven-dependency-plugin 130 | 2.8 131 | 132 | 133 | maven-deploy-plugin 134 | 2.8.1 135 | 136 | 137 | maven-enforcer-plugin 138 | 1.3.1 139 | 140 | 141 | maven-failsafe-plugin 142 | 2.16 143 | 144 | 145 | maven-install-plugin 146 | 2.5.1 147 | 148 | 149 | maven-jar-plugin 150 | 2.4 151 | 152 | 153 | maven-javadoc-plugin 154 | 2.10.3 155 | 156 | 157 | maven-release-plugin 158 | 2.4.2 159 | 160 | forked-path 161 | false 162 | ${arguments} -Psonatype-oss-release 163 | 164 | 165 | 166 | maven-resources-plugin 167 | 2.6 168 | 169 | 170 | maven-site-plugin 171 | 3.3 172 | 173 | 174 | maven-source-plugin 175 | 2.2.1 176 | 177 | 178 | maven-surefire-plugin 179 | 2.16 180 | 181 | 182 | org.codehaus.mojo 183 | animal-sniffer-maven-plugin 184 | 1.10 185 | 186 | 187 | org.codehaus.mojo 188 | versions-maven-plugin 189 | 2.1 190 | 191 | 193 | 194 | org.eclipse.m2e 195 | lifecycle-mapping 196 | 1.0.0 197 | 198 | 199 | 200 | 201 | 202 | org.apache.maven.plugins 203 | maven-enforcer-plugin 204 | [1.0,) 205 | 206 | enforce 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | org.apache.maven.plugins 219 | maven-gpg-plugin 220 | 1.6 221 | 222 | 223 | 224 | 225 | 226 | 227 | maven-enforcer-plugin 228 | 229 | 230 | validate 231 | 232 | enforce 233 | 234 | 235 | 236 | 237 | 1.6 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | maven-compiler-plugin 246 | 247 | 1.6 248 | 1.6 249 | 250 | 251 | 252 | maven-source-plugin 253 | 254 | 255 | attach-sources 256 | 257 | jar 258 | 259 | 260 | 261 | 262 | 263 | maven-javadoc-plugin 264 | 265 | true 266 | false 267 | 268 | 269 | 270 | attach-javadocs 271 | 272 | jar 273 | 274 | 275 | 276 | 277 | 278 | org.codehaus.mojo 279 | animal-sniffer-maven-plugin 280 | 281 | 282 | check-java16 283 | test 284 | 285 | check 286 | 287 | 288 | 289 | org.codehaus.mojo.signature 290 | java16 291 | 1.0 292 | 293 | 294 | 295 | 296 | 297 | 298 | maven-failsafe-plugin 299 | 300 | 301 | default-integration-test 302 | integration-test 303 | 304 | integration-test 305 | 306 | 307 | 308 | default-verify 309 | verify 310 | 311 | verify 312 | 313 | 314 | 315 | 316 | 317 | maven-assembly-plugin 318 | 319 | 320 | 321 | org.rabinfingerprint.Main 322 | 323 | 324 | 325 | jar-with-dependencies 326 | 327 | 328 | 329 | 330 | assemble-executable-jar 331 | package 332 | 333 | single 334 | 335 | 336 | 337 | 338 | 339 | maven-release-plugin 340 | 341 | 342 | maven-site-plugin 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | maven-project-info-reports-plugin 351 | 2.7 352 | 353 | 354 | 355 | 356 | 357 | -------------------------------------------------------------------------------- /src/main/java/org/rabinfingerprint/polynomial/Polynomial.java: -------------------------------------------------------------------------------- 1 | package org.rabinfingerprint.polynomial; 2 | 3 | import java.math.BigInteger; 4 | import java.util.Comparator; 5 | import java.util.Random; 6 | import java.util.TreeSet; 7 | 8 | /** 9 | * An immutable polynomial in the finite field GF(2^k) 10 | * 11 | * Supports standard arithmetic in the field, as well as reducibility tests. 12 | */ 13 | public class Polynomial implements Arithmetic< Polynomial >, Comparable< Polynomial > { 14 | 15 | /** number of elements in the finite field GF(2^k) */ 16 | public static final BigInteger Q = BigInteger.valueOf(2L); 17 | 18 | /** the polynomial "x" */ 19 | public static final Polynomial X = Polynomial.createFromLong(2L); 20 | 21 | /** the polynomial "1" */ 22 | public static final Polynomial ONE = Polynomial.createFromLong(1L); 23 | 24 | /** a reverse comparator so that polynomials are printed out correctly */ 25 | private static final class ReverseComparator implements Comparator { 26 | public int compare(BigInteger o1, BigInteger o2) { 27 | return -1 * o1.compareTo(o2); 28 | } 29 | } 30 | 31 | /** 32 | * Constructs a polynomial using the bits from a long. Note that Java does 33 | * not support unsigned longs. 34 | */ 35 | public static Polynomial createFromLong(long l) { 36 | TreeSet dgrs = createDegreesCollection(); 37 | for (int i = 0; i < 64; i++) { 38 | if (((l >> i) & 1) == 1) 39 | dgrs.add(BigInteger.valueOf(i)); 40 | } 41 | return new Polynomial(dgrs); 42 | } 43 | 44 | public static Polynomial createFromBytes(byte[] bytes) { 45 | TreeSet dgrs = createDegreesCollection(); 46 | int degree = 0; 47 | for (int i = bytes.length - 1; i >= 0; i--) { 48 | for (int j = 0; j < 8; j++) { 49 | if ((((bytes[i] >> j) & 1) == 1)) { 50 | dgrs.add(BigInteger.valueOf(degree)); 51 | } 52 | degree++; 53 | } 54 | } 55 | return new Polynomial(dgrs); 56 | } 57 | 58 | /** 59 | * Constructs a polynomial using the bits from an array of bytes, limiting 60 | * the degree to the specified size. 61 | * 62 | * We set the final degree to ensure a monic polynomial of the correct 63 | * degree. 64 | */ 65 | public static Polynomial createFromBytes(byte[] bytes, int degree) { 66 | TreeSet dgrs = createDegreesCollection(); 67 | for (int i = 0; i < degree; i++) { 68 | if (Polynomials.getBit(bytes, i)) 69 | dgrs.add(BigInteger.valueOf(i)); 70 | } 71 | dgrs.add(BigInteger.valueOf(degree)); 72 | return new Polynomial(dgrs); 73 | } 74 | 75 | /** 76 | * Constructs a random polynomial of degree "degree" 77 | */ 78 | public static Polynomial createRandom(int degree) { 79 | Random random = new Random(); 80 | byte[] bytes = new byte[(degree / 8) + 1]; 81 | random.nextBytes(bytes); 82 | return createFromBytes(bytes, degree); 83 | } 84 | 85 | /** 86 | * Finds a random irreducible polynomial of degree "degree" 87 | */ 88 | public static Polynomial createIrreducible(int degree) { 89 | while (true) { 90 | Polynomial p = createRandom(degree); 91 | if (p.getReducibility() == Reducibility.IRREDUCIBLE) 92 | return p; 93 | } 94 | } 95 | 96 | /** 97 | * An enumeration representing the reducibility of the polynomial 98 | * 99 | * A polynomial p(x) in GF(2^k) is called irreducible over GF[2^k] if it is 100 | * non-constant and cannot be represented as the product of two or more 101 | * non-constant polynomials from GF(2^k). 102 | * 103 | * http://en.wikipedia.org/wiki/Irreducible_element 104 | */ 105 | public static enum Reducibility { 106 | REDUCIBLE, IRREDUCIBLE 107 | }; 108 | 109 | /** 110 | * A (sorted) set of the degrees of the terms of the polynomial. The 111 | * sortedness helps quickly compute the degree as well as print out the 112 | * terms in order. The O(nlogn) performance of insertions and deletions 113 | * might actually hurt us, though, so we might consider moving to a HashSet 114 | */ 115 | private final TreeSet degrees; 116 | 117 | /** 118 | * Construct a new, empty polynomial 119 | */ 120 | public Polynomial() { 121 | this.degrees = createDegreesCollection(); 122 | } 123 | 124 | /** 125 | * Construct a new polynomial copy of the input argument 126 | */ 127 | public Polynomial(Polynomial p) { 128 | this(p.degrees); 129 | } 130 | 131 | /** 132 | * Construct a new polynomial from a collection of degrees 133 | */ 134 | @SuppressWarnings("unchecked") 135 | protected Polynomial(TreeSet degrees) { 136 | this.degrees = (TreeSet) degrees.clone(); 137 | } 138 | 139 | /** 140 | * Factory for create the degrees collection. 141 | */ 142 | protected static TreeSet createDegreesCollection() { 143 | return new TreeSet(new ReverseComparator()); 144 | } 145 | 146 | /** 147 | * Factory for create the copy of current degrees collection. 148 | */ 149 | @SuppressWarnings("unchecked") 150 | protected TreeSet createDegreesCollectionCopy() { 151 | return (TreeSet) this.degrees.clone(); 152 | } 153 | 154 | /** 155 | * Returns the degree of the highest term or -1 otherwise. 156 | */ 157 | public BigInteger degree() { 158 | if (degrees.isEmpty()) 159 | return BigInteger.ONE.negate(); 160 | return degrees.first(); 161 | } 162 | 163 | /** 164 | * Tests if the polynomial is empty, i.e. it has no terms 165 | */ 166 | public boolean isEmpty() { 167 | return degrees.isEmpty(); 168 | } 169 | 170 | /** 171 | * Computes (this + that) in GF(2^k) 172 | */ 173 | public Polynomial add(Polynomial that) { 174 | return xor(that); 175 | } 176 | 177 | /** 178 | * Computes (this - that) in GF(2^k) 179 | */ 180 | public Polynomial subtract(Polynomial that) { 181 | return xor(that); 182 | } 183 | 184 | /** 185 | * Computes (this * that) in GF(2^k) 186 | */ 187 | public Polynomial multiply(Polynomial that) { 188 | TreeSet dgrs = createDegreesCollection(); 189 | for (BigInteger pa : this.degrees) { 190 | for (BigInteger pb : that.degrees) { 191 | BigInteger sum = pa.add(pb); 192 | // xor the result 193 | if (dgrs.contains(sum)) 194 | dgrs.remove(sum); 195 | else 196 | dgrs.add(sum); 197 | } 198 | } 199 | return new Polynomial(dgrs); 200 | } 201 | 202 | /** 203 | * Computes (this & that) in GF(2^k) 204 | */ 205 | public Polynomial and(Polynomial that) { 206 | TreeSet dgrs = this.createDegreesCollectionCopy(); 207 | dgrs.retainAll(that.degrees); 208 | return new Polynomial(dgrs); 209 | } 210 | 211 | /** 212 | * Computes (this | that) in GF(2^k) 213 | */ 214 | public Polynomial or(Polynomial that) { 215 | TreeSet dgrs = this.createDegreesCollectionCopy(); 216 | dgrs.addAll(that.degrees); 217 | return new Polynomial(dgrs); 218 | } 219 | 220 | /** 221 | * Computes (this ^ that) in GF(2^k) 222 | */ 223 | public Polynomial xor(Polynomial that) { 224 | TreeSet dgrs0 = this.createDegreesCollectionCopy(); 225 | dgrs0.removeAll(that.degrees); 226 | TreeSet dgrs1 = that.createDegreesCollectionCopy(); 227 | dgrs1.removeAll(this.degrees); 228 | dgrs1.addAll(dgrs0); 229 | return new Polynomial(dgrs1); 230 | } 231 | 232 | /** 233 | * Computes (this mod that) in GF(2^k) using synthetic division 234 | */ 235 | public Polynomial mod(Polynomial that) { 236 | BigInteger da = this.degree(); 237 | BigInteger db = that.degree(); 238 | Polynomial register = new Polynomial(this.degrees); 239 | for (BigInteger i = da.subtract(db); i.compareTo(BigInteger.ZERO) >= 0; i = i.subtract(BigInteger.ONE)) { 240 | if (register.hasDegree(i.add(db))) { 241 | Polynomial shifted = that.shiftLeft(i); 242 | register = register.xor(shifted); 243 | } 244 | } 245 | return register; 246 | } 247 | 248 | /** 249 | * Computes (this << shift) in GF(2^k) 250 | */ 251 | public Polynomial shiftLeft(BigInteger shift) { 252 | TreeSet dgrs = createDegreesCollection(); 253 | for (BigInteger degree : degrees) { 254 | BigInteger shifted = degree.add(shift); 255 | dgrs.add(shifted); 256 | } 257 | return new Polynomial(dgrs); 258 | } 259 | 260 | /** 261 | * Computes (this >> shift) in GF(2^k) 262 | */ 263 | public Polynomial shiftRight(BigInteger shift) { 264 | TreeSet dgrs = createDegreesCollection(); 265 | for (BigInteger degree : degrees) { 266 | BigInteger shifted = degree.subtract(shift); 267 | if (shifted.compareTo(BigInteger.ZERO) < 0) 268 | continue; 269 | dgrs.add(shifted); 270 | } 271 | return new Polynomial(dgrs); 272 | } 273 | 274 | /** 275 | * Tests if there exists a term with degree k 276 | */ 277 | public boolean hasDegree(BigInteger k) { 278 | return degrees.contains(k); 279 | } 280 | 281 | /** 282 | * Sets the coefficient of the term with degree k to 1 283 | */ 284 | public Polynomial setDegree(BigInteger k) { 285 | TreeSet dgrs = createDegreesCollection(); 286 | dgrs.addAll(this.degrees); 287 | dgrs.add(k); 288 | return new Polynomial(dgrs); 289 | } 290 | 291 | /** 292 | * Sets the coefficient of the term with degree k to 0 293 | */ 294 | public Polynomial clearDegree(BigInteger k) { 295 | TreeSet dgrs = createDegreesCollection(); 296 | dgrs.addAll(this.degrees); 297 | dgrs.remove(k); 298 | return new Polynomial(dgrs); 299 | } 300 | 301 | /** 302 | * Toggles the coefficient of the term with degree k 303 | */ 304 | public Polynomial toggleDegree(BigInteger k) { 305 | TreeSet dgrs = createDegreesCollection(); 306 | dgrs.addAll(this.degrees); 307 | if (dgrs.contains(k)) { 308 | dgrs.remove(k); 309 | } else { 310 | dgrs.add(k); 311 | } 312 | return new Polynomial(dgrs); 313 | } 314 | 315 | /** 316 | * Computes (this^e mod m). 317 | * 318 | * This algorithm requires at most this.degree() + m.degree() space. 319 | * 320 | * http://en.wikipedia.org/wiki/Modular_exponentiation 321 | */ 322 | public Polynomial modPow(BigInteger e, Polynomial m) { 323 | Polynomial result = Polynomial.ONE; 324 | Polynomial b = new Polynomial(this); 325 | while (e.bitCount() != 0) { 326 | if (e.testBit(0)) { 327 | result = result.multiply(b).mod(m); 328 | } 329 | e = e.shiftRight(1); 330 | b = b.multiply(b).mod(m); 331 | } 332 | return result; 333 | } 334 | 335 | /** 336 | * Computes the greatest common divisor between polynomials using Euclid's 337 | * algorithm 338 | * 339 | * http://en.wikipedia.org/wiki/Euclids_algorithm 340 | */ 341 | public Polynomial gcd(Polynomial that) { 342 | Polynomial a = new Polynomial(this); 343 | while (!that.isEmpty()) { 344 | Polynomial t = new Polynomial(that); 345 | that = a.mod(that); 346 | a = t; 347 | } 348 | return a; 349 | } 350 | 351 | /** 352 | * Construct a BigInteger whose value represents this polynomial. This can 353 | * lose information if the degrees of the terms are larger than 354 | * Integer.MAX_VALUE; 355 | */ 356 | public BigInteger toBigInteger() { 357 | BigInteger b = BigInteger.ZERO; 358 | for (BigInteger degree : degrees) { 359 | b = b.setBit((int) degree.longValue()); 360 | } 361 | return b; 362 | } 363 | 364 | /** 365 | * Technically accurate but slow as hell. 366 | */ 367 | public BigInteger toBigIntegerAccurate() { 368 | BigInteger b = BigInteger.ZERO; 369 | for (BigInteger degree : degrees) { 370 | BigInteger term = BigInteger.ONE; 371 | for (BigInteger i = BigInteger.ONE; i.compareTo(degree) <= 0; i = i.add(BigInteger.ONE)) { 372 | term = term.shiftLeft(1); 373 | } 374 | b.add(term); 375 | } 376 | return b; 377 | } 378 | 379 | /** 380 | * Returns a string of hex characters representing this polynomial 381 | */ 382 | public String toHexString() { 383 | return toBigInteger().toString(16).toUpperCase(); 384 | } 385 | 386 | /** 387 | * Returns a string of digits presenting this polynomial 388 | */ 389 | public String toDecimalString() { 390 | return toBigInteger().toString(); 391 | } 392 | 393 | /** 394 | * Returns a string of binary digits presenting this polynomial 395 | */ 396 | public String toBinaryString() { 397 | StringBuffer str = new StringBuffer(); 398 | for (BigInteger deg = degree(); deg.compareTo(BigInteger.ZERO) >= 0; deg = deg 399 | .subtract(BigInteger.ONE)) { 400 | if (degrees.contains(deg)) { 401 | str.append("1"); 402 | } else { 403 | str.append("0"); 404 | } 405 | } 406 | return str.toString(); 407 | } 408 | 409 | /** 410 | * Returns standard ascii representation of this polynomial in the form: 411 | * 412 | * e.g.: x^8 + x^4 + x^3 + x + 1 413 | */ 414 | public String toPolynomialString() { 415 | StringBuffer str = new StringBuffer(); 416 | for (BigInteger degree : degrees) { 417 | if (str.length() != 0) { 418 | str.append(" + "); 419 | } 420 | if (degree.compareTo(BigInteger.ZERO) == 0) { 421 | str.append("1"); 422 | } else { 423 | str.append("x^" + degree); 424 | } 425 | } 426 | return str.toString(); 427 | } 428 | 429 | /** 430 | * Default toString override uses the ascii representation 431 | */ 432 | @Override 433 | public String toString() { 434 | return toPolynomialString(); 435 | } 436 | 437 | /** 438 | * Tests the reducibility of the polynomial 439 | */ 440 | public boolean isReducible() { 441 | return getReducibility() == Reducibility.REDUCIBLE; 442 | } 443 | 444 | /** 445 | * Tests the reducibility of the polynomial 446 | */ 447 | public Reducibility getReducibility() { 448 | // test trivial cases 449 | if (this.compareTo(Polynomial.ONE) == 0) 450 | return Reducibility.REDUCIBLE; 451 | if (this.compareTo(Polynomial.X) == 0) 452 | return Reducibility.REDUCIBLE; 453 | 454 | // do full-on reducibility test 455 | return getReducibilityBenOr(); 456 | } 457 | 458 | /** 459 | * BenOr Reducibility Test 460 | * 461 | * Tests and Constructions of Irreducible Polynomials over Finite Fields 462 | * (1997) Shuhong Gao, Daniel Panario 463 | * 464 | * http://citeseer.ist.psu.edu/cache/papers/cs/27167/http:zSzzSzwww.math.clemson.eduzSzfacultyzSzGaozSzpaperszSzGP97a.pdf/gao97tests.pdf 465 | */ 466 | protected Reducibility getReducibilityBenOr() { 467 | final long degree = this.degree().longValue(); 468 | for (int i = 1; i <= (int) (degree / 2); i++) { 469 | Polynomial b = reduceExponent(i); 470 | Polynomial g = this.gcd(b); 471 | if (g.compareTo(Polynomial.ONE) != 0) 472 | return Reducibility.REDUCIBLE; 473 | } 474 | 475 | return Reducibility.IRREDUCIBLE; 476 | } 477 | 478 | /** 479 | * Rabin's Reducibility Test 480 | * 481 | * This requires the distinct prime factors of the degree, so we don't use 482 | * it. But this could be faster for prime degree polynomials 483 | */ 484 | protected Reducibility getReducibilityRabin(int[] factors) { 485 | int degree = (int) degree().longValue(); 486 | for (int i = 0; i < factors.length; i++) { 487 | int n_i = factors[i]; 488 | Polynomial b = reduceExponent(n_i); 489 | Polynomial g = this.gcd(b); 490 | if (g.compareTo(Polynomial.ONE) != 0) 491 | return Reducibility.REDUCIBLE; 492 | } 493 | 494 | Polynomial g = reduceExponent(degree); 495 | if (!g.isEmpty()) 496 | return Reducibility.REDUCIBLE; 497 | 498 | return Reducibility.IRREDUCIBLE; 499 | } 500 | 501 | /** 502 | * Computes ( x^(2^p) - x ) mod f 503 | * 504 | * This function is useful for computing the reducibility of the polynomial 505 | */ 506 | private Polynomial reduceExponent(final int p) { 507 | // compute (x^q^p mod f) 508 | BigInteger q_to_p = Q.pow(p); 509 | Polynomial x_to_q_to_p = X.modPow(q_to_p, this); 510 | 511 | // subtract (x mod f) 512 | return x_to_q_to_p.xor(X).mod(this); 513 | } 514 | 515 | /** 516 | * Compares this polynomial to the other 517 | */ 518 | public int compareTo(Polynomial o) { 519 | int cmp = degree().compareTo(o.degree()); 520 | if (cmp != 0) return cmp; 521 | // get first degree difference 522 | Polynomial x = this.xor(o); 523 | if (x.isEmpty()) return 0; 524 | return this.hasDegree(x.degree()) ? 1 : -1; 525 | } 526 | } 527 | --------------------------------------------------------------------------------