├── README.txt
├── src
└── util
│ └── hash
│ └── MurmurHash3.java
└── test
└── util
└── hash
├── TestHashSpeed.java
└── TestMurmurHash3.java
/README.txt:
--------------------------------------------------------------------------------
1 | Some random useful Java classes.
2 |
3 | These are things too small for actual jars or releases
4 | and are meant to just be copied into your own project.
5 | As such, unless otherwise noted, this code has been
6 | authored by Yonik Seeley and placed into the public domain
7 | to allow people to use or license as they see fit.
8 |
9 | util/hash/MurmurHash3:
10 | A fast, high quality hash function. This version can also calculate
11 | the hash of the UTF-8 encoding of a String without converting to a UTF-8 byte[].
12 |
13 |
--------------------------------------------------------------------------------
/src/util/hash/MurmurHash3.java:
--------------------------------------------------------------------------------
1 | package util.hash;
2 |
3 | /**
4 | * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.
5 | * This java port was authored by Yonik Seeley and also placed into the public domain.
6 | * The author hereby disclaims copyright to this source code.
7 | *
8 | * This produces exactly the same hash values as the final C++
9 | * version of MurmurHash3 and is thus suitable for producing the same hash values across
10 | * platforms.
11 | *
12 | * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.
13 | * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash.
14 | *
15 | * Note - The x86 and x64 versions do _not_ produce the same results, as the
16 | * algorithms are optimized for their respective platforms.
17 | *
18 | * See http://github.com/yonik/java_util for future updates to this file.
19 | */
20 | public final class MurmurHash3 {
21 |
22 | /** 128 bits of state */
23 | public static final class LongPair {
24 | public long val1;
25 | public long val2;
26 | }
27 |
28 | public static final int fmix32(int h) {
29 | h ^= h >>> 16;
30 | h *= 0x85ebca6b;
31 | h ^= h >>> 13;
32 | h *= 0xc2b2ae35;
33 | h ^= h >>> 16;
34 | return h;
35 | }
36 |
37 | public static final long fmix64(long k) {
38 | k ^= k >>> 33;
39 | k *= 0xff51afd7ed558ccdL;
40 | k ^= k >>> 33;
41 | k *= 0xc4ceb9fe1a85ec53L;
42 | k ^= k >>> 33;
43 | return k;
44 | }
45 |
46 | /** Gets a long from a byte buffer in little endian byte order. */
47 | public static final long getLongLittleEndian(byte[] buf, int offset) {
48 | return ((long)buf[offset+7] << 56) // no mask needed
49 | | ((buf[offset+6] & 0xffL) << 48)
50 | | ((buf[offset+5] & 0xffL) << 40)
51 | | ((buf[offset+4] & 0xffL) << 32)
52 | | ((buf[offset+3] & 0xffL) << 24)
53 | | ((buf[offset+2] & 0xffL) << 16)
54 | | ((buf[offset+1] & 0xffL) << 8)
55 | | ((buf[offset ] & 0xffL)); // no shift needed
56 | }
57 |
58 |
59 | /** Returns the MurmurHash3_x86_32 hash. */
60 | @SuppressWarnings("fallthrough")
61 | public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {
62 |
63 | final int c1 = 0xcc9e2d51;
64 | final int c2 = 0x1b873593;
65 |
66 | int h1 = seed;
67 | int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
68 |
69 | for (int i=offset; i>> 17); // ROTL32(k1,15);
74 | k1 *= c2;
75 |
76 | h1 ^= k1;
77 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
78 | h1 = h1*5+0xe6546b64;
79 | }
80 |
81 | // tail
82 | int k1 = 0;
83 |
84 | switch(len & 0x03) {
85 | case 3:
86 | k1 = (data[roundedEnd + 2] & 0xff) << 16;
87 | // fallthrough
88 | case 2:
89 | k1 |= (data[roundedEnd + 1] & 0xff) << 8;
90 | // fallthrough
91 | case 1:
92 | k1 |= (data[roundedEnd] & 0xff);
93 | k1 *= c1;
94 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
95 | k1 *= c2;
96 | h1 ^= k1;
97 | }
98 |
99 | // finalization
100 | h1 ^= len;
101 |
102 | // fmix(h1);
103 | h1 ^= h1 >>> 16;
104 | h1 *= 0x85ebca6b;
105 | h1 ^= h1 >>> 13;
106 | h1 *= 0xc2b2ae35;
107 | h1 ^= h1 >>> 16;
108 |
109 | return h1;
110 | }
111 |
112 |
113 | /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
114 | * the string to a temporary buffer. This is more than 2x faster than hashing the result
115 | * of String.getBytes().
116 | */
117 | public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
118 |
119 | final int c1 = 0xcc9e2d51;
120 | final int c2 = 0x1b873593;
121 |
122 | int h1 = seed;
123 |
124 | int pos = offset;
125 | int end = offset + len;
126 | int k1 = 0;
127 | int k2 = 0;
128 | int shift = 0;
129 | int bits = 0;
130 | int nBytes = 0; // length in UTF8 bytes
131 |
132 |
133 | while (pos < end) {
134 | int code = data.charAt(pos++);
135 | if (code < 0x80) {
136 | k2 = code;
137 | bits = 8;
138 |
139 | /***
140 | // optimized ascii implementation (currently slower!!! code size?)
141 | if (shift == 24) {
142 | k1 = k1 | (code << 24);
143 |
144 | k1 *= c1;
145 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
146 | k1 *= c2;
147 |
148 | h1 ^= k1;
149 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
150 | h1 = h1*5+0xe6546b64;
151 |
152 | shift = 0;
153 | nBytes += 4;
154 | k1 = 0;
155 | } else {
156 | k1 |= code << shift;
157 | shift += 8;
158 | }
159 | continue;
160 | ***/
161 |
162 | }
163 | else if (code < 0x800) {
164 | k2 = (0xC0 | (code >> 6))
165 | | ((0x80 | (code & 0x3F)) << 8);
166 | bits = 16;
167 | }
168 | else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
169 | // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
170 | k2 = (0xE0 | (code >> 12))
171 | | ((0x80 | ((code >> 6) & 0x3F)) << 8)
172 | | ((0x80 | (code & 0x3F)) << 16);
173 | bits = 24;
174 | } else {
175 | // surrogate pair
176 | // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
177 | int utf32 = (int) data.charAt(pos++);
178 | utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
179 | k2 = (0xff & (0xF0 | (utf32 >> 18)))
180 | | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
181 | | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
182 | | (0x80 | (utf32 & 0x3F)) << 24;
183 | bits = 32;
184 | }
185 |
186 |
187 | k1 |= k2 << shift;
188 |
189 | // int used_bits = 32 - shift; // how many bits of k2 were used in k1.
190 | // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift
191 |
192 | shift += bits;
193 | if (shift >= 32) {
194 | // mix after we have a complete word
195 |
196 | k1 *= c1;
197 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
198 | k1 *= c2;
199 |
200 | h1 ^= k1;
201 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
202 | h1 = h1*5+0xe6546b64;
203 |
204 | shift -= 32;
205 | // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
206 | if (shift != 0) {
207 | k1 = k2 >>> (bits-shift); // bits used == bits - newshift
208 | } else {
209 | k1 = 0;
210 | }
211 | nBytes += 4;
212 | }
213 |
214 | } // inner
215 |
216 | // handle tail
217 | if (shift > 0) {
218 | nBytes += shift >> 3;
219 | k1 *= c1;
220 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
221 | k1 *= c2;
222 | h1 ^= k1;
223 | }
224 |
225 | // finalization
226 | h1 ^= nBytes;
227 |
228 | // fmix(h1);
229 | h1 ^= h1 >>> 16;
230 | h1 *= 0x85ebca6b;
231 | h1 ^= h1 >>> 13;
232 | h1 *= 0xc2b2ae35;
233 | h1 ^= h1 >>> 16;
234 |
235 | return h1;
236 | }
237 |
238 |
239 | /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */
240 | @SuppressWarnings("fallthrough")
241 | public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {
242 | // The original algorithm does have a 32 bit unsigned seed.
243 | // We have to mask to match the behavior of the unsigned types and prevent sign extension.
244 | long h1 = seed & 0x00000000FFFFFFFFL;
245 | long h2 = seed & 0x00000000FFFFFFFFL;
246 |
247 | final long c1 = 0x87c37b91114253d5L;
248 | final long c2 = 0x4cf5ad432745937fL;
249 |
250 | int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block
251 | for (int i=offset; i>3)&0x03);
34 | ret += MurmurHash3.murmurhash3_x86_32(arr, offset, len, i);
35 | }
36 | } else if (method.equals("slow_string")) {
37 | for (int i = 0; i>3)&0x03);
41 | byte[] utf8 = s.getBytes(utf8Charset);
42 | ret += MurmurHash3.murmurhash3_x86_32(utf8, offset, len, i);
43 | }
44 | } else if (method.equals("fast_string")) {
45 | for (int i = 0; i>3)&0x03);
49 | ret += MurmurHash3.murmurhash3_x86_32(s, offset, len, i);
50 | }
51 | } else {
52 | throw new RuntimeException("Unknown method " + method);
53 | }
54 |
55 | long end = System.currentTimeMillis();
56 |
57 | System.out.println("method="+method + " result="+ ret + " throughput=" + 1000 * ((double)size)*iter/(end-start) );
58 |
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/test/util/hash/TestMurmurHash3.java:
--------------------------------------------------------------------------------
1 | package util.hash;
2 |
3 | import junit.framework.TestCase;
4 |
5 | import java.nio.charset.Charset;
6 | import java.util.Random;
7 |
8 | /**
9 | * @author yonik
10 | */
11 | public class TestMurmurHash3 extends TestCase {
12 |
13 | public void testCorrectValues() throws Exception {
14 | byte[] bytes = "Now is the time for all good men to come to the aid of their country".getBytes("UTF-8");
15 | int hash=0;
16 | for (int i=0; i
118 | #include "MurmurHash3.h"
119 | using namespace std;
120 |
121 | int main(int argc, char** argv) {
122 | char* val = strdup("Now is the time for all good men to come to the aid of their country");
123 | int max = strlen(val);
124 | int hash=0;
125 | for (int i=0; i