├── README.txt ├── src └── util │ └── hash │ └── MurmurHash3.java └── test └── util └── hash ├── TestHashSpeed.java └── TestMurmurHash3.java /README.txt: -------------------------------------------------------------------------------- 1 | Some random useful Java classes. 2 | 3 | These are things too small for actual jars or releases 4 | and are meant to just be copied into your own project. 5 | As such, unless otherwise noted, this code has been 6 | authored by Yonik Seeley and placed into the public domain 7 | to allow people to use or license as they see fit. 8 | 9 | util/hash/MurmurHash3: 10 | A fast, high quality hash function. This version can also calculate 11 | the hash of the UTF-8 encoding of a String without converting to a UTF-8 byte[]. 12 | 13 | -------------------------------------------------------------------------------- /src/util/hash/MurmurHash3.java: -------------------------------------------------------------------------------- 1 | package util.hash; 2 | 3 | /** 4 | * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. 5 | * This java port was authored by Yonik Seeley and also placed into the public domain. 6 | * The author hereby disclaims copyright to this source code. 7 | *

8 | * This produces exactly the same hash values as the final C++ 9 | * version of MurmurHash3 and is thus suitable for producing the same hash values across 10 | * platforms. 11 | *

12 | * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids. 13 | * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash. 14 | *

15 | * Note - The x86 and x64 versions do _not_ produce the same results, as the 16 | * algorithms are optimized for their respective platforms. 17 | *

18 | * See http://github.com/yonik/java_util for future updates to this file. 19 | */ 20 | public final class MurmurHash3 { 21 | 22 | /** 128 bits of state */ 23 | public static final class LongPair { 24 | public long val1; 25 | public long val2; 26 | } 27 | 28 | public static final int fmix32(int h) { 29 | h ^= h >>> 16; 30 | h *= 0x85ebca6b; 31 | h ^= h >>> 13; 32 | h *= 0xc2b2ae35; 33 | h ^= h >>> 16; 34 | return h; 35 | } 36 | 37 | public static final long fmix64(long k) { 38 | k ^= k >>> 33; 39 | k *= 0xff51afd7ed558ccdL; 40 | k ^= k >>> 33; 41 | k *= 0xc4ceb9fe1a85ec53L; 42 | k ^= k >>> 33; 43 | return k; 44 | } 45 | 46 | /** Gets a long from a byte buffer in little endian byte order. */ 47 | public static final long getLongLittleEndian(byte[] buf, int offset) { 48 | return ((long)buf[offset+7] << 56) // no mask needed 49 | | ((buf[offset+6] & 0xffL) << 48) 50 | | ((buf[offset+5] & 0xffL) << 40) 51 | | ((buf[offset+4] & 0xffL) << 32) 52 | | ((buf[offset+3] & 0xffL) << 24) 53 | | ((buf[offset+2] & 0xffL) << 16) 54 | | ((buf[offset+1] & 0xffL) << 8) 55 | | ((buf[offset ] & 0xffL)); // no shift needed 56 | } 57 | 58 | 59 | /** Returns the MurmurHash3_x86_32 hash. */ 60 | @SuppressWarnings("fallthrough") 61 | public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { 62 | 63 | final int c1 = 0xcc9e2d51; 64 | final int c2 = 0x1b873593; 65 | 66 | int h1 = seed; 67 | int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block 68 | 69 | for (int i=offset; i>> 17); // ROTL32(k1,15); 74 | k1 *= c2; 75 | 76 | h1 ^= k1; 77 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 78 | h1 = h1*5+0xe6546b64; 79 | } 80 | 81 | // tail 82 | int k1 = 0; 83 | 84 | switch(len & 0x03) { 85 | case 3: 86 | k1 = (data[roundedEnd + 2] & 0xff) << 16; 87 | // fallthrough 88 | case 2: 89 | k1 |= (data[roundedEnd + 1] & 0xff) << 8; 90 | // fallthrough 91 | case 1: 92 | k1 |= (data[roundedEnd] & 0xff); 93 | k1 *= c1; 94 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 95 | k1 *= c2; 96 | h1 ^= k1; 97 | } 98 | 99 | // finalization 100 | h1 ^= len; 101 | 102 | // fmix(h1); 103 | h1 ^= h1 >>> 16; 104 | h1 *= 0x85ebca6b; 105 | h1 ^= h1 >>> 13; 106 | h1 *= 0xc2b2ae35; 107 | h1 ^= h1 >>> 16; 108 | 109 | return h1; 110 | } 111 | 112 | 113 | /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding 114 | * the string to a temporary buffer. This is more than 2x faster than hashing the result 115 | * of String.getBytes(). 116 | */ 117 | public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) { 118 | 119 | final int c1 = 0xcc9e2d51; 120 | final int c2 = 0x1b873593; 121 | 122 | int h1 = seed; 123 | 124 | int pos = offset; 125 | int end = offset + len; 126 | int k1 = 0; 127 | int k2 = 0; 128 | int shift = 0; 129 | int bits = 0; 130 | int nBytes = 0; // length in UTF8 bytes 131 | 132 | 133 | while (pos < end) { 134 | int code = data.charAt(pos++); 135 | if (code < 0x80) { 136 | k2 = code; 137 | bits = 8; 138 | 139 | /*** 140 | // optimized ascii implementation (currently slower!!! code size?) 141 | if (shift == 24) { 142 | k1 = k1 | (code << 24); 143 | 144 | k1 *= c1; 145 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 146 | k1 *= c2; 147 | 148 | h1 ^= k1; 149 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 150 | h1 = h1*5+0xe6546b64; 151 | 152 | shift = 0; 153 | nBytes += 4; 154 | k1 = 0; 155 | } else { 156 | k1 |= code << shift; 157 | shift += 8; 158 | } 159 | continue; 160 | ***/ 161 | 162 | } 163 | else if (code < 0x800) { 164 | k2 = (0xC0 | (code >> 6)) 165 | | ((0x80 | (code & 0x3F)) << 8); 166 | bits = 16; 167 | } 168 | else if (code < 0xD800 || code > 0xDFFF || pos>=end) { 169 | // we check for pos>=end to encode an unpaired surrogate as 3 bytes. 170 | k2 = (0xE0 | (code >> 12)) 171 | | ((0x80 | ((code >> 6) & 0x3F)) << 8) 172 | | ((0x80 | (code & 0x3F)) << 16); 173 | bits = 24; 174 | } else { 175 | // surrogate pair 176 | // int utf32 = pos < end ? (int) data.charAt(pos++) : 0; 177 | int utf32 = (int) data.charAt(pos++); 178 | utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); 179 | k2 = (0xff & (0xF0 | (utf32 >> 18))) 180 | | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8 181 | | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 182 | | (0x80 | (utf32 & 0x3F)) << 24; 183 | bits = 32; 184 | } 185 | 186 | 187 | k1 |= k2 << shift; 188 | 189 | // int used_bits = 32 - shift; // how many bits of k2 were used in k1. 190 | // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift 191 | 192 | shift += bits; 193 | if (shift >= 32) { 194 | // mix after we have a complete word 195 | 196 | k1 *= c1; 197 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 198 | k1 *= c2; 199 | 200 | h1 ^= k1; 201 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 202 | h1 = h1*5+0xe6546b64; 203 | 204 | shift -= 32; 205 | // unfortunately, java won't let you shift 32 bits off, so we need to check for 0 206 | if (shift != 0) { 207 | k1 = k2 >>> (bits-shift); // bits used == bits - newshift 208 | } else { 209 | k1 = 0; 210 | } 211 | nBytes += 4; 212 | } 213 | 214 | } // inner 215 | 216 | // handle tail 217 | if (shift > 0) { 218 | nBytes += shift >> 3; 219 | k1 *= c1; 220 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 221 | k1 *= c2; 222 | h1 ^= k1; 223 | } 224 | 225 | // finalization 226 | h1 ^= nBytes; 227 | 228 | // fmix(h1); 229 | h1 ^= h1 >>> 16; 230 | h1 *= 0x85ebca6b; 231 | h1 ^= h1 >>> 13; 232 | h1 *= 0xc2b2ae35; 233 | h1 ^= h1 >>> 16; 234 | 235 | return h1; 236 | } 237 | 238 | 239 | /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */ 240 | @SuppressWarnings("fallthrough") 241 | public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) { 242 | // The original algorithm does have a 32 bit unsigned seed. 243 | // We have to mask to match the behavior of the unsigned types and prevent sign extension. 244 | long h1 = seed & 0x00000000FFFFFFFFL; 245 | long h2 = seed & 0x00000000FFFFFFFFL; 246 | 247 | final long c1 = 0x87c37b91114253d5L; 248 | final long c2 = 0x4cf5ad432745937fL; 249 | 250 | int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 251 | for (int i=offset; i>3)&0x03); 34 | ret += MurmurHash3.murmurhash3_x86_32(arr, offset, len, i); 35 | } 36 | } else if (method.equals("slow_string")) { 37 | for (int i = 0; i>3)&0x03); 41 | byte[] utf8 = s.getBytes(utf8Charset); 42 | ret += MurmurHash3.murmurhash3_x86_32(utf8, offset, len, i); 43 | } 44 | } else if (method.equals("fast_string")) { 45 | for (int i = 0; i>3)&0x03); 49 | ret += MurmurHash3.murmurhash3_x86_32(s, offset, len, i); 50 | } 51 | } else { 52 | throw new RuntimeException("Unknown method " + method); 53 | } 54 | 55 | long end = System.currentTimeMillis(); 56 | 57 | System.out.println("method="+method + " result="+ ret + " throughput=" + 1000 * ((double)size)*iter/(end-start) ); 58 | 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /test/util/hash/TestMurmurHash3.java: -------------------------------------------------------------------------------- 1 | package util.hash; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import java.nio.charset.Charset; 6 | import java.util.Random; 7 | 8 | /** 9 | * @author yonik 10 | */ 11 | public class TestMurmurHash3 extends TestCase { 12 | 13 | public void testCorrectValues() throws Exception { 14 | byte[] bytes = "Now is the time for all good men to come to the aid of their country".getBytes("UTF-8"); 15 | int hash=0; 16 | for (int i=0; i 118 | #include "MurmurHash3.h" 119 | using namespace std; 120 | 121 | int main(int argc, char** argv) { 122 | char* val = strdup("Now is the time for all good men to come to the aid of their country"); 123 | int max = strlen(val); 124 | int hash=0; 125 | for (int i=0; i