├── README.md ├── pom.xml └── src └── main └── java └── jmh └── BytesHashcode.java /README.md: -------------------------------------------------------------------------------- 1 | Example using the Vector API to implement a vectorized version of Arrays.hashCode. -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.oracle 8 | vector-api-dev-live-10-2021 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 1.32 14 | 17 15 | 17 16 | 17 | 18 | 19 | 20 | org.openjdk.jmh 21 | jmh-core 22 | ${jmh.version} 23 | compile 24 | 25 | 26 | 27 | org.openjdk.jmh 28 | jmh-generator-annprocess 29 | ${jmh.version} 30 | compile 31 | 32 | 33 | 34 | 35 | 36 | 37 | org.apache.maven.plugins 38 | maven-compiler-plugin 39 | 3.8.1 40 | 41 | 42 | --add-modules 43 | jdk.incubator.vector 44 | --add-modules 45 | jdk.incubator.foreign 46 | 47 | true 48 | true 49 | true 50 | true 51 | true 52 | true 53 | 54 | 55 | 56 | 57 | org.apache.maven.plugins 58 | maven-shade-plugin 59 | 3.0.0 60 | 61 | 62 | 63 | package 64 | 65 | shade 66 | 67 | 68 | 69 | 70 | 72 | org.openjdk.jmh.Main 73 | 74 | 75 | benchmarks 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/java/jmh/BytesHashcode.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. 8 | * 9 | * This code is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 | * version 2 for more details (a copy is included in the LICENSE file that 13 | * accompanied this code). 14 | * 15 | * You should have received a copy of the GNU General Public License version 16 | * 2 along with this work; if not, write to the Free Software Foundation, 17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 | * 19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 | * or visit www.oracle.com if you need additional information or have any 21 | * questions. 22 | */ 23 | package jmh; 24 | 25 | import jdk.incubator.vector.ByteVector; 26 | import jdk.incubator.vector.IntVector; 27 | import jdk.incubator.vector.VectorOperators; 28 | import jdk.incubator.vector.VectorSpecies; 29 | import org.openjdk.jmh.annotations.Benchmark; 30 | import org.openjdk.jmh.annotations.BenchmarkMode; 31 | import org.openjdk.jmh.annotations.Fork; 32 | import org.openjdk.jmh.annotations.Measurement; 33 | import org.openjdk.jmh.annotations.Mode; 34 | import org.openjdk.jmh.annotations.OutputTimeUnit; 35 | import org.openjdk.jmh.annotations.Param; 36 | import org.openjdk.jmh.annotations.Scope; 37 | import org.openjdk.jmh.annotations.Setup; 38 | import org.openjdk.jmh.annotations.State; 39 | import org.openjdk.jmh.annotations.Warmup; 40 | 41 | import java.util.Arrays; 42 | import java.util.concurrent.ThreadLocalRandom; 43 | import java.util.concurrent.TimeUnit; 44 | 45 | @BenchmarkMode(Mode.Throughput) 46 | @OutputTimeUnit(TimeUnit.MILLISECONDS) 47 | @State(Scope.Benchmark) 48 | @Warmup(iterations = 3, time = 1) 49 | @Measurement(iterations = 5, time = 1) 50 | @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) 51 | public class BytesHashcode { 52 | static final VectorSpecies INT_256_SPECIES = IntVector.SPECIES_256; 53 | 54 | static final VectorSpecies BYTE_64_SPECIES = ByteVector.SPECIES_64; 55 | static final VectorSpecies BYTE_128_SPECIES = ByteVector.SPECIES_128; 56 | static final VectorSpecies BYTE_256_SPECIES = ByteVector.SPECIES_256; 57 | 58 | static final int COEFF_31_TO_8; 59 | static final int COEFF_31_TO_16; 60 | static final int COEFF_31_TO_32; 61 | 62 | static final IntVector H_COEFF_31_TO_8; 63 | static final IntVector H_COEFF_31_TO_16; 64 | static final IntVector H_COEFF_31_TO_32; 65 | 66 | static final IntVector H_COEFF_8; 67 | static final IntVector H_COEFF_16; 68 | static final IntVector H_COEFF_24; 69 | static final IntVector H_COEFF_32; 70 | 71 | 72 | static { 73 | int[] x = new int[INT_256_SPECIES.length() * 4]; 74 | x[x.length - 1] = 1; 75 | for (int i = 1; i < x.length; i++) { 76 | x[x.length - 1 - i] = x[x.length - 1 - i + 1] * 31; 77 | } 78 | 79 | COEFF_31_TO_8 = x[24] * 31; 80 | COEFF_31_TO_16 = x[16] * 31; 81 | COEFF_31_TO_32 = x[0] * 31; 82 | 83 | H_COEFF_31_TO_8 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_8); 84 | H_COEFF_31_TO_16 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_16); 85 | H_COEFF_31_TO_32 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_32); 86 | 87 | H_COEFF_8 = IntVector.fromArray(INT_256_SPECIES, x, 24); 88 | H_COEFF_16 = IntVector.fromArray(INT_256_SPECIES, x, 16); 89 | H_COEFF_24 = IntVector.fromArray(INT_256_SPECIES, x, 8); 90 | H_COEFF_32 = IntVector.fromArray(INT_256_SPECIES, x, 0); 91 | } 92 | 93 | @Param("1024") 94 | int size; 95 | 96 | byte[] a; 97 | 98 | @Setup 99 | public void init() { 100 | a = new byte[size]; 101 | ThreadLocalRandom.current().nextBytes(a); 102 | } 103 | 104 | 105 | @Benchmark 106 | public int scalar() { 107 | return Arrays.hashCode(a); 108 | } 109 | 110 | /* 111 | Hashcode calculation can be represented a polynomial 112 | 113 | h = 31^l 114 | + 31^(l - 1) * a[0] 115 | + 31^(l - 2) * a[1] 116 | + ... 117 | + 31^2 * a[l - 3] 118 | + 31 * a[l - 2] 119 | + a[l - 1] 120 | 121 | */ 122 | 123 | 124 | @Benchmark 125 | public int scalarUnrolled() { 126 | if (a == null) 127 | return 0; 128 | 129 | int h = 1; 130 | int i = 0; 131 | for (; i < (a.length & ~(8 - 1)); i += 8) { 132 | h = h * 31 * 31 * 31 * 31 * 31 * 31 * 31 * 31 + 133 | a[i + 0] * 31 * 31 * 31 * 31 * 31 * 31 * 31 + 134 | a[i + 1] * 31 * 31 * 31 * 31 * 31 * 31 + 135 | a[i + 2] * 31 * 31 * 31 * 31 * 31 + 136 | a[i + 3] * 31 * 31 * 31 * 31 + 137 | a[i + 4] * 31 * 31 * 31 + 138 | a[i + 5] * 31 * 31 + 139 | a[i + 6] * 31 + 140 | a[i + 7]; 141 | } 142 | 143 | for (; i < a.length; i++) { 144 | h = 31 * h + a[i]; 145 | } 146 | return h; 147 | } 148 | 149 | @Benchmark 150 | public int vector64ReduceInLoop() { 151 | int h = 1; 152 | int i = 0; 153 | for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) { 154 | // load 8 bytes, into a 64-bit vector 155 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i); 156 | // convert 8 bytes into 8 ints, into a 256-bit vector 157 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0); 158 | h = h * COEFF_31_TO_8 + x.mul(H_COEFF_8).reduceLanes(VectorOperators.ADD); 159 | } 160 | 161 | for (; i < a.length; i++) { 162 | h = 31 * h + a[i]; 163 | } 164 | return h; 165 | } 166 | 167 | @Benchmark 168 | public int vector64() { 169 | IntVector h = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0); 170 | int i = 0; 171 | for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) { 172 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i); 173 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0); 174 | h = h.mul(H_COEFF_31_TO_8).add(x.mul(H_COEFF_8)); 175 | } 176 | 177 | int sh = h.reduceLanes(VectorOperators.ADD); 178 | for (; i < a.length; i++) { 179 | sh = 31 * sh + a[i]; 180 | } 181 | return sh; 182 | } 183 | 184 | @Benchmark 185 | public int vector64Unrolledx2() { 186 | IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0); 187 | IntVector h2 = IntVector.zero(INT_256_SPECIES); 188 | int i = 0; 189 | for (; i < (a.length & ~(BYTE_128_SPECIES.length() - 1)); i += BYTE_128_SPECIES.length()) { 190 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i); 191 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0); 192 | h1 = h1.mul(H_COEFF_31_TO_16).add(x.mul(H_COEFF_16)); 193 | 194 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length()); 195 | x = (IntVector) b.castShape(INT_256_SPECIES, 0); 196 | h2 = h2.mul(H_COEFF_31_TO_16).add(x.mul(H_COEFF_8)); 197 | } 198 | 199 | int sh = h1.reduceLanes(VectorOperators.ADD) + h2.reduceLanes(VectorOperators.ADD); 200 | for (; i < a.length; i++) { 201 | sh = 31 * sh + a[i]; 202 | } 203 | return sh; 204 | } 205 | 206 | 207 | @Benchmark 208 | public int vector64Unrolledx4() { 209 | IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0); 210 | IntVector h2 = IntVector.zero(INT_256_SPECIES); 211 | IntVector h3 = IntVector.zero(INT_256_SPECIES); 212 | IntVector h4 = IntVector.zero(INT_256_SPECIES); 213 | int i = 0; 214 | for (; i < (a.length & ~(BYTE_256_SPECIES.length() - 1)); i += BYTE_256_SPECIES.length()) { 215 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i); 216 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0); 217 | h1 = h1.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_32)); 218 | 219 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length()); 220 | x = (IntVector) b.castShape(INT_256_SPECIES, 0); 221 | h2 = h2.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_24)); 222 | 223 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 2); 224 | x = (IntVector) b.castShape(INT_256_SPECIES, 0); 225 | h3 = h3.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_16)); 226 | 227 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 3); 228 | x = (IntVector) b.castShape(INT_256_SPECIES, 0); 229 | h4 = h4.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_8)); 230 | } 231 | 232 | int sh = h1.reduceLanes(VectorOperators.ADD) + 233 | h2.reduceLanes(VectorOperators.ADD) + 234 | h3.reduceLanes(VectorOperators.ADD) + 235 | h4.reduceLanes(VectorOperators.ADD); 236 | for (; i < a.length; i++) { 237 | sh = 31 * sh + a[i]; 238 | } 239 | return sh; 240 | } 241 | } 242 | /* 243 | 244 | $ git rev-parse --short HEAD 245 | dfacda488bf 246 | 247 | java -XX:-TieredCompilation -jar target/benchmarks.jar BytesHashcode 248 | 249 | # JMH version: 1.31 250 | # VM version: JDK 17-internal, OpenJDK 64-Bit Server VM, 17-internal+0-adhoc.sandoz.jdk17 251 | # VM invoker: /Users/sandoz/Projects/jdk/jdk17/build/macosx-x86_64-server-release/images/jdk/bin/java 252 | # VM options: --add-modules=jdk.incubator.vector -XX:-TieredCompilation 253 | 254 | Benchmark (size) Mode Cnt Score Error Units 255 | BytesHashcode.scalar 1024 thrpt 5 1280.707 ± 3.982 ops/ms 256 | BytesHashcode.scalarUnrolled 1024 thrpt 5 573.338 ± 22.649 ops/ms 257 | BytesHashcode.vector64 1024 thrpt 5 2636.888 ± 8.721 ops/ms 258 | BytesHashcode.vector64ReduceInLoop 1024 thrpt 5 3855.475 ± 28.767 ops/ms 259 | BytesHashcode.vector64Unrolledx2 1024 thrpt 5 5134.652 ± 42.714 ops/ms 260 | BytesHashcode.vector64Unrolledx4 1024 thrpt 5 8733.027 ± 255.440 ops/ms 261 | 262 | 263 | java -XX:-TieredCompilation -XX:LoopUnrollLimit=0 -jar target/benchmarks.jar -prof dtraceasm BytesHashcode.vector64$ 264 | 265 | Hot loop: 266 | 267 | 5.96% ↗│ 0x000000011a1dff50: vmovq 0x10(%r9,%r8,1),%xmm1 // b = ByteVector.fromArray 268 | ││ 0x000000011a1dff57: movabs $0x61d418178,%rdi // Address of vector constant H_COEFF_31_TO_8 269 | 2.56% ││ 0x000000011a1dff61: vpmulld 0x10(%rdi),%ymm0,%ymm0 // h = h.mul(H_COEFF_31_TO_8) 270 | 69.97% ││ 0x000000011a1dff67: vpmovsxbd %xmm1,%ymm1 // x = b.castShape(INT_256_SPECIES, 0) 271 | ││ 0x000000011a1dff6c: movabs $0x61d4181a8,%rdi // Address of vector constant H_COEFF_8 272 | 0.02% ││ 0x000000011a1dff76: vpmulld 0x10(%rdi),%ymm1,%ymm1 // x = x.mul(H_COEFF_8) 273 | 8.58% ││ 0x000000011a1dff7c: vpaddd %ymm1,%ymm0,%ymm0 // y = y + x 274 | 8.50% ││ 0x000000011a1dff80: add $0x8,%r8d // 275 | ││ 0x000000011a1dff84: cmp %r11d,%r8d // next 8 bytes 276 | ╰│ 0x000000011a1dff87: jl 0x000000011a1dff50 // 277 | 278 | Loop unrolling can make it harder to identify the hot loop, turning it off is 279 | sometimes useful. 280 | 281 | Printing assembly requires the hsdis [1] library be accessible to the java 282 | executable. On Mac with SIP enabled: 283 | - the easiest approach is to place the hsdis library in the current working 284 | directory where "java" is executed; or 285 | - set up the library path and refer to the full path of the java executable 286 | directly 287 | 288 | [1] https://github.com/openjdk/jdk/blob/master/src/utils/hsdis/README 289 | https://github.com/AdoptOpenJDK/jitwatch/wiki/Building-hsdis 290 | 291 | */ --------------------------------------------------------------------------------