├── README.md
├── pom.xml
└── src
└── main
└── java
└── jmh
└── BytesHashcode.java
/README.md:
--------------------------------------------------------------------------------
1 | Example using the Vector API to implement a vectorized version of Arrays.hashCode.
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.oracle
8 | vector-api-dev-live-10-2021
9 | 1.0-SNAPSHOT
10 |
11 |
12 | UTF-8
13 | 1.32
14 | 17
15 | 17
16 |
17 |
18 |
19 |
20 | org.openjdk.jmh
21 | jmh-core
22 | ${jmh.version}
23 | compile
24 |
25 |
26 |
27 | org.openjdk.jmh
28 | jmh-generator-annprocess
29 | ${jmh.version}
30 | compile
31 |
32 |
33 |
34 |
35 |
36 |
37 | org.apache.maven.plugins
38 | maven-compiler-plugin
39 | 3.8.1
40 |
41 |
42 | --add-modules
43 | jdk.incubator.vector
44 | --add-modules
45 | jdk.incubator.foreign
46 |
47 | true
48 | true
49 | true
50 | true
51 | true
52 | true
53 |
54 |
55 |
56 |
57 | org.apache.maven.plugins
58 | maven-shade-plugin
59 | 3.0.0
60 |
61 |
62 |
63 | package
64 |
65 | shade
66 |
67 |
68 |
69 |
70 |
72 | org.openjdk.jmh.Main
73 |
74 |
75 | benchmarks
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/src/main/java/jmh/BytesHashcode.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 | *
5 | * This code is free software; you can redistribute it and/or modify it
6 | * under the terms of the GNU General Public License version 2 only, as
7 | * published by the Free Software Foundation.
8 | *
9 | * This code is distributed in the hope that it will be useful, but WITHOUT
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 | * version 2 for more details (a copy is included in the LICENSE file that
13 | * accompanied this code).
14 | *
15 | * You should have received a copy of the GNU General Public License version
16 | * 2 along with this work; if not, write to the Free Software Foundation,
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 | *
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 | * or visit www.oracle.com if you need additional information or have any
21 | * questions.
22 | */
23 | package jmh;
24 |
25 | import jdk.incubator.vector.ByteVector;
26 | import jdk.incubator.vector.IntVector;
27 | import jdk.incubator.vector.VectorOperators;
28 | import jdk.incubator.vector.VectorSpecies;
29 | import org.openjdk.jmh.annotations.Benchmark;
30 | import org.openjdk.jmh.annotations.BenchmarkMode;
31 | import org.openjdk.jmh.annotations.Fork;
32 | import org.openjdk.jmh.annotations.Measurement;
33 | import org.openjdk.jmh.annotations.Mode;
34 | import org.openjdk.jmh.annotations.OutputTimeUnit;
35 | import org.openjdk.jmh.annotations.Param;
36 | import org.openjdk.jmh.annotations.Scope;
37 | import org.openjdk.jmh.annotations.Setup;
38 | import org.openjdk.jmh.annotations.State;
39 | import org.openjdk.jmh.annotations.Warmup;
40 |
41 | import java.util.Arrays;
42 | import java.util.concurrent.ThreadLocalRandom;
43 | import java.util.concurrent.TimeUnit;
44 |
45 | @BenchmarkMode(Mode.Throughput)
46 | @OutputTimeUnit(TimeUnit.MILLISECONDS)
47 | @State(Scope.Benchmark)
48 | @Warmup(iterations = 3, time = 1)
49 | @Measurement(iterations = 5, time = 1)
50 | @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
51 | public class BytesHashcode {
52 | static final VectorSpecies INT_256_SPECIES = IntVector.SPECIES_256;
53 |
54 | static final VectorSpecies BYTE_64_SPECIES = ByteVector.SPECIES_64;
55 | static final VectorSpecies BYTE_128_SPECIES = ByteVector.SPECIES_128;
56 | static final VectorSpecies BYTE_256_SPECIES = ByteVector.SPECIES_256;
57 |
58 | static final int COEFF_31_TO_8;
59 | static final int COEFF_31_TO_16;
60 | static final int COEFF_31_TO_32;
61 |
62 | static final IntVector H_COEFF_31_TO_8;
63 | static final IntVector H_COEFF_31_TO_16;
64 | static final IntVector H_COEFF_31_TO_32;
65 |
66 | static final IntVector H_COEFF_8;
67 | static final IntVector H_COEFF_16;
68 | static final IntVector H_COEFF_24;
69 | static final IntVector H_COEFF_32;
70 |
71 |
72 | static {
73 | int[] x = new int[INT_256_SPECIES.length() * 4];
74 | x[x.length - 1] = 1;
75 | for (int i = 1; i < x.length; i++) {
76 | x[x.length - 1 - i] = x[x.length - 1 - i + 1] * 31;
77 | }
78 |
79 | COEFF_31_TO_8 = x[24] * 31;
80 | COEFF_31_TO_16 = x[16] * 31;
81 | COEFF_31_TO_32 = x[0] * 31;
82 |
83 | H_COEFF_31_TO_8 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_8);
84 | H_COEFF_31_TO_16 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_16);
85 | H_COEFF_31_TO_32 = IntVector.broadcast(INT_256_SPECIES, COEFF_31_TO_32);
86 |
87 | H_COEFF_8 = IntVector.fromArray(INT_256_SPECIES, x, 24);
88 | H_COEFF_16 = IntVector.fromArray(INT_256_SPECIES, x, 16);
89 | H_COEFF_24 = IntVector.fromArray(INT_256_SPECIES, x, 8);
90 | H_COEFF_32 = IntVector.fromArray(INT_256_SPECIES, x, 0);
91 | }
92 |
93 | @Param("1024")
94 | int size;
95 |
96 | byte[] a;
97 |
98 | @Setup
99 | public void init() {
100 | a = new byte[size];
101 | ThreadLocalRandom.current().nextBytes(a);
102 | }
103 |
104 |
105 | @Benchmark
106 | public int scalar() {
107 | return Arrays.hashCode(a);
108 | }
109 |
110 | /*
111 | Hashcode calculation can be represented a polynomial
112 |
113 | h = 31^l
114 | + 31^(l - 1) * a[0]
115 | + 31^(l - 2) * a[1]
116 | + ...
117 | + 31^2 * a[l - 3]
118 | + 31 * a[l - 2]
119 | + a[l - 1]
120 |
121 | */
122 |
123 |
124 | @Benchmark
125 | public int scalarUnrolled() {
126 | if (a == null)
127 | return 0;
128 |
129 | int h = 1;
130 | int i = 0;
131 | for (; i < (a.length & ~(8 - 1)); i += 8) {
132 | h = h * 31 * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
133 | a[i + 0] * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
134 | a[i + 1] * 31 * 31 * 31 * 31 * 31 * 31 +
135 | a[i + 2] * 31 * 31 * 31 * 31 * 31 +
136 | a[i + 3] * 31 * 31 * 31 * 31 +
137 | a[i + 4] * 31 * 31 * 31 +
138 | a[i + 5] * 31 * 31 +
139 | a[i + 6] * 31 +
140 | a[i + 7];
141 | }
142 |
143 | for (; i < a.length; i++) {
144 | h = 31 * h + a[i];
145 | }
146 | return h;
147 | }
148 |
149 | @Benchmark
150 | public int vector64ReduceInLoop() {
151 | int h = 1;
152 | int i = 0;
153 | for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) {
154 | // load 8 bytes, into a 64-bit vector
155 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
156 | // convert 8 bytes into 8 ints, into a 256-bit vector
157 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
158 | h = h * COEFF_31_TO_8 + x.mul(H_COEFF_8).reduceLanes(VectorOperators.ADD);
159 | }
160 |
161 | for (; i < a.length; i++) {
162 | h = 31 * h + a[i];
163 | }
164 | return h;
165 | }
166 |
167 | @Benchmark
168 | public int vector64() {
169 | IntVector h = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
170 | int i = 0;
171 | for (; i < BYTE_64_SPECIES.loopBound(a.length); i += BYTE_64_SPECIES.length()) {
172 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
173 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
174 | h = h.mul(H_COEFF_31_TO_8).add(x.mul(H_COEFF_8));
175 | }
176 |
177 | int sh = h.reduceLanes(VectorOperators.ADD);
178 | for (; i < a.length; i++) {
179 | sh = 31 * sh + a[i];
180 | }
181 | return sh;
182 | }
183 |
184 | @Benchmark
185 | public int vector64Unrolledx2() {
186 | IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
187 | IntVector h2 = IntVector.zero(INT_256_SPECIES);
188 | int i = 0;
189 | for (; i < (a.length & ~(BYTE_128_SPECIES.length() - 1)); i += BYTE_128_SPECIES.length()) {
190 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
191 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
192 | h1 = h1.mul(H_COEFF_31_TO_16).add(x.mul(H_COEFF_16));
193 |
194 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length());
195 | x = (IntVector) b.castShape(INT_256_SPECIES, 0);
196 | h2 = h2.mul(H_COEFF_31_TO_16).add(x.mul(H_COEFF_8));
197 | }
198 |
199 | int sh = h1.reduceLanes(VectorOperators.ADD) + h2.reduceLanes(VectorOperators.ADD);
200 | for (; i < a.length; i++) {
201 | sh = 31 * sh + a[i];
202 | }
203 | return sh;
204 | }
205 |
206 |
207 | @Benchmark
208 | public int vector64Unrolledx4() {
209 | IntVector h1 = IntVector.fromArray(INT_256_SPECIES, new int[]{1, 0, 0, 0, 0, 0, 0, 0}, 0);
210 | IntVector h2 = IntVector.zero(INT_256_SPECIES);
211 | IntVector h3 = IntVector.zero(INT_256_SPECIES);
212 | IntVector h4 = IntVector.zero(INT_256_SPECIES);
213 | int i = 0;
214 | for (; i < (a.length & ~(BYTE_256_SPECIES.length() - 1)); i += BYTE_256_SPECIES.length()) {
215 | ByteVector b = ByteVector.fromArray(BYTE_64_SPECIES, a, i);
216 | IntVector x = (IntVector) b.castShape(INT_256_SPECIES, 0);
217 | h1 = h1.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_32));
218 |
219 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length());
220 | x = (IntVector) b.castShape(INT_256_SPECIES, 0);
221 | h2 = h2.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_24));
222 |
223 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 2);
224 | x = (IntVector) b.castShape(INT_256_SPECIES, 0);
225 | h3 = h3.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_16));
226 |
227 | b = ByteVector.fromArray(BYTE_64_SPECIES, a, i + BYTE_64_SPECIES.length() * 3);
228 | x = (IntVector) b.castShape(INT_256_SPECIES, 0);
229 | h4 = h4.mul(H_COEFF_31_TO_32).add(x.mul(H_COEFF_8));
230 | }
231 |
232 | int sh = h1.reduceLanes(VectorOperators.ADD) +
233 | h2.reduceLanes(VectorOperators.ADD) +
234 | h3.reduceLanes(VectorOperators.ADD) +
235 | h4.reduceLanes(VectorOperators.ADD);
236 | for (; i < a.length; i++) {
237 | sh = 31 * sh + a[i];
238 | }
239 | return sh;
240 | }
241 | }
242 | /*
243 |
244 | $ git rev-parse --short HEAD
245 | dfacda488bf
246 |
247 | java -XX:-TieredCompilation -jar target/benchmarks.jar BytesHashcode
248 |
249 | # JMH version: 1.31
250 | # VM version: JDK 17-internal, OpenJDK 64-Bit Server VM, 17-internal+0-adhoc.sandoz.jdk17
251 | # VM invoker: /Users/sandoz/Projects/jdk/jdk17/build/macosx-x86_64-server-release/images/jdk/bin/java
252 | # VM options: --add-modules=jdk.incubator.vector -XX:-TieredCompilation
253 |
254 | Benchmark (size) Mode Cnt Score Error Units
255 | BytesHashcode.scalar 1024 thrpt 5 1280.707 ± 3.982 ops/ms
256 | BytesHashcode.scalarUnrolled 1024 thrpt 5 573.338 ± 22.649 ops/ms
257 | BytesHashcode.vector64 1024 thrpt 5 2636.888 ± 8.721 ops/ms
258 | BytesHashcode.vector64ReduceInLoop 1024 thrpt 5 3855.475 ± 28.767 ops/ms
259 | BytesHashcode.vector64Unrolledx2 1024 thrpt 5 5134.652 ± 42.714 ops/ms
260 | BytesHashcode.vector64Unrolledx4 1024 thrpt 5 8733.027 ± 255.440 ops/ms
261 |
262 |
263 | java -XX:-TieredCompilation -XX:LoopUnrollLimit=0 -jar target/benchmarks.jar -prof dtraceasm BytesHashcode.vector64$
264 |
265 | Hot loop:
266 |
267 | 5.96% ↗│ 0x000000011a1dff50: vmovq 0x10(%r9,%r8,1),%xmm1 // b = ByteVector.fromArray
268 | ││ 0x000000011a1dff57: movabs $0x61d418178,%rdi // Address of vector constant H_COEFF_31_TO_8
269 | 2.56% ││ 0x000000011a1dff61: vpmulld 0x10(%rdi),%ymm0,%ymm0 // h = h.mul(H_COEFF_31_TO_8)
270 | 69.97% ││ 0x000000011a1dff67: vpmovsxbd %xmm1,%ymm1 // x = b.castShape(INT_256_SPECIES, 0)
271 | ││ 0x000000011a1dff6c: movabs $0x61d4181a8,%rdi // Address of vector constant H_COEFF_8
272 | 0.02% ││ 0x000000011a1dff76: vpmulld 0x10(%rdi),%ymm1,%ymm1 // x = x.mul(H_COEFF_8)
273 | 8.58% ││ 0x000000011a1dff7c: vpaddd %ymm1,%ymm0,%ymm0 // y = y + x
274 | 8.50% ││ 0x000000011a1dff80: add $0x8,%r8d //
275 | ││ 0x000000011a1dff84: cmp %r11d,%r8d // next 8 bytes
276 | ╰│ 0x000000011a1dff87: jl 0x000000011a1dff50 //
277 |
278 | Loop unrolling can make it harder to identify the hot loop, turning it off is
279 | sometimes useful.
280 |
281 | Printing assembly requires the hsdis [1] library be accessible to the java
282 | executable. On Mac with SIP enabled:
283 | - the easiest approach is to place the hsdis library in the current working
284 | directory where "java" is executed; or
285 | - set up the library path and refer to the full path of the java executable
286 | directly
287 |
288 | [1] https://github.com/openjdk/jdk/blob/master/src/utils/hsdis/README
289 | https://github.com/AdoptOpenJDK/jitwatch/wiki/Building-hsdis
290 |
291 | */
--------------------------------------------------------------------------------