├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── appveyor.yml
├── benchmark.cpp
└── libalgebra.h
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Object files
5 | *.o
6 | *.ko
7 | *.obj
8 | *.elf
9 |
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 |
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 |
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 |
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 |
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 |
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 |
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 |
54 | # Executable
55 | benchmark
56 | # VSCode
57 | settings.json
58 | # CMake
59 | CMakeFiles
60 | cmake_*
61 | Makefile
62 | CMakeCache.txt
63 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Test configuration for Travis CI. See .
2 | language: cpp
3 |
4 | matrix:
5 | fast_finish: true
6 |
7 | include:
8 | - name: Xenial (g++-4.9)
9 | os: linux
10 | dist: xenial
11 | addons:
12 | apt:
13 | packages:
14 | - g++-4.9
15 | sources: &sources
16 | - ubuntu-toolchain-r-test
17 | env:
18 | - COMPILERS="CC=gcc-4.9 && CXX=g++-4.9"
19 |
20 | - name: Xenial (g++-5)
21 | os: linux
22 | dist: xenial
23 | addons:
24 | apt:
25 | packages:
26 | - g++-5
27 | sources: &sources
28 | - ubuntu-toolchain-r-test
29 | env:
30 | - COMPILERS="CC=gcc-5 && CXX=g++-5"
31 |
32 | - name: Xenial (g++-6)
33 | os: linux
34 | dist: xenial
35 | addons:
36 | apt:
37 | packages:
38 | - g++-6
39 | sources: &sources
40 | - ubuntu-toolchain-r-test
41 | env:
42 | - COMPILERS="CC=gcc-6 && CXX=g++-6"
43 |
44 | - name: Xenial (g++-7)
45 | os: linux
46 | dist: xenial
47 | addons:
48 | apt:
49 | packages:
50 | - g++-7
51 | - cmake
52 | sources: &sources
53 | - ubuntu-toolchain-r-test
54 | env:
55 | - COMPILERS="CC=gcc-7 && CXX=g++-7"
56 |
57 | - name: Xenial (g++-8)
58 | os: linux
59 | dist: xenial
60 | addons:
61 | apt:
62 | packages:
63 | - g++-8
64 | sources: &sources
65 | - ubuntu-toolchain-r-test
66 | env:
67 | - COMPILERS="CC=gcc-8 && CXX=g++-8"
68 |
69 | - name: Xenial (clang++-3.7)
70 | os: linux
71 | dist: xenial
72 | addons:
73 | apt:
74 | sources:
75 | - ubuntu-toolchain-r-test
76 | - llvm-toolchain-xenial-3.7
77 | packages:
78 | - clang-3.7
79 | env:
80 | - MATRIX_EVAL="CC=clang-3.7 && CXX=clang++-3.7"
81 | compiler: clang
82 |
83 | - name: Xenial (clang++-3.8)
84 | os: linux
85 | dist: xenial
86 | addons:
87 | apt:
88 | sources:
89 | - ubuntu-toolchain-r-test
90 | - llvm-toolchain-xenial-3.8
91 | packages:
92 | - clang-3.8
93 | env:
94 | - MATRIX_EVAL="CC=clang-3.8 && CXX=clang++-3.8"
95 | compiler: clang
96 |
97 | - name: Xenial (clang++-3.9)
98 | os: linux
99 | dist: xenial
100 | addons:
101 | apt:
102 | sources:
103 | - llvm-toolchain-xenial-3.9
104 | packages:
105 | - clang-3.9
106 | env:
107 | - MATRIX_EVAL="CC=clang-3.9 && CXX=clang++-3.9"
108 | compiler: clang
109 |
110 | - name: Xenial (clang++-4.0)
111 | os: linux
112 | dist: xenial
113 | addons:
114 | apt:
115 | sources:
116 | - llvm-toolchain-xenial-4.0
117 | packages:
118 | - clang-4.0
119 | env:
120 | - MATRIX_EVAL="CC=clang-4.0 && CXX=clang++-4.0"
121 | compiler: clang
122 |
123 | - name: Xenial (clang++-5.0)
124 | os: linux
125 | dist: xenial
126 | addons:
127 | apt:
128 | sources:
129 | - llvm-toolchain-xenial-5.0
130 | - ubuntu-toolchain-r-test
131 | packages:
132 | - clang-5.0
133 | env:
134 | - MATRIX_EVAL="CC=clang-5.0 && CXX=clang++-5.0"
135 | compiler: clang
136 |
137 | - name: Xenial (clang++-6.0)
138 | os: linux
139 | dist: xenial
140 | addons:
141 | apt:
142 | sources:
143 | - ubuntu-toolchain-r-test
144 | - llvm-toolchain-xenial-6.0
145 | packages:
146 | - clang-6.0
147 | - libstdc++-6-dev
148 | env:
149 | - MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0"
150 | compiler: clang
151 |
152 | - name: Xenial (clang++-7.0)
153 | os: linux
154 | dist: xenial
155 | addons:
156 | apt:
157 | sources:
158 | - ubuntu-toolchain-r-test
159 | - llvm-toolchain-xenial-7
160 | packages:
161 | - clang-7
162 | - libstdc++-7-dev
163 | env:
164 | - MATRIX_EVAL="CC=clang-7 && CXX=clang++-7"
165 | compiler: clang
166 |
167 | # OSX begin
168 | - os: osx
169 | compiler: clang
170 |
171 | install:
172 | ############################################################################
173 | # Install a recent CMake (unless already installed on OS X)
174 | ############################################################################
175 | - CMAKE_VERSION=3.7.2
176 | - |
177 | if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then
178 | CMAKE_URL="https://cmake.org/files/v${CMAKE_VERSION%.[0-9]}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
179 | mkdir cmake && travis_retry wget --no-check-certificate -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C cmake
180 | export PATH=${DEPS_DIR}/cmake/bin:${PATH}
181 | else
182 | brew install cmake || brew upgrade cmake
183 | fi
184 | - cmake --version
185 |
186 | before_script:
187 | - cd "${TRAVIS_BUILD_DIR}"
188 | # Overwrite default CC and CXX
189 | - eval "${COMPILERS}"
190 |
191 | script:
192 | - cmake .
193 | - make
194 | - |
195 | if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then
196 | sudo ./benchmark -r 10
197 | else
198 | ./benchmark -r 10
199 | fi
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
2 | project(libalgebra C CXX)
3 | set (CMAKE_BUILD_TYPE Release)
4 | set (CMAKE_CXX_FLAGS "-std=c++11")
5 |
6 | include_directories(.)
7 |
8 | add_executable(benchmark benchmark.cpp)
9 |
10 | install(FILES libalgebra.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.com/mklarqvist/libalgebra)
2 | [](https://ci.appveyor.com/project/mklarqvist/libalgebra)
3 | [](https://github.com/mklarqvist/libalgebra/releases)
4 | [](LICENSE)
5 |
6 | # libalgebra
7 |
8 | ```libalgebra.h``` is a header-only C/C++ library for:
9 | * counting the number of set bits ("population count", `popcnt`) in an array
10 | * counting the number of set bits at each position ("positional population count", `pospopcnt`) in an array
11 | * perform set algebraic operations on bitmaps including union, intersection, and diff cardinalities
12 |
13 | using specialized CPU instructions i.e.
14 | [POPCNT](https://en.wikipedia.org/wiki/SSE4#POPCNT_and_LZCNT),
15 | [SSE4.2](https://en.wikipedia.org/wiki/SSE4#SSE4.2),
16 | [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
17 | [AVX512BW](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
18 | [NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_.28NEON.29). ```libalgebra.h``` has been tested successfully using the GCC,
19 | Clang and MSVC compilers.
20 |
21 | The core algorithms are described in the papers:
22 |
23 | * [Faster Population Counts using AVX2 Instructions](https://arxiv.org/abs/1611.07612) by Daniel Lemire, Nathan Kurz
24 | and Wojciech Muła (23 Nov 2016).
25 | * [Efficient Computation of Positional Population Counts Using SIMD Instructions](https://arxiv.org/abs/1911.02696) by Marcus D. R. Klarqvist and Wojciech Muła and Daniel Lemire
26 | * [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/abs/1603.06549) by D. Lemire, G. Ssi-Yan-Kai,
27 | and O. Kaser (21 Mar 2016).
28 |
29 | ### Speedup
30 |
31 | Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level.
32 | The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3).
33 |
34 | ### POSPOPCNT
35 |
36 | This benchmark shows the speedup of the four `pospopcnt` algorithms used on x86
37 | CPUs compared to a naive unvectorized solution
38 | (`pospopcnt_u16_scalar_naive_nosimd`) for different array sizes (in number of
39 | 2-byte values).
40 |
41 | | Algorithm | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | 65536 |
42 | |-----------------------------------|------|-------|-------|-------|-------|-------|-------|--------|
43 | | pospopcnt_u16_sse_blend_popcnt_unroll8 | **8.28** | 9.84 | 10.55 | 11 | 11.58 | 11.93 | 12.13 | 12.28 |
44 | | pospopcnt_u16_avx512_blend_popcnt_unroll8 | 7.07 | **11.25** | **16.21** | 21 | 25.49 | 27.91 | 29.73 | 31.55 |
45 | | pospopcnt_u16_avx512_adder_forest | 3.05 | 2.82 | 14.53 | **23.13** | **34.37** | 44.91 | 52.78 | 61.68 |
46 | | pospopcnt_u16_avx512_harvey_seal | 2.07 | 2.3 | 8.21 | 15.41 | 28.17 | **49.14** | **76.11** | **138.71** |
47 |
48 | ### POPCNT
49 |
50 | Fold speedup compared to a naive unvectorized algorithm
51 | (`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word):
52 |
53 | | Words | libalgebra.h | Scalar | Speedup |
54 | |---------|--------------|---------------|---------|
55 | | 4 | 27.75 (37) | 26.75 (33.5) | 1 |
56 | | 8 | 16.38 (25.5) | 17.38 (30.25) | 1.1 |
57 | | 16 | 10.5 (19.94) | 12.75 (28.63) | 1.2 |
58 | | 32 | 7.72 (17.16) | 10.69 (27.81) | 1.4 |
59 | | 64 | 3.09 (4.36) | 9.61 (27.41) | 3.1 |
60 | | 128 | 2.53 (2.73) | 8.84 (27.2) | 3.5 |
61 | | 256 | 1.35 (1.7) | 8.5 (27.1) | 6.3 |
62 | | 512 | 0.67 (1.18) | 8.33 (27.05) | 12.4 |
63 | | 1024 | 0.5 (0.92) | 8.25 (27.03) | 16.4 |
64 | | 2048 | 0.41 (0.79) | 8.15 (27.01) | 20.1 |
65 | | 4096 | 0.46 (0.72) | 8.12 (27.01) | 17.8 |
66 | | 8192 | 0.39 (0.69) | 8.11 (27) | 21 |
67 | | 16384 | 0.39 (0.67) | 8.1 (27) | 20.6 |
68 | | 32768 | 0.89 (0.66) | 8.1 (27) | 9.1 |
69 | | 65536 | 0.84 (0.66) | 8.1 (27) | 9.6 |
70 | | 131072 | 0.68 (0.66) | 8.09 (27) | 11.9 |
71 | | 262144 | 1.11 (0.66) | 8.09 (27) | 7.3 |
72 | | 524288 | 1.84 (0.66) | 8.12 (27) | 4.4 |
73 | | 1048576 | 1.95 (0.66) | 8.15 (27) | 4.2 |
74 |
75 | ### Set algebra
76 |
77 | Fold speedup compared to naive unvectorized solution (`*_scalar_naive_nosimd`)
78 | for different array sizes (in number of _pairs_ of 64-bit word but results reported per _single_ 64-bit word). These
79 | functions are identifical with the exception of the bitwise operator used (AND,
80 | OR, or XOR) which all have identical latency and throughput (CPI).
81 |
82 | | Words | libalgebra.h | Scalar | Speedup |
83 | |---------|--------------|---------------|---------|
84 | | 4 | 17.63 (8.63) | 14.63 (22.75) | 0.8 |
85 | | 8 | 8.13 (5.44) | 10 (20.88) | 1.2 |
86 | | 16 | 4.69 (3.84) | 7.91 (19.94) | 1.7 |
87 | | 32 | 2.38 (2.56) | 6.59 (19.47) | 2.8 |
88 | | 64 | 1.82 (2.06) | 5.87 (19.23) | 3.2 |
89 | | 128 | 0.88 (0.89) | 5.43 (19.12) | 6.2 |
90 | | 256 | 0.57 (0.64) | 5.18 (19.06) | 9.2 |
91 | | 512 | 0.41 (0.51) | 5.11 (19.03) | 12.4 |
92 | | 1024 | 0.33 (0.45) | 5.06 (19.02) | 15.3 |
93 | | 2048 | 0.39 (0.41) | 5.03 (19.01) | 13.1 |
94 | | 4096 | 0.36 (0.4) | 5.02 (19) | 13.9 |
95 | | 8192 | 0.37 (0.39) | 5.01 (19) | 13.7 |
96 | | 16384 | 0.55 (0.39) | 5.01 (19) | 9.1 |
97 | | 32768 | 0.55 (0.39) | 5 (19) | 9.2 |
98 | | 65536 | 0.52 (0.38) | 5 (19) | 9.7 |
99 | | 131072 | 0.56 (0.38) | 5.01 (19) | 9 |
100 | | 262144 | 1.25 (0.38) | 5.02 (19) | 4 |
101 | | 524288 | 1.76 (0.38) | 5.03 (19) | 2.9 |
102 | | 1048576 | 1.81 (0.38) | 5.07 (19) | 2.8 |
103 |
104 | ## C/C++ API
105 |
106 | ```C
107 | #include "libalgebra.h"
108 |
109 | /*
110 | * Count the number of 1 bits in the data array
111 | * @data: An array
112 | * @size: Size of data in bytes
113 | */
114 | uint64_t STORM_popcnt(const void* data, uint64_t size);
115 | ```
116 |
117 | ```C
118 | #include "libalgebra.h"
119 |
120 | /*
121 | * Count the number of 1 bits for each position in the data array
122 | * @data: A 16-bit array
123 | * @size: Size of data in bytes
124 | * @flags: Output vector[16]
125 | */
126 | uint32_t flags[16];
127 | int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags);
128 | ```
129 |
130 | ```C
131 | #include "libalgebra.h"
132 |
133 | /*
134 | * Compute the intersection, union, or diff cardinality between pairs of bitmaps
135 | * @data1: A 64-bit array
136 | * @data2: A 64-bit array
137 | * @size: Size of data in 64-bit words
138 | */
139 | // Intersect cardinality
140 | uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
141 | // Union cardinality
142 | uint64_t STORM_union_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
143 | // Diff cardinality
144 | uint64_t STORM_diff_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
145 | ```
146 |
147 | ### Advanced use
148 |
149 | Retrieve a function pointer to the optimal function given the target length.
150 |
151 | ```C
152 | STORM_compute_func STORM_get_intersection_count_func(const size_t n_bitmaps_vector);
153 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector);
154 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector);
155 | ```
156 |
157 | Portable memory alignment.
158 |
159 | ```C
160 | #include "libalgebra.h"
161 |
162 | void* STORM_aligned_malloc(size_t alignment, size_t size);
163 | void STORM_aligned_free(void* memblock);
164 | ```
165 |
166 | ## How it works
167 |
168 | On x86 CPUs ```libalgebra.h``` uses a combination of algorithms depending on the input vector size and what instruction set your CPU supports. These checks are performed during **run-time**.
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | # Automated Windows (MSVC++) testing using appveyor.com
2 | # https://ci.appveyor.com/projects
3 |
4 | version: 1.0.{build}
5 |
6 | branches:
7 | except:
8 | - gh-pages
9 |
10 | os: Visual Studio 2017
11 |
12 | platform:
13 | - x86
14 | - x64
15 |
16 | build_script:
17 | - if "%platform%" == "x86" cmake -G "Visual Studio 15 2017" .
18 | - if "%platform%" == "x64" cmake -G "Visual Studio 15 2017 Win64" .
19 | - cmake --build . --config Release
20 |
21 | test_script:
22 | - ps: C:\projects\libalgebra\Release\benchmark.exe
--------------------------------------------------------------------------------
/benchmark.cpp:
--------------------------------------------------------------------------------
1 | #include "libalgebra.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #if !defined(_MSC_VER)
9 | #include "getopt.h"
10 | #endif
11 |
12 | uint64_t* generate_random_data(uint32_t n_bitmaps) {
13 | // Clear data
14 | // uint32_t n_bitmaps = ceil(n / 64.0);
15 | // memset(data, 0, sizeof(uint64_t)*n_bitmaps);
16 | uint64_t* mem = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t));
17 |
18 | // PRNG
19 | std::uniform_int_distribution distr(0, std::numeric_limits::max()-1); // right inclusive
20 | std::random_device rd; // obtain a random number from hardware
21 | std::mt19937 eng(rd()); // seed the generator
22 |
23 | // Generate some random data.
24 | uint32_t n_unique = 0;
25 | // while (n_unique < n) {
26 | for (int i = 0; i < n_bitmaps; ++i) {
27 | uint32_t val1 = distr(eng);
28 | uint32_t val2 = distr(eng);
29 | uint64_t x = ((uint64_t)val1 << 32) | val2;
30 | mem[i] = x;
31 | }
32 |
33 | return mem;
34 | }
35 |
36 | #if !defined(__clang__) && !defined(_MSC_VER)
37 | __attribute__((optimize("no-tree-vectorize")))
38 | #endif
39 | uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) {
40 | uint64_t total = 0;
41 | // for (int i = 0; i < len; ++i) {
42 | // total += STORM_popcount64(data1[i] & data2[i]);
43 | // }
44 | // assert(len % 8 == 0);
45 |
46 | for (int j = 0; j < len; j += 8) {
47 | // total += STORM_popcount64(data[i]);
48 | // diff = data1[i] & data2[i];
49 | total += STORM_popcnt_lookup8bit[data[j+0]];
50 | total += STORM_popcnt_lookup8bit[data[j+1]];
51 | total += STORM_popcnt_lookup8bit[data[j+2]];
52 | total += STORM_popcnt_lookup8bit[data[j+3]];
53 | total += STORM_popcnt_lookup8bit[data[j+4]];
54 | total += STORM_popcnt_lookup8bit[data[j+5]];
55 | total += STORM_popcnt_lookup8bit[data[j+6]];
56 | total += STORM_popcnt_lookup8bit[data[j+7]];
57 | }
58 |
59 | return total;
60 | }
61 |
62 | #ifdef __linux__
63 |
64 | #include // for __NR_perf_event_open
65 | #include // for perf event constants
66 | #include // for ioctl
67 | #include // for syscall
68 | #include
69 | #include // for errno
70 | #include // for memset
71 | #include
72 |
73 | #include
74 |
75 | template
76 | class LinuxEvents {
77 | int fd;
78 | bool working;
79 | perf_event_attr attribs;
80 | int num_events;
81 | std::vector temp_result_vec;
82 | std::vector ids;
83 |
84 | public:
85 | explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) {
86 | memset(&attribs, 0, sizeof(attribs));
87 | attribs.type = TYPE;
88 | attribs.size = sizeof(attribs);
89 | attribs.disabled = 1;
90 | attribs.exclude_kernel = 1;
91 | attribs.exclude_hv = 1;
92 |
93 | attribs.sample_period = 0;
94 | attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
95 | const int pid = 0; // the current process
96 | const int cpu = -1; // all CPUs
97 | const unsigned long flags = 0;
98 |
99 | int group = -1; // no group
100 | num_events = config_vec.size();
101 | uint32_t i = 0;
102 | for (auto config : config_vec) {
103 | attribs.config = config;
104 | fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
105 | if (fd == -1) {
106 | report_error("perf_event_open");
107 | }
108 | ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
109 | if (group == -1) {
110 | group = fd;
111 | }
112 | }
113 |
114 | temp_result_vec.resize(num_events * 2 + 1);
115 | }
116 |
117 | ~LinuxEvents() { close(fd); }
118 |
119 | inline void start() {
120 | if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
121 | report_error("ioctl(PERF_EVENT_IOC_RESET)");
122 | }
123 |
124 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
125 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
126 | }
127 | }
128 |
129 | inline void end(std::vector &results) {
130 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
131 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
132 | }
133 |
134 | if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) {
135 | report_error("read");
136 | }
137 | // our actual results are in slots 1,3,5, ... of this structure
138 | // we really should be checking our ids obtained earlier to be safe
139 | for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
140 | results[i / 2] = temp_result_vec[i];
141 | }
142 | }
143 |
144 | private:
145 | void report_error(const std::string &context) {
146 | if (working)
147 | std::cerr << (context + ": " + std::string(strerror(errno))) << std::endl;
148 | working = false;
149 | }
150 | };
151 |
152 | std::vector
153 | compute_mins(std::vector< std::vector > allresults) {
154 | if (allresults.size() == 0)
155 | return std::vector();
156 |
157 | std::vector answer = allresults[0];
158 |
159 | for (size_t k = 1; k < allresults.size(); k++) {
160 | assert(allresults[k].size() == answer.size());
161 | for (size_t z = 0; z < answer.size(); z++) {
162 | if (allresults[k][z] < answer[z])
163 | answer[z] = allresults[k][z];
164 | }
165 | }
166 | return answer;
167 | }
168 |
169 | std::vector
170 | compute_averages(std::vector< std::vector > allresults) {
171 | if (allresults.size() == 0)
172 | return std::vector();
173 |
174 | std::vector answer(allresults[0].size());
175 |
176 | for (size_t k = 0; k < allresults.size(); k++) {
177 | assert(allresults[k].size() == answer.size());
178 | for (size_t z = 0; z < answer.size(); z++) {
179 | answer[z] += allresults[k][z];
180 | }
181 | }
182 |
183 | for (size_t z = 0; z < answer.size(); z++) {
184 | answer[z] /= allresults.size();
185 | }
186 | return answer;
187 | }
188 |
189 | int linux_set_algebra_wrapper(std::string name,
190 | STORM_compute_func f,
191 | int iterations,
192 | uint32_t range,
193 | uint32_t n_values,
194 | uint32_t n_bitmaps,
195 | bool verbose)
196 | {
197 | std::vector evts;
198 | evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
199 | evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
200 | evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
201 | evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
202 | evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
203 | evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES);
204 | LinuxEvents unified(evts);
205 | std::vector results; // tmp buffer
206 | std::vector< std::vector > allresults;
207 | results.resize(evts.size());
208 |
209 | std::random_device rd;
210 | std::mt19937 gen(rd());
211 | std::uniform_int_distribution<> dis(0, 0xFFFF);
212 |
213 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
214 | for (uint32_t i = 0; i < iterations; i++) {
215 | uint64_t* mem1 = generate_random_data(n_values);
216 | uint64_t* mem2 = generate_random_data(n_values);
217 |
218 | unified.start();
219 | // Call argument subroutine pointer.
220 | total += (*f)(mem1, mem2, n_bitmaps);
221 | unified.end(results);
222 | allresults.push_back(results);
223 |
224 | STORM_aligned_free(mem1);
225 | STORM_aligned_free(mem2);
226 | }
227 |
228 | std::vector mins = compute_mins(allresults);
229 | std::vector avg = compute_averages(allresults);
230 |
231 | if (verbose) {
232 | printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n",
233 | name.c_str(),
234 | n_bitmaps,
235 | double(mins[1]) / mins[0],
236 | double(mins[0]) / (2*n_bitmaps),
237 | double(mins[1]) / (2*n_bitmaps),
238 | mins[0],
239 | mins[1],
240 | mins[2],
241 | mins[3],
242 | mins[4],
243 | avg[0],
244 | avg[1],
245 | avg[2],
246 | avg[3],
247 | avg[4]);
248 | } else {
249 | printf("%s-%u:\n",name.c_str(),n_bitmaps);
250 | printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, "
251 | "instructions per 64-bit word %4.3f \n",
252 | double(mins[1]) / mins[0], double(mins[0]) / (2*n_bitmaps), double(mins[1]) / (2*n_bitmaps));
253 | // first we display mins
254 | printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu "
255 | "cache ref., %8llu cache mis.\n",
256 | mins[0], mins[1], mins[2], mins[3], mins[4]);
257 | printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f "
258 | "cache ref., %8.1f cache mis.\n",
259 | avg[0], avg[1], avg[2], avg[3], avg[4]);
260 | }
261 |
262 | return 1;
263 | }
264 |
265 | int linux_popcount_wrapper(std::string name,
266 | STORM_popcnt_func f,
267 | int iterations,
268 | uint32_t range,
269 | uint32_t n_values,
270 | uint32_t n_bitmaps,
271 | bool verbose)
272 | {
273 | std::vector evts;
274 | evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
275 | evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
276 | evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
277 | evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
278 | evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
279 | evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES);
280 | LinuxEvents unified(evts);
281 | std::vector results; // tmp buffer
282 | std::vector< std::vector > allresults;
283 | results.resize(evts.size());
284 |
285 | std::random_device rd;
286 | std::mt19937 gen(rd());
287 | std::uniform_int_distribution<> dis(0, 0xFFFF);
288 |
289 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
290 | for (uint32_t i = 0; i < iterations; i++) {
291 | uint64_t* mem1 = generate_random_data(n_values);
292 |
293 | unified.start();
294 | // Call argument subroutine pointer.
295 | uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8);
296 | unified.end(results);
297 | allresults.push_back(results);
298 |
299 | uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8);
300 | assert(a == b);
301 | total += a;
302 |
303 | STORM_aligned_free(mem1);
304 | }
305 |
306 | std::vector mins = compute_mins(allresults);
307 | std::vector avg = compute_averages(allresults);
308 |
309 | if (verbose) {
310 | printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n",
311 | name.c_str(),
312 | n_bitmaps,
313 | double(mins[1]) / mins[0],
314 | double(mins[0]) / (n_bitmaps),
315 | double(mins[1]) / (n_bitmaps),
316 | mins[0],
317 | mins[1],
318 | mins[2],
319 | mins[3],
320 | mins[4],
321 | avg[0],
322 | avg[1],
323 | avg[2],
324 | avg[3],
325 | avg[4]);
326 | } else {
327 | printf("%s-%u:\n",name.c_str(),n_bitmaps);
328 | printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, "
329 | "instructions per 64-bit word %4.3f \n",
330 | double(mins[1]) / mins[0], double(mins[0]) / (n_bitmaps), double(mins[1]) / (n_bitmaps));
331 | // first we display mins
332 | printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu "
333 | "cache ref., %8llu cache mis.\n",
334 | mins[0], mins[1], mins[2], mins[3], mins[4]);
335 | printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f "
336 | "cache ref., %8.1f cache mis.\n",
337 | avg[0], avg[1], avg[2], avg[3], avg[4]);
338 | }
339 |
340 | return 1;
341 | }
342 | #endif // end is linux
343 |
344 | struct bench_unit {
345 | bench_unit() : valid(false), cycles(0), cycles_local(0), times(0), times_local(0){}
346 |
347 | bool valid;
348 | float cycles;
349 | float cycles_local;
350 | uint64_t times;
351 | uint64_t times_local;
352 | };
353 |
354 | uint64_t get_cpu_cycles() {
355 | uint64_t result;
356 | #ifndef _MSC_VER
357 | __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax":"=a"
358 | (result)::"%rdx");
359 | #else
360 | result = __rdtsc();
361 | #endif
362 | return result;
363 | };
364 |
365 | #if !defined(__clang__) && !defined(_MSC_VER)
366 | __attribute__((optimize("no-tree-vectorize")))
367 | #endif
368 | uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
369 | uint64_t total = 0;
370 | // for (int i = 0; i < len; ++i) {
371 | // total += STORM_popcount64(data1[i] & data2[i]);
372 | // }
373 |
374 | uint64_t diff;
375 | uint8_t* b8 = (uint8_t*)&diff;
376 | for (int i = 0; i < len; ++i) {
377 | // total += STORM_popcount64(data1[i] & data2[i]);
378 | diff = data1[i] & data2[i];
379 | total += STORM_popcnt_lookup8bit[b8[0]];
380 | total += STORM_popcnt_lookup8bit[b8[1]];
381 | total += STORM_popcnt_lookup8bit[b8[2]];
382 | total += STORM_popcnt_lookup8bit[b8[3]];
383 | total += STORM_popcnt_lookup8bit[b8[4]];
384 | total += STORM_popcnt_lookup8bit[b8[5]];
385 | total += STORM_popcnt_lookup8bit[b8[6]];
386 | total += STORM_popcnt_lookup8bit[b8[7]];
387 | }
388 |
389 | return total;
390 | }
391 |
392 | #if !defined(__clang__) && !defined(_MSC_VER)
393 | __attribute__((optimize("no-tree-vectorize")))
394 | #endif
395 | uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
396 | uint64_t total = 0;
397 | // for (int i = 0; i < len; ++i) {
398 | // total += STORM_popcount64(data1[i] | data2[i]);
399 | // }
400 |
401 | uint64_t diff;
402 | uint8_t* b8 = (uint8_t*)&diff;
403 | for (int i = 0; i < len; ++i) {
404 | // total += STORM_popcount64(data1[i] | data2[i]);
405 | diff = data1[i] | data2[i];
406 | total += STORM_popcnt_lookup8bit[b8[0]];
407 | total += STORM_popcnt_lookup8bit[b8[1]];
408 | total += STORM_popcnt_lookup8bit[b8[2]];
409 | total += STORM_popcnt_lookup8bit[b8[3]];
410 | total += STORM_popcnt_lookup8bit[b8[4]];
411 | total += STORM_popcnt_lookup8bit[b8[5]];
412 | total += STORM_popcnt_lookup8bit[b8[6]];
413 | total += STORM_popcnt_lookup8bit[b8[7]];
414 | }
415 |
416 | return total;
417 | }
418 |
419 | #if !defined(__clang__) && !defined(_MSC_VER)
420 | __attribute__((optimize("no-tree-vectorize")))
421 | #endif
422 | uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
423 | uint64_t total = 0;
424 | // for (int i = 0; i < len; ++i) {
425 | // total += STORM_popcount64(data1[i] ^ data2[i]);
426 | // }
427 |
428 | uint64_t diff;
429 | uint8_t* b8 = (uint8_t*)&diff;
430 | for (int i = 0; i < len; ++i) {
431 | // total += STORM_popcount64(data1[i] ^ data2[i]);
432 | diff = data1[i] ^ data2[i];
433 | total += STORM_popcnt_lookup8bit[b8[0]];
434 | total += STORM_popcnt_lookup8bit[b8[1]];
435 | total += STORM_popcnt_lookup8bit[b8[2]];
436 | total += STORM_popcnt_lookup8bit[b8[3]];
437 | total += STORM_popcnt_lookup8bit[b8[4]];
438 | total += STORM_popcnt_lookup8bit[b8[5]];
439 | total += STORM_popcnt_lookup8bit[b8[6]];
440 | total += STORM_popcnt_lookup8bit[b8[7]];
441 | }
442 |
443 | return total;
444 | }
445 |
446 | // Definition for microsecond timer.
447 | typedef std::chrono::high_resolution_clock::time_point clockdef;
448 |
449 | int set_algebra_wrapper(std::string name,
450 | STORM_compute_func f,
451 | int iterations,
452 | uint32_t range,
453 | uint32_t n_values,
454 | size_t n_bitmaps,
455 | bench_unit& unit)
456 | {
457 | uint32_t cycles_low = 0, cycles_high = 0;
458 | uint32_t cycles_low1 = 0, cycles_high1 = 0;
459 | // Start timer.
460 |
461 | std::vector clocks;
462 | std::vector times;
463 |
464 | #ifndef _MSC_VER
465 | // Intel guide:
466 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
467 | asm volatile ("CPUID\n\t"
468 | "RDTSC\n\t"
469 | "mov %%edx, %0\n\t"
470 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
471 | asm volatile("RDTSCP\n\t"
472 | "mov %%edx, %0\n\t"
473 | "mov %%eax, %1\n\t"
474 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
475 | asm volatile ("CPUID\n\t"
476 | "RDTSC\n\t"
477 | "mov %%edx, %0\n\t"
478 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
479 | asm volatile("RDTSCP\n\t"
480 | "mov %%edx, %0\n\t"
481 | "mov %%eax, %1\n\t"
482 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
483 | #endif
484 | uint64_t* mem1 = generate_random_data(n_values);
485 | uint64_t* mem2 = generate_random_data(n_values);
486 |
487 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
488 | clockdef t1 = std::chrono::high_resolution_clock::now();
489 |
490 | #ifdef __linux__
491 | // unsigned long flags;
492 | // preempt_disable(); /*we disable preemption on our CPU*/
493 | // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/
494 | /*at this stage we exclusively own the CPU*/
495 | #endif
496 |
497 | #ifndef _MSC_VER
498 | asm volatile ("CPUID\n\t"
499 | "RDTSC\n\t"
500 | "mov %%edx, %0\n\t"
501 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
502 | #endif
503 |
504 | for (int i = 0; i < iterations; ++i) {
505 | // Call argument subroutine pointer.
506 | total += (*f)(mem1, mem2, n_bitmaps);
507 | }
508 |
509 | #ifndef _MSC_VER
510 | asm volatile("RDTSCP\n\t"
511 | "mov %%edx, %0\n\t"
512 | "mov %%eax, %1\n\t"
513 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
514 | #endif
515 | #ifdef __linux__
516 | // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/
517 | // preempt_enable();/*we enable preemption*/
518 | #endif
519 |
520 | clockdef t2 = std::chrono::high_resolution_clock::now();
521 | auto time_span = std::chrono::duration_cast(t2 - t1);
522 |
523 | STORM_aligned_free(mem1);
524 | STORM_aligned_free(mem2);
525 |
526 | uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low );
527 | uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
528 |
529 | double mean_cycles = (end - start) / (double)iterations;
530 | uint32_t mean_time = time_span.count() / (double)iterations;
531 |
532 | std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" <<
533 | mean_cycles << "\t" <<
534 | mean_time << "\t" <<
535 | mean_cycles / n_bitmaps << "\t" <<
536 | ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl;
537 |
538 | unit.times += mean_time;
539 | unit.times_local = mean_time;
540 | unit.cycles += mean_cycles;
541 | unit.cycles_local = mean_cycles;
542 | unit.valid = 1;
543 |
544 | return 0;
545 | }
546 |
547 | int popcount_wrapper(std::string name,
548 | STORM_popcnt_func f,
549 | int iterations,
550 | uint32_t range,
551 | uint32_t n_values,
552 | uint32_t n_bitmaps,
553 | bench_unit& unit)
554 | {
555 | uint32_t cycles_low = 0, cycles_high = 0;
556 | uint32_t cycles_low1 = 0, cycles_high1 = 0;
557 | // Start timer.
558 |
559 | std::vector clocks;
560 | std::vector times;
561 |
562 | #ifndef _MSC_VER
563 | // Intel guide:
564 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
565 | asm volatile ("CPUID\n\t"
566 | "RDTSC\n\t"
567 | "mov %%edx, %0\n\t"
568 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
569 | asm volatile("RDTSCP\n\t"
570 | "mov %%edx, %0\n\t"
571 | "mov %%eax, %1\n\t"
572 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
573 | asm volatile ("CPUID\n\t"
574 | "RDTSC\n\t"
575 | "mov %%edx, %0\n\t"
576 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
577 | asm volatile("RDTSCP\n\t"
578 | "mov %%edx, %0\n\t"
579 | "mov %%eax, %1\n\t"
580 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
581 | #endif
582 | uint64_t* mem = generate_random_data(n_values);
583 |
584 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
585 | clockdef t1 = std::chrono::high_resolution_clock::now();
586 |
587 | #ifdef __linux__
588 | // unsigned long flags;
589 | // preempt_disable(); /*we disable preemption on our CPU*/
590 | // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/
591 | /*at this stage we exclusively own the CPU*/
592 | #endif
593 |
594 | #ifndef _MSC_VER
595 | asm volatile ("CPUID\n\t"
596 | "RDTSC\n\t"
597 | "mov %%edx, %0\n\t"
598 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
599 | #endif
600 |
601 | size_t n_b = n_bitmaps*8;
602 | for (int i = 0; i < iterations; ++i) {
603 | // Call argument subroutine pointer.
604 | total += (*f)((uint8_t*)mem, n_b);
605 | }
606 |
607 | #ifndef _MSC_VER
608 | asm volatile("RDTSCP\n\t"
609 | "mov %%edx, %0\n\t"
610 | "mov %%eax, %1\n\t"
611 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
612 | #endif
613 | #ifdef __linux__
614 | // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/
615 | // preempt_enable();/*we enable preemption*/
616 | #endif
617 |
618 | clockdef t2 = std::chrono::high_resolution_clock::now();
619 | auto time_span = std::chrono::duration_cast(t2 - t1);
620 |
621 | STORM_aligned_free(mem);
622 |
623 | uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low );
624 | uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
625 |
626 | double mean_cycles = (end - start) / (double)iterations;
627 | uint32_t mean_time = time_span.count() / (double)iterations;
628 |
629 | std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" <<
630 | mean_cycles << "\t" <<
631 | mean_time << "\t" <<
632 | mean_cycles / n_bitmaps << "\t" <<
633 | ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl;
634 |
635 | unit.times += mean_time;
636 | unit.times_local = mean_time;
637 | unit.cycles += mean_cycles;
638 | unit.cycles_local = mean_cycles;
639 | unit.valid = 1;
640 |
641 | return 0;
642 | }
643 |
644 | int benchmark(int n_repetitions, bool use_perf = false) {
645 | // Align some bitmaps.
646 | uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t));
647 | uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t));
648 |
649 | std::vector ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576};
650 | std::vector reps;
651 | if (n_repetitions <= 0) {
652 | reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,150,150,150,150,150,150,150,100,100,100};
653 | } else {
654 | reps = std::vector(ranges.size(), n_repetitions);
655 | }
656 |
657 | if (use_perf) {
658 | #ifndef __linux__
659 | std::cerr << "perf counter are only available on Linux systems!" << std::endl;
660 | exit(EXIT_FAILURE);
661 | #endif
662 | printf("Algorithm\tWords\tInstructions/cycle\tCycles/word\tInstructions/word\tMinCycles\tMinInstructions\tMinBranchMiss\tMinCacheRef\tminCacheMiss\tAvgCycles\tAvgInstructions\tAvgBranchMiss\tAvgCacheRef\tAvgCacheMiss\n");
663 | }
664 |
665 |
666 | for (int i = 0; i < ranges.size(); ++i) {
667 | bench_unit unit_intsec, unit_union, unit_diff;
668 |
669 | if (use_perf) {
670 | #ifdef __linux__
671 | linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
672 | linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], true);
673 | linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
674 | linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
675 | linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
676 | linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
677 | linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
678 | linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
679 | #else
680 | std::cerr << "perf counter are only available on Linux systems!" << std::endl;
681 | exit(EXIT_FAILURE);
682 | #endif
683 | } else {
684 | popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
685 | popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
686 | set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
687 | set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
688 | set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
689 | set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_union);
690 | set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
691 | set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_diff);
692 | }
693 | }
694 |
695 | // Clean up.
696 | STORM_aligned_free(bitmaps);
697 | STORM_aligned_free(bitmaps2);
698 |
699 | return 1;
700 | }
701 |
702 | int main(int argc, char **argv) {
703 | #if !defined(_MSC_VER)
704 | bool verbose = false;
705 | bool perf_subsystem = false;
706 | int c;
707 | int n_repetitions = -1;
708 |
709 | while ((c = getopt(argc, argv, "vpr:")) != -1) {
710 | switch (c) {
711 | case 'r':
712 | n_repetitions = atoi(optarg);
713 | break;
714 | case 'v':
715 | verbose = true;
716 | break;
717 | case 'p':
718 | perf_subsystem = true;
719 | break;
720 | default:
721 | abort();
722 | }
723 | }
724 |
725 | benchmark(n_repetitions, perf_subsystem);
726 | #else
727 | int n_repetitions = -1;
728 | if (argc > 2) {
729 | n_repetitions = std::atoi(argv[1]);
730 | }
731 | benchmark(n_repetitions, false);
732 | #endif
733 |
734 | return EXIT_SUCCESS;
735 | }
--------------------------------------------------------------------------------
/libalgebra.h:
--------------------------------------------------------------------------------
1 | // License for libalgebra.h
2 | /*
3 | * Copyright (c) 2019 Marcus D. R. Klarqvist
4 | * Author(s): Marcus D. R. Klarqvist
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 | // License for pospopcnt.h
20 | /*
21 | * Copyright (c) 2019
22 | * Author(s): Marcus D. R. Klarqvist, Wojciech Muła, and Daniel Lemire
23 | *
24 | * Licensed under the Apache License, Version 2.0 (the "License");
25 | * you may not use this file except in compliance with the License.
26 | * You may obtain a copy of the License at
27 | *
28 | * http://www.apache.org/licenses/LICENSE-2.0
29 | *
30 | * Unless required by applicable law or agreed to in writing,
31 | * software distributed under the License is distributed on an
32 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
33 | * KIND, either express or implied. See the License for the
34 | * specific language governing permissions and limitations
35 | * under the License.
36 | */
37 | // License for libpopcnt.h
38 | /*
39 | * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
40 | * population count) in an array as quickly as possible using
41 | * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
42 | *
43 | * Copyright (c) 2016 - 2018, Kim Walisch
44 | * Copyright (c) 2016 - 2018, Wojciech Muła
45 | *
46 | * All rights reserved.
47 | *
48 | * Redistribution and use in source and binary forms, with or without
49 | * modification, are permitted provided that the following conditions are met:
50 | *
51 | * 1. Redistributions of source code must retain the above copyright notice, this
52 | * list of conditions and the following disclaimer.
53 | * 2. Redistributions in binary form must reproduce the above copyright notice,
54 | * this list of conditions and the following disclaimer in the documentation
55 | * and/or other materials provided with the distribution.
56 | *
57 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
58 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
59 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
60 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
61 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
62 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
63 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
64 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
66 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67 | */
68 | #ifndef LIBALGEBRA_H_8723467365934
69 | #define LIBALGEBRA_H_8723467365934
70 |
71 | /* *************************************
72 | * Includes
73 | ***************************************/
74 | #include
75 | #include
76 | #include
77 | #include
78 | #include
79 |
80 | /* *************************************
81 | * Safety
82 | ***************************************/
83 |
84 | #if !(defined(__APPLE__)) && !(defined(__FreeBSD__))
85 | #include // this should never be needed but there are some reports that it is needed.
86 | #endif
87 |
88 | #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8
89 | #error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported.
90 | #endif
91 |
92 | /****************************
93 | * Memory management
94 | *
95 | * The subroutines aligned_malloc and aligned_free had to be renamed to
96 | * STORM_aligned_malloc and STORM_aligned_free to prevent clashing with the
97 | * same subroutines in Roaring. These subroutines are included here
98 | * since there is no hard dependency on using Roaring bitmaps.
99 | *
100 | * These subroutines and definitions are taken from the CRoaring repo
101 | * by Daniel Lemire et al. available under the Apache 2.0 License
102 | * (same as libalgebra.h):
103 | * https://github.com/RoaringBitmap/CRoaring/
104 | ****************************/
105 | // portable version of posix_memalign
106 | #ifndef _MSC_VER
107 | #include
108 | #endif
109 |
110 | #ifndef STORM_aligned_malloc
111 | static
112 | void* STORM_aligned_malloc(size_t alignment, size_t size) {
113 | void *p;
114 | #ifdef _MSC_VER
115 | p = _aligned_malloc(size, alignment);
116 | #elif defined(__MINGW32__) || defined(__MINGW64__)
117 | p = __mingw_aligned_malloc(size, alignment);
118 | #else
119 | // somehow, if this is used before including "x86intrin.h", it creates an
120 | // implicit defined warning.
121 | if (posix_memalign(&p, alignment, size) != 0)
122 | return NULL;
123 | #endif
124 | return p;
125 | }
126 | #endif
127 |
128 | #ifndef STORM_aligned_free
129 | static
130 | void STORM_aligned_free(void* memblock) {
131 | #ifdef _MSC_VER
132 | _aligned_free(memblock);
133 | #elif defined(__MINGW32__) || defined(__MINGW64__)
134 | __mingw_aligned_free(memblock);
135 | #else
136 | free(memblock);
137 | #endif
138 | }
139 | #endif
140 |
141 | // portable alignment
142 | #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
143 | # include
144 | # define STORM_ALIGN(n) alignas(n)
145 | #elif defined(__GNUC__)
146 | # define STORM_ALIGN(n) __attribute__ ((aligned(n)))
147 | #elif defined(_MSC_VER)
148 | # define STORM_ALIGN(n) __declspec(align(n))
149 | #else
150 | # define STORM_ALIGN(n) /* disabled */
151 | #endif
152 |
153 | /* *************************************
154 | * Compiler Specific Options
155 | ***************************************/
156 | // Taken from XXHASH
157 | #ifdef _MSC_VER /* Visual Studio */
158 | # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
159 | # define STORM_FORCE_INLINE static __forceinline
160 | # define STORM_NO_INLINE static __declspec(noinline)
161 | #else
162 | # if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
163 | # ifdef __GNUC__
164 | # define STORM_FORCE_INLINE static inline __attribute__((always_inline))
165 | # define STORM_NO_INLINE static __attribute__((noinline))
166 | # else
167 | # define STORM_FORCE_INLINE static inline
168 | # define STORM_NO_INLINE static
169 | # endif
170 | # else
171 | # define STORM_FORCE_INLINE static
172 | # define STORM_NO_INLINE static
173 | # endif /* __STDC_VERSION__ */
174 | #endif
175 |
176 | /****************************
177 | * General checks
178 | ****************************/
179 |
180 | #ifndef __has_builtin
181 | #define STORM_HAS_BUILTIN(x) 0
182 | #else
183 | #define STORM_HAS_BUILTIN(x) __has_builtin(x)
184 | #endif
185 |
186 | #ifndef __has_attribute
187 | #define STORM_HAS_ATTRIBUTE(x) 0
188 | #else
189 | #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x)
190 | #endif
191 |
192 | // disable noise
193 | #ifdef __GNUC__
194 | #define STORM_WARN_UNUSED __attribute__((warn_unused_result))
195 | #else
196 | #define STORM_WARN_UNUSED
197 | #endif
198 |
199 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
200 | # define STORM_RESTRICT restrict
201 | #else
202 | /* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */
203 | # define STORM_RESTRICT /* disable */
204 | #endif
205 |
206 | #ifdef __GNUC__
207 | #define GNUC_PREREQ(x, y) \
208 | (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
209 | #else
210 | #define GNUC_PREREQ(x, y) 0
211 | #endif
212 |
213 | #ifdef __clang__
214 | #define CLANG_PREREQ(x, y) \
215 | (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
216 | #else
217 | #define CLANG_PREREQ(x, y) 0
218 | #endif
219 |
220 | #if (defined(__i386__) || \
221 | defined(__x86_64__) || \
222 | defined(_M_IX86) || \
223 | defined(_M_X64))
224 | #define X86_OR_X64
225 | #endif
226 |
227 | #if defined(X86_OR_X64) && \
228 | (defined(__cplusplus) || \
229 | defined(_MSC_VER) || \
230 | (GNUC_PREREQ(4, 2) || \
231 | STORM_HAS_BUILTIN(__sync_val_compare_and_swap)))
232 | #define STORM_HAVE_CPUID
233 | #endif
234 |
235 | #if GNUC_PREREQ(4, 2) || \
236 | STORM_HAS_BUILTIN(__builtin_popcount)
237 | #define STORM_HAVE_BUILTIN_POPCOUNT
238 | #endif
239 |
240 | #if GNUC_PREREQ(4, 2) || \
241 | CLANG_PREREQ(3, 0)
242 | #define STORM_HAVE_ASM_POPCNT
243 | #endif
244 |
245 | #if defined(STORM_HAVE_CPUID) && \
246 | (defined(STORM_HAVE_ASM_POPCNT) || \
247 | defined(_MSC_VER))
248 | #define STORM_HAVE_POPCNT
249 | #endif
250 |
251 | #if defined(STORM_HAVE_CPUID) && \
252 | GNUC_PREREQ(4, 9)
253 | #define STORM_HAVE_SSE42
254 | #define STORM_HAVE_AVX2
255 | #endif
256 |
257 | #if defined(STORM_HAVE_CPUID) && \
258 | GNUC_PREREQ(5, 0)
259 | #define STORM_HAVE_AVX512
260 | #endif
261 |
262 | #if defined(STORM_HAVE_CPUID) && \
263 | defined(_MSC_VER) && \
264 | defined(__AVX2__)
265 | #define STORM_HAVE_SSE42
266 | #define STORM_HAVE_AVX2
267 | #endif
268 |
269 | #if defined(STORM_HAVE_CPUID) && \
270 | defined(_MSC_VER) && \
271 | defined(__AVX512__)
272 | #define STORM_HAVE_AVX512
273 | #endif
274 |
275 | #if defined(STORM_HAVE_CPUID) && \
276 | CLANG_PREREQ(3, 8) && \
277 | STORM_HAS_ATTRIBUTE(target) && \
278 | (!defined(_MSC_VER) || defined(__AVX2__)) && \
279 | (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
280 | #define STORM_HAVE_SSE42
281 | #define STORM_HAVE_AVX2
282 | #define STORM_HAVE_AVX512
283 | #endif
284 |
285 | // Target attribute
286 | #if !defined(_MSC_VER)
287 | #define STORM_TARGET(x) __attribute__ ((target (x)))
288 | #else
289 | #define STORM_TARGET(x) 0
290 | #endif
291 |
292 |
293 | /****************************
294 | * CPUID and SIMD
295 | ****************************/
296 |
297 | #define STORM_SSE_ALIGNMENT 16
298 | #define STORM_AVX2_ALIGNMENT 32
299 | #define STORM_AVX512_ALIGNMENT 64
300 |
301 | #ifdef __cplusplus
302 | extern "C" {
303 | #endif
304 |
305 | #if defined(STORM_HAVE_CPUID)
306 |
307 | #if defined(_MSC_VER)
308 | #include
309 | #include
310 | #endif
311 |
312 | // CPUID flags. See https://en.wikipedia.org/wiki/CPUID for more info.
313 | /* %ecx bit flags */
314 | #define STORM_CPUID_runtime_bit_POPCNT (1 << 23) // POPCNT instruction
315 | #define STORM_CPUID_runtime_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19]
316 | #define STORM_CPUID_runtime_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20]
317 |
318 | /* %ebx bit flags */
319 | #define STORM_CPUID_runtime_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]
320 | #define STORM_CPUID_runtime_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions
321 |
322 | /* xgetbv bit flags */
323 | #define STORM_XSTATE_SSE (1 << 1)
324 | #define STORM_XSTATE_YMM (1 << 2)
325 | #define STORM_XSTATE_ZMM (7 << 5)
326 |
327 | static
328 | void STORM_run_cpuid(int eax, int ecx, int* abcd) {
329 | #if defined(_MSC_VER)
330 | __cpuidex(abcd, eax, ecx);
331 | #else
332 | int ebx = 0;
333 | int edx = 0;
334 |
335 | #if defined(__i386__) && \
336 | defined(__PIC__)
337 | /* in case of PIC under 32-bit EBX cannot be clobbered */
338 | __asm__ ("movl %%ebx, %%edi;"
339 | "cpuid;"
340 | "xchgl %%ebx, %%edi;"
341 | : "=D" (ebx),
342 | "+a" (eax),
343 | "+c" (ecx),
344 | "=d" (edx));
345 | #else
346 | __asm__ ("cpuid;"
347 | : "+b" (ebx),
348 | "+a" (eax),
349 | "+c" (ecx),
350 | "=d" (edx));
351 | #endif
352 |
353 | abcd[0] = eax;
354 | abcd[1] = ebx;
355 | abcd[2] = ecx;
356 | abcd[3] = edx;
357 | #endif
358 | }
359 |
360 | #if defined(STORM_HAVE_AVX2) || \
361 | defined(STORM_HAVE_AVX512)
362 |
363 | static
364 | int STORM_get_xcr0() {
365 | int xcr0;
366 |
367 | #if defined(_MSC_VER)
368 | xcr0 = (int) _xgetbv(0);
369 | #else
370 | __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
371 | #endif
372 |
373 | return xcr0;
374 | }
375 |
376 | #endif
377 |
378 | static
379 | int STORM_get_cpuid() {
380 | int flags = 0;
381 | int abcd[4];
382 |
383 | STORM_run_cpuid(1, 0, abcd);
384 |
385 | // Check for POPCNT instruction
386 | if ((abcd[2] & STORM_CPUID_runtime_bit_POPCNT) == STORM_CPUID_runtime_bit_POPCNT)
387 | flags |= STORM_CPUID_runtime_bit_POPCNT;
388 |
389 | // Check for SSE4.1 instruction set
390 | if ((abcd[2] & STORM_CPUID_runtime_bit_SSE41) == STORM_CPUID_runtime_bit_SSE41)
391 | flags |= STORM_CPUID_runtime_bit_SSE41;
392 |
393 | // Check for SSE4.2 instruction set
394 | if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42)
395 | flags |= STORM_CPUID_runtime_bit_SSE42;
396 |
397 | #if defined(STORM_HAVE_AVX2) || \
398 | defined(STORM_HAVE_AVX512)
399 |
400 | int osxsave_mask = (1 << 27);
401 |
402 | /* ensure OS supports extended processor state management */
403 | if ((abcd[2] & osxsave_mask) != osxsave_mask)
404 | return 0;
405 |
406 | int ymm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM;
407 | int zmm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM | STORM_XSTATE_ZMM;
408 |
409 | int xcr0 = STORM_get_xcr0();
410 |
411 | if ((xcr0 & ymm_mask) == ymm_mask) {
412 | STORM_run_cpuid(7, 0, abcd);
413 |
414 | if ((abcd[1] & STORM_CPUID_runtime_bit_AVX2) == STORM_CPUID_runtime_bit_AVX2)
415 | flags |= STORM_CPUID_runtime_bit_AVX2;
416 |
417 | if ((xcr0 & zmm_mask) == zmm_mask) {
418 | if ((abcd[1] & STORM_CPUID_runtime_bit_AVX512BW) == STORM_CPUID_runtime_bit_AVX512BW)
419 | flags |= STORM_CPUID_runtime_bit_AVX512BW;
420 | }
421 | }
422 |
423 | #endif
424 |
425 | return flags;
426 | }
427 | #endif // defined(STORM_HAVE_CPUID)
428 |
429 | /// Taken from libpopcnt.h
430 | #if defined(STORM_HAVE_ASM_POPCNT) && \
431 | defined(__x86_64__)
432 |
433 | STORM_FORCE_INLINE
434 | uint64_t STORM_POPCOUNT(uint64_t x)
435 | {
436 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
437 | return x;
438 | }
439 |
440 | #elif defined(STORM_HAVE_ASM_POPCNT) && \
441 | defined(__i386__)
442 |
443 | STORM_FORCE_INLINE
444 | uint32_t STORM_popcnt32(uint32_t x)
445 | {
446 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
447 | return x;
448 | }
449 |
450 | STORM_FORCE_INLINE
451 | uint64_t STORM_POPCOUNT(uint64_t x)
452 | {
453 | return STORM_popcnt32((uint32_t) x) +
454 | STORM_popcnt32((uint32_t)(x >> 32));
455 | }
456 |
457 | #elif defined(_MSC_VER) && \
458 | defined(_M_X64)
459 |
460 | #include
461 |
462 | STORM_FORCE_INLINE
463 | uint64_t STORM_POPCOUNT(uint64_t x) {
464 | return _mm_popcnt_u64(x);
465 | }
466 |
467 | #elif defined(_MSC_VER) && \
468 | defined(_M_IX86)
469 |
470 | #include
471 |
472 | STORM_FORCE_INLINE
473 | uint64_t STORM_POPCOUNT(uint64_t x)
474 | {
475 | return _mm_popcnt_u32((uint32_t) x) +
476 | _mm_popcnt_u32((uint32_t)(x >> 32));
477 | }
478 |
479 | /* non x86 CPUs */
480 | #elif defined(STORM_HAVE_BUILTIN_POPCOUNT)
481 |
482 | STORM_FORCE_INLINE
483 | uint64_t STORM_POPCOUNT(uint64_t x) {
484 | return __builtin_popcountll(x);
485 | }
486 |
487 | /* no hardware POPCNT,
488 | * use pure integer algorithm */
489 | #else
490 |
491 | STORM_FORCE_INLINE
492 | uint64_t STORM_POPCOUNT(uint64_t x) {
493 | return STORM_popcount64(x);
494 | }
495 |
496 | #endif
497 |
498 |
499 | static
500 | uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1,
501 | const uint64_t* STORM_RESTRICT data2,
502 | size_t size)
503 | {
504 | const uint64_t limit = size - size % 4;
505 | uint64_t cnt = 0;
506 | uint64_t i = 0;
507 |
508 | for (/**/; i < limit; i += 4) {
509 | cnt += STORM_POPCOUNT(data1[i+0] & data2[i+0]);
510 | cnt += STORM_POPCOUNT(data1[i+1] & data2[i+1]);
511 | cnt += STORM_POPCOUNT(data1[i+2] & data2[i+2]);
512 | cnt += STORM_POPCOUNT(data1[i+3] & data2[i+3]);
513 | }
514 |
515 | for (/**/; i < size; ++i)
516 | cnt += STORM_POPCOUNT(data1[i] & data2[i]);
517 |
518 | return cnt;
519 | }
520 |
521 | static
522 | uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1,
523 | const uint64_t* STORM_RESTRICT data2,
524 | size_t size)
525 | {
526 | const uint64_t limit = size - size % 4;
527 | uint64_t cnt = 0;
528 | uint64_t i = 0;
529 |
530 | for (/**/; i < limit; i += 4) {
531 | cnt += STORM_POPCOUNT(data1[i+0] | data2[i+0]);
532 | cnt += STORM_POPCOUNT(data1[i+1] | data2[i+1]);
533 | cnt += STORM_POPCOUNT(data1[i+2] | data2[i+2]);
534 | cnt += STORM_POPCOUNT(data1[i+3] | data2[i+3]);
535 | }
536 |
537 | for (/**/; i < size; ++i)
538 | cnt += STORM_POPCOUNT(data1[i] | data2[i]);
539 |
540 | return cnt;
541 | }
542 |
543 | static
544 | uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1,
545 | const uint64_t* STORM_RESTRICT data2,
546 | size_t size)
547 | {
548 | const uint64_t limit = size - size % 4;
549 | uint64_t cnt = 0;
550 | uint64_t i = 0;
551 |
552 | for (/**/; i < limit; i += 4) {
553 | cnt += STORM_POPCOUNT(data1[i+0] ^ data2[i+0]);
554 | cnt += STORM_POPCOUNT(data1[i+1] ^ data2[i+1]);
555 | cnt += STORM_POPCOUNT(data1[i+2] ^ data2[i+2]);
556 | cnt += STORM_POPCOUNT(data1[i+3] ^ data2[i+3]);
557 | }
558 |
559 | for (/**/; i < size; ++i)
560 | cnt += STORM_POPCOUNT(data1[i] ^ data2[i]);
561 |
562 | return cnt;
563 | }
564 |
565 | static
566 | int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, size_t len, uint32_t* out) {
567 | for (int i = 0; i < len; ++i) {
568 | for (int j = 0; j < 16; ++j) {
569 | out[j] += ((data[i] & (1 << j)) >> j);
570 | }
571 | }
572 |
573 | return 0;
574 | }
575 |
576 | #ifndef _MSC_VER
577 |
578 | STORM_FORCE_INLINE
579 | uint64_t STORM_pospopcnt_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
580 | unsigned __int128 x = (unsigned __int128)a * (unsigned __int128)b;
581 | *hi = (uint64_t)(x >> 64);
582 | return (uint64_t)x;
583 | }
584 |
585 | STORM_FORCE_INLINE
586 | uint64_t STORM_pospopcnt_loadu_u64(const void* ptr) {
587 | uint64_t data;
588 | memcpy(&data, ptr, sizeof(data));
589 | return data;
590 | }
591 |
592 | // By @aqrit (https://github.com/aqrit)
593 | // @see: https://gist.github.com/aqrit/c729815b0165c139d0bac642ab7ee104
594 | static
595 | int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, size_t n, uint32_t* out) {
596 | while (n >= 8) {
597 | uint64_t counter_a = 0; // 4 packed 12-bit counters
598 | uint64_t counter_b = 0;
599 | uint64_t counter_c = 0;
600 | uint64_t counter_d = 0;
601 |
602 | // end before overflowing the counters
603 | uint32_t len = ((n < 0x0FFF) ? n : 0x0FFF) & ~7;
604 | n -= len;
605 | for (const uint16_t* end = &in[len]; in != end; in += 8) {
606 | const uint64_t mask_a = UINT64_C(0x1111111111111111);
607 | const uint64_t mask_b = mask_a + mask_a;
608 | const uint64_t mask_c = mask_b + mask_b;
609 | const uint64_t mask_0001 = UINT64_C(0x0001000100010001);
610 | const uint64_t mask_cnts = UINT64_C(0x000000F00F00F00F);
611 |
612 | uint64_t v0 = STORM_pospopcnt_loadu_u64(&in[0]);
613 | uint64_t v1 = STORM_pospopcnt_loadu_u64(&in[4]);
614 |
615 | uint64_t a = (v0 & mask_a) + (v1 & mask_a);
616 | uint64_t b = ((v0 & mask_b) + (v1 & mask_b)) >> 1;
617 | uint64_t c = ((v0 & mask_c) + (v1 & mask_c)) >> 2;
618 | uint64_t d = ((v0 >> 3) & mask_a) + ((v1 >> 3) & mask_a);
619 |
620 | uint64_t hi;
621 | a = STORM_pospopcnt_umul128(a, mask_0001, &hi);
622 | a += hi; // broadcast 4-bit counts
623 | b = STORM_pospopcnt_umul128(b, mask_0001, &hi);
624 | b += hi;
625 | c = STORM_pospopcnt_umul128(c, mask_0001, &hi);
626 | c += hi;
627 | d = STORM_pospopcnt_umul128(d, mask_0001, &hi);
628 | d += hi;
629 |
630 | counter_a += a & mask_cnts;
631 | counter_b += b & mask_cnts;
632 | counter_c += c & mask_cnts;
633 | counter_d += d & mask_cnts;
634 | }
635 |
636 | out[0] += counter_a & 0x0FFF;
637 | out[1] += counter_b & 0x0FFF;
638 | out[2] += counter_c & 0x0FFF;
639 | out[3] += counter_d & 0x0FFF;
640 | out[4] += (counter_a >> 36);
641 | out[5] += (counter_b >> 36);
642 | out[6] += (counter_c >> 36);
643 | out[7] += (counter_d >> 36);
644 | out[8] += (counter_a >> 24) & 0x0FFF;
645 | out[9] += (counter_b >> 24) & 0x0FFF;
646 | out[10] += (counter_c >> 24) & 0x0FFF;
647 | out[11] += (counter_d >> 24) & 0x0FFF;
648 | out[12] += (counter_a >> 12) & 0x0FFF;
649 | out[13] += (counter_b >> 12) & 0x0FFF;
650 | out[14] += (counter_c >> 12) & 0x0FFF;
651 | out[15] += (counter_d >> 12) & 0x0FFF;
652 | }
653 |
654 | // assert(n < 8)
655 | if (n != 0) {
656 | uint64_t tail_counter_a = 0;
657 | uint64_t tail_counter_b = 0;
658 | do { // zero-extend a bit to 8-bits (emulate pdep) then accumulate
659 | const uint64_t mask_01 = UINT64_C(0x0101010101010101);
660 | const uint64_t magic = UINT64_C(0x0000040010004001); // 1+(1<<14)+(1<<28)+(1<<42)
661 | uint64_t x = *in++;
662 | tail_counter_a += ((x & 0x5555) * magic) & mask_01; // 0101010101010101
663 | tail_counter_b += (((x >> 1) & 0x5555) * magic) & mask_01;
664 | } while (--n);
665 |
666 | out[0] += tail_counter_a & 0xFF;
667 | out[8] += (tail_counter_a >> 8) & 0xFF;
668 | out[2] += (tail_counter_a >> 16) & 0xFF;
669 | out[10] += (tail_counter_a >> 24) & 0xFF;
670 | out[4] += (tail_counter_a >> 32) & 0xFF;
671 | out[12] += (tail_counter_a >> 40) & 0xFF;
672 | out[6] += (tail_counter_a >> 48) & 0xFF;
673 | out[14] += (tail_counter_a >> 56) & 0xFF;
674 | out[1] += tail_counter_b & 0xFF;
675 | out[9] += (tail_counter_b >> 8) & 0xFF;
676 | out[3] += (tail_counter_b >> 16) & 0xFF;
677 | out[11] += (tail_counter_b >> 24) & 0xFF;
678 | out[5] += (tail_counter_b >> 32) & 0xFF;
679 | out[13] += (tail_counter_b >> 40) & 0xFF;
680 | out[7] += (tail_counter_b >> 48) & 0xFF;
681 | out[15] += (tail_counter_b >> 56) & 0xFF;
682 | }
683 |
684 | return 0;
685 | }
686 | #endif
687 |
688 | /*
689 | * This uses fewer arithmetic operations than any other known
690 | * implementation on machines with fast multiplication.
691 | * It uses 12 arithmetic operations, one of which is a multiply.
692 | * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
693 | */
694 | STORM_FORCE_INLINE
695 | uint64_t STORM_popcount64(uint64_t x)
696 | {
697 | uint64_t m1 = UINT64_C(0x5555555555555555);
698 | uint64_t m2 = UINT64_C(0x3333333333333333);
699 | uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F);
700 | uint64_t h01 = UINT64_C(0x0101010101010101);
701 |
702 | x -= (x >> 1) & m1;
703 | x = (x & m2) + ((x >> 2) & m2);
704 | x = (x + (x >> 4)) & m4;
705 |
706 | return (x * h01) >> 56;
707 | }
708 |
709 |
710 | static
711 | const uint8_t STORM_popcnt_lookup8bit[256] = {
712 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
713 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
714 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
715 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
716 | /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3,
717 | /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4,
718 | /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4,
719 | /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5,
720 | /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3,
721 | /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4,
722 | /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4,
723 | /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5,
724 | /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4,
725 | /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5,
726 | /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5,
727 | /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6,
728 | /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3,
729 | /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4,
730 | /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4,
731 | /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5,
732 | /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4,
733 | /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5,
734 | /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5,
735 | /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6,
736 | /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4,
737 | /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5,
738 | /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5,
739 | /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6,
740 | /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5,
741 | /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6,
742 | /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6,
743 | /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7,
744 | /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3,
745 | /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4,
746 | /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4,
747 | /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5,
748 | /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4,
749 | /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5,
750 | /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5,
751 | /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6,
752 | /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4,
753 | /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5,
754 | /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5,
755 | /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6,
756 | /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5,
757 | /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6,
758 | /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6,
759 | /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7,
760 | /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4,
761 | /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5,
762 | /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5,
763 | /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6,
764 | /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5,
765 | /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6,
766 | /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6,
767 | /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7,
768 | /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5,
769 | /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6,
770 | /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6,
771 | /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7,
772 | /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6,
773 | /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7,
774 | /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7,
775 | /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8
776 | };
777 |
778 | /****************************
779 | * SSE4.1 functions
780 | ****************************/
781 |
782 | #if defined(STORM_HAVE_SSE42)
783 |
784 | #include
785 |
786 | STORM_TARGET("sse4.2")
787 | STORM_FORCE_INLINE
788 | uint64_t STORM_POPCOUNT_SSE(const __m128i n) {
789 | return(STORM_POPCOUNT(_mm_cvtsi128_si64(n)) +
790 | STORM_POPCOUNT(_mm_cvtsi128_si64(_mm_unpackhi_epi64(n, n))));
791 | }
792 |
793 | STORM_TARGET("sse4.2")
794 | STORM_FORCE_INLINE
795 | void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) {
796 | __m128i u = _mm_xor_si128(a, b);
797 | *h = _mm_or_si128(_mm_and_si128(a, b), _mm_and_si128(u, c));
798 | *l = _mm_xor_si128(u, c);
799 | }
800 |
801 | /**
802 | * Carry-save adder update step.
803 | * @see https://en.wikipedia.org/wiki/Carry-save_adder#Technical_details
804 | *
805 | * Steps:
806 | * 1) U = *L ⊕ B
807 | * 2) *H = (*L ^ B) | (U ^ C)
808 | * 3) *L = *L ⊕ B ⊕ C = U ⊕ C
809 | *
810 | * B and C are 16-bit staggered registers such that &C - &B = 1.
811 | *
812 | * Example usage:
813 | * pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 0), _mm_loadu_si128(data + i + 1));
814 | *
815 | * @param h
816 | * @param l
817 | * @param b
818 | * @param c
819 | */
820 | STORM_TARGET("sse4.2")
821 | STORM_FORCE_INLINE
822 | void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h,
823 | __m128i* STORM_RESTRICT l,
824 | const __m128i b,
825 | const __m128i c)
826 | {
827 | const __m128i u = _mm_xor_si128(*l, b);
828 | *h = _mm_or_si128(*l & b, u & c); // shift carry (sc_i).
829 | *l = _mm_xor_si128(u, c); // partial sum (ps).
830 | }
831 |
832 | // By @aqrit (https://github.com/aqrit)
833 | // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3
834 | STORM_TARGET("sse4.2")
835 | static
836 | int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, size_t len, uint32_t* flag_counts) {
837 | const __m128i zero = _mm_setzero_si128();
838 | const __m128i mask_lo_byte = _mm_srli_epi16(_mm_cmpeq_epi8(zero, zero), 8);
839 | const __m128i mask_lo_cnt = _mm_srli_epi16(mask_lo_byte, 2);
840 | const __m128i mask_bits_a = _mm_set1_epi8(0x41); // 01000001
841 | const __m128i mask_bits_b = _mm_add_epi8(mask_bits_a, mask_bits_a);
842 | uint32_t buffer[16];
843 |
844 | __m128i counterA = zero;
845 | __m128i counterB = zero;
846 | __m128i counterC = zero;
847 | __m128i counterD = zero;
848 |
849 | for (const uint16_t* end = &data[(len & ~31)]; data != end; data += 32) {
850 | __m128i r0 = _mm_loadu_si128((__m128i*)&data[0]);
851 | __m128i r1 = _mm_loadu_si128((__m128i*)&data[8]);
852 | __m128i r2 = _mm_loadu_si128((__m128i*)&data[16]);
853 | __m128i r3 = _mm_loadu_si128((__m128i*)&data[24]);
854 | __m128i r4, r5, r6, r7;
855 |
856 | // seperate LOBYTE and HIBYTE of each WORD
857 | // (emulate PSHUFB F,D,B,9,7,5,3,1, E,C,A,8,6,4,2,0)
858 | r4 = _mm_and_si128(mask_lo_byte, r0);
859 | r5 = _mm_and_si128(mask_lo_byte, r1);
860 | r6 = _mm_and_si128(mask_lo_byte, r2);
861 | r7 = _mm_and_si128(mask_lo_byte, r3);
862 | r0 = _mm_srli_epi16(r0, 8);
863 | r1 = _mm_srli_epi16(r1, 8);
864 | r2 = _mm_srli_epi16(r2, 8);
865 | r3 = _mm_srli_epi16(r3, 8);
866 | r0 = _mm_packus_epi16(r0, r4);
867 | r1 = _mm_packus_epi16(r1, r5);
868 | r2 = _mm_packus_epi16(r2, r6);
869 | r3 = _mm_packus_epi16(r3, r7);
870 |
871 | // isolate bits to count
872 | r4 = _mm_and_si128(mask_bits_a, r0);
873 | r5 = _mm_and_si128(mask_bits_a, r1);
874 | r6 = _mm_and_si128(mask_bits_a, r2);
875 | r7 = _mm_and_si128(mask_bits_a, r3);
876 |
877 | // horizontal sum of qwords
878 | r4 = _mm_sad_epu8(r4, zero);
879 | r5 = _mm_sad_epu8(r5, zero);
880 | r6 = _mm_sad_epu8(r6, zero);
881 | r7 = _mm_sad_epu8(r7, zero);
882 |
883 | // sum 6-bit counts
884 | r4 = _mm_add_epi16(r4,r5);
885 | r4 = _mm_add_epi16(r4,r6);
886 | r4 = _mm_add_epi16(r4,r7);
887 |
888 | // unpack 6-bit counts to 32-bits
889 | r5 = _mm_and_si128(mask_lo_cnt, r4);
890 | r4 = _mm_srli_epi16(r4, 6);
891 | r4 = _mm_packs_epi32(r4, r5);
892 |
893 | // accumulate
894 | counterA = _mm_add_epi32(counterA, r4);
895 |
896 | // do it again...
897 | r4 = _mm_and_si128(mask_bits_b, r0);
898 | r5 = _mm_and_si128(mask_bits_b, r1);
899 | r6 = _mm_and_si128(mask_bits_b, r2);
900 | r7 = _mm_and_si128(mask_bits_b, r3);
901 |
902 | r4 = _mm_sad_epu8(r4, zero);
903 | r5 = _mm_sad_epu8(r5, zero);
904 | r6 = _mm_sad_epu8(r6, zero);
905 | r7 = _mm_sad_epu8(r7, zero);
906 |
907 | r4 = _mm_add_epi16(r4,r5);
908 | r4 = _mm_add_epi16(r4,r6);
909 | r4 = _mm_add_epi16(r4,r7);
910 |
911 | r5 = _mm_avg_epu8(zero, r4); // shift right 1
912 | r5 = _mm_and_si128(r5, mask_lo_cnt);
913 | r4 = _mm_srli_epi16(r4, 7);
914 | r4 = _mm_packs_epi32(r4, r5);
915 |
916 | counterB = _mm_add_epi32(counterB, r4); // accumulate
917 |
918 | // rotate right 4
919 | r4 = _mm_slli_epi16(r0, 12);
920 | r5 = _mm_slli_epi16(r1, 12);
921 | r6 = _mm_slli_epi16(r2, 12);
922 | r7 = _mm_slli_epi16(r3, 12);
923 | r0 = _mm_srli_epi16(r0, 4);
924 | r1 = _mm_srli_epi16(r1, 4);
925 | r2 = _mm_srli_epi16(r2, 4);
926 | r3 = _mm_srli_epi16(r3, 4);
927 | r0 = _mm_or_si128(r0, r4);
928 | r1 = _mm_or_si128(r1, r5);
929 | r2 = _mm_or_si128(r2, r6);
930 | r3 = _mm_or_si128(r3, r7);
931 |
932 | // do it again...
933 | r4 = _mm_and_si128(mask_bits_a, r0);
934 | r5 = _mm_and_si128(mask_bits_a, r1);
935 | r6 = _mm_and_si128(mask_bits_a, r2);
936 | r7 = _mm_and_si128(mask_bits_a, r3);
937 |
938 | r4 = _mm_sad_epu8(r4, zero);
939 | r5 = _mm_sad_epu8(r5, zero);
940 | r6 = _mm_sad_epu8(r6, zero);
941 | r7 = _mm_sad_epu8(r7, zero);
942 |
943 | r4 = _mm_add_epi16(r4,r5);
944 | r4 = _mm_add_epi16(r4,r6);
945 | r4 = _mm_add_epi16(r4,r7);
946 |
947 | r5 = _mm_and_si128(mask_lo_cnt, r4);
948 | r4 = _mm_srli_epi16(r4, 6);
949 | r4 = _mm_packs_epi32(r4, r5);
950 |
951 | counterC = _mm_add_epi32(counterC, r4); // accumulate
952 |
953 | // do it again...
954 | r0 = _mm_and_si128(r0, mask_bits_b);
955 | r1 = _mm_and_si128(r1, mask_bits_b);
956 | r2 = _mm_and_si128(r2, mask_bits_b);
957 | r3 = _mm_and_si128(r3, mask_bits_b);
958 |
959 | r0 = _mm_sad_epu8(r0, zero);
960 | r1 = _mm_sad_epu8(r1, zero);
961 | r2 = _mm_sad_epu8(r2, zero);
962 | r3 = _mm_sad_epu8(r3, zero);
963 |
964 | r0 = _mm_add_epi16(r0,r1);
965 | r0 = _mm_add_epi16(r0,r2);
966 | r0 = _mm_add_epi16(r0,r3);
967 |
968 | r1 = _mm_avg_epu8(zero, r0);
969 | r1 = _mm_and_si128(r1, mask_lo_cnt);
970 | r0 = _mm_srli_epi16(r0, 7);
971 | r0 = _mm_packs_epi32(r0, r1);
972 |
973 | counterD = _mm_add_epi32(counterD, r0); // accumulate
974 | }
975 |
976 | // transpose then store counters
977 | __m128i counter_1098 = _mm_unpackhi_epi32(counterA, counterB);
978 | __m128i counter_76FE = _mm_unpacklo_epi32(counterA, counterB);
979 | __m128i counter_32BA = _mm_unpacklo_epi32(counterC, counterD);
980 | __m128i counter_54DC = _mm_unpackhi_epi32(counterC, counterD);
981 | __m128i counter_7654 = _mm_unpackhi_epi64(counter_54DC, counter_76FE);
982 | __m128i counter_FEDC = _mm_unpacklo_epi64(counter_54DC, counter_76FE);
983 | __m128i counter_3210 = _mm_unpackhi_epi64(counter_1098, counter_32BA);
984 | __m128i counter_BA98 = _mm_unpacklo_epi64(counter_1098, counter_32BA);
985 |
986 |
987 | _mm_storeu_si128((__m128i*)&buffer[0], counter_3210);
988 | _mm_storeu_si128((__m128i*)&buffer[4], counter_7654);
989 | _mm_storeu_si128((__m128i*)&buffer[8], counter_BA98);
990 | _mm_storeu_si128((__m128i*)&buffer[12], counter_FEDC);
991 | for (int i = 0; i < 16; ++i) flag_counts[i] += buffer[i];
992 |
993 | // scalar tail loop
994 | int tail = len & 31;
995 | if (tail != 0) {
996 | uint64_t countsA = 0;
997 | uint64_t countsB = 0;
998 | do {
999 | // zero-extend a bit to 8-bits then accumulate
1000 | // (emulate pdep)
1001 | const uint64_t mask_01 = UINT64_C(0x0101010101010101);// 100000001000000010000000100000001000000010000000100000001
1002 | const uint64_t magic = UINT64_C(0x0000040010004001);// 000000000000001000000000000010000000000000100000000000001
1003 | // 1+(1<<14)+(1<<28)+(1<<42)
1004 | uint64_t x = *data++;
1005 | countsA += ((x & 0x5555) * magic) & mask_01; // 0101010101010101
1006 | countsB += (((x >> 1) & 0x5555) * magic) & mask_01;
1007 | } while (--tail);
1008 |
1009 | // transpose then store counters
1010 | flag_counts[0] += countsA & 0xFF;
1011 | flag_counts[8] += (countsA >> 8) & 0xFF;
1012 | flag_counts[2] += (countsA >> 16) & 0xFF;
1013 | flag_counts[10] += (countsA >> 24) & 0xFF;
1014 | flag_counts[4] += (countsA >> 32) & 0xFF;
1015 | flag_counts[12] += (countsA >> 40) & 0xFF;
1016 | flag_counts[6] += (countsA >> 48) & 0xFF;
1017 | flag_counts[14] += (countsA >> 56) & 0xFF;
1018 | flag_counts[1] += countsB & 0xFF;
1019 | flag_counts[9] += (countsB >> 8) & 0xFF;
1020 | flag_counts[3] += (countsB >> 16) & 0xFF;
1021 | flag_counts[11] += (countsB >> 24) & 0xFF;
1022 | flag_counts[5] += (countsB >> 32) & 0xFF;
1023 | flag_counts[13] += (countsB >> 40) & 0xFF;
1024 | flag_counts[7] += (countsB >> 48) & 0xFF;
1025 | flag_counts[15] += (countsB >> 56) & 0xFF;
1026 | }
1027 |
1028 | return 0;
1029 | }
1030 |
1031 | STORM_TARGET("sse4.2")
1032 | static
1033 | int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) {
1034 | const __m128i* data_vectors = (const __m128i*)(array);
1035 | const uint32_t n_cycles = len / 8;
1036 |
1037 | size_t i = 0;
1038 | for (/**/; i + 8 <= n_cycles; i += 8) {
1039 | #define L(p) __m128i v##p = _mm_loadu_si128(data_vectors+i+p);
1040 | L(0) L(1) L(2) L(3)
1041 | L(4) L(5) L(6) L(7)
1042 |
1043 | #define U0(p,k) __m128i input##p = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0x00FF)), _mm_slli_epi16(v##k, 8));
1044 | #define U1(p,k) __m128i input##k = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0xFF00)), _mm_srli_epi16(v##k, 8));
1045 | #define U(p, k) U0(p,k) U1(p,k)
1046 |
1047 | U(0,1) U(2,3) U(4,5) U(6,7)
1048 |
1049 | for (int i = 0; i < 8; ++i) {
1050 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p));
1051 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k));
1052 | #define A(p, k) A0(p) A1(k)
1053 | A(0,1) A(2, 3) A(4,5) A(6, 7)
1054 |
1055 | #define P0(p) input##p = _mm_add_epi8(input##p, input##p);
1056 | #define P(p, k) input##p = P0(p) P0(k)
1057 |
1058 | P(0,1) P(2, 3) P(4,5) P(6, 7)
1059 | }
1060 | }
1061 |
1062 | for (/**/; i + 4 <= n_cycles; i += 4) {
1063 | L(0) L(1) L(2) L(3)
1064 | U(0,1) U(2,3)
1065 |
1066 | for (int i = 0; i < 8; ++i) {
1067 | A(0,1) A(2, 3)
1068 | P(0,1) P(2, 3)
1069 | }
1070 | }
1071 |
1072 | for (/**/; i + 2 <= n_cycles; i += 2) {
1073 | L(0) L(1)
1074 | U(0,1)
1075 |
1076 | for (int i = 0; i < 8; ++i) {
1077 | A(0,1)
1078 | P(0,1)
1079 | }
1080 | }
1081 |
1082 | i *= 8;
1083 | for (/**/; i < len; ++i) {
1084 | for (int j = 0; j < 16; ++j) {
1085 | out[j] += ((array[i] & (1 << j)) >> j);
1086 | }
1087 | }
1088 |
1089 | #undef L
1090 | #undef U0
1091 | #undef U1
1092 | #undef U
1093 | #undef A0
1094 | #undef A1
1095 | #undef A
1096 | #undef P0
1097 | #undef P
1098 | return 0;
1099 | }
1100 |
1101 | STORM_TARGET("sse4.2")
1102 | static
1103 | int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
1104 | for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) {
1105 | for (int j = 0; j < 16; ++j) {
1106 | out[j] += ((array[i] & (1 << j)) >> j);
1107 | }
1108 | }
1109 |
1110 | const __m128i* data = (const __m128i*)array;
1111 | size_t size = len / 8;
1112 | __m128i v1 = _mm_setzero_si128();
1113 | __m128i v2 = _mm_setzero_si128();
1114 | __m128i v4 = _mm_setzero_si128();
1115 | __m128i v8 = _mm_setzero_si128();
1116 | __m128i v16 = _mm_setzero_si128();
1117 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1118 |
1119 | const uint64_t limit = size - size % 16;
1120 | uint64_t i = 0;
1121 | uint16_t buffer[8];
1122 | __m128i counter[16];
1123 |
1124 | while (i < limit) {
1125 | for (size_t i = 0; i < 16; ++i) {
1126 | counter[i] = _mm_setzero_si128();
1127 | }
1128 |
1129 | size_t thislimit = limit;
1130 | if (thislimit - i >= (1 << 16))
1131 | thislimit = i + (1 << 16) - 1;
1132 |
1133 | for (/**/; i < thislimit; i += 16) {
1134 | #define U(pos) { \
1135 | counter[pos] = _mm_add_epi16(counter[pos], _mm_and_si128(v16, _mm_set1_epi16(1))); \
1136 | v16 = _mm_srli_epi16(v16, 1); \
1137 | }
1138 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 0), _mm_loadu_si128(data + i + 1));
1139 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 2), _mm_loadu_si128(data + i + 3));
1140 | STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB);
1141 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 4), _mm_loadu_si128(data + i + 5));
1142 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 6), _mm_loadu_si128(data + i + 7));
1143 | STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB);
1144 | STORM_pospopcnt_csa_sse(&eightsA,&v4, foursA, foursB);
1145 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 8), _mm_loadu_si128(data + i + 9));
1146 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 10), _mm_loadu_si128(data + i + 11));
1147 | STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB);
1148 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 12), _mm_loadu_si128(data + i + 13));
1149 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 14), _mm_loadu_si128(data + i + 15));
1150 | STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB);
1151 | STORM_pospopcnt_csa_sse(&eightsB,&v4, foursA, foursB);
1152 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
1153 | STORM_pospopcnt_csa_sse(&v16, &v8, eightsA, eightsB);
1154 | #undef U
1155 | }
1156 |
1157 | // update the counters after the last iteration
1158 | for (size_t i = 0; i < 16; ++i) {
1159 | counter[i] = _mm_add_epi16(counter[i], _mm_and_si128(v16, _mm_set1_epi16(1)));
1160 | v16 = _mm_srli_epi16(v16, 1);
1161 | }
1162 |
1163 | for (size_t i = 0; i < 16; ++i) {
1164 | _mm_storeu_si128((__m128i*)buffer, counter[i]);
1165 | for (size_t z = 0; z < 8; z++) {
1166 | out[i] += 16 * (uint32_t)buffer[z];
1167 | }
1168 | }
1169 | }
1170 |
1171 | _mm_storeu_si128((__m128i*)buffer, v1);
1172 | for (size_t i = 0; i < 8; ++i) {
1173 | for (int j = 0; j < 16; ++j) {
1174 | out[j] += ((buffer[i] & (1 << j)) >> j);
1175 | }
1176 | }
1177 |
1178 | _mm_storeu_si128((__m128i*)buffer, v2);
1179 | for (size_t i = 0; i < 8; ++i) {
1180 | for (int j = 0; j < 16; ++j) {
1181 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
1182 | }
1183 | }
1184 | _mm_storeu_si128((__m128i*)buffer, v4);
1185 | for (size_t i = 0; i < 8; ++i) {
1186 | for (int j = 0; j < 16; ++j) {
1187 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
1188 | }
1189 | }
1190 | _mm_storeu_si128((__m128i*)buffer, v8);
1191 | for (size_t i = 0; i < 8; ++i) {
1192 | for (int j = 0; j < 16; ++j) {
1193 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
1194 | }
1195 | }
1196 | return 0;
1197 | }
1198 |
1199 | STORM_TARGET("sse4.2")
1200 | static
1201 | uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1,
1202 | const __m128i* STORM_RESTRICT data2,
1203 | size_t size)
1204 | {
1205 | __m128i ones = _mm_setzero_si128();
1206 | __m128i twos = _mm_setzero_si128();
1207 | __m128i fours = _mm_setzero_si128();
1208 | __m128i eights = _mm_setzero_si128();
1209 | __m128i sixteens = _mm_setzero_si128();
1210 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1211 |
1212 | uint64_t i = 0;
1213 | uint64_t limit = size - size % 16;
1214 | uint64_t cnt64 = 0;
1215 |
1216 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) & _mm_loadu_si128(&data2[i+a]))
1217 |
1218 | for (/**/; i < limit; i += 16) {
1219 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1));
1220 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3));
1221 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1222 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5));
1223 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7));
1224 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1225 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB);
1226 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9));
1227 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11));
1228 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1229 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13));
1230 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15));
1231 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1232 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB);
1233 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1234 |
1235 | cnt64 += STORM_POPCOUNT_SSE(sixteens);
1236 | }
1237 | #undef LOAD
1238 |
1239 | cnt64 <<= 4;
1240 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1241 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2;
1242 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1;
1243 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0;
1244 |
1245 | for (/**/; i < size; ++i)
1246 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) & _mm_loadu_si128(&data2[i]));
1247 |
1248 | return cnt64;
1249 | }
1250 |
1251 | STORM_TARGET("sse4.2")
1252 | static
1253 | uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1,
1254 | const __m128i* STORM_RESTRICT data2,
1255 | size_t size)
1256 | {
1257 | __m128i ones = _mm_setzero_si128();
1258 | __m128i twos = _mm_setzero_si128();
1259 | __m128i fours = _mm_setzero_si128();
1260 | __m128i eights = _mm_setzero_si128();
1261 | __m128i sixteens = _mm_setzero_si128();
1262 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1263 |
1264 | uint64_t i = 0;
1265 | uint64_t limit = size - size % 16;
1266 | uint64_t cnt64 = 0;
1267 |
1268 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) | _mm_loadu_si128(&data2[i+a]))
1269 |
1270 | for (/**/; i < limit; i += 16) {
1271 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1));
1272 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3));
1273 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1274 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5));
1275 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7));
1276 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1277 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB);
1278 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9));
1279 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11));
1280 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1281 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13));
1282 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15));
1283 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1284 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB);
1285 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1286 |
1287 | cnt64 += STORM_POPCOUNT_SSE(sixteens);
1288 | }
1289 | #undef LOAD
1290 |
1291 | cnt64 <<= 4;
1292 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1293 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2;
1294 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1;
1295 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0;
1296 |
1297 | for (/**/; i < size; ++i)
1298 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) | _mm_loadu_si128(&data2[i]));
1299 |
1300 | return cnt64;
1301 | }
1302 |
1303 | STORM_TARGET("sse4.2")
1304 | static
1305 | uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1,
1306 | const __m128i* STORM_RESTRICT data2,
1307 | size_t size)
1308 | {
1309 | __m128i ones = _mm_setzero_si128();
1310 | __m128i twos = _mm_setzero_si128();
1311 | __m128i fours = _mm_setzero_si128();
1312 | __m128i eights = _mm_setzero_si128();
1313 | __m128i sixteens = _mm_setzero_si128();
1314 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1315 |
1316 | uint64_t i = 0;
1317 | uint64_t limit = size - size % 16;
1318 | uint64_t cnt64 = 0;
1319 |
1320 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) ^ _mm_loadu_si128(&data2[i+a]))
1321 |
1322 | for (/**/; i < limit; i += 16) {
1323 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1));
1324 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3));
1325 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1326 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5));
1327 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7));
1328 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1329 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB);
1330 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9));
1331 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11));
1332 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1333 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13));
1334 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15));
1335 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1336 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB);
1337 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1338 |
1339 | cnt64 += STORM_POPCOUNT_SSE(sixteens);
1340 | }
1341 | #undef LOAD
1342 |
1343 | cnt64 <<= 4;
1344 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1345 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2;
1346 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1;
1347 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0;
1348 |
1349 | for (/**/; i < size; ++i)
1350 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) ^ _mm_loadu_si128(&data2[i]));
1351 |
1352 | return cnt64;
1353 | }
1354 |
1355 | STORM_TARGET("sse4.2")
1356 | static
1357 | uint64_t STORM_popcnt_csa_sse4(const __m128i* STORM_RESTRICT data,
1358 | size_t size)
1359 | {
1360 | __m128i ones = _mm_setzero_si128();
1361 | __m128i twos = _mm_setzero_si128();
1362 | __m128i fours = _mm_setzero_si128();
1363 | __m128i eights = _mm_setzero_si128();
1364 | __m128i sixteens = _mm_setzero_si128();
1365 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1366 |
1367 | uint64_t i = 0;
1368 | uint64_t limit = size - size % 16;
1369 | uint64_t cnt64 = 0;
1370 |
1371 | #define LOAD(a) (_mm_loadu_si128(&data[i+a]))
1372 |
1373 | for (/**/; i < limit; i += 16) {
1374 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1));
1375 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3));
1376 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1377 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5));
1378 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7));
1379 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1380 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB);
1381 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9));
1382 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11));
1383 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB);
1384 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13));
1385 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15));
1386 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB);
1387 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB);
1388 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1389 |
1390 | cnt64 += STORM_POPCOUNT_SSE(sixteens);
1391 | }
1392 | #undef LOAD
1393 |
1394 | cnt64 <<= 4;
1395 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1396 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2;
1397 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1;
1398 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0;
1399 |
1400 | for (/**/; i < size; ++i)
1401 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data[i]));
1402 |
1403 | return cnt64;
1404 | }
1405 |
1406 | STORM_TARGET("sse4.2")
1407 | static
1408 | uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1,
1409 | const uint64_t* STORM_RESTRICT b2,
1410 | const size_t n_ints)
1411 | {
1412 | uint64_t count = 0;
1413 | const __m128i* r1 = (__m128i*)b1;
1414 | const __m128i* r2 = (__m128i*)b2;
1415 | const uint32_t n_cycles = n_ints / 2;
1416 |
1417 | count += STORM_intersect_count_csa_sse4(r1, r2, n_cycles);
1418 |
1419 | for (int i = n_cycles*2; i < n_ints; ++i) {
1420 | count += STORM_POPCOUNT(b1[i] & b2[i]);
1421 | }
1422 |
1423 | return(count);
1424 | }
1425 |
1426 | STORM_TARGET("sse4.2")
1427 | static
1428 | uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1,
1429 | const uint64_t* STORM_RESTRICT b2,
1430 | const size_t n_ints)
1431 | {
1432 | uint64_t count = 0;
1433 | const __m128i* r1 = (__m128i*)b1;
1434 | const __m128i* r2 = (__m128i*)b2;
1435 | const uint32_t n_cycles = n_ints / 2;
1436 |
1437 | count += STORM_union_count_csa_sse4(r1, r2, n_cycles);
1438 |
1439 | for (int i = n_cycles*2; i < n_ints; ++i) {
1440 | count += STORM_POPCOUNT(b1[i] | b2[i]);
1441 | }
1442 |
1443 | return(count);
1444 | }
1445 |
1446 | STORM_TARGET("sse4.2")
1447 | static
1448 | uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1,
1449 | const uint64_t* STORM_RESTRICT b2,
1450 | const size_t n_ints)
1451 | {
1452 | uint64_t count = 0;
1453 | const __m128i* r1 = (__m128i*)b1;
1454 | const __m128i* r2 = (__m128i*)b2;
1455 | const uint32_t n_cycles = n_ints / 2;
1456 |
1457 | count += STORM_diff_count_csa_sse4(r1, r2, n_cycles);
1458 |
1459 | for (int i = n_cycles*2; i < n_ints; ++i) {
1460 | count += STORM_POPCOUNT(b1[i] ^ b2[i]);
1461 | }
1462 |
1463 | return(count);
1464 | }
1465 |
1466 | STORM_TARGET("sse4.2")
1467 | static
1468 | uint64_t STORM_popcnt_sse4(const uint64_t* STORM_RESTRICT data,
1469 | const size_t n_ints)
1470 | {
1471 | uint64_t count = 0;
1472 | const __m128i* r1 = (__m128i*)data;
1473 | const uint32_t n_cycles = n_ints / 2;
1474 |
1475 | count += STORM_popcnt_csa_sse4(r1, n_cycles);
1476 |
1477 | for (int i = n_cycles*2; i < n_ints; ++i) {
1478 | count += STORM_POPCOUNT(data[i]);
1479 | }
1480 |
1481 | return(count);
1482 | }
1483 | #endif
1484 |
1485 | /****************************
1486 | * AVX256 functions
1487 | ****************************/
1488 |
1489 | #if defined(STORM_HAVE_AVX2)
1490 |
1491 | #include
1492 |
1493 | STORM_TARGET("avx2")
1494 | STORM_FORCE_INLINE
1495 | void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) {
1496 | __m256i u = _mm256_xor_si256(a, b);
1497 | *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
1498 | *l = _mm256_xor_si256(u, c);
1499 | }
1500 |
1501 | STORM_TARGET("avx2")
1502 | STORM_FORCE_INLINE
1503 | void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h,
1504 | __m256i* STORM_RESTRICT l,
1505 | const __m256i b,
1506 | const __m256i c)
1507 | {
1508 | const __m256i u = _mm256_xor_si256(*l, b);
1509 | *h = _mm256_or_si256(*l & b, u & c);
1510 | *l = _mm256_xor_si256(u, c);
1511 | }
1512 |
1513 | STORM_TARGET("avx2")
1514 | static
1515 | int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) {
1516 | const __m256i* data_vectors = (const __m256i*)(array);
1517 | const uint32_t n_cycles = len / 16;
1518 |
1519 | size_t i = 0;
1520 | for (/**/; i + 8 <= n_cycles; i += 8) {
1521 | #define L(p) __m256i v##p = _mm256_loadu_si256(data_vectors+i+p);
1522 | L(0) L(1) L(2) L(3)
1523 | L(4) L(5) L(6) L(7)
1524 |
1525 | #define U0(p,k) __m256i input##p = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0x00FF)), _mm256_slli_epi16(v##k, 8));
1526 | #define U1(p,k) __m256i input##k = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0xFF00)), _mm256_srli_epi16(v##k, 8));
1527 | #define U(p, k) U0(p,k) U1(p,k)
1528 | U(0,1) U(2, 3) U(4, 5) U(6, 7)
1529 |
1530 | for (int i = 0; i < 8; ++i) {
1531 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p));
1532 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k));
1533 | #define A(p, k) A0(p) A1(k)
1534 | A(0,1) A(2, 3) A(4, 5) A(6, 7)
1535 |
1536 | #define P0(p) input##p = _mm256_add_epi8(input##p, input##p);
1537 | #define P(p, k) input##p = P0(p) P0(k)
1538 | P(0,1) P(2, 3) P(4, 5) P(6, 7)
1539 | }
1540 | }
1541 |
1542 | for (/**/; i + 4 <= n_cycles; i += 4) {
1543 | L(0) L(1) L(2) L(3)
1544 | U(0,1) U(2, 3)
1545 |
1546 | for (int i = 0; i < 8; ++i) {
1547 | A(0,1) A( 2, 3)
1548 | P(0,1) P( 2, 3)
1549 | }
1550 | }
1551 |
1552 | for (/**/; i + 2 <= n_cycles; i += 2) {
1553 | L(0) L(1)
1554 | U(0,1)
1555 |
1556 | for (int i = 0; i < 8; ++i) {
1557 | A(0,1)
1558 | P(0,1)
1559 | }
1560 | }
1561 |
1562 | i *= 16;
1563 | for (/**/; i < len; ++i) {
1564 | for (int j = 0; j < 16; ++j) {
1565 | out[j] += ((array[i] & (1 << j)) >> j);
1566 | }
1567 | }
1568 |
1569 | #undef L
1570 | #undef U0
1571 | #undef U1
1572 | #undef U
1573 | #undef A0
1574 | #undef A1
1575 | #undef A
1576 | #undef P0
1577 | #undef P
1578 |
1579 | return 0;
1580 | }
1581 |
1582 | STORM_TARGET("avx2")
1583 | static
1584 | int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
1585 | for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) {
1586 | for (int j = 0; j < 16; ++j) {
1587 | out[j] += ((array[i] & (1 << j)) >> j);
1588 | }
1589 | }
1590 |
1591 | const __m256i* data = (const __m256i*)array;
1592 | size_t size = len / 16;
1593 | __m256i v1 = _mm256_setzero_si256();
1594 | __m256i v2 = _mm256_setzero_si256();
1595 | __m256i v4 = _mm256_setzero_si256();
1596 | __m256i v8 = _mm256_setzero_si256();
1597 | __m256i v16 = _mm256_setzero_si256();
1598 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1599 |
1600 | const uint64_t limit = size - size % 16;
1601 | uint64_t i = 0;
1602 | uint16_t buffer[16];
1603 | __m256i counter[16];
1604 | const __m256i one = _mm256_set1_epi16(1);
1605 |
1606 | while (i < limit) {
1607 | for (size_t i = 0; i < 16; ++i) {
1608 | counter[i] = _mm256_setzero_si256();
1609 | }
1610 |
1611 | size_t thislimit = limit;
1612 | if (thislimit - i >= (1 << 16))
1613 | thislimit = i + (1 << 16) - 1;
1614 |
1615 | for (/**/; i < thislimit; i += 16) {
1616 | #define U(pos) { \
1617 | counter[pos] = _mm256_add_epi16(counter[pos], _mm256_and_si256(v16, one)); \
1618 | v16 = _mm256_srli_epi16(v16, 1); \
1619 | }
1620 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 0), _mm256_loadu_si256(data + i + 1));
1621 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 2), _mm256_loadu_si256(data + i + 3));
1622 | STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB);
1623 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 4), _mm256_loadu_si256(data + i + 5));
1624 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 6), _mm256_loadu_si256(data + i + 7));
1625 | STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB);
1626 | STORM_pospopcnt_csa_avx2(&eightsA,&v4, foursA, foursB);
1627 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 8), _mm256_loadu_si256(data + i + 9));
1628 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 10), _mm256_loadu_si256(data + i + 11));
1629 | STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB);
1630 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 12), _mm256_loadu_si256(data + i + 13));
1631 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 14), _mm256_loadu_si256(data + i + 15));
1632 | STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB);
1633 | STORM_pospopcnt_csa_avx2(&eightsB,&v4, foursA, foursB);
1634 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
1635 | STORM_pospopcnt_csa_avx2(&v16, &v8, eightsA, eightsB);
1636 | #undef U
1637 | }
1638 |
1639 | // update the counters after the last iteration
1640 | for (size_t i = 0; i < 16; ++i) {
1641 | counter[i] = _mm256_add_epi16(counter[i], _mm256_and_si256(v16, one));
1642 | v16 = _mm256_srli_epi16(v16, 1);
1643 | }
1644 |
1645 | for (size_t i = 0; i < 16; ++i) {
1646 | _mm256_storeu_si256((__m256i*)buffer, counter[i]);
1647 | for (size_t z = 0; z < 16; z++) {
1648 | out[i] += 16 * (uint32_t)buffer[z];
1649 | }
1650 | }
1651 | }
1652 |
1653 | _mm256_storeu_si256((__m256i*)buffer, v1);
1654 | for (size_t i = 0; i < 16; ++i) {
1655 | for (int j = 0; j < 16; ++j) {
1656 | out[j] += ((buffer[i] & (1 << j)) >> j);
1657 | }
1658 | }
1659 |
1660 | _mm256_storeu_si256((__m256i*)buffer, v2);
1661 | for (size_t i = 0; i < 16; ++i) {
1662 | for (int j = 0; j < 16; ++j) {
1663 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
1664 | }
1665 | }
1666 | _mm256_storeu_si256((__m256i*)buffer, v4);
1667 | for (size_t i = 0; i < 16; ++i) {
1668 | for (int j = 0; j < 16; ++j) {
1669 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
1670 | }
1671 | }
1672 | _mm256_storeu_si256((__m256i*)buffer, v8);
1673 | for (size_t i = 0; i < 16; ++i) {
1674 | for (int j = 0; j < 16; ++j) {
1675 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
1676 | }
1677 | }
1678 | return 0;
1679 | }
1680 |
1681 |
1682 | STORM_TARGET("avx2")
1683 | static
1684 | __m256i STORM_popcnt256(__m256i v) {
1685 | __m256i lookup1 = _mm256_setr_epi8(
1686 | 4, 5, 5, 6, 5, 6, 6, 7,
1687 | 5, 6, 6, 7, 6, 7, 7, 8,
1688 | 4, 5, 5, 6, 5, 6, 6, 7,
1689 | 5, 6, 6, 7, 6, 7, 7, 8
1690 | );
1691 |
1692 | __m256i lookup2 = _mm256_setr_epi8(
1693 | 4, 3, 3, 2, 3, 2, 2, 1,
1694 | 3, 2, 2, 1, 2, 1, 1, 0,
1695 | 4, 3, 3, 2, 3, 2, 2, 1,
1696 | 3, 2, 2, 1, 2, 1, 1, 0
1697 | );
1698 |
1699 | __m256i low_mask = _mm256_set1_epi8(0x0f);
1700 | __m256i lo = _mm256_and_si256(v, low_mask);
1701 | __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
1702 | __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
1703 | __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
1704 |
1705 | return _mm256_sad_epu8(popcnt1, popcnt2);
1706 | }
1707 |
1708 | // modified from https://github.com/WojciechMula/sse-popcount
1709 | STORM_TARGET("avx2")
1710 | static
1711 | uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1,
1712 | const uint8_t* STORM_RESTRICT data2,
1713 | const size_t n)
1714 | {
1715 |
1716 | size_t i = 0;
1717 |
1718 | const __m256i lookup = _mm256_setr_epi8(
1719 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1720 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1721 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1722 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1723 |
1724 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1725 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1726 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1727 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1728 | );
1729 |
1730 | const __m256i low_mask = _mm256_set1_epi8(0x0f);
1731 |
1732 | __m256i acc = _mm256_setzero_si256();
1733 |
1734 | #define ITER { \
1735 | const __m256i vec = _mm256_and_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1736 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1737 | const __m256i lo = _mm256_and_si256(vec, low_mask); \
1738 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1739 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1740 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1741 | local = _mm256_add_epi8(local, popcnt1); \
1742 | local = _mm256_add_epi8(local, popcnt2); \
1743 | i += 32; \
1744 | }
1745 |
1746 | while (i + 8*32 <= n) {
1747 | __m256i local = _mm256_setzero_si256();
1748 | ITER ITER ITER ITER
1749 | ITER ITER ITER ITER
1750 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1751 | }
1752 |
1753 | __m256i local = _mm256_setzero_si256();
1754 |
1755 | while (i + 32 <= n) {
1756 | ITER;
1757 | }
1758 |
1759 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1760 |
1761 | #undef ITER
1762 |
1763 | uint64_t result = 0;
1764 |
1765 | result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1766 | result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1767 | result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1768 | result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1769 |
1770 | for (/**/; i < n; ++i) {
1771 | result += STORM_popcnt_lookup8bit[data1[i] & data2[i]];
1772 | }
1773 |
1774 | return result;
1775 | }
1776 |
1777 | // modified from https://github.com/WojciechMula/sse-popcount
1778 | STORM_TARGET("avx2")
1779 | static
1780 | uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1,
1781 | const uint8_t* STORM_RESTRICT data2,
1782 | const size_t n)
1783 | {
1784 |
1785 | size_t i = 0;
1786 |
1787 | const __m256i lookup = _mm256_setr_epi8(
1788 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1789 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1790 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1791 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1792 |
1793 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1794 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1795 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1796 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1797 | );
1798 |
1799 | const __m256i low_mask = _mm256_set1_epi8(0x0f);
1800 |
1801 | __m256i acc = _mm256_setzero_si256();
1802 |
1803 | #define ITER { \
1804 | const __m256i vec = _mm256_or_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1805 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1806 | const __m256i lo = _mm256_and_si256(vec, low_mask); \
1807 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1808 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1809 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1810 | local = _mm256_add_epi8(local, popcnt1); \
1811 | local = _mm256_add_epi8(local, popcnt2); \
1812 | i += 32; \
1813 | }
1814 |
1815 | while (i + 8*32 <= n) {
1816 | __m256i local = _mm256_setzero_si256();
1817 | ITER ITER ITER ITER
1818 | ITER ITER ITER ITER
1819 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1820 | }
1821 |
1822 | __m256i local = _mm256_setzero_si256();
1823 |
1824 | while (i + 32 <= n) {
1825 | ITER;
1826 | }
1827 |
1828 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1829 |
1830 | #undef ITER
1831 |
1832 | uint64_t result = 0;
1833 |
1834 | result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1835 | result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1836 | result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1837 | result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1838 |
1839 | for (/**/; i < n; ++i) {
1840 | result += STORM_popcnt_lookup8bit[data1[i] | data2[i]];
1841 | }
1842 |
1843 | return result;
1844 | }
1845 |
1846 | // modified from https://github.com/WojciechMula/sse-popcount
1847 | STORM_TARGET("avx2")
1848 | static
1849 | uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1,
1850 | const uint8_t* STORM_RESTRICT data2,
1851 | const size_t n)
1852 | {
1853 |
1854 | size_t i = 0;
1855 |
1856 | const __m256i lookup = _mm256_setr_epi8(
1857 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1858 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1859 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1860 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1861 |
1862 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1863 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1864 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1865 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1866 | );
1867 |
1868 | const __m256i low_mask = _mm256_set1_epi8(0x0f);
1869 |
1870 | __m256i acc = _mm256_setzero_si256();
1871 |
1872 | #define ITER { \
1873 | const __m256i vec = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1874 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1875 | const __m256i lo = _mm256_and_si256(vec, low_mask); \
1876 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1877 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1878 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1879 | local = _mm256_add_epi8(local, popcnt1); \
1880 | local = _mm256_add_epi8(local, popcnt2); \
1881 | i += 32; \
1882 | }
1883 |
1884 | while (i + 8*32 <= n) {
1885 | __m256i local = _mm256_setzero_si256();
1886 | ITER ITER ITER ITER
1887 | ITER ITER ITER ITER
1888 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1889 | }
1890 |
1891 | __m256i local = _mm256_setzero_si256();
1892 |
1893 | while (i + 32 <= n) {
1894 | ITER;
1895 | }
1896 |
1897 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1898 |
1899 | #undef ITER
1900 |
1901 | uint64_t result = 0;
1902 |
1903 | result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1904 | result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1905 | result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1906 | result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1907 |
1908 | for (/**/; i < n; ++i) {
1909 | result += STORM_popcnt_lookup8bit[data1[i] ^ data2[i]];
1910 | }
1911 |
1912 | return result;
1913 | }
1914 |
1915 | STORM_TARGET("avx2")
1916 | static
1917 | uint64_t STORM_popcnt_csa_avx2(const __m256i* data, uint64_t size)
1918 | {
1919 | __m256i cnt = _mm256_setzero_si256();
1920 | __m256i ones = _mm256_setzero_si256();
1921 | __m256i twos = _mm256_setzero_si256();
1922 | __m256i fours = _mm256_setzero_si256();
1923 | __m256i eights = _mm256_setzero_si256();
1924 | __m256i sixteens = _mm256_setzero_si256();
1925 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1926 |
1927 | uint64_t i = 0;
1928 | uint64_t limit = size - size % 16;
1929 | uint64_t* cnt64;
1930 |
1931 | #define LOAD(a) (_mm256_loadu_si256(&data[i+a]))
1932 |
1933 | for (/**/; i < limit; i += 16) {
1934 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1));
1935 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3));
1936 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
1937 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5));
1938 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7));
1939 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
1940 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB);
1941 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9));
1942 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11));
1943 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
1944 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13));
1945 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15));
1946 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
1947 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB);
1948 | STORM_CSA256(&sixteens, &eights, eights, eightsA, eightsB);
1949 |
1950 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
1951 | }
1952 | #undef LOAD
1953 |
1954 | cnt = _mm256_slli_epi64(cnt, 4);
1955 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
1956 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2));
1957 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1));
1958 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
1959 |
1960 | for (/**/; i < size; ++i)
1961 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(data[i]));
1962 |
1963 | cnt64 = (uint64_t*) &cnt;
1964 |
1965 | return cnt64[0] +
1966 | cnt64[1] +
1967 | cnt64[2] +
1968 | cnt64[3];
1969 | }
1970 |
1971 |
1972 | /*
1973 | * AVX2 Harley-Seal popcount (4th iteration).
1974 | * The algorithm is based on the paper "Faster Population Counts
1975 | * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
1976 | * Wojciech Mula (23 Nov 2016).
1977 | * @see https://arxiv.org/abs/1611.07612
1978 | */
1979 | // In this version we perform the operation A&B as input into the CSA operator.
1980 | STORM_TARGET("avx2")
1981 | static
1982 | uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1,
1983 | const __m256i* STORM_RESTRICT data2,
1984 | size_t size)
1985 | {
1986 | __m256i cnt = _mm256_setzero_si256();
1987 | __m256i ones = _mm256_setzero_si256();
1988 | __m256i twos = _mm256_setzero_si256();
1989 | __m256i fours = _mm256_setzero_si256();
1990 | __m256i eights = _mm256_setzero_si256();
1991 | __m256i sixteens = _mm256_setzero_si256();
1992 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1993 |
1994 | uint64_t i = 0;
1995 | uint64_t limit = size - size % 16;
1996 | uint64_t* cnt64;
1997 |
1998 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) & _mm256_loadu_si256(&data2[i+a]))
1999 |
2000 | for (/**/; i < limit; i += 16) {
2001 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1));
2002 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3));
2003 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2004 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5));
2005 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7));
2006 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2007 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB);
2008 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9));
2009 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11));
2010 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2011 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13));
2012 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15));
2013 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2014 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB);
2015 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2016 |
2017 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2018 | }
2019 | #undef LOAD
2020 |
2021 | cnt = _mm256_slli_epi64(cnt, 4);
2022 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2023 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2));
2024 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1));
2025 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2026 |
2027 | for (/**/; i < size; ++i)
2028 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) & _mm256_loadu_si256(&data2[i])));
2029 |
2030 | cnt64 = (uint64_t*) &cnt;
2031 |
2032 | return cnt64[0] +
2033 | cnt64[1] +
2034 | cnt64[2] +
2035 | cnt64[3];
2036 | }
2037 |
2038 | // In this version we perform the operation A|B as input into the CSA operator.
2039 | STORM_TARGET("avx2")
2040 | static
2041 | uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1,
2042 | const __m256i* STORM_RESTRICT data2,
2043 | size_t size)
2044 | {
2045 | __m256i cnt = _mm256_setzero_si256();
2046 | __m256i ones = _mm256_setzero_si256();
2047 | __m256i twos = _mm256_setzero_si256();
2048 | __m256i fours = _mm256_setzero_si256();
2049 | __m256i eights = _mm256_setzero_si256();
2050 | __m256i sixteens = _mm256_setzero_si256();
2051 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
2052 |
2053 | uint64_t i = 0;
2054 | uint64_t limit = size - size % 16;
2055 | uint64_t* cnt64;
2056 |
2057 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) | _mm256_loadu_si256(&data2[i+a]))
2058 |
2059 | for (/**/; i < limit; i += 16) {
2060 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1));
2061 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3));
2062 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2063 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5));
2064 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7));
2065 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2066 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB);
2067 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9));
2068 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11));
2069 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2070 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13));
2071 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15));
2072 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2073 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB);
2074 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2075 |
2076 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2077 | }
2078 | #undef LOAD
2079 |
2080 | cnt = _mm256_slli_epi64(cnt, 4);
2081 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2082 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2));
2083 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1));
2084 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2085 |
2086 | for (/**/; i < size; ++i)
2087 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) | _mm256_loadu_si256(&data2[i])));
2088 |
2089 | cnt64 = (uint64_t*) &cnt;
2090 |
2091 | return cnt64[0] +
2092 | cnt64[1] +
2093 | cnt64[2] +
2094 | cnt64[3];
2095 | }
2096 |
2097 | // In this version we perform the operation A^B as input into the CSA operator.
2098 | STORM_TARGET("avx2")
2099 | static
2100 | uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1,
2101 | const __m256i* STORM_RESTRICT data2,
2102 | size_t size)
2103 | {
2104 | __m256i cnt = _mm256_setzero_si256();
2105 | __m256i ones = _mm256_setzero_si256();
2106 | __m256i twos = _mm256_setzero_si256();
2107 | __m256i fours = _mm256_setzero_si256();
2108 | __m256i eights = _mm256_setzero_si256();
2109 | __m256i sixteens = _mm256_setzero_si256();
2110 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
2111 |
2112 | uint64_t i = 0;
2113 | uint64_t limit = size - size % 16;
2114 | uint64_t* cnt64;
2115 |
2116 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) ^ _mm256_loadu_si256(&data2[i+a]))
2117 |
2118 | for (/**/; i < limit; i += 16) {
2119 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1));
2120 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3));
2121 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2122 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5));
2123 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7));
2124 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2125 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB);
2126 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9));
2127 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11));
2128 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
2129 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13));
2130 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15));
2131 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
2132 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB);
2133 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2134 |
2135 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2136 | }
2137 | #undef LOAD
2138 |
2139 | cnt = _mm256_slli_epi64(cnt, 4);
2140 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2141 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2));
2142 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1));
2143 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2144 |
2145 | for (/**/; i < size; ++i)
2146 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) ^ _mm256_loadu_si256(&data2[i])));
2147 |
2148 | cnt64 = (uint64_t*) &cnt;
2149 |
2150 | return cnt64[0] +
2151 | cnt64[1] +
2152 | cnt64[2] +
2153 | cnt64[3];
2154 | }
2155 |
2156 | STORM_TARGET("avx2")
2157 | static
2158 | uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1,
2159 | const uint64_t* STORM_RESTRICT b2,
2160 | const size_t n_ints)
2161 | {
2162 | uint64_t count = 0;
2163 | const __m256i* r1 = (__m256i*)b1;
2164 | const __m256i* r2 = (__m256i*)b2;
2165 | const uint32_t n_cycles = n_ints / 4;
2166 |
2167 | count += STORM_intersect_count_csa_avx2(r1, r2, n_cycles);
2168 |
2169 | for (int i = n_cycles*4; i < n_ints; ++i) {
2170 | count += STORM_POPCOUNT(b1[i] & b2[i]);
2171 | }
2172 |
2173 | return(count);
2174 | }
2175 |
2176 | STORM_TARGET("avx2")
2177 | static
2178 | uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1,
2179 | const uint64_t* STORM_RESTRICT b2,
2180 | const size_t n_ints)
2181 | {
2182 | uint64_t count = 0;
2183 | const __m256i* r1 = (__m256i*)b1;
2184 | const __m256i* r2 = (__m256i*)b2;
2185 | const uint32_t n_cycles = n_ints / 4;
2186 |
2187 | count += STORM_union_count_csa_avx2(r1, r2, n_cycles);
2188 |
2189 | for (int i = n_cycles*4; i < n_ints; ++i) {
2190 | count += STORM_POPCOUNT(b1[i] | b2[i]);
2191 | }
2192 |
2193 | return(count);
2194 | }
2195 |
2196 | STORM_TARGET("avx2")
2197 | static
2198 | uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1,
2199 | const uint64_t* STORM_RESTRICT b2,
2200 | const size_t n_ints)
2201 | {
2202 | uint64_t count = 0;
2203 | const __m256i* r1 = (__m256i*)b1;
2204 | const __m256i* r2 = (__m256i*)b2;
2205 | const uint32_t n_cycles = n_ints / 4;
2206 |
2207 | count += STORM_diff_count_csa_avx2(r1, r2, n_cycles);
2208 |
2209 | for (int i = n_cycles*4; i < n_ints; ++i) {
2210 | count += STORM_POPCOUNT(b1[i] ^ b2[i]);
2211 | }
2212 |
2213 | return(count);
2214 | }
2215 |
2216 | STORM_TARGET("avx2")
2217 | static
2218 | uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1,
2219 | const uint64_t* STORM_RESTRICT b2,
2220 | const size_t n_ints)
2221 | {
2222 | return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2223 | }
2224 |
2225 | STORM_TARGET("avx2")
2226 | static
2227 | uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1,
2228 | const uint64_t* STORM_RESTRICT b2,
2229 | const size_t n_ints)
2230 | {
2231 | return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2232 | }
2233 |
2234 | STORM_TARGET("avx2")
2235 | static
2236 | uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1,
2237 | const uint64_t* STORM_RESTRICT b2,
2238 | const size_t n_ints)
2239 | {
2240 | return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2241 | }
2242 |
2243 | STORM_TARGET("avx2")
2244 | static
2245 | uint64_t STORM_popcnt_avx2(const uint64_t* data,
2246 | const size_t n_ints)
2247 | {
2248 | uint64_t count = 0;
2249 | const uint32_t n_cycles = n_ints / 4;
2250 | const uint32_t n_cycles_sse = (n_ints % 4) / 2;
2251 |
2252 | const __m256i* r1 = (__m256i*)&data[0];
2253 | const __m128i* r2 = (__m128i*)&data[n_cycles_sse*4];
2254 |
2255 | count += STORM_popcnt_csa_avx2(r1, n_cycles);
2256 | count += STORM_popcnt_csa_sse4(r2, n_cycles_sse);
2257 |
2258 | for (int i = (4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) {
2259 | count += STORM_POPCOUNT(data[i]);
2260 | }
2261 |
2262 | return count;
2263 | }
2264 | #endif
2265 |
2266 | /****************************
2267 | * AVX512BW functions
2268 | ****************************/
2269 |
2270 | #if defined(STORM_HAVE_AVX512)
2271 |
2272 | #include
2273 |
2274 | STORM_TARGET("avx512bw")
2275 | STORM_FORCE_INLINE
2276 | __m512i STORM_popcnt512(__m512i v) {
2277 | __m512i m1 = _mm512_set1_epi8(0x55);
2278 | __m512i m2 = _mm512_set1_epi8(0x33);
2279 | __m512i m4 = _mm512_set1_epi8(0x0F);
2280 | __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1));
2281 | __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2));
2282 | __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4;
2283 |
2284 | return _mm512_sad_epu8(t3, _mm512_setzero_si512());
2285 | }
2286 |
2287 | STORM_TARGET("avx512bw")
2288 | STORM_FORCE_INLINE
2289 | void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) {
2290 | *l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
2291 | *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8);
2292 | }
2293 |
2294 | // By Wojciech Muła
2295 | // @see https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx512-harley-seal.cpp#L3
2296 | // @see https://arxiv.org/abs/1611.07612
2297 | STORM_TARGET("avx512bw")
2298 | STORM_FORCE_INLINE
2299 | __m512i STORM_avx512_popcount(const __m512i v) {
2300 | const __m512i m1 = _mm512_set1_epi8(0x55); // 01010101
2301 | const __m512i m2 = _mm512_set1_epi8(0x33); // 00110011
2302 | const __m512i m4 = _mm512_set1_epi8(0x0F); // 00001111
2303 |
2304 | const __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1));
2305 | const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2));
2306 | const __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4;
2307 | return _mm512_sad_epu8(t3, _mm512_setzero_si512());
2308 | }
2309 |
2310 | // 512i-version of carry-save adder subroutine.
2311 | STORM_TARGET("avx512bw")
2312 | STORM_FORCE_INLINE
2313 | void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h,
2314 | __m512i* STORM_RESTRICT l,
2315 | __m512i b, __m512i c)
2316 | {
2317 | *h = _mm512_ternarylogic_epi32(c, b, *l, 0xE8); // 11101000
2318 | *l = _mm512_ternarylogic_epi32(c, b, *l, 0x96); // 10010110
2319 | }
2320 |
2321 | STORM_TARGET("avx512bw")
2322 | static
2323 | uint64_t STORM_popcnt_csa_avx512bw(const __m512i* STORM_RESTRICT data, size_t size)
2324 | {
2325 | __m512i cnt = _mm512_setzero_si512();
2326 | __m512i ones = _mm512_setzero_si512();
2327 | __m512i twos = _mm512_setzero_si512();
2328 | __m512i fours = _mm512_setzero_si512();
2329 | __m512i eights = _mm512_setzero_si512();
2330 | __m512i sixteens = _mm512_setzero_si512();
2331 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2332 |
2333 | uint64_t i = 0;
2334 | uint64_t limit = size - size % 16;
2335 | uint64_t* cnt64;
2336 |
2337 | #define LOAD(a) (_mm512_loadu_si512(&data[i+a]))
2338 |
2339 | for (/**/; i < limit; i += 16) {
2340 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1));
2341 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3));
2342 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2343 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5));
2344 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7));
2345 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2346 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB);
2347 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9));
2348 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11));
2349 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2350 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13));
2351 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15));
2352 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2353 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB);
2354 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2355 |
2356 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2357 | }
2358 | #undef LOAD
2359 |
2360 | cnt = _mm512_slli_epi64(cnt, 4);
2361 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2362 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2363 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2364 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones));
2365 |
2366 | for (/**/; i < size; ++i)
2367 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data[i])));
2368 |
2369 | cnt64 = (uint64_t*)&cnt;
2370 |
2371 | return cnt64[0] +
2372 | cnt64[1] +
2373 | cnt64[2] +
2374 | cnt64[3] +
2375 | cnt64[4] +
2376 | cnt64[5] +
2377 | cnt64[6] +
2378 | cnt64[7];
2379 | }
2380 |
2381 | STORM_TARGET("avx512bw")
2382 | static
2383 | int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
2384 | for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) {
2385 | for (int j = 0; j < 16; ++j) {
2386 | out[j] += ((array[i] & (1 << j)) >> j);
2387 | }
2388 | }
2389 |
2390 | const __m512i* data = (const __m512i*)array;
2391 | __m512i v1 = _mm512_setzero_si512();
2392 | __m512i v2 = _mm512_setzero_si512();
2393 | __m512i v4 = _mm512_setzero_si512();
2394 | __m512i v8 = _mm512_setzero_si512();
2395 | __m512i v16 = _mm512_setzero_si512();
2396 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2397 | __m512i one = _mm512_set1_epi16(1);
2398 | __m512i counter[16];
2399 |
2400 | const size_t size = len / 32;
2401 | const uint64_t limit = size - size % 16;
2402 |
2403 | uint16_t buffer[32];
2404 |
2405 | uint64_t i = 0;
2406 | while (i < limit) {
2407 | for (size_t i = 0; i < 16; ++i)
2408 | counter[i] = _mm512_setzero_si512();
2409 |
2410 | size_t thislimit = limit;
2411 | if (thislimit - i >= (1 << 16))
2412 | thislimit = i + (1 << 16) - 1;
2413 |
2414 | for (/**/; i < thislimit; i += 16) {
2415 | #define U(pos) { \
2416 | counter[pos] = _mm512_add_epi16(counter[pos], _mm512_and_si512(v16, _mm512_set1_epi16(1))); \
2417 | v16 = _mm512_srli_epi16(v16, 1); \
2418 | }
2419 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 0), _mm512_loadu_si512(data + i + 1));
2420 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 2), _mm512_loadu_si512(data + i + 3));
2421 | STORM_pospopcnt_csa_avx512(&foursA, &v2, twosA, twosB);
2422 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 4), _mm512_loadu_si512(data + i + 5));
2423 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 6), _mm512_loadu_si512(data + i + 7));
2424 | STORM_pospopcnt_csa_avx512(&foursB, &v2, twosA, twosB);
2425 | STORM_pospopcnt_csa_avx512(&eightsA, &v4, foursA, foursB);
2426 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 8), _mm512_loadu_si512(data + i + 9));
2427 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 10), _mm512_loadu_si512(data + i + 11));
2428 | STORM_pospopcnt_csa_avx512(&foursA, &v2, twosA, twosB);
2429 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 12), _mm512_loadu_si512(data + i + 13));
2430 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 14), _mm512_loadu_si512(data + i + 15));
2431 | STORM_pospopcnt_csa_avx512(&foursB, &v2, twosA, twosB);
2432 | STORM_pospopcnt_csa_avx512(&eightsB, &v4, foursA, foursB);
2433 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
2434 | STORM_pospopcnt_csa_avx512(&v16, &v8, eightsA, eightsB);
2435 | }
2436 | // Update the counters after the last iteration.
2437 | for (size_t i = 0; i < 16; ++i) U(i)
2438 | #undef U
2439 |
2440 | for (size_t i = 0; i < 16; ++i) {
2441 | _mm512_storeu_si512((__m512i*)buffer, counter[i]);
2442 | for (size_t z = 0; z < 32; z++) {
2443 | out[i] += 16 * (uint32_t)buffer[z];
2444 | }
2445 | }
2446 | }
2447 |
2448 | _mm512_storeu_si512((__m512i*)buffer, v1);
2449 | for (size_t i = 0; i < 32; ++i) {
2450 | for (int j = 0; j < 16; ++j) {
2451 | out[j] += 1 * ((buffer[i] & (1 << j)) >> j);
2452 | }
2453 | }
2454 |
2455 | _mm512_storeu_si512((__m512i*)buffer, v2);
2456 | for (size_t i = 0; i < 32; ++i) {
2457 | for (int j = 0; j < 16; ++j) {
2458 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
2459 | }
2460 | }
2461 |
2462 | _mm512_storeu_si512((__m512i*)buffer, v4);
2463 | for (size_t i = 0; i < 32; ++i) {
2464 | for (int j = 0; j < 16; ++j) {
2465 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
2466 | }
2467 | }
2468 |
2469 | _mm512_storeu_si512((__m512i*)buffer, v8);
2470 | for (size_t i = 0; i < 32; ++i) {
2471 | for (int j = 0; j < 16; ++j) {
2472 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
2473 | }
2474 | }
2475 |
2476 | return 0;
2477 | }
2478 |
2479 | STORM_TARGET("avx512bw")
2480 | static
2481 | int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, size_t len, uint32_t* out) {
2482 | #define AND_OR 0xea // ternary function: (a & b) | c
2483 | const __m512i* data_vectors = (const __m512i*)(data);
2484 | const uint32_t n_cycles = len / 32;
2485 |
2486 | size_t i = 0;
2487 | for (/**/; i + 8 <= n_cycles; i += 8) {
2488 | #define L(p) __m512i v##p = _mm512_loadu_si512(data_vectors+i+p);
2489 | L(0) L(1) L(2) L(3)
2490 | L(4) L(5) L(6) L(7)
2491 |
2492 | #define U0(p,k) __m512i input##p = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0x00FF), _mm512_slli_epi16(v##k, 8), AND_OR);
2493 | #define U1(p,k) __m512i input##k = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0xFF00), _mm512_srli_epi16(v##k, 8), AND_OR);
2494 | #define U(p, k) U0(p,k) U1(p,k)
2495 |
2496 | U(0,1) U( 2, 3) U( 4, 5) U( 6, 7)
2497 |
2498 | for (int i = 0; i < 8; ++i) {
2499 | #define A0(p) out[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p));
2500 | #define A1(k) out[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k));
2501 | #define A(p, k) A0(p) A1(k)
2502 | A(0,1) A(2, 3) A(4,5) A(6, 7)
2503 |
2504 | #define P0(p) input##p = _mm512_add_epi8(input##p, input##p);
2505 | #define P(p, k) input##p = P0(p) P0(k)
2506 |
2507 | P(0,1) P(2, 3) P(4,5) P(6, 7)
2508 | }
2509 | }
2510 |
2511 | for (/**/; i + 4 <= n_cycles; i += 4) {
2512 | L(0) L(1) L(2) L(3)
2513 | U(0,1) U(2,3)
2514 |
2515 | for (int i = 0; i < 8; ++i) {
2516 | A(0,1) A(2, 3)
2517 | P(0,1) P(2, 3)
2518 | }
2519 | }
2520 |
2521 | for (/**/; i + 2 <= n_cycles; i += 2) {
2522 | L(0) L(1)
2523 | U(0,1)
2524 |
2525 | for (int i = 0; i < 8; ++i) {
2526 | A(0,1)
2527 | P(0,1)
2528 | }
2529 | }
2530 |
2531 | i *= 32;
2532 | for (/**/; i < len; ++i) {
2533 | for (int j = 0; j < 16; ++j) {
2534 | out[j] += ((data[i] & (1 << j)) >> j);
2535 | }
2536 | }
2537 |
2538 | #undef L
2539 | #undef U0
2540 | #undef U1
2541 | #undef U
2542 | #undef A0
2543 | #undef A1
2544 | #undef A
2545 | #undef P0
2546 | #undef P
2547 | #undef AND_OR
2548 |
2549 | return 0;
2550 | }
2551 |
2552 | STORM_TARGET("avx512bw")
2553 | static
2554 | int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, size_t len, uint32_t* out) {
2555 | __m512i counters[16];
2556 |
2557 | for (size_t i = 0; i < 16; ++i) {
2558 | counters[i] = _mm512_setzero_si512();
2559 | }
2560 |
2561 | const __m512i mask1bit = _mm512_set1_epi16(0x5555); // 0101010101010101 Pattern: 01
2562 | const __m512i mask2bit = _mm512_set1_epi16(0x3333); // 0011001100110011 Pattern: 0011
2563 | const __m512i mask4bit = _mm512_set1_epi16(0x0F0F); // 0000111100001111 Pattern: 00001111
2564 | const __m512i mask8bit = _mm512_set1_epi16(0x00FF); // 0000000011111111 Pattern: 0000000011111111
2565 |
2566 | const uint32_t n_cycles = len / (2048 * (16*32));
2567 | const uint32_t n_total = len / (16*32);
2568 | uint16_t tmp[32];
2569 |
2570 | /*------ Macros --------*/
2571 | #define LE(i,p,k) const __m512i sum##p##k##_##i##bit_even = _mm512_add_epi8(input##p & mask##i##bit, input##k & mask##i##bit);
2572 | #define LO(i,p,k) const __m512i sum##p##k##_##i##bit_odd = _mm512_add_epi8(_mm512_srli_epi16(input##p, i) & mask##i##bit, _mm512_srli_epi16(input##k, i) & mask##i##bit);
2573 |
2574 | #define LBLOCK(i) \
2575 | LE(i,0,1) LO(i,0,1) \
2576 | LE(i,2,3) LO(i,2,3) \
2577 | LE(i,4,5) LO(i,4,5) \
2578 | LE(i,6,7) LO(i,6,7) \
2579 | LE(i,8,9) LO(i,8,9) \
2580 | LE(i,10,11) LO(i,10,11) \
2581 | LE(i,12,13) LO(i,12,13) \
2582 | LE(i,14,15) LO(i,14,15) \
2583 |
2584 | #define EVEN(b,i,k,p) input##i = sum##k##p##_##b##bit_even;
2585 | #define ODD(b,i,k,p) input##i = sum##k##p##_##b##bit_odd;
2586 |
2587 | #define UPDATE(i) \
2588 | EVEN(i,0,0,1) EVEN(i,1,2,3) EVEN(i,2,4,5) EVEN(i,3,6,7) \
2589 | EVEN(i,4,8,9) EVEN(i,5,10,11) EVEN(i,6,12,13) EVEN(i,7,14,15) \
2590 | ODD(i,8,0,1) ODD(i,9,2,3) ODD(i,10,4,5) ODD(i,11,6,7) \
2591 | ODD(i,12,8,9) ODD(i,13,10,11) ODD(i,14,12,13) ODD(i,15,14,15) \
2592 |
2593 | #define UE(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_even);
2594 | #define UO(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_odd);
2595 |
2596 | /*------ Start --------*/
2597 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*2048*512 + j*512 + p*32));
2598 | size_t i = 0;
2599 | for (/**/; i < n_cycles; ++i) {
2600 | for (int j = 0; j < 2048; ++j) {
2601 | // Load 16 registers.
2602 | L(0) L(1) L(2) L(3)
2603 | L(4) L(5) L(6) L(7)
2604 | L(8) L(9) L(10) L(11)
2605 | L(12) L(13) L(14) L(15)
2606 |
2607 | // Perform updates for bits {1,2,4,8}.
2608 | LBLOCK(1) UPDATE(1)
2609 | LBLOCK(2) UPDATE(2)
2610 | LBLOCK(4) UPDATE(4)
2611 | LBLOCK(8) UPDATE(8)
2612 |
2613 | // Update accumulators.
2614 | UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7)
2615 | UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15)
2616 | UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7)
2617 | UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15)
2618 | }
2619 |
2620 | // Update.
2621 | for (size_t i = 0; i < 16; ++i) {
2622 | _mm512_storeu_si512((__m512i*)tmp, counters[i]);
2623 | for (int j = 0; j < 32; ++j) out[i] += tmp[j];
2624 | }
2625 | // Reset.
2626 | for (size_t i = 0; i < 16; ++i) {
2627 | counters[i] = _mm512_setzero_si512();
2628 | }
2629 | }
2630 | #undef L
2631 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*512 + p*32));
2632 | i *= 2048;
2633 | for (/**/; i < n_total; ++i) {
2634 | // Load 16 registers.
2635 | L(0) L(1) L(2) L(3)
2636 | L(4) L(5) L(6) L(7)
2637 | L(8) L(9) L(10) L(11)
2638 | L(12) L(13) L(14) L(15)
2639 |
2640 | // Perform updates for bits {1,2,4,8}.
2641 | LBLOCK(1) UPDATE(1)
2642 | LBLOCK(2) UPDATE(2)
2643 | LBLOCK(4) UPDATE(4)
2644 | LBLOCK(8) UPDATE(8)
2645 |
2646 | // Update accumulators.
2647 | UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7)
2648 | UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15)
2649 | UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7)
2650 | UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15)
2651 | }
2652 |
2653 | i *= 512;
2654 | for (/**/; i < len; ++i) {
2655 | for (int j = 0; j < 16; ++j) {
2656 | out[j] += ((array[i] & (1 << j)) >> j);
2657 | }
2658 | }
2659 |
2660 | #undef L
2661 | #undef UPDATE
2662 | #undef ODD
2663 | #undef EVEN
2664 | #undef LBLOCK
2665 | #undef LE
2666 | #undef LO
2667 | #undef UO
2668 | #undef UE
2669 |
2670 | for (size_t i = 0; i < 16; ++i) {
2671 | _mm512_storeu_si512((__m512i*)tmp, counters[i]);
2672 | for (int j = 0; j < 32; ++j) out[i] += tmp[j];
2673 | }
2674 | return 0;
2675 | }
2676 |
2677 | /*
2678 | * AVX512 Harley-Seal popcount (4th iteration).
2679 | * The algorithm is based on the paper "Faster Population Counts
2680 | * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
2681 | * Wojciech Mula (23 Nov 2016).
2682 | * @see https://arxiv.org/abs/1611.07612
2683 | */
2684 | STORM_TARGET("avx512bw")
2685 | static
2686 | uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1,
2687 | const __m512i* STORM_RESTRICT data2,
2688 | size_t size)
2689 | {
2690 | __m512i cnt = _mm512_setzero_si512();
2691 | __m512i ones = _mm512_setzero_si512();
2692 | __m512i twos = _mm512_setzero_si512();
2693 | __m512i fours = _mm512_setzero_si512();
2694 | __m512i eights = _mm512_setzero_si512();
2695 | __m512i sixteens = _mm512_setzero_si512();
2696 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2697 |
2698 | uint64_t i = 0;
2699 | uint64_t limit = size - size % 16;
2700 | uint64_t* cnt64;
2701 |
2702 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) & _mm512_loadu_si512(&data2[i+a]))
2703 |
2704 | for (/**/; i < limit; i += 16) {
2705 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1));
2706 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3));
2707 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2708 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5));
2709 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7));
2710 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2711 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB);
2712 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9));
2713 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11));
2714 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2715 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13));
2716 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15));
2717 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2718 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB);
2719 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2720 |
2721 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2722 | }
2723 | #undef LOAD
2724 |
2725 | cnt = _mm512_slli_epi64(cnt, 4);
2726 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2727 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2728 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2729 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones));
2730 |
2731 | for (/**/; i < size; ++i)
2732 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) & _mm512_loadu_si512(&data2[i])));
2733 |
2734 | cnt64 = (uint64_t*)&cnt;
2735 |
2736 | return cnt64[0] +
2737 | cnt64[1] +
2738 | cnt64[2] +
2739 | cnt64[3] +
2740 | cnt64[4] +
2741 | cnt64[5] +
2742 | cnt64[6] +
2743 | cnt64[7];
2744 | }
2745 |
2746 | STORM_TARGET("avx512bw")
2747 | static
2748 | uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1,
2749 | const __m512i* STORM_RESTRICT data2,
2750 | size_t size)
2751 | {
2752 | __m512i cnt = _mm512_setzero_si512();
2753 | __m512i ones = _mm512_setzero_si512();
2754 | __m512i twos = _mm512_setzero_si512();
2755 | __m512i fours = _mm512_setzero_si512();
2756 | __m512i eights = _mm512_setzero_si512();
2757 | __m512i sixteens = _mm512_setzero_si512();
2758 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2759 |
2760 | uint64_t i = 0;
2761 | uint64_t limit = size - size % 16;
2762 | uint64_t* cnt64;
2763 |
2764 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) | _mm512_loadu_si512(&data2[i+a]))
2765 |
2766 | for (/**/; i < limit; i += 16) {
2767 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1));
2768 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3));
2769 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2770 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5));
2771 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7));
2772 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2773 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB);
2774 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9));
2775 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11));
2776 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2777 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13));
2778 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15));
2779 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2780 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB);
2781 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2782 |
2783 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2784 | }
2785 | #undef LOAD
2786 |
2787 | cnt = _mm512_slli_epi64(cnt, 4);
2788 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2789 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2790 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2791 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones));
2792 |
2793 | for (/**/; i < size; ++i)
2794 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) | _mm512_loadu_si512(&data2[i])));
2795 |
2796 | cnt64 = (uint64_t*)&cnt;
2797 |
2798 | return cnt64[0] +
2799 | cnt64[1] +
2800 | cnt64[2] +
2801 | cnt64[3] +
2802 | cnt64[4] +
2803 | cnt64[5] +
2804 | cnt64[6] +
2805 | cnt64[7];
2806 | }
2807 |
2808 | STORM_TARGET("avx512bw")
2809 | static
2810 | uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1,
2811 | const __m512i* STORM_RESTRICT data2,
2812 | size_t size)
2813 | {
2814 | __m512i cnt = _mm512_setzero_si512();
2815 | __m512i ones = _mm512_setzero_si512();
2816 | __m512i twos = _mm512_setzero_si512();
2817 | __m512i fours = _mm512_setzero_si512();
2818 | __m512i eights = _mm512_setzero_si512();
2819 | __m512i sixteens = _mm512_setzero_si512();
2820 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2821 |
2822 | uint64_t i = 0;
2823 | uint64_t limit = size - size % 16;
2824 | uint64_t* cnt64;
2825 |
2826 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) ^ _mm512_loadu_si512(&data2[i+a]))
2827 |
2828 | for (/**/; i < limit; i += 16) {
2829 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1));
2830 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3));
2831 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2832 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5));
2833 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7));
2834 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2835 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB);
2836 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9));
2837 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11));
2838 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB);
2839 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13));
2840 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15));
2841 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB);
2842 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB);
2843 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2844 |
2845 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2846 | }
2847 | #undef LOAD
2848 |
2849 | cnt = _mm512_slli_epi64(cnt, 4);
2850 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2851 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2852 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2853 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones));
2854 |
2855 | for (/**/; i < size; ++i)
2856 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) ^ _mm512_loadu_si512(&data2[i])));
2857 |
2858 | cnt64 = (uint64_t*)&cnt;
2859 |
2860 | return cnt64[0] +
2861 | cnt64[1] +
2862 | cnt64[2] +
2863 | cnt64[3] +
2864 | cnt64[4] +
2865 | cnt64[5] +
2866 | cnt64[6] +
2867 | cnt64[7];
2868 | }
2869 |
2870 | // Functions
2871 | // AVX512
2872 | STORM_TARGET("avx512bw")
2873 | static
2874 | uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1,
2875 | const uint64_t* STORM_RESTRICT b2,
2876 | const size_t n_ints)
2877 | {
2878 | uint64_t count = 0;
2879 | const __m512i* r1 = (const __m512i*)(b1);
2880 | const __m512i* r2 = (const __m512i*)(b2);
2881 | const uint32_t n_cycles = n_ints / 8;
2882 |
2883 | count += STORM_intersect_count_csa_avx512(r1, r2, n_cycles);
2884 |
2885 | for (int i = n_cycles*8; i < n_ints; ++i) {
2886 | count += STORM_POPCOUNT(b1[i] & b2[i]);
2887 | }
2888 |
2889 | return(count);
2890 | }
2891 |
2892 | STORM_TARGET("avx512bw")
2893 | static
2894 | uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1,
2895 | const uint64_t* STORM_RESTRICT b2,
2896 | const size_t n_ints)
2897 | {
2898 | uint64_t count = 0;
2899 | const __m512i* r1 = (const __m512i*)(b1);
2900 | const __m512i* r2 = (const __m512i*)(b2);
2901 | const uint32_t n_cycles = n_ints / 8;
2902 |
2903 | count += STORM_union_count_csa_avx512(r1, r2, n_cycles);
2904 |
2905 | for (int i = n_cycles*8; i < n_ints; ++i) {
2906 | count += STORM_POPCOUNT(b1[i] | b2[i]);
2907 | }
2908 |
2909 | return(count);
2910 | }
2911 |
2912 | STORM_TARGET("avx512bw")
2913 | static
2914 | uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1,
2915 | const uint64_t* STORM_RESTRICT b2,
2916 | const size_t n_ints)
2917 | {
2918 | uint64_t count = 0;
2919 | const __m512i* r1 = (const __m512i*)(b1);
2920 | const __m512i* r2 = (const __m512i*)(b2);
2921 | const uint32_t n_cycles = n_ints / 8;
2922 |
2923 | count += STORM_diff_count_csa_avx512(r1, r2, n_cycles);
2924 |
2925 | for (int i = n_cycles*8; i < n_ints; ++i) {
2926 | count += STORM_POPCOUNT(b1[i] ^ b2[i]);
2927 | }
2928 |
2929 | return(count);
2930 | }
2931 |
2932 | STORM_TARGET("avx2")
2933 | static
2934 | uint64_t STORM_popcnt_avx512(const uint64_t* data,
2935 | const size_t n_ints)
2936 | {
2937 | uint64_t count = 0;
2938 | const uint32_t n_cycles = n_ints / 8;
2939 | const uint32_t n_cycles_avx2 = (n_ints % 8) / 4;
2940 | const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2;
2941 |
2942 | const __m512i* r1 = (__m512i*)&data[0];
2943 | const __m256i* r2 = (__m256i*)&data[n_cycles*8];
2944 | const __m128i* r3 = (__m128i*)&data[n_cycles*8+n_cycles_avx2*4];
2945 |
2946 | count += STORM_popcnt_csa_avx512bw(r1, n_cycles);
2947 | count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2);
2948 | count += STORM_popcnt_csa_sse4(r3, n_cycles_sse);
2949 |
2950 | for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) {
2951 | count += STORM_POPCOUNT(data[i]);
2952 | }
2953 |
2954 | return count;
2955 | }
2956 | #endif
2957 |
2958 | /****************************
2959 | * Popcount
2960 | ****************************/
2961 |
2962 | STORM_FORCE_INLINE
2963 | uint64_t STORM_popcount64_unrolled(const uint64_t* data, size_t size) {
2964 | uint64_t i = 0;
2965 | uint64_t limit = size - size % 4;
2966 | uint64_t cnt = 0;
2967 |
2968 | for (/**/; i < limit; i += 4) {
2969 | cnt += STORM_popcount64(data[i+0]);
2970 | cnt += STORM_popcount64(data[i+1]);
2971 | cnt += STORM_popcount64(data[i+2]);
2972 | cnt += STORM_popcount64(data[i+3]);
2973 | }
2974 |
2975 | for (/**/; i < size; ++i)
2976 | cnt += STORM_popcount64(data[i]);
2977 |
2978 | return cnt;
2979 | }
2980 |
2981 | /****************************
2982 | * Scalar functions
2983 | ****************************/
2984 |
2985 | STORM_FORCE_INLINE
2986 | uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1,
2987 | const uint64_t* STORM_RESTRICT b2,
2988 | const size_t n_ints)
2989 | {
2990 | return STORM_intersect_count_unrolled(b1, b2, n_ints);
2991 | }
2992 |
2993 | STORM_FORCE_INLINE
2994 | uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1,
2995 | const uint64_t* STORM_RESTRICT b2,
2996 | const size_t n_ints)
2997 | {
2998 | return STORM_union_count_unrolled(b1, b2, n_ints);
2999 | }
3000 |
3001 | STORM_FORCE_INLINE
3002 | uint64_t STORM_diff_count_scalar(const uint64_t* STORM_RESTRICT b1,
3003 | const uint64_t* STORM_RESTRICT b2,
3004 | const size_t n_ints)
3005 | {
3006 | return STORM_diff_count_unrolled(b1, b2, n_ints);
3007 | }
3008 |
3009 | static
3010 | uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1,
3011 | const uint64_t* STORM_RESTRICT b2,
3012 | const uint32_t* STORM_RESTRICT l1,
3013 | const uint32_t* STORM_RESTRICT l2,
3014 | const size_t n1,
3015 | const size_t n2)
3016 | {
3017 | uint64_t count = 0;
3018 |
3019 | #define MOD(x) (( (x) * 64 ) >> 6)
3020 | if (n1 < n2) {
3021 | for (int i = 0; i < n1; ++i)
3022 | count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0);
3023 | } else {
3024 | for (int i = 0; i < n2; ++i)
3025 | count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0);
3026 | }
3027 | #undef MOD
3028 | return(count);
3029 | }
3030 |
3031 |
3032 | /* *************************************
3033 | * Function pointer definitions.
3034 | ***************************************/
3035 | typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const size_t);
3036 | typedef int (STORM_pposcnt_func)(const uint16_t*, size_t, uint32_t*);
3037 | typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t);
3038 |
3039 | /* *************************************
3040 | * Alignment
3041 | ***************************************/
3042 | // Return the best alignment given the available instruction set at
3043 | // run-time.
3044 | static
3045 | uint32_t STORM_get_alignment() {
3046 |
3047 | #if defined(STORM_HAVE_CPUID)
3048 | #if defined(__cplusplus)
3049 | /* C++11 thread-safe singleton */
3050 | static const int cpuid = STORM_get_cpuid();
3051 | #else
3052 | static int cpuid_ = -1;
3053 | int cpuid = cpuid_;
3054 | if (cpuid == -1) {
3055 | cpuid = STORM_get_cpuid();
3056 |
3057 | #if defined(_MSC_VER)
3058 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3059 | #else
3060 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3061 | #endif
3062 | }
3063 | #endif
3064 | #endif
3065 |
3066 | uint32_t alignment = 0;
3067 | #if defined(STORM_HAVE_AVX512)
3068 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512
3069 | alignment = STORM_AVX512_ALIGNMENT;
3070 | }
3071 | #endif
3072 |
3073 | #if defined(STORM_HAVE_AVX2)
3074 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256
3075 | alignment = STORM_AVX2_ALIGNMENT;
3076 | }
3077 | #endif
3078 |
3079 | #if defined(STORM_HAVE_SSE42)
3080 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128
3081 | alignment = STORM_SSE_ALIGNMENT;
3082 | }
3083 | #endif
3084 |
3085 | if (alignment == 0) alignment = 8;
3086 | return alignment;
3087 | }
3088 |
3089 | /* *************************************
3090 | * Set algebra functions
3091 | ***************************************/
3092 | // Return the optimal intersection function given the range [0, n_bitmaps_vector)
3093 | // and the available instruction set at run-time.
3094 | static
3095 | STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) {
3096 |
3097 | #if defined(STORM_HAVE_CPUID)
3098 | #if defined(__cplusplus)
3099 | /* C++11 thread-safe singleton */
3100 | static const int cpuid = STORM_get_cpuid();
3101 | #else
3102 | static int cpuid_ = -1;
3103 | int cpuid = cpuid_;
3104 | if (cpuid == -1) {
3105 | cpuid = STORM_get_cpuid();
3106 |
3107 | #if defined(_MSC_VER)
3108 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3109 | #else
3110 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3111 | #endif
3112 | }
3113 | #endif
3114 | #endif
3115 |
3116 |
3117 | #if defined(STORM_HAVE_AVX512)
3118 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3119 | return &STORM_intersect_count_avx512;
3120 | }
3121 | #endif
3122 |
3123 | #if defined(STORM_HAVE_AVX2)
3124 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3125 | return &STORM_intersect_count_avx2;
3126 | }
3127 |
3128 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3129 | return &STORM_intersect_count_lookup_avx2;
3130 | }
3131 | #endif
3132 |
3133 | #if defined(STORM_HAVE_SSE42)
3134 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3135 | return &STORM_intersect_count_sse4;
3136 | }
3137 | #endif
3138 |
3139 | return &STORM_intersect_count_scalar;
3140 | }
3141 |
3142 | static
3143 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) {
3144 |
3145 | #if defined(STORM_HAVE_CPUID)
3146 | #if defined(__cplusplus)
3147 | /* C++11 thread-safe singleton */
3148 | static const int cpuid = STORM_get_cpuid();
3149 | #else
3150 | static int cpuid_ = -1;
3151 | int cpuid = cpuid_;
3152 | if (cpuid == -1) {
3153 | cpuid = STORM_get_cpuid();
3154 |
3155 | #if defined(_MSC_VER)
3156 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3157 | #else
3158 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3159 | #endif
3160 | }
3161 | #endif
3162 | #endif
3163 |
3164 |
3165 | #if defined(STORM_HAVE_AVX512)
3166 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3167 | return &STORM_union_count_avx512;
3168 | }
3169 | #endif
3170 |
3171 | #if defined(STORM_HAVE_AVX2)
3172 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3173 | return &STORM_union_count_avx2;
3174 | }
3175 |
3176 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3177 | return &STORM_union_count_lookup_avx2;
3178 | }
3179 | #endif
3180 |
3181 | #if defined(STORM_HAVE_SSE42)
3182 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3183 | return &STORM_union_count_sse4;
3184 | }
3185 | #endif
3186 |
3187 | return &STORM_union_count_scalar;
3188 | }
3189 |
3190 | static
3191 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) {
3192 |
3193 | #if defined(STORM_HAVE_CPUID)
3194 | #if defined(__cplusplus)
3195 | /* C++11 thread-safe singleton */
3196 | static const int cpuid = STORM_get_cpuid();
3197 | #else
3198 | static int cpuid_ = -1;
3199 | int cpuid = cpuid_;
3200 | if (cpuid == -1) {
3201 | cpuid = STORM_get_cpuid();
3202 |
3203 | #if defined(_MSC_VER)
3204 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3205 | #else
3206 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3207 | #endif
3208 | }
3209 | #endif
3210 | #endif
3211 |
3212 |
3213 | #if defined(STORM_HAVE_AVX512)
3214 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3215 | return &STORM_diff_count_avx512;
3216 | }
3217 | #endif
3218 |
3219 | #if defined(STORM_HAVE_AVX2)
3220 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3221 | return &STORM_diff_count_avx2;
3222 | }
3223 |
3224 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3225 | return &STORM_diff_count_lookup_avx2;
3226 | }
3227 | #endif
3228 |
3229 | #if defined(STORM_HAVE_SSE42)
3230 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3231 | return &STORM_diff_count_sse4;
3232 | }
3233 | #endif
3234 |
3235 | return &STORM_diff_count_scalar;
3236 | }
3237 |
3238 | // real
3239 | // Return the optimal intersection function given the range [0, n_bitmaps_vector)
3240 | // and the available instruction set at run-time.
3241 | static
3242 | uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1,
3243 | const uint64_t* STORM_RESTRICT data2,
3244 | const size_t n_len)
3245 | {
3246 |
3247 | #if defined(STORM_HAVE_CPUID)
3248 | #if defined(__cplusplus)
3249 | /* C++11 thread-safe singleton */
3250 | static const int cpuid = STORM_get_cpuid();
3251 | #else
3252 | static int cpuid_ = -1;
3253 | int cpuid = cpuid_;
3254 | if (cpuid == -1) {
3255 | cpuid = STORM_get_cpuid();
3256 |
3257 | #if defined(_MSC_VER)
3258 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3259 | #else
3260 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3261 | #endif
3262 | }
3263 | #endif
3264 | #endif
3265 |
3266 |
3267 | #if defined(STORM_HAVE_AVX512)
3268 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3269 | return STORM_intersect_count_avx512(data1, data2, n_len);
3270 | }
3271 | #endif
3272 |
3273 | #if defined(STORM_HAVE_AVX2)
3274 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3275 | return STORM_intersect_count_avx2(data1, data2, n_len);
3276 | }
3277 |
3278 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3279 | return STORM_intersect_count_lookup_avx2(data1, data2, n_len);
3280 | }
3281 | #endif
3282 |
3283 | #if defined(STORM_HAVE_SSE42)
3284 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3285 | return STORM_intersect_count_sse4(data1, data2, n_len);
3286 | }
3287 | #endif
3288 |
3289 | return STORM_intersect_count_scalar(data1, data2, n_len);
3290 | }
3291 |
3292 | static
3293 | uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1,
3294 | const uint64_t* STORM_RESTRICT data2,
3295 | const size_t n_len)
3296 | {
3297 |
3298 | #if defined(STORM_HAVE_CPUID)
3299 | #if defined(__cplusplus)
3300 | /* C++11 thread-safe singleton */
3301 | static const int cpuid = STORM_get_cpuid();
3302 | #else
3303 | static int cpuid_ = -1;
3304 | int cpuid = cpuid_;
3305 | if (cpuid == -1) {
3306 | cpuid = STORM_get_cpuid();
3307 |
3308 | #if defined(_MSC_VER)
3309 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3310 | #else
3311 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3312 | #endif
3313 | }
3314 | #endif
3315 | #endif
3316 |
3317 |
3318 | #if defined(STORM_HAVE_AVX512)
3319 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3320 | return STORM_union_count_avx512(data1, data2, n_len);
3321 | }
3322 | #endif
3323 |
3324 | #if defined(STORM_HAVE_AVX2)
3325 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3326 | return STORM_union_count_avx2(data1, data2, n_len);
3327 | }
3328 |
3329 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3330 | return STORM_union_count_lookup_avx2(data1, data2, n_len);
3331 | }
3332 | #endif
3333 |
3334 | #if defined(STORM_HAVE_SSE42)
3335 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3336 | return STORM_union_count_sse4(data1, data2, n_len);
3337 | }
3338 | #endif
3339 |
3340 | return STORM_union_count_scalar(data1, data2, n_len);
3341 | }
3342 |
3343 | static
3344 | uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1,
3345 | const uint64_t* STORM_RESTRICT data2,
3346 | const size_t n_len)
3347 | {
3348 |
3349 | #if defined(STORM_HAVE_CPUID)
3350 | #if defined(__cplusplus)
3351 | /* C++11 thread-safe singleton */
3352 | static const int cpuid = STORM_get_cpuid();
3353 | #else
3354 | static int cpuid_ = -1;
3355 | int cpuid = cpuid_;
3356 | if (cpuid == -1) {
3357 | cpuid = STORM_get_cpuid();
3358 |
3359 | #if defined(_MSC_VER)
3360 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3361 | #else
3362 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3363 | #endif
3364 | }
3365 | #endif
3366 | #endif
3367 |
3368 |
3369 | #if defined(STORM_HAVE_AVX512)
3370 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3371 | return STORM_diff_count_avx512(data1, data2, n_len);
3372 | }
3373 | #endif
3374 |
3375 | #if defined(STORM_HAVE_AVX2)
3376 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3377 | return STORM_diff_count_avx2(data1, data2, n_len);
3378 | }
3379 |
3380 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3381 | return STORM_diff_count_lookup_avx2(data1, data2, n_len);
3382 | }
3383 | #endif
3384 |
3385 | #if defined(STORM_HAVE_SSE42)
3386 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3387 | return STORM_diff_count_sse4(data1, data2, n_len);
3388 | }
3389 | #endif
3390 |
3391 | return STORM_diff_count_scalar(data1, data2, n_len);
3392 | }
3393 |
3394 | /* *************************************
3395 | * POPCNT and POSPOPCNT functions.
3396 | ***************************************/
3397 | static
3398 | uint64_t STORM_popcnt(const uint8_t* data, size_t size) {
3399 | uint64_t cnt = 0;
3400 | uint64_t i;
3401 | // size /= 8;
3402 |
3403 | #if defined(STORM_HAVE_CPUID)
3404 | #if defined(__cplusplus)
3405 | /* C++11 thread-safe singleton */
3406 | static const int cpuid = STORM_get_cpuid();
3407 | #else
3408 | static int cpuid_ = -1;
3409 | int cpuid = cpuid_;
3410 | if (cpuid == -1) {
3411 | cpuid = STORM_get_cpuid();
3412 |
3413 | #if defined(_MSC_VER)
3414 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3415 | #else
3416 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3417 | #endif
3418 | }
3419 | #endif
3420 | #endif
3421 |
3422 | #if defined(STORM_HAVE_AVX512)
3423 |
3424 | /* AVX512 requires arrays >= 1024 bytes */
3425 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) &&
3426 | size >= 1024)
3427 | {
3428 | // cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64);
3429 | // data += size - size % 64;
3430 | // size = size % 64;
3431 | cnt += STORM_popcnt_avx512((uint64_t*)data, size/8);
3432 | data += size - size % 8;
3433 | size = size % 8;
3434 | }
3435 |
3436 | #endif
3437 |
3438 | #if defined(STORM_HAVE_AVX2)
3439 |
3440 | /* AVX2 requires arrays >= 512 bytes */
3441 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) &&
3442 | size >= 512)
3443 | {
3444 | cnt += STORM_popcnt_avx2((uint64_t*)data, size/8);
3445 | data += size - size % 8;
3446 | size = size % 8;
3447 | // data += size - size % 32;
3448 | // size = size % 32;
3449 | }
3450 |
3451 | #endif
3452 |
3453 | #if defined(STORM_HAVE_SSE42)
3454 |
3455 | /* AVX2 requires arrays >= 512 bytes */
3456 | if ((cpuid & STORM_CPUID_runtime_bit_SSE42) &&
3457 | size >= 256)
3458 | {
3459 | cnt += STORM_popcnt_sse4((uint64_t*)data, size/8);
3460 | data += size - size % 8;
3461 | size = size % 8;
3462 | // data += size - size % 32;
3463 | // size = size % 32;
3464 | }
3465 |
3466 | #endif
3467 |
3468 | #if defined(STORM_HAVE_POPCNT)
3469 |
3470 | if (cpuid & STORM_CPUID_runtime_bit_POPCNT) {
3471 | cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8);
3472 | data += size - size % 8;
3473 | size = size % 8;
3474 | for (i = 0; i < size; ++i)
3475 | cnt += STORM_popcount64(data[i]);
3476 |
3477 | return cnt;
3478 | }
3479 |
3480 | #endif
3481 |
3482 | /* pure integer popcount algorithm */
3483 | if (size >= 8) {
3484 | cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8);
3485 | data += size - size % 8;
3486 | size = size % 8;
3487 | }
3488 |
3489 | /* pure integer popcount algorithm */
3490 | for (i = 0; i < size; ++i)
3491 | cnt += STORM_popcount64(data[i]);
3492 |
3493 | return cnt;
3494 | }
3495 |
3496 | static
3497 | int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) {
3498 | memset(out, 0, sizeof(uint32_t)*16);
3499 |
3500 | #if defined(STORM_HAVE_CPUID)
3501 | #if defined(__cplusplus)
3502 | /* C++11 thread-safe singleton */
3503 | static const int cpuid = STORM_get_cpuid();
3504 | #else
3505 | static int cpuid_ = -1;
3506 | int cpuid = cpuid_;
3507 | if (cpuid == -1) {
3508 | cpuid = STORM_get_cpuid();
3509 |
3510 | #if defined(_MSC_VER)
3511 | _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3512 | #else
3513 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3514 | #endif
3515 | }
3516 | #endif
3517 | #endif
3518 |
3519 | #if defined(STORM_HAVE_AVX512)
3520 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW))
3521 | {
3522 | if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small
3523 | else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small
3524 | else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, out)); // medium
3525 | else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, out)); // medium3
3526 | else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, out)); // fix
3527 | }
3528 | #endif
3529 |
3530 | #if defined(STORM_HAVE_AVX2)
3531 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2))
3532 | {
3533 | if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small
3534 | else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium
3535 | else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, out)); // large
3536 | }
3537 | #endif
3538 |
3539 | #if defined(STORM_HAVE_SSE42)
3540 | if ((cpuid & STORM_CPUID_runtime_bit_SSE42))
3541 | {
3542 | return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out));
3543 | }
3544 | #endif
3545 |
3546 | #ifndef _MSC_VER
3547 | return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, out)); // fallback scalar
3548 | #else
3549 | return(STORM_pospopcnt_u16_scalar_naive(data, len, out));
3550 | #endif
3551 | }
3552 |
3553 | #ifdef __cplusplus
3554 | } /* extern "C" */
3555 | #endif
3556 |
3557 | #endif /* LIBALGEBRA_H_8723467365934 */
3558 |
--------------------------------------------------------------------------------