├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── appveyor.yml
├── benchmark.cpp
└── libalgebra.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | # Executable
55 | benchmark
56 | # VSCode
57 | settings.json
58 | # CMake
59 | CMakeFiles
60 | cmake_*
61 | Makefile
62 | CMakeCache.txt
63 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
  1 | # Test configuration for Travis CI. See <https://travis-ci.org/>.
  2 | language: cpp
  3 | 
  4 | matrix:
  5 |   fast_finish: true
  6 |   
  7 |   include:
  8 |     - name: Xenial (g++-4.9)
  9 |       os: linux
 10 |       dist: xenial
 11 |       addons:
 12 |         apt:
 13 |           packages:
 14 |             - g++-4.9
 15 |           sources: &sources
 16 |             - ubuntu-toolchain-r-test
 17 |       env:
 18 |         - COMPILERS="CC=gcc-4.9 && CXX=g++-4.9"
 19 | 
 20 |     - name: Xenial (g++-5)
 21 |       os: linux
 22 |       dist: xenial
 23 |       addons:
 24 |         apt:
 25 |           packages:
 26 |             - g++-5
 27 |           sources: &sources
 28 |             - ubuntu-toolchain-r-test
 29 |       env:
 30 |         - COMPILERS="CC=gcc-5 && CXX=g++-5"
 31 | 
 32 |     - name: Xenial (g++-6)
 33 |       os: linux
 34 |       dist: xenial
 35 |       addons:
 36 |         apt:
 37 |           packages:
 38 |             - g++-6
 39 |           sources: &sources
 40 |             - ubuntu-toolchain-r-test
 41 |       env:
 42 |         - COMPILERS="CC=gcc-6 && CXX=g++-6"
 43 | 
 44 |     - name: Xenial (g++-7)
 45 |       os: linux
 46 |       dist: xenial
 47 |       addons:
 48 |         apt:
 49 |           packages:
 50 |             - g++-7
 51 |             - cmake
 52 |           sources: &sources
 53 |             - ubuntu-toolchain-r-test
 54 |       env:
 55 |         - COMPILERS="CC=gcc-7 && CXX=g++-7"
 56 | 
 57 |     - name: Xenial (g++-8)
 58 |       os: linux
 59 |       dist: xenial
 60 |       addons:
 61 |         apt:
 62 |           packages:
 63 |             - g++-8
 64 |           sources: &sources
 65 |             - ubuntu-toolchain-r-test
 66 |       env:
 67 |         - COMPILERS="CC=gcc-8 && CXX=g++-8"
 68 | 
 69 |     - name: Xenial (clang++-3.7)
 70 |       os: linux
 71 |       dist: xenial
 72 |       addons:
 73 |         apt:
 74 |           sources:
 75 |             - ubuntu-toolchain-r-test
 76 |             - llvm-toolchain-xenial-3.7
 77 |           packages:
 78 |             - clang-3.7
 79 |       env:
 80 |         - MATRIX_EVAL="CC=clang-3.7 && CXX=clang++-3.7"
 81 |       compiler: clang
 82 | 
 83 |     - name: Xenial (clang++-3.8)
 84 |       os: linux
 85 |       dist: xenial
 86 |       addons:
 87 |         apt:
 88 |           sources:
 89 |             - ubuntu-toolchain-r-test
 90 |             - llvm-toolchain-xenial-3.8
 91 |           packages:
 92 |             - clang-3.8
 93 |       env:
 94 |         - MATRIX_EVAL="CC=clang-3.8 && CXX=clang++-3.8"
 95 |       compiler: clang
 96 | 
 97 |     - name: Xenial (clang++-3.9)
 98 |       os: linux
 99 |       dist: xenial
100 |       addons:
101 |         apt:
102 |           sources:
103 |             - llvm-toolchain-xenial-3.9
104 |           packages:
105 |             - clang-3.9
106 |       env:
107 |         - MATRIX_EVAL="CC=clang-3.9 && CXX=clang++-3.9"
108 |       compiler: clang
109 | 
110 |     - name: Xenial (clang++-4.0)
111 |       os: linux
112 |       dist: xenial
113 |       addons:
114 |         apt:
115 |           sources:
116 |             - llvm-toolchain-xenial-4.0
117 |           packages:
118 |             - clang-4.0
119 |       env:
120 |         - MATRIX_EVAL="CC=clang-4.0 && CXX=clang++-4.0"
121 |       compiler: clang
122 | 
123 |     - name: Xenial (clang++-5.0)
124 |       os: linux
125 |       dist: xenial
126 |       addons:
127 |         apt:
128 |           sources:
129 |             - llvm-toolchain-xenial-5.0
130 |             - ubuntu-toolchain-r-test
131 |           packages:
132 |             - clang-5.0
133 |       env:
134 |         - MATRIX_EVAL="CC=clang-5.0 && CXX=clang++-5.0"
135 |       compiler: clang
136 | 
137 |     - name: Xenial (clang++-6.0)
138 |       os: linux
139 |       dist: xenial
140 |       addons:
141 |         apt:
142 |           sources:
143 |             - ubuntu-toolchain-r-test
144 |             - llvm-toolchain-xenial-6.0
145 |           packages:
146 |             - clang-6.0
147 |             - libstdc++-6-dev
148 |       env:
149 |         - MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0"
150 |       compiler: clang
151 | 
152 |     - name: Xenial (clang++-7.0)
153 |       os: linux
154 |       dist: xenial
155 |       addons:
156 |         apt:
157 |           sources:
158 |             - ubuntu-toolchain-r-test
159 |             - llvm-toolchain-xenial-7
160 |           packages:
161 |             - clang-7
162 |             - libstdc++-7-dev
163 |       env:
164 |         - MATRIX_EVAL="CC=clang-7 && CXX=clang++-7"
165 |       compiler: clang
166 | 
167 |     # OSX begin
168 |     - os: osx
169 |       compiler: clang
170 | 
171 | install:
172 |   ############################################################################
173 |   # Install a recent CMake (unless already installed on OS X)
174 |   ############################################################################
175 |   - CMAKE_VERSION=3.7.2
176 |   - |
177 |     if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then
178 |       CMAKE_URL="https://cmake.org/files/v${CMAKE_VERSION%.[0-9]}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
179 |       mkdir cmake && travis_retry wget --no-check-certificate -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C cmake
180 |       export PATH=${DEPS_DIR}/cmake/bin:${PATH}
181 |     else
182 |       brew install cmake || brew upgrade cmake
183 |     fi
184 |   - cmake --version
185 | 
186 | before_script:
187 |   - cd "${TRAVIS_BUILD_DIR}"
188 |   # Overwrite default CC and CXX
189 |   - eval "${COMPILERS}"
190 | 
191 | script:
192 |   - cmake .
193 |   - make
194 |   - |
195 |     if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then
196 |       sudo ./benchmark -r 10
197 |     else
198 |       ./benchmark -r 10
199 |     fi


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 2 | project(libalgebra C CXX)
 3 | set (CMAKE_BUILD_TYPE Release)
 4 | set (CMAKE_CXX_FLAGS "-std=c++11")
 5 | 
 6 | include_directories(.)
 7 | 
 8 | add_executable(benchmark benchmark.cpp)
 9 | 
10 | install(FILES libalgebra.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.com/mklarqvist/libalgebra.svg)](https://travis-ci.com/mklarqvist/libalgebra)
  2 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/mklarqvist/libalgebra?branch=master&svg=true)](https://ci.appveyor.com/project/mklarqvist/libalgebra)
  3 | [![Github Releases](https://img.shields.io/github/release/mklarqvist/libalgebra.svg)](https://github.com/mklarqvist/libalgebra/releases)
  4 | [![License](https://img.shields.io/badge/Apache-2.0-blue.svg)](LICENSE)
  5 | 
  6 | # libalgebra
  7 | 
  8 | ```libalgebra.h``` is a header-only C/C++ library for:
  9 | * counting the number of set bits ("population count", `popcnt`) in an array
 10 | * counting the number of set bits at each position ("positional population count", `pospopcnt`) in an array
 11 | * perform set algebraic operations on bitmaps including union, intersection, and diff cardinalities
 12 | 
 13 | using specialized CPU instructions i.e.
 14 | [POPCNT](https://en.wikipedia.org/wiki/SSE4#POPCNT_and_LZCNT),
 15 | [SSE4.2](https://en.wikipedia.org/wiki/SSE4#SSE4.2),
 16 | [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
 17 | [AVX512BW](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
 18 | [NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_.28NEON.29). ```libalgebra.h``` has been tested successfully using the GCC,
 19 | Clang and MSVC compilers.
 20 | 
 21 | The core algorithms are described in the papers:
 22 | 
 23 | * [Faster Population Counts using AVX2 Instructions](https://arxiv.org/abs/1611.07612) by Daniel Lemire, Nathan Kurz
 24 |   and Wojciech Muła (23 Nov 2016).
 25 | * [Efficient Computation of Positional Population Counts Using SIMD Instructions](https://arxiv.org/abs/1911.02696) by Marcus D. R. Klarqvist and Wojciech Muła and Daniel Lemire
 26 | * [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/abs/1603.06549) by D. Lemire, G. Ssi-Yan-Kai,
 27 |   and O. Kaser (21 Mar 2016).
 28 | 
 29 | ### Speedup
 30 | 
 31 | Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level.
 32 | The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3).
 33 | 
 34 | ### POSPOPCNT
 35 | 
 36 | This benchmark shows the speedup of the four `pospopcnt` algorithms used on x86
 37 | CPUs compared to a naive unvectorized solution
 38 | (`pospopcnt_u16_scalar_naive_nosimd`) for different array sizes (in number of
 39 | 2-byte values). 
 40 | 
 41 | | Algorithm                         | 128  | 256   | 512   | 1024  | 2048  | 4096  | 8192  | 65536  |
 42 | |-----------------------------------|------|-------|-------|-------|-------|-------|-------|--------|
 43 | | pospopcnt_u16_sse_blend_popcnt_unroll8    | **8.28** | 9.84  | 10.55 | 11    | 11.58 | 11.93 | 12.13 | 12.28  |
 44 | | pospopcnt_u16_avx512_blend_popcnt_unroll8 | 7.07 | **11.25** | **16.21** | 21    | 25.49 | 27.91 | 29.73 | 31.55  |
 45 | | pospopcnt_u16_avx512_adder_forest        | 3.05 | 2.82  | 14.53 | **23.13** | **34.37** | 44.91 | 52.78 | 61.68  |
 46 | | pospopcnt_u16_avx512_harvey_seal          | 2.07 | 2.3   | 8.21  | 15.41 | 28.17 | **49.14** | **76.11** | **138.71** |
 47 | 
 48 | ### POPCNT
 49 | 
 50 | Fold speedup compared to a naive unvectorized algorithm
 51 | (`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word):
 52 | 
 53 | | Words   | libalgebra.h | Scalar        | Speedup |
 54 | |---------|--------------|---------------|---------|
 55 | | 4       | 27.75 (37)   | 26.75 (33.5)  | 1       |
 56 | | 8       | 16.38 (25.5) | 17.38 (30.25) | 1.1     |
 57 | | 16      | 10.5 (19.94) | 12.75 (28.63) | 1.2     |
 58 | | 32      | 7.72 (17.16) | 10.69 (27.81) | 1.4     |
 59 | | 64      | 3.09 (4.36)  | 9.61 (27.41)  | 3.1     |
 60 | | 128     | 2.53 (2.73)  | 8.84 (27.2)   | 3.5     |
 61 | | 256     | 1.35 (1.7)   | 8.5 (27.1)    | 6.3     |
 62 | | 512     | 0.67 (1.18)  | 8.33 (27.05)  | 12.4    |
 63 | | 1024    | 0.5 (0.92)   | 8.25 (27.03)  | 16.4    |
 64 | | 2048    | 0.41 (0.79)  | 8.15 (27.01)  | 20.1    |
 65 | | 4096    | 0.46 (0.72)  | 8.12 (27.01)  | 17.8    |
 66 | | 8192    | 0.39 (0.69)  | 8.11 (27)     | 21      |
 67 | | 16384   | 0.39 (0.67)  | 8.1 (27)      | 20.6    |
 68 | | 32768   | 0.89 (0.66)  | 8.1 (27)      | 9.1     |
 69 | | 65536   | 0.84 (0.66)  | 8.1 (27)      | 9.6     |
 70 | | 131072  | 0.68 (0.66)  | 8.09 (27)     | 11.9    |
 71 | | 262144  | 1.11 (0.66)  | 8.09 (27)     | 7.3     |
 72 | | 524288  | 1.84 (0.66)  | 8.12 (27)     | 4.4     |
 73 | | 1048576 | 1.95 (0.66)  | 8.15 (27)     | 4.2     |
 74 | 
 75 | ### Set algebra
 76 | 
 77 | Fold speedup compared to naive unvectorized solution (`*_scalar_naive_nosimd`)
 78 | for different array sizes (in number of _pairs_ of 64-bit word but results reported per _single_ 64-bit word). These
 79 | functions are identifical with the exception of the bitwise operator used (AND,
 80 | OR, or XOR) which all have identical latency and throughput (CPI).
 81 | 
 82 | | Words   | libalgebra.h | Scalar        | Speedup |
 83 | |---------|--------------|---------------|---------|
 84 | | 4       | 17.63 (8.63) | 14.63 (22.75) | 0.8     |
 85 | | 8       | 8.13 (5.44)  | 10 (20.88)    | 1.2     |
 86 | | 16      | 4.69 (3.84)  | 7.91 (19.94)  | 1.7     |
 87 | | 32      | 2.38 (2.56)  | 6.59 (19.47)  | 2.8     |
 88 | | 64      | 1.82 (2.06)  | 5.87 (19.23)  | 3.2     |
 89 | | 128     | 0.88 (0.89)  | 5.43 (19.12)  | 6.2     |
 90 | | 256     | 0.57 (0.64)  | 5.18 (19.06)  | 9.2     |
 91 | | 512     | 0.41 (0.51)  | 5.11 (19.03)  | 12.4    |
 92 | | 1024    | 0.33 (0.45)  | 5.06 (19.02)  | 15.3    |
 93 | | 2048    | 0.39 (0.41)  | 5.03 (19.01)  | 13.1    |
 94 | | 4096    | 0.36 (0.4)   | 5.02 (19)     | 13.9    |
 95 | | 8192    | 0.37 (0.39)  | 5.01 (19)     | 13.7    |
 96 | | 16384   | 0.55 (0.39)  | 5.01 (19)     | 9.1     |
 97 | | 32768   | 0.55 (0.39)  | 5 (19)        | 9.2     |
 98 | | 65536   | 0.52 (0.38)  | 5 (19)        | 9.7     |
 99 | | 131072  | 0.56 (0.38)  | 5.01 (19)     | 9       |
100 | | 262144  | 1.25 (0.38)  | 5.02 (19)     | 4       |
101 | | 524288  | 1.76 (0.38)  | 5.03 (19)     | 2.9     |
102 | | 1048576 | 1.81 (0.38)  | 5.07 (19)     | 2.8     |
103 | 
104 | ## C/C++ API
105 | 
106 | ```C
107 | #include "libalgebra.h"
108 | 
109 | /*
110 |  * Count the number of 1 bits in the data array
111 |  * @data: An array
112 |  * @size: Size of data in bytes
113 |  */
114 | uint64_t STORM_popcnt(const void* data, uint64_t size);
115 | ```
116 | 
117 | ```C
118 | #include "libalgebra.h"
119 | 
120 | /*
121 |  * Count the number of 1 bits for each position in the data array
122 |  * @data: A 16-bit array
123 |  * @size: Size of data in bytes
124 |  * @flags: Output vector[16]
125 |  */
126 | uint32_t flags[16];
127 | int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags);
128 | ```
129 | 
130 | ```C
131 | #include "libalgebra.h"
132 | 
133 | /*
134 |  * Compute the intersection, union, or diff cardinality between pairs of bitmaps
135 |  * @data1: A 64-bit array
136 |  * @data2: A 64-bit array
137 |  * @size: Size of data in 64-bit words
138 |  */
139 | // Intersect cardinality
140 | uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
141 | // Union cardinality
142 | uint64_t STORM_union_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
143 | // Diff cardinality
144 | uint64_t STORM_diff_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size);
145 | ```
146 | 
147 | ### Advanced use
148 | 
149 | Retrieve a function pointer to the optimal function given the target length.
150 | 
151 | ```C
152 | STORM_compute_func STORM_get_intersection_count_func(const size_t n_bitmaps_vector);
153 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector);
154 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector);
155 | ```
156 | 
157 | Portable memory alignment.
158 | 
159 | ```C
160 | #include "libalgebra.h"
161 | 
162 | void* STORM_aligned_malloc(size_t alignment, size_t size);
163 | void STORM_aligned_free(void* memblock);
164 | ```
165 | 
166 | ## How it works
167 | 
168 | On x86 CPUs ```libalgebra.h``` uses a combination of algorithms depending on the input vector size and what instruction set your CPU supports. These checks are performed during **run-time**.


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # Automated Windows (MSVC++) testing using appveyor.com
 2 | # https://ci.appveyor.com/projects
 3 | 
 4 | version: 1.0.{build}
 5 | 
 6 | branches:
 7 |   except:
 8 |     - gh-pages
 9 | 
10 | os: Visual Studio 2017
11 | 
12 | platform:
13 |   - x86
14 |   - x64
15 | 
16 | build_script:
17 |   - if "%platform%" == "x86" cmake -G "Visual Studio 15 2017" .
18 |   - if "%platform%" == "x64" cmake -G "Visual Studio 15 2017 Win64" .
19 |   - cmake --build . --config Release
20 | 
21 | test_script:
22 |   - ps: C:\projects\libalgebra\Release\benchmark.exe


--------------------------------------------------------------------------------
/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include "libalgebra.h"
  2 | 
  3 | #include <iostream>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include <string>
  7 | #include <vector>
  8 | #if !defined(_MSC_VER)
  9 | #include "getopt.h"
 10 | #endif
 11 | 
 12 | uint64_t* generate_random_data(uint32_t n_bitmaps) {
 13 |     // Clear data
 14 |     // uint32_t n_bitmaps = ceil(n / 64.0);
 15 |     // memset(data, 0, sizeof(uint64_t)*n_bitmaps);
 16 |     uint64_t* mem = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t));
 17 | 
 18 |     // PRNG
 19 |     std::uniform_int_distribution<uint32_t> distr(0, std::numeric_limits<uint32_t>::max()-1); // right inclusive
 20 |     std::random_device rd;  // obtain a random number from hardware
 21 |     std::mt19937 eng(rd()); // seed the generator
 22 | 
 23 |      // Generate some random data.
 24 |     uint32_t n_unique = 0;
 25 |     // while (n_unique < n) {
 26 |     for (int i = 0; i < n_bitmaps; ++i) {
 27 |         uint32_t val1 = distr(eng);
 28 |         uint32_t val2 = distr(eng);
 29 |         uint64_t x = ((uint64_t)val1 << 32) | val2;
 30 |         mem[i] = x;
 31 |     }
 32 | 
 33 |     return mem;
 34 | }
 35 | 
 36 | #if !defined(__clang__) && !defined(_MSC_VER)
 37 | __attribute__((optimize("no-tree-vectorize")))
 38 | #endif
 39 | uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) {
 40 |     uint64_t total = 0;
 41 |     // for (int i = 0; i < len; ++i) {
 42 |     //     total += STORM_popcount64(data1[i] & data2[i]);
 43 |     // }
 44 |     // assert(len % 8 == 0);
 45 | 
 46 |     for (int j = 0; j < len; j += 8) {
 47 |         // total += STORM_popcount64(data[i]);
 48 |         // diff = data1[i] & data2[i];
 49 |         total += STORM_popcnt_lookup8bit[data[j+0]];
 50 |         total += STORM_popcnt_lookup8bit[data[j+1]];
 51 |         total += STORM_popcnt_lookup8bit[data[j+2]];
 52 |         total += STORM_popcnt_lookup8bit[data[j+3]];
 53 |         total += STORM_popcnt_lookup8bit[data[j+4]];
 54 |         total += STORM_popcnt_lookup8bit[data[j+5]];
 55 |         total += STORM_popcnt_lookup8bit[data[j+6]];
 56 |         total += STORM_popcnt_lookup8bit[data[j+7]];
 57 |     }
 58 | 
 59 |     return total;
 60 | }
 61 | 
 62 | #ifdef __linux__
 63 | 
 64 | #include <asm/unistd.h>       // for __NR_perf_event_open
 65 | #include <linux/perf_event.h> // for perf event constants
 66 | #include <sys/ioctl.h>        // for ioctl
 67 | #include <unistd.h>           // for syscall
 68 | #include <iostream>
 69 | #include <cerrno>  // for errno
 70 | #include <cstring> // for memset
 71 | #include <stdexcept>
 72 | 
 73 | #include <vector>
 74 | 
 75 | template <int TYPE = PERF_TYPE_HARDWARE> 
 76 | class LinuxEvents {
 77 |     int fd;
 78 |     bool working;
 79 |     perf_event_attr attribs;
 80 |     int num_events;
 81 |     std::vector<uint64_t> temp_result_vec;
 82 |     std::vector<uint64_t> ids;
 83 | 
 84 | public:
 85 |     explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
 86 |         memset(&attribs, 0, sizeof(attribs));
 87 |         attribs.type = TYPE;
 88 |         attribs.size = sizeof(attribs);
 89 |         attribs.disabled = 1;
 90 |         attribs.exclude_kernel = 1;
 91 |         attribs.exclude_hv = 1;
 92 | 
 93 |         attribs.sample_period = 0;
 94 |         attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
 95 |         const int pid = 0;  // the current process
 96 |         const int cpu = -1; // all CPUs
 97 |         const unsigned long flags = 0;
 98 | 
 99 |         int group = -1; // no group
100 |         num_events = config_vec.size();
101 |         uint32_t i = 0;
102 |         for (auto config : config_vec) {
103 |             attribs.config = config;
104 |             fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
105 |             if (fd == -1) {
106 |                 report_error("perf_event_open");
107 |             }
108 |                 ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
109 |                 if (group == -1) {
110 |                 group = fd;
111 |             }
112 |         }
113 | 
114 |         temp_result_vec.resize(num_events * 2 + 1);
115 |     }
116 | 
117 |     ~LinuxEvents() { close(fd); }
118 | 
119 |     inline void start() {
120 |         if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
121 |             report_error("ioctl(PERF_EVENT_IOC_RESET)");
122 |         }
123 | 
124 |         if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
125 |             report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
126 |         }
127 |     }
128 | 
129 |     inline void end(std::vector<unsigned long long> &results) {
130 |         if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
131 |             report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
132 |         }
133 | 
134 |         if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) {
135 |             report_error("read");
136 |         }
137 |         // our actual results are in slots 1,3,5, ... of this structure
138 |         // we really should be checking our ids obtained earlier to be safe
139 |         for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
140 |             results[i / 2] = temp_result_vec[i];
141 |         }
142 |     }
143 | 
144 | private:
145 |     void report_error(const std::string &context) {
146 |     if (working)
147 |         std::cerr << (context + ": " + std::string(strerror(errno))) << std::endl;
148 |         working = false;
149 |     }
150 | };
151 | 
152 | std::vector<unsigned long long>
153 | compute_mins(std::vector< std::vector<unsigned long long> > allresults) {
154 |     if (allresults.size() == 0)
155 |         return std::vector<unsigned long long>();
156 |     
157 |     std::vector<unsigned long long> answer = allresults[0];
158 |     
159 |     for (size_t k = 1; k < allresults.size(); k++) {
160 |         assert(allresults[k].size() == answer.size());
161 |         for (size_t z = 0; z < answer.size(); z++) {
162 |             if (allresults[k][z] < answer[z])
163 |                 answer[z] = allresults[k][z];
164 |         }
165 |     }
166 |     return answer;
167 | }
168 | 
169 | std::vector<double>
170 | compute_averages(std::vector< std::vector<unsigned long long> > allresults) {
171 |     if (allresults.size() == 0)
172 |         return std::vector<double>();
173 |     
174 |     std::vector<double> answer(allresults[0].size());
175 |     
176 |     for (size_t k = 0; k < allresults.size(); k++) {
177 |         assert(allresults[k].size() == answer.size());
178 |         for (size_t z = 0; z < answer.size(); z++) {
179 |             answer[z] += allresults[k][z];
180 |         }
181 |     }
182 | 
183 |     for (size_t z = 0; z < answer.size(); z++) {
184 |         answer[z] /= allresults.size();
185 |     }
186 |     return answer;
187 | }
188 | 
189 | int linux_set_algebra_wrapper(std::string name,
190 |     STORM_compute_func f, 
191 |     int iterations,
192 |     uint32_t range,
193 |     uint32_t n_values,
194 |     uint32_t n_bitmaps, 
195 |     bool verbose) 
196 | {
197 |     std::vector<int> evts;
198 |     evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
199 |     evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
200 |     evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
201 |     evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
202 |     evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
203 |     evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES);
204 |     LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
205 |     std::vector<unsigned long long> results; // tmp buffer
206 |     std::vector< std::vector<unsigned long long> > allresults;
207 |     results.resize(evts.size());
208 |     
209 |     std::random_device rd;
210 |     std::mt19937 gen(rd());
211 |     std::uniform_int_distribution<> dis(0, 0xFFFF);
212 | 
213 |     volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization    
214 |     for (uint32_t i = 0; i < iterations; i++) {
215 |         uint64_t* mem1 = generate_random_data(n_values);
216 |         uint64_t* mem2 = generate_random_data(n_values);
217 |         
218 |         unified.start();
219 |         // Call argument subroutine pointer.
220 |         total += (*f)(mem1, mem2, n_bitmaps);
221 |         unified.end(results);
222 |         allresults.push_back(results);
223 | 
224 |         STORM_aligned_free(mem1);
225 |         STORM_aligned_free(mem2);
226 |     }
227 | 
228 |     std::vector<unsigned long long> mins = compute_mins(allresults);
229 |     std::vector<double> avg = compute_averages(allresults);
230 |     
231 |     if (verbose) {
232 |         printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n",
233 |              name.c_str(),
234 |              n_bitmaps,
235 |              double(mins[1]) / mins[0], 
236 |              double(mins[0]) / (2*n_bitmaps), 
237 |              double(mins[1]) / (2*n_bitmaps),
238 |              mins[0], 
239 |              mins[1], 
240 |              mins[2], 
241 |              mins[3], 
242 |              mins[4],
243 |              avg[0], 
244 |              avg[1], 
245 |              avg[2], 
246 |              avg[3], 
247 |              avg[4]);
248 |     } else {
249 |         printf("%s-%u:\n",name.c_str(),n_bitmaps);
250 |         printf("instructions per cycle %4.2f, cycles per 64-bit word:  %4.3f, "
251 |                "instructions per 64-bit word %4.3f \n",
252 |                 double(mins[1]) / mins[0], double(mins[0]) / (2*n_bitmaps), double(mins[1]) / (2*n_bitmaps));
253 |         // first we display mins
254 |         printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu "
255 |                "cache ref., %8llu cache mis.\n",
256 |                 mins[0], mins[1], mins[2], mins[3], mins[4]);
257 |         printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f "
258 |                "cache ref., %8.1f cache mis.\n",
259 |                 avg[0], avg[1], avg[2], avg[3], avg[4]);
260 |     }
261 | 
262 |     return 1;
263 | }
264 | 
265 | int linux_popcount_wrapper(std::string name,
266 |     STORM_popcnt_func f, 
267 |     int iterations,
268 |     uint32_t range,
269 |     uint32_t n_values,
270 |     uint32_t n_bitmaps, 
271 |     bool verbose) 
272 | {
273 |     std::vector<int> evts;
274 |     evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
275 |     evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
276 |     evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
277 |     evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
278 |     evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
279 |     evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES);
280 |     LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
281 |     std::vector<unsigned long long> results; // tmp buffer
282 |     std::vector< std::vector<unsigned long long> > allresults;
283 |     results.resize(evts.size());
284 |     
285 |     std::random_device rd;
286 |     std::mt19937 gen(rd());
287 |     std::uniform_int_distribution<> dis(0, 0xFFFF);
288 | 
289 |     volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization    
290 |     for (uint32_t i = 0; i < iterations; i++) {
291 |         uint64_t* mem1 = generate_random_data(n_values);
292 |         
293 |         unified.start();
294 |         // Call argument subroutine pointer.
295 |         uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8);
296 |         unified.end(results);
297 |         allresults.push_back(results);
298 | 
299 |         uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8);
300 |         assert(a == b);
301 |         total += a;
302 | 
303 |         STORM_aligned_free(mem1);
304 |     }
305 | 
306 |     std::vector<unsigned long long> mins = compute_mins(allresults);
307 |     std::vector<double> avg = compute_averages(allresults);
308 |     
309 |     if (verbose) {
310 |         printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n",
311 |              name.c_str(),
312 |              n_bitmaps,
313 |              double(mins[1]) / mins[0], 
314 |              double(mins[0]) / (n_bitmaps), 
315 |              double(mins[1]) / (n_bitmaps),
316 |              mins[0], 
317 |              mins[1], 
318 |              mins[2], 
319 |              mins[3], 
320 |              mins[4],
321 |              avg[0], 
322 |              avg[1], 
323 |              avg[2], 
324 |              avg[3], 
325 |              avg[4]);
326 |     } else {
327 |         printf("%s-%u:\n",name.c_str(),n_bitmaps);
328 |         printf("instructions per cycle %4.2f, cycles per 64-bit word:  %4.3f, "
329 |                "instructions per 64-bit word %4.3f \n",
330 |                 double(mins[1]) / mins[0], double(mins[0]) / (n_bitmaps), double(mins[1]) / (n_bitmaps));
331 |         // first we display mins
332 |         printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu "
333 |                "cache ref., %8llu cache mis.\n",
334 |                 mins[0], mins[1], mins[2], mins[3], mins[4]);
335 |         printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f "
336 |                "cache ref., %8.1f cache mis.\n",
337 |                 avg[0], avg[1], avg[2], avg[3], avg[4]);
338 |     }
339 | 
340 |     return 1;
341 | }
342 | #endif // end is linux
343 | 
344 | struct bench_unit {
345 |     bench_unit() : valid(false), cycles(0), cycles_local(0), times(0), times_local(0){}
346 | 
347 |     bool valid;
348 |     float cycles;
349 |     float cycles_local;
350 |     uint64_t times;
351 |     uint64_t times_local;
352 | };
353 | 
354 | uint64_t get_cpu_cycles() {
355 |     uint64_t result;
356 | #ifndef _MSC_VER
357 |     __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax":"=a"
358 |                      (result)::"%rdx");
359 | #else
360 |     result = __rdtsc();
361 | #endif
362 |     return result;
363 | };
364 | 
365 | #if !defined(__clang__) && !defined(_MSC_VER)
366 | __attribute__((optimize("no-tree-vectorize")))
367 | #endif
368 | uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
369 |     uint64_t total = 0;
370 |     // for (int i = 0; i < len; ++i) {
371 |     //     total += STORM_popcount64(data1[i] & data2[i]);
372 |     // }
373 | 
374 |     uint64_t diff;
375 |     uint8_t* b8 = (uint8_t*)&diff;
376 |     for (int i = 0; i < len; ++i) {
377 |         // total += STORM_popcount64(data1[i] & data2[i]);
378 |         diff = data1[i] & data2[i];
379 |         total += STORM_popcnt_lookup8bit[b8[0]];
380 |         total += STORM_popcnt_lookup8bit[b8[1]];
381 |         total += STORM_popcnt_lookup8bit[b8[2]];
382 |         total += STORM_popcnt_lookup8bit[b8[3]];
383 |         total += STORM_popcnt_lookup8bit[b8[4]];
384 |         total += STORM_popcnt_lookup8bit[b8[5]];
385 |         total += STORM_popcnt_lookup8bit[b8[6]];
386 |         total += STORM_popcnt_lookup8bit[b8[7]];
387 |     }
388 | 
389 |     return total;
390 | }
391 | 
392 | #if !defined(__clang__) && !defined(_MSC_VER)
393 | __attribute__((optimize("no-tree-vectorize")))
394 | #endif
395 | uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
396 |     uint64_t total = 0;
397 |     // for (int i = 0; i < len; ++i) {
398 |     //     total += STORM_popcount64(data1[i] | data2[i]);
399 |     // }
400 | 
401 |     uint64_t diff;
402 |     uint8_t* b8 = (uint8_t*)&diff;
403 |     for (int i = 0; i < len; ++i) {
404 |         // total += STORM_popcount64(data1[i] | data2[i]);
405 |         diff = data1[i] | data2[i];
406 |         total += STORM_popcnt_lookup8bit[b8[0]];
407 |         total += STORM_popcnt_lookup8bit[b8[1]];
408 |         total += STORM_popcnt_lookup8bit[b8[2]];
409 |         total += STORM_popcnt_lookup8bit[b8[3]];
410 |         total += STORM_popcnt_lookup8bit[b8[4]];
411 |         total += STORM_popcnt_lookup8bit[b8[5]];
412 |         total += STORM_popcnt_lookup8bit[b8[6]];
413 |         total += STORM_popcnt_lookup8bit[b8[7]];
414 |     }
415 | 
416 |     return total;
417 | }
418 | 
419 | #if !defined(__clang__) && !defined(_MSC_VER)
420 | __attribute__((optimize("no-tree-vectorize")))
421 | #endif
422 | uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) {
423 |     uint64_t total = 0;
424 |     // for (int i = 0; i < len; ++i) {
425 |     //     total += STORM_popcount64(data1[i] ^ data2[i]);
426 |     // }
427 | 
428 |     uint64_t diff;
429 |     uint8_t* b8 = (uint8_t*)&diff;
430 |     for (int i = 0; i < len; ++i) {
431 |         // total += STORM_popcount64(data1[i] ^ data2[i]);
432 |         diff = data1[i] ^ data2[i];
433 |         total += STORM_popcnt_lookup8bit[b8[0]];
434 |         total += STORM_popcnt_lookup8bit[b8[1]];
435 |         total += STORM_popcnt_lookup8bit[b8[2]];
436 |         total += STORM_popcnt_lookup8bit[b8[3]];
437 |         total += STORM_popcnt_lookup8bit[b8[4]];
438 |         total += STORM_popcnt_lookup8bit[b8[5]];
439 |         total += STORM_popcnt_lookup8bit[b8[6]];
440 |         total += STORM_popcnt_lookup8bit[b8[7]];
441 |     }
442 | 
443 |     return total;
444 | }
445 | 
446 | // Definition for microsecond timer.
447 | typedef std::chrono::high_resolution_clock::time_point clockdef;
448 | 
449 | int set_algebra_wrapper(std::string name,
450 |     STORM_compute_func f, 
451 |     int iterations,
452 |     uint32_t range,
453 |     uint32_t n_values,
454 |     size_t n_bitmaps, 
455 |     bench_unit& unit) 
456 | {
457 |     uint32_t cycles_low = 0, cycles_high = 0;
458 |     uint32_t cycles_low1 = 0, cycles_high1 = 0;
459 |     // Start timer.
460 | 
461 |     std::vector<uint64_t> clocks;
462 |     std::vector<uint32_t> times;
463 | 
464 | #ifndef _MSC_VER
465 | // Intel guide:
466 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
467 | asm   volatile ("CPUID\n\t"
468 |                 "RDTSC\n\t"
469 |                 "mov %%edx, %0\n\t"
470 |                 "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 
471 | asm   volatile("RDTSCP\n\t"
472 |                "mov %%edx, %0\n\t"
473 |                "mov %%eax, %1\n\t"
474 |                "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 
475 | asm   volatile ("CPUID\n\t"
476 |                 "RDTSC\n\t"
477 |                 "mov %%edx, %0\n\t"
478 |                 "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 
479 | asm   volatile("RDTSCP\n\t"
480 |                "mov %%edx, %0\n\t"
481 |                "mov %%eax, %1\n\t"
482 |                "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
483 | #endif
484 |     uint64_t* mem1 = generate_random_data(n_values);
485 |     uint64_t* mem2 = generate_random_data(n_values);
486 | 
487 |     volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
488 |     clockdef t1 = std::chrono::high_resolution_clock::now();
489 | 
490 | #ifdef __linux__ 
491 |     // unsigned long flags;
492 |     // preempt_disable(); /*we disable preemption on our CPU*/
493 |     // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/  
494 |     /*at this stage we exclusively own the CPU*/ 
495 | #endif
496 | 
497 | #ifndef _MSC_VER 
498 |     asm   volatile ("CPUID\n\t"
499 |                     "RDTSC\n\t"
500 |                     "mov %%edx, %0\n\t"
501 |                     "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
502 | #endif
503 | 
504 |     for (int i = 0; i < iterations; ++i) {
505 |         // Call argument subroutine pointer.
506 |         total += (*f)(mem1, mem2, n_bitmaps);
507 |     }
508 | 
509 | #ifndef _MSC_VER 
510 |     asm   volatile("RDTSCP\n\t"
511 |                    "mov %%edx, %0\n\t"
512 |                    "mov %%eax, %1\n\t"
513 |                    "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
514 | #endif
515 | #ifdef __linux__ 
516 |         // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/
517 |         // preempt_enable();/*we enable preemption*/
518 | #endif
519 | 
520 |     clockdef t2 = std::chrono::high_resolution_clock::now();
521 |     auto time_span = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
522 | 
523 |     STORM_aligned_free(mem1);
524 |     STORM_aligned_free(mem2);
525 | 
526 |     uint64_t start = ( ((uint64_t)cycles_high  << 32) | cycles_low  );
527 |     uint64_t end   = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
528 | 
529 |     double mean_cycles = (end - start) / (double)iterations;
530 |     uint32_t mean_time = time_span.count() / (double)iterations;
531 | 
532 |     std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" << 
533 |         mean_cycles << "\t" <<
534 |         mean_time << "\t" << 
535 |         mean_cycles / n_bitmaps << "\t" << 
536 |         ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl;
537 |     
538 |     unit.times += mean_time;
539 |     unit.times_local = mean_time;
540 |     unit.cycles += mean_cycles;
541 |     unit.cycles_local = mean_cycles;
542 |     unit.valid = 1;
543 | 
544 |     return 0;
545 | }
546 | 
547 | int popcount_wrapper(std::string name,
548 |     STORM_popcnt_func f, 
549 |     int iterations,
550 |     uint32_t range,
551 |     uint32_t n_values,
552 |     uint32_t n_bitmaps, 
553 |     bench_unit& unit) 
554 | {
555 |     uint32_t cycles_low = 0, cycles_high = 0;
556 |     uint32_t cycles_low1 = 0, cycles_high1 = 0;
557 |     // Start timer.
558 | 
559 |     std::vector<uint64_t> clocks;
560 |     std::vector<uint32_t> times;
561 | 
562 | #ifndef _MSC_VER
563 | // Intel guide:
564 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
565 | asm   volatile ("CPUID\n\t"
566 |                 "RDTSC\n\t"
567 |                 "mov %%edx, %0\n\t"
568 |                 "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 
569 | asm   volatile("RDTSCP\n\t"
570 |                "mov %%edx, %0\n\t"
571 |                "mov %%eax, %1\n\t"
572 |                "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 
573 | asm   volatile ("CPUID\n\t"
574 |                 "RDTSC\n\t"
575 |                 "mov %%edx, %0\n\t"
576 |                 "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 
577 | asm   volatile("RDTSCP\n\t"
578 |                "mov %%edx, %0\n\t"
579 |                "mov %%eax, %1\n\t"
580 |                "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
581 | #endif
582 |     uint64_t* mem = generate_random_data(n_values);
583 | 
584 |     volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization
585 |     clockdef t1 = std::chrono::high_resolution_clock::now();
586 | 
587 | #ifdef __linux__ 
588 |     // unsigned long flags;
589 |     // preempt_disable(); /*we disable preemption on our CPU*/
590 |     // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/  
591 |     /*at this stage we exclusively own the CPU*/ 
592 | #endif
593 | 
594 | #ifndef _MSC_VER 
595 |     asm   volatile ("CPUID\n\t"
596 |                     "RDTSC\n\t"
597 |                     "mov %%edx, %0\n\t"
598 |                     "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
599 | #endif
600 | 
601 |     size_t n_b = n_bitmaps*8;
602 |     for (int i = 0; i < iterations; ++i) {
603 |         // Call argument subroutine pointer.
604 |         total += (*f)((uint8_t*)mem, n_b);
605 |     }
606 | 
607 | #ifndef _MSC_VER 
608 |     asm   volatile("RDTSCP\n\t"
609 |                    "mov %%edx, %0\n\t"
610 |                    "mov %%eax, %1\n\t"
611 |                    "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
612 | #endif
613 | #ifdef __linux__ 
614 |         // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/
615 |         // preempt_enable();/*we enable preemption*/
616 | #endif
617 | 
618 |     clockdef t2 = std::chrono::high_resolution_clock::now();
619 |     auto time_span = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
620 | 
621 |     STORM_aligned_free(mem);
622 | 
623 |     uint64_t start = ( ((uint64_t)cycles_high  << 32) | cycles_low  );
624 |     uint64_t end   = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
625 | 
626 |     double mean_cycles = (end - start) / (double)iterations;
627 |     uint32_t mean_time = time_span.count() / (double)iterations;
628 | 
629 |     std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" << 
630 |         mean_cycles << "\t" <<
631 |         mean_time << "\t" << 
632 |         mean_cycles / n_bitmaps << "\t" << 
633 |         ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl;
634 |     
635 |     unit.times += mean_time;
636 |     unit.times_local = mean_time;
637 |     unit.cycles += mean_cycles;
638 |     unit.cycles_local = mean_cycles;
639 |     unit.valid = 1;
640 | 
641 |     return 0;
642 | }
643 | 
644 | int benchmark(int n_repetitions, bool use_perf = false) {
645 |     // Align some bitmaps.
646 |     uint64_t* bitmaps  = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t));
647 |     uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t));
648 | 
649 |     std::vector<uint32_t> ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576};
650 |     std::vector<uint32_t> reps;
651 |     if (n_repetitions <= 0) {
652 |         reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,150,150,150,150,150,150,150,100,100,100};
653 |     } else {
654 |         reps = std::vector<uint32_t>(ranges.size(), n_repetitions);
655 |     }
656 | 
657 |     if (use_perf) {
658 | #ifndef __linux__ 
659 |         std::cerr << "perf counter are only available on Linux systems!" << std::endl;
660 |         exit(EXIT_FAILURE);
661 | #endif
662 |         printf("Algorithm\tWords\tInstructions/cycle\tCycles/word\tInstructions/word\tMinCycles\tMinInstructions\tMinBranchMiss\tMinCacheRef\tminCacheMiss\tAvgCycles\tAvgInstructions\tAvgBranchMiss\tAvgCacheRef\tAvgCacheMiss\n");
663 |     }
664 | 
665 | 
666 |     for (int i = 0; i < ranges.size(); ++i) {
667 |         bench_unit unit_intsec, unit_union, unit_diff;
668 |         
669 |         if (use_perf) {
670 | #ifdef __linux__ 
671 |             linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
672 |             linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], true);
673 |             linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
674 |             linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
675 |             linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
676 |             linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
677 |             linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true);
678 |             linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true);
679 | #else
680 |             std::cerr << "perf counter are only available on Linux systems!" << std::endl;
681 |             exit(EXIT_FAILURE);
682 | #endif
683 |         } else {
684 |             popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
685 |             popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
686 |             set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
687 |             set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
688 |             set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
689 |             set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_union);
690 |             set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec);
691 |             set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_diff);
692 |         }
693 |     }
694 | 
695 |     // Clean up.
696 |     STORM_aligned_free(bitmaps);
697 |     STORM_aligned_free(bitmaps2);
698 |     
699 |     return 1;
700 | }
701 | 
702 | int main(int argc, char **argv) {
703 | #if !defined(_MSC_VER)
704 |     bool verbose = false;
705 |     bool perf_subsystem = false;
706 |     int c;
707 |     int n_repetitions = -1;
708 | 
709 |     while ((c = getopt(argc, argv, "vpr:")) != -1) {
710 |         switch (c) {
711 |         case 'r':
712 |             n_repetitions = atoi(optarg);
713 |             break;
714 |         case 'v':
715 |             verbose = true;
716 |             break;
717 |         case 'p':
718 |             perf_subsystem = true;
719 |             break;
720 |         default:
721 |             abort();
722 |         }
723 |     }
724 | 
725 |     benchmark(n_repetitions, perf_subsystem);
726 | #else
727 |     int n_repetitions = -1;
728 |     if (argc > 2) {
729 |         n_repetitions = std::atoi(argv[1]);
730 |     }
731 |     benchmark(n_repetitions, false);
732 | #endif
733 | 
734 |     return EXIT_SUCCESS;
735 | }


--------------------------------------------------------------------------------
/libalgebra.h:
--------------------------------------------------------------------------------
   1 | // License for libalgebra.h
   2 | /*
   3 | * Copyright (c) 2019 Marcus D. R. Klarqvist
   4 | * Author(s): Marcus D. R. Klarqvist
   5 | *
   6 | * Licensed under the Apache License, Version 2.0 (the "License");
   7 | * you may not use this file except in compliance with the License.
   8 | * You may obtain a copy of the License at
   9 | *
  10 | *   http://www.apache.org/licenses/LICENSE-2.0
  11 | *
  12 | * Unless required by applicable law or agreed to in writing,
  13 | * software distributed under the License is distributed on an
  14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15 | * KIND, either express or implied.  See the License for the
  16 | * specific language governing permissions and limitations
  17 | * under the License.
  18 | */
  19 | // License for pospopcnt.h
  20 | /*
  21 | * Copyright (c) 2019
  22 | * Author(s): Marcus D. R. Klarqvist, Wojciech Muła, and Daniel Lemire
  23 | *
  24 | * Licensed under the Apache License, Version 2.0 (the "License");
  25 | * you may not use this file except in compliance with the License.
  26 | * You may obtain a copy of the License at
  27 | *
  28 | *   http://www.apache.org/licenses/LICENSE-2.0
  29 | *
  30 | * Unless required by applicable law or agreed to in writing,
  31 | * software distributed under the License is distributed on an
  32 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  33 | * KIND, either express or implied.  See the License for the
  34 | * specific language governing permissions and limitations
  35 | * under the License.
  36 | */
  37 | // License for libpopcnt.h
  38 | /*
  39 |  * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
  40 |  * population count) in an array as quickly as possible using
  41 |  * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
  42 |  *
  43 |  * Copyright (c) 2016 - 2018, Kim Walisch
  44 |  * Copyright (c) 2016 - 2018, Wojciech Muła
  45 |  *
  46 |  * All rights reserved.
  47 |  *
  48 |  * Redistribution and use in source and binary forms, with or without
  49 |  * modification, are permitted provided that the following conditions are met:
  50 |  *
  51 |  * 1. Redistributions of source code must retain the above copyright notice, this
  52 |  *    list of conditions and the following disclaimer.
  53 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
  54 |  *    this list of conditions and the following disclaimer in the documentation
  55 |  *    and/or other materials provided with the distribution.
  56 |  *
  57 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  58 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  59 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  60 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  61 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  62 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  63 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  64 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  65 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  66 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  67 |  */
  68 | #ifndef LIBALGEBRA_H_8723467365934
  69 | #define LIBALGEBRA_H_8723467365934
  70 | 
  71 | /* *************************************
  72 | *  Includes
  73 | ***************************************/
  74 | #include <stdint.h>
  75 | #include <assert.h>
  76 | #include <memory.h>
  77 | #include <string.h>
  78 | #include <math.h>
  79 | 
  80 | /* *************************************
  81 | *  Safety
  82 | ***************************************/
  83 | 
  84 | #if !(defined(__APPLE__)) && !(defined(__FreeBSD__))
  85 | #include <malloc.h>  // this should never be needed but there are some reports that it is needed.
  86 | #endif
  87 | 
  88 | #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8
  89 | #error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported.
  90 | #endif
  91 | 
  92 | /****************************
  93 | *  Memory management
  94 | * 
  95 | *  The subroutines aligned_malloc and aligned_free had to be renamed to
  96 | *  STORM_aligned_malloc and STORM_aligned_free to prevent clashing with the
  97 | *  same subroutines in Roaring. These subroutines are included here
  98 | *  since there is no hard dependency on using Roaring bitmaps.
  99 | *
 100 | *  These subroutines and definitions are taken from the CRoaring repo
 101 | *  by Daniel Lemire et al. available under the Apache 2.0 License
 102 | *  (same as libalgebra.h):
 103 | *  https://github.com/RoaringBitmap/CRoaring/ 
 104 | ****************************/
 105 | // portable version of  posix_memalign
 106 | #ifndef _MSC_VER
 107 | #include <x86intrin.h>
 108 | #endif
 109 | 
 110 | #ifndef STORM_aligned_malloc
 111 | static 
 112 | void* STORM_aligned_malloc(size_t alignment, size_t size) {
 113 |     void *p;
 114 | #ifdef _MSC_VER
 115 |     p = _aligned_malloc(size, alignment);
 116 | #elif defined(__MINGW32__) || defined(__MINGW64__)
 117 |     p = __mingw_aligned_malloc(size, alignment);
 118 | #else
 119 |     // somehow, if this is used before including "x86intrin.h", it creates an
 120 |     // implicit defined warning.
 121 |     if (posix_memalign(&p, alignment, size) != 0) 
 122 |         return NULL;
 123 | #endif
 124 |     return p;
 125 | }
 126 | #endif
 127 | 
 128 | #ifndef STORM_aligned_free
 129 | static 
 130 | void STORM_aligned_free(void* memblock) {
 131 | #ifdef _MSC_VER
 132 |     _aligned_free(memblock);
 133 | #elif defined(__MINGW32__) || defined(__MINGW64__)
 134 |     __mingw_aligned_free(memblock);
 135 | #else
 136 |     free(memblock);
 137 | #endif
 138 | }
 139 | #endif
 140 | 
 141 | // portable alignment
 142 | #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
 143 | #  include <stdalign.h>
 144 | #  define STORM_ALIGN(n)  alignas(n)
 145 | #elif defined(__GNUC__)
 146 | #  define STORM_ALIGN(n)  __attribute__ ((aligned(n)))
 147 | #elif defined(_MSC_VER)
 148 | #  define STORM_ALIGN(n)  __declspec(align(n))
 149 | #else
 150 | #  define STORM_ALIGN(n)  /* disabled */
 151 | #endif
 152 | 
 153 | /* *************************************
 154 | *  Compiler Specific Options
 155 | ***************************************/
 156 | // Taken from XXHASH
 157 | #ifdef _MSC_VER    /* Visual Studio */
 158 | #  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
 159 | #  define STORM_FORCE_INLINE static __forceinline
 160 | #  define STORM_NO_INLINE static __declspec(noinline)
 161 | #else
 162 | #  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 163 | #    ifdef __GNUC__
 164 | #      define STORM_FORCE_INLINE static inline __attribute__((always_inline))
 165 | #      define STORM_NO_INLINE static __attribute__((noinline))
 166 | #    else
 167 | #      define STORM_FORCE_INLINE static inline
 168 | #      define STORM_NO_INLINE static
 169 | #    endif
 170 | #  else
 171 | #    define STORM_FORCE_INLINE static
 172 | #    define STORM_NO_INLINE static
 173 | #  endif /* __STDC_VERSION__ */
 174 | #endif
 175 | 
 176 | /****************************
 177 | *  General checks
 178 | ****************************/
 179 | 
 180 | #ifndef __has_builtin
 181 |   #define STORM_HAS_BUILTIN(x) 0
 182 | #else
 183 |   #define STORM_HAS_BUILTIN(x) __has_builtin(x)
 184 | #endif
 185 | 
 186 | #ifndef __has_attribute
 187 |   #define STORM_HAS_ATTRIBUTE(x) 0
 188 | #else
 189 |   #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x)
 190 | #endif
 191 | 
 192 | // disable noise
 193 | #ifdef __GNUC__
 194 | #define STORM_WARN_UNUSED __attribute__((warn_unused_result))
 195 | #else
 196 | #define STORM_WARN_UNUSED
 197 | #endif
 198 | 
 199 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
 200 | #  define STORM_RESTRICT   restrict
 201 | #else
 202 | /* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */
 203 | #  define STORM_RESTRICT   /* disable */
 204 | #endif
 205 | 
 206 | #ifdef __GNUC__
 207 |   #define GNUC_PREREQ(x, y) \
 208 |       (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
 209 | #else
 210 |   #define GNUC_PREREQ(x, y) 0
 211 | #endif
 212 | 
 213 | #ifdef __clang__
 214 |   #define CLANG_PREREQ(x, y) \
 215 |       (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
 216 | #else
 217 |   #define CLANG_PREREQ(x, y) 0
 218 | #endif
 219 | 
 220 | #if (defined(__i386__) || \
 221 |      defined(__x86_64__) || \
 222 |      defined(_M_IX86) || \
 223 |      defined(_M_X64))
 224 |   #define X86_OR_X64
 225 | #endif
 226 | 
 227 | #if defined(X86_OR_X64) && \
 228 |    (defined(__cplusplus) || \
 229 |     defined(_MSC_VER) || \
 230 |    (GNUC_PREREQ(4, 2) || \
 231 |     STORM_HAS_BUILTIN(__sync_val_compare_and_swap)))
 232 |   #define STORM_HAVE_CPUID
 233 | #endif
 234 | 
 235 | #if GNUC_PREREQ(4, 2) || \
 236 |     STORM_HAS_BUILTIN(__builtin_popcount)
 237 |   #define STORM_HAVE_BUILTIN_POPCOUNT
 238 | #endif
 239 | 
 240 | #if GNUC_PREREQ(4, 2) || \
 241 |     CLANG_PREREQ(3, 0)
 242 |   #define STORM_HAVE_ASM_POPCNT
 243 | #endif
 244 | 
 245 | #if defined(STORM_HAVE_CPUID) && \
 246 |    (defined(STORM_HAVE_ASM_POPCNT) || \
 247 |     defined(_MSC_VER))
 248 |   #define STORM_HAVE_POPCNT
 249 | #endif
 250 | 
 251 | #if defined(STORM_HAVE_CPUID) && \
 252 |     GNUC_PREREQ(4, 9)
 253 |   #define STORM_HAVE_SSE42
 254 |   #define STORM_HAVE_AVX2
 255 | #endif
 256 | 
 257 | #if defined(STORM_HAVE_CPUID) && \
 258 |     GNUC_PREREQ(5, 0)
 259 |   #define STORM_HAVE_AVX512
 260 | #endif
 261 | 
 262 | #if defined(STORM_HAVE_CPUID) && \
 263 |     defined(_MSC_VER) && \
 264 |     defined(__AVX2__)
 265 |   #define STORM_HAVE_SSE42
 266 |   #define STORM_HAVE_AVX2
 267 | #endif
 268 | 
 269 | #if defined(STORM_HAVE_CPUID) && \
 270 |     defined(_MSC_VER) && \
 271 |     defined(__AVX512__)
 272 |   #define STORM_HAVE_AVX512
 273 | #endif
 274 | 
 275 | #if defined(STORM_HAVE_CPUID) && \
 276 |     CLANG_PREREQ(3, 8) && \
 277 |     STORM_HAS_ATTRIBUTE(target) && \
 278 |    (!defined(_MSC_VER) || defined(__AVX2__)) && \
 279 |    (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
 280 |   #define STORM_HAVE_SSE42
 281 |   #define STORM_HAVE_AVX2
 282 |   #define STORM_HAVE_AVX512
 283 | #endif
 284 | 
 285 | // Target attribute
 286 | #if !defined(_MSC_VER)
 287 |   #define STORM_TARGET(x) __attribute__ ((target (x)))
 288 | #else
 289 |   #define STORM_TARGET(x) 0
 290 | #endif
 291 | 
 292 | 
 293 | /****************************
 294 | *  CPUID and SIMD
 295 | ****************************/
 296 | 
 297 | #define STORM_SSE_ALIGNMENT    16
 298 | #define STORM_AVX2_ALIGNMENT   32
 299 | #define STORM_AVX512_ALIGNMENT 64
 300 | 
 301 | #ifdef __cplusplus
 302 | extern "C" {
 303 | #endif
 304 | 
 305 | #if defined(STORM_HAVE_CPUID)
 306 | 
 307 | #if defined(_MSC_VER)
 308 |   #include <intrin.h>
 309 |   #include <immintrin.h>
 310 | #endif
 311 | 
 312 | // CPUID flags. See https://en.wikipedia.org/wiki/CPUID for more info.
 313 | /* %ecx bit flags */
 314 | #define STORM_CPUID_runtime_bit_POPCNT   (1 << 23) // POPCNT instruction 
 315 | #define STORM_CPUID_runtime_bit_SSE41    (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19]
 316 | #define STORM_CPUID_runtime_bit_SSE42    (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20]
 317 | 
 318 | /* %ebx bit flags */
 319 | #define STORM_CPUID_runtime_bit_AVX2     (1 << 5)  // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]
 320 | #define STORM_CPUID_runtime_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions
 321 | 
 322 | /* xgetbv bit flags */
 323 | #define STORM_XSTATE_SSE (1 << 1)
 324 | #define STORM_XSTATE_YMM (1 << 2)
 325 | #define STORM_XSTATE_ZMM (7 << 5)
 326 | 
 327 | static  
 328 | void STORM_run_cpuid(int eax, int ecx, int* abcd) {
 329 | #if defined(_MSC_VER)
 330 |     __cpuidex(abcd, eax, ecx);
 331 | #else
 332 |     int ebx = 0;
 333 |     int edx = 0;
 334 | 
 335 | #if defined(__i386__) && \
 336 |     defined(__PIC__)
 337 |     /* in case of PIC under 32-bit EBX cannot be clobbered */
 338 |     __asm__ ("movl %%ebx, %%edi;"
 339 |                 "cpuid;"
 340 |                 "xchgl %%ebx, %%edi;"
 341 |                 : "=D" (ebx),
 342 |                 "+a" (eax),
 343 |                 "+c" (ecx),
 344 |                 "=d" (edx));
 345 | #else
 346 |     __asm__ ("cpuid;"
 347 |                 : "+b" (ebx),
 348 |                 "+a" (eax),
 349 |                 "+c" (ecx),
 350 |                 "=d" (edx));
 351 | #endif
 352 | 
 353 |     abcd[0] = eax;
 354 |     abcd[1] = ebx;
 355 |     abcd[2] = ecx;
 356 |     abcd[3] = edx;
 357 | #endif
 358 | }
 359 | 
 360 | #if defined(STORM_HAVE_AVX2) || \
 361 |     defined(STORM_HAVE_AVX512)
 362 | 
 363 | static 
 364 | int STORM_get_xcr0() {
 365 |     int xcr0;
 366 | 
 367 | #if defined(_MSC_VER)
 368 |     xcr0 = (int) _xgetbv(0);
 369 | #else
 370 |     __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
 371 | #endif
 372 | 
 373 |     return xcr0;
 374 | }
 375 | 
 376 | #endif
 377 | 
 378 | static  
 379 | int STORM_get_cpuid() {
 380 |     int flags = 0;
 381 |     int abcd[4];
 382 | 
 383 |     STORM_run_cpuid(1, 0, abcd);
 384 | 
 385 |     // Check for POPCNT instruction
 386 |     if ((abcd[2] & STORM_CPUID_runtime_bit_POPCNT) == STORM_CPUID_runtime_bit_POPCNT)
 387 |         flags |= STORM_CPUID_runtime_bit_POPCNT;
 388 | 
 389 |     // Check for SSE4.1 instruction set
 390 |     if ((abcd[2] & STORM_CPUID_runtime_bit_SSE41) == STORM_CPUID_runtime_bit_SSE41)
 391 |         flags |= STORM_CPUID_runtime_bit_SSE41;
 392 | 
 393 |     // Check for SSE4.2 instruction set
 394 |     if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42)
 395 |         flags |= STORM_CPUID_runtime_bit_SSE42;
 396 | 
 397 | #if defined(STORM_HAVE_AVX2) || \
 398 |     defined(STORM_HAVE_AVX512)
 399 | 
 400 |     int osxsave_mask = (1 << 27);
 401 | 
 402 |     /* ensure OS supports extended processor state management */
 403 |     if ((abcd[2] & osxsave_mask) != osxsave_mask)
 404 |         return 0;
 405 | 
 406 |     int ymm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM;
 407 |     int zmm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM | STORM_XSTATE_ZMM;
 408 | 
 409 |     int xcr0 = STORM_get_xcr0();
 410 | 
 411 |     if ((xcr0 & ymm_mask) == ymm_mask) {
 412 |         STORM_run_cpuid(7, 0, abcd);
 413 | 
 414 |         if ((abcd[1] & STORM_CPUID_runtime_bit_AVX2) == STORM_CPUID_runtime_bit_AVX2)
 415 |             flags |= STORM_CPUID_runtime_bit_AVX2;
 416 | 
 417 |         if ((xcr0 & zmm_mask) == zmm_mask) {
 418 |             if ((abcd[1] & STORM_CPUID_runtime_bit_AVX512BW) == STORM_CPUID_runtime_bit_AVX512BW)
 419 |                 flags |= STORM_CPUID_runtime_bit_AVX512BW;
 420 |         }
 421 |     }
 422 | 
 423 | #endif
 424 | 
 425 |   return flags;
 426 | }
 427 | #endif // defined(STORM_HAVE_CPUID)
 428 | 
 429 | /// Taken from libpopcnt.h
 430 | #if defined(STORM_HAVE_ASM_POPCNT) && \
 431 |     defined(__x86_64__)
 432 | 
 433 | STORM_FORCE_INLINE
 434 | uint64_t STORM_POPCOUNT(uint64_t x)
 435 | {
 436 |     __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
 437 |     return x;
 438 | }
 439 | 
 440 | #elif defined(STORM_HAVE_ASM_POPCNT) && \
 441 |       defined(__i386__)
 442 | 
 443 | STORM_FORCE_INLINE
 444 | uint32_t STORM_popcnt32(uint32_t x)
 445 | {
 446 |     __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
 447 |     return x;
 448 | }
 449 | 
 450 | STORM_FORCE_INLINE
 451 | uint64_t STORM_POPCOUNT(uint64_t x)
 452 | {
 453 |     return STORM_popcnt32((uint32_t) x) +
 454 |             STORM_popcnt32((uint32_t)(x >> 32));
 455 | }
 456 | 
 457 | #elif defined(_MSC_VER) && \
 458 |       defined(_M_X64)
 459 | 
 460 | #include <nmmintrin.h>
 461 | 
 462 | STORM_FORCE_INLINE
 463 | uint64_t STORM_POPCOUNT(uint64_t x) {
 464 |     return _mm_popcnt_u64(x);
 465 | }
 466 | 
 467 | #elif defined(_MSC_VER) && \
 468 |       defined(_M_IX86)
 469 | 
 470 | #include <nmmintrin.h>
 471 | 
 472 | STORM_FORCE_INLINE
 473 | uint64_t STORM_POPCOUNT(uint64_t x)
 474 | {
 475 |     return _mm_popcnt_u32((uint32_t) x) + 
 476 |             _mm_popcnt_u32((uint32_t)(x >> 32));
 477 | }
 478 | 
 479 | /* non x86 CPUs */
 480 | #elif defined(STORM_HAVE_BUILTIN_POPCOUNT)
 481 | 
 482 | STORM_FORCE_INLINE
 483 | uint64_t STORM_POPCOUNT(uint64_t x) {
 484 |     return __builtin_popcountll(x);
 485 | }
 486 | 
 487 | /* no hardware POPCNT,
 488 |  * use pure integer algorithm */
 489 | #else
 490 | 
 491 | STORM_FORCE_INLINE
 492 | uint64_t STORM_POPCOUNT(uint64_t x) {
 493 |     return STORM_popcount64(x);
 494 | }
 495 | 
 496 | #endif
 497 | 
 498 | 
 499 | static 
 500 | uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, 
 501 |                                         const uint64_t* STORM_RESTRICT data2, 
 502 |                                         size_t size)
 503 | {
 504 |     const uint64_t limit = size - size % 4;
 505 |     uint64_t cnt = 0;
 506 |     uint64_t i   = 0;
 507 | 
 508 |     for (/**/; i < limit; i += 4) {
 509 |         cnt += STORM_POPCOUNT(data1[i+0] & data2[i+0]);
 510 |         cnt += STORM_POPCOUNT(data1[i+1] & data2[i+1]);
 511 |         cnt += STORM_POPCOUNT(data1[i+2] & data2[i+2]);
 512 |         cnt += STORM_POPCOUNT(data1[i+3] & data2[i+3]);
 513 |     }
 514 | 
 515 |     for (/**/; i < size; ++i)
 516 |         cnt += STORM_POPCOUNT(data1[i] & data2[i]);
 517 | 
 518 |     return cnt;
 519 | }
 520 | 
 521 | static 
 522 | uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, 
 523 |                                     const uint64_t* STORM_RESTRICT data2, 
 524 |                                     size_t size)
 525 | {
 526 |     const uint64_t limit = size - size % 4;
 527 |     uint64_t cnt = 0;
 528 |     uint64_t i   = 0;
 529 | 
 530 |     for (/**/; i < limit; i += 4) {
 531 |         cnt += STORM_POPCOUNT(data1[i+0] | data2[i+0]);
 532 |         cnt += STORM_POPCOUNT(data1[i+1] | data2[i+1]);
 533 |         cnt += STORM_POPCOUNT(data1[i+2] | data2[i+2]);
 534 |         cnt += STORM_POPCOUNT(data1[i+3] | data2[i+3]);
 535 |     }
 536 | 
 537 |     for (/**/; i < size; ++i)
 538 |         cnt += STORM_POPCOUNT(data1[i] | data2[i]);
 539 | 
 540 |     return cnt;
 541 | }
 542 | 
 543 | static 
 544 | uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, 
 545 |                                    const uint64_t* STORM_RESTRICT data2, 
 546 |                                    size_t size)
 547 | {
 548 |     const uint64_t limit = size - size % 4;
 549 |     uint64_t cnt = 0;
 550 |     uint64_t i   = 0;
 551 | 
 552 |     for (/**/; i < limit; i += 4) {
 553 |         cnt += STORM_POPCOUNT(data1[i+0] ^ data2[i+0]);
 554 |         cnt += STORM_POPCOUNT(data1[i+1] ^ data2[i+1]);
 555 |         cnt += STORM_POPCOUNT(data1[i+2] ^ data2[i+2]);
 556 |         cnt += STORM_POPCOUNT(data1[i+3] ^ data2[i+3]);
 557 |     }
 558 | 
 559 |     for (/**/; i < size; ++i)
 560 |         cnt += STORM_POPCOUNT(data1[i] ^ data2[i]);
 561 | 
 562 |     return cnt;
 563 | }
 564 | 
 565 | static
 566 | int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, size_t len, uint32_t* out) {
 567 |     for (int i = 0; i < len; ++i) {
 568 |         for (int j = 0; j < 16; ++j) {
 569 |             out[j] += ((data[i] & (1 << j)) >> j);
 570 |         }
 571 |     }
 572 | 
 573 |     return 0;
 574 | }
 575 | 
 576 | #ifndef _MSC_VER
 577 | 
 578 | STORM_FORCE_INLINE
 579 | uint64_t STORM_pospopcnt_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
 580 |     unsigned __int128 x = (unsigned __int128)a * (unsigned __int128)b;
 581 |     *hi = (uint64_t)(x >> 64);
 582 |     return (uint64_t)x;
 583 | }
 584 | 
 585 | STORM_FORCE_INLINE
 586 | uint64_t STORM_pospopcnt_loadu_u64(const void* ptr) {
 587 |     uint64_t data;
 588 |     memcpy(&data, ptr, sizeof(data));
 589 |     return data;
 590 | }
 591 | 
 592 | // By @aqrit (https://github.com/aqrit)
 593 | // @see: https://gist.github.com/aqrit/c729815b0165c139d0bac642ab7ee104
 594 | static
 595 | int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, size_t n, uint32_t* out) {
 596 |     while (n >= 8) {
 597 |         uint64_t counter_a = 0; // 4 packed 12-bit counters
 598 |         uint64_t counter_b = 0;
 599 |         uint64_t counter_c = 0;
 600 |         uint64_t counter_d = 0;
 601 | 
 602 |         // end before overflowing the counters
 603 |         uint32_t len = ((n < 0x0FFF) ? n : 0x0FFF) & ~7;
 604 |         n -= len;
 605 |         for (const uint16_t* end = &in[len]; in != end; in += 8) {
 606 |             const uint64_t mask_a = UINT64_C(0x1111111111111111);
 607 |             const uint64_t mask_b = mask_a + mask_a;
 608 |             const uint64_t mask_c = mask_b + mask_b;
 609 |             const uint64_t mask_0001 = UINT64_C(0x0001000100010001);
 610 |             const uint64_t mask_cnts = UINT64_C(0x000000F00F00F00F);
 611 | 
 612 |             uint64_t v0 = STORM_pospopcnt_loadu_u64(&in[0]);
 613 |             uint64_t v1 = STORM_pospopcnt_loadu_u64(&in[4]);
 614 | 
 615 |             uint64_t a = (v0 & mask_a) + (v1 & mask_a);
 616 |             uint64_t b = ((v0 & mask_b) + (v1 & mask_b)) >> 1;
 617 |             uint64_t c = ((v0 & mask_c) + (v1 & mask_c)) >> 2;
 618 |             uint64_t d = ((v0 >> 3) & mask_a) + ((v1 >> 3) & mask_a);
 619 | 
 620 |             uint64_t hi;
 621 |             a = STORM_pospopcnt_umul128(a, mask_0001, &hi);
 622 |             a += hi; // broadcast 4-bit counts
 623 |             b = STORM_pospopcnt_umul128(b, mask_0001, &hi);
 624 |             b += hi;
 625 |             c = STORM_pospopcnt_umul128(c, mask_0001, &hi);
 626 |             c += hi;
 627 |             d = STORM_pospopcnt_umul128(d, mask_0001, &hi);
 628 |             d += hi;
 629 | 
 630 |             counter_a += a & mask_cnts;
 631 |             counter_b += b & mask_cnts;
 632 |             counter_c += c & mask_cnts;
 633 |             counter_d += d & mask_cnts;
 634 |         }
 635 | 
 636 |         out[0] += counter_a & 0x0FFF;
 637 |         out[1] += counter_b & 0x0FFF;
 638 |         out[2] += counter_c & 0x0FFF;
 639 |         out[3] += counter_d & 0x0FFF;
 640 |         out[4] += (counter_a >> 36);
 641 |         out[5] += (counter_b >> 36);
 642 |         out[6] += (counter_c >> 36);
 643 |         out[7] += (counter_d >> 36);
 644 |         out[8] += (counter_a >> 24) & 0x0FFF;
 645 |         out[9] += (counter_b >> 24) & 0x0FFF;
 646 |         out[10] += (counter_c >> 24) & 0x0FFF;
 647 |         out[11] += (counter_d >> 24) & 0x0FFF;
 648 |         out[12] += (counter_a >> 12) & 0x0FFF;
 649 |         out[13] += (counter_b >> 12) & 0x0FFF;
 650 |         out[14] += (counter_c >> 12) & 0x0FFF;
 651 |         out[15] += (counter_d >> 12) & 0x0FFF;
 652 |     }
 653 | 
 654 |     // assert(n < 8)
 655 |     if (n != 0) {
 656 |         uint64_t tail_counter_a = 0;
 657 |         uint64_t tail_counter_b = 0;
 658 |         do { // zero-extend a bit to 8-bits (emulate pdep) then accumulate
 659 |             const uint64_t mask_01 = UINT64_C(0x0101010101010101);
 660 |             const uint64_t magic   = UINT64_C(0x0000040010004001); // 1+(1<<14)+(1<<28)+(1<<42)
 661 |             uint64_t x = *in++;
 662 |             tail_counter_a += ((x & 0x5555) * magic) & mask_01; // 0101010101010101
 663 |             tail_counter_b += (((x >> 1) & 0x5555) * magic) & mask_01;
 664 |         } while (--n);
 665 | 
 666 |         out[0]  += tail_counter_a & 0xFF;
 667 |         out[8]  += (tail_counter_a >>  8) & 0xFF;
 668 |         out[2]  += (tail_counter_a >> 16) & 0xFF;
 669 |         out[10] += (tail_counter_a >> 24) & 0xFF;
 670 |         out[4]  += (tail_counter_a >> 32) & 0xFF;
 671 |         out[12] += (tail_counter_a >> 40) & 0xFF;
 672 |         out[6]  += (tail_counter_a >> 48) & 0xFF;
 673 |         out[14] += (tail_counter_a >> 56) & 0xFF;
 674 |         out[1]  += tail_counter_b & 0xFF;
 675 |         out[9]  += (tail_counter_b >>  8) & 0xFF;
 676 |         out[3]  += (tail_counter_b >> 16) & 0xFF;
 677 |         out[11] += (tail_counter_b >> 24) & 0xFF;
 678 |         out[5]  += (tail_counter_b >> 32) & 0xFF;
 679 |         out[13] += (tail_counter_b >> 40) & 0xFF;
 680 |         out[7]  += (tail_counter_b >> 48) & 0xFF;
 681 |         out[15] += (tail_counter_b >> 56) & 0xFF;
 682 |     }
 683 | 
 684 |     return 0;
 685 | }
 686 | #endif
 687 | 
 688 | /*
 689 |  * This uses fewer arithmetic operations than any other known
 690 |  * implementation on machines with fast multiplication.
 691 |  * It uses 12 arithmetic operations, one of which is a multiply.
 692 |  * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
 693 |  */
 694 | STORM_FORCE_INLINE
 695 | uint64_t STORM_popcount64(uint64_t x)
 696 | {
 697 |     uint64_t m1  = UINT64_C(0x5555555555555555);
 698 |     uint64_t m2  = UINT64_C(0x3333333333333333);
 699 |     uint64_t m4  = UINT64_C(0x0F0F0F0F0F0F0F0F);
 700 |     uint64_t h01 = UINT64_C(0x0101010101010101);
 701 | 
 702 |     x -= (x >> 1) & m1;
 703 |     x = (x & m2) + ((x >> 2) & m2);
 704 |     x = (x + (x >> 4)) & m4;
 705 | 
 706 |     return (x * h01) >> 56;
 707 | }
 708 | 
 709 | 
 710 | static
 711 | const uint8_t STORM_popcnt_lookup8bit[256] = {
 712 | 	/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
 713 | 	/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
 714 | 	/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
 715 | 	/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
 716 | 	/* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3,
 717 | 	/* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4,
 718 | 	/* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4,
 719 | 	/* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5,
 720 | 	/* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3,
 721 | 	/* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4,
 722 | 	/* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4,
 723 | 	/* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5,
 724 | 	/* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4,
 725 | 	/* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5,
 726 | 	/* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5,
 727 | 	/* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6,
 728 | 	/* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3,
 729 | 	/* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4,
 730 | 	/* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4,
 731 | 	/* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5,
 732 | 	/* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4,
 733 | 	/* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5,
 734 | 	/* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5,
 735 | 	/* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6,
 736 | 	/* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4,
 737 | 	/* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5,
 738 | 	/* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5,
 739 | 	/* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6,
 740 | 	/* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5,
 741 | 	/* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6,
 742 | 	/* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6,
 743 | 	/* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7,
 744 | 	/* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3,
 745 | 	/* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4,
 746 | 	/* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4,
 747 | 	/* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5,
 748 | 	/* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4,
 749 | 	/* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5,
 750 | 	/* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5,
 751 | 	/* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6,
 752 | 	/* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4,
 753 | 	/* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5,
 754 | 	/* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5,
 755 | 	/* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6,
 756 | 	/* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5,
 757 | 	/* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6,
 758 | 	/* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6,
 759 | 	/* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7,
 760 | 	/* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4,
 761 | 	/* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5,
 762 | 	/* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5,
 763 | 	/* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6,
 764 | 	/* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5,
 765 | 	/* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6,
 766 | 	/* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6,
 767 | 	/* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7,
 768 | 	/* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5,
 769 | 	/* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6,
 770 | 	/* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6,
 771 | 	/* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7,
 772 | 	/* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6,
 773 | 	/* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7,
 774 | 	/* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7,
 775 | 	/* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8
 776 | };
 777 | 
 778 | /****************************
 779 | *  SSE4.1 functions
 780 | ****************************/
 781 | 
 782 | #if defined(STORM_HAVE_SSE42)
 783 | 
 784 | #include <immintrin.h>
 785 | 
 786 | STORM_TARGET("sse4.2")
 787 | STORM_FORCE_INLINE  
 788 | uint64_t STORM_POPCOUNT_SSE(const __m128i n) {
 789 |     return(STORM_POPCOUNT(_mm_cvtsi128_si64(n)) + 
 790 |            STORM_POPCOUNT(_mm_cvtsi128_si64(_mm_unpackhi_epi64(n, n))));
 791 | }
 792 | 
 793 | STORM_TARGET("sse4.2")
 794 | STORM_FORCE_INLINE 
 795 | void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) {
 796 |     __m128i u = _mm_xor_si128(a, b);
 797 |     *h = _mm_or_si128(_mm_and_si128(a, b), _mm_and_si128(u, c));
 798 |     *l = _mm_xor_si128(u, c);
 799 | }
 800 | 
 801 | /**
 802 |  * Carry-save adder update step.
 803 |  * @see https://en.wikipedia.org/wiki/Carry-save_adder#Technical_details
 804 |  * 
 805 |  * Steps:
 806 |  * 1)  U = *L ⊕ B
 807 |  * 2) *H = (*L ^ B) | (U ^ C)
 808 |  * 3) *L = *L ⊕ B ⊕ C = U ⊕ C
 809 |  * 
 810 |  * B and C are 16-bit staggered registers such that &C - &B = 1.
 811 |  * 
 812 |  * Example usage:
 813 |  * pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 0), _mm_loadu_si128(data + i + 1));
 814 |  * 
 815 |  * @param h 
 816 |  * @param l 
 817 |  * @param b 
 818 |  * @param c  
 819 |  */
 820 | STORM_TARGET("sse4.2")
 821 | STORM_FORCE_INLINE
 822 | void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, 
 823 |                              __m128i* STORM_RESTRICT l, 
 824 |                              const __m128i b, 
 825 |                              const __m128i c) 
 826 | {
 827 |     const __m128i u = _mm_xor_si128(*l, b);
 828 |     *h = _mm_or_si128(*l & b, u & c); // shift carry (sc_i).
 829 |     *l = _mm_xor_si128(u, c); // partial sum (ps).
 830 | }
 831 | 
 832 | // By @aqrit (https://github.com/aqrit)
 833 | // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3
 834 | STORM_TARGET("sse4.2")
 835 | static
 836 | int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, size_t len, uint32_t* flag_counts) {
 837 |     const __m128i zero = _mm_setzero_si128();
 838 |     const __m128i mask_lo_byte = _mm_srli_epi16(_mm_cmpeq_epi8(zero, zero), 8);
 839 |     const __m128i mask_lo_cnt  = _mm_srli_epi16(mask_lo_byte, 2);
 840 |     const __m128i mask_bits_a  = _mm_set1_epi8(0x41); // 01000001
 841 |     const __m128i mask_bits_b  = _mm_add_epi8(mask_bits_a, mask_bits_a);
 842 |     uint32_t buffer[16];
 843 | 
 844 |     __m128i counterA = zero;
 845 |     __m128i counterB = zero;
 846 |     __m128i counterC = zero;
 847 |     __m128i counterD = zero;
 848 | 
 849 |     for (const uint16_t* end = &data[(len & ~31)]; data != end; data += 32) {
 850 |         __m128i r0 = _mm_loadu_si128((__m128i*)&data[0]);
 851 |         __m128i r1 = _mm_loadu_si128((__m128i*)&data[8]);
 852 |         __m128i r2 = _mm_loadu_si128((__m128i*)&data[16]);
 853 |         __m128i r3 = _mm_loadu_si128((__m128i*)&data[24]);
 854 |         __m128i r4, r5, r6, r7;
 855 | 
 856 |         // seperate LOBYTE and HIBYTE of each WORD
 857 |         // (emulate PSHUFB F,D,B,9,7,5,3,1, E,C,A,8,6,4,2,0)
 858 |         r4 = _mm_and_si128(mask_lo_byte, r0);
 859 |         r5 = _mm_and_si128(mask_lo_byte, r1);
 860 |         r6 = _mm_and_si128(mask_lo_byte, r2);
 861 |         r7 = _mm_and_si128(mask_lo_byte, r3);
 862 |         r0 = _mm_srli_epi16(r0, 8);
 863 |         r1 = _mm_srli_epi16(r1, 8);
 864 |         r2 = _mm_srli_epi16(r2, 8);
 865 |         r3 = _mm_srli_epi16(r3, 8);
 866 |         r0 = _mm_packus_epi16(r0, r4);
 867 |         r1 = _mm_packus_epi16(r1, r5);
 868 |         r2 = _mm_packus_epi16(r2, r6);
 869 |         r3 = _mm_packus_epi16(r3, r7);
 870 | 
 871 |         // isolate bits to count
 872 |         r4 = _mm_and_si128(mask_bits_a, r0);
 873 |         r5 = _mm_and_si128(mask_bits_a, r1);
 874 |         r6 = _mm_and_si128(mask_bits_a, r2);
 875 |         r7 = _mm_and_si128(mask_bits_a, r3);
 876 | 
 877 |         // horizontal sum of qwords
 878 |         r4 = _mm_sad_epu8(r4, zero);
 879 |         r5 = _mm_sad_epu8(r5, zero);
 880 |         r6 = _mm_sad_epu8(r6, zero);
 881 |         r7 = _mm_sad_epu8(r7, zero);
 882 | 
 883 |         // sum 6-bit counts
 884 |         r4 = _mm_add_epi16(r4,r5);
 885 |         r4 = _mm_add_epi16(r4,r6);
 886 |         r4 = _mm_add_epi16(r4,r7);
 887 | 
 888 |         // unpack 6-bit counts to 32-bits
 889 |         r5 = _mm_and_si128(mask_lo_cnt, r4);
 890 |         r4 = _mm_srli_epi16(r4, 6);
 891 |         r4 = _mm_packs_epi32(r4, r5);
 892 | 
 893 |         // accumulate
 894 |         counterA = _mm_add_epi32(counterA, r4);
 895 | 
 896 |         // do it again...
 897 |         r4 = _mm_and_si128(mask_bits_b, r0);
 898 |         r5 = _mm_and_si128(mask_bits_b, r1);
 899 |         r6 = _mm_and_si128(mask_bits_b, r2);
 900 |         r7 = _mm_and_si128(mask_bits_b, r3);
 901 | 
 902 |         r4 = _mm_sad_epu8(r4, zero);
 903 |         r5 = _mm_sad_epu8(r5, zero);
 904 |         r6 = _mm_sad_epu8(r6, zero);
 905 |         r7 = _mm_sad_epu8(r7, zero);
 906 | 
 907 |         r4 = _mm_add_epi16(r4,r5);
 908 |         r4 = _mm_add_epi16(r4,r6);
 909 |         r4 = _mm_add_epi16(r4,r7);
 910 | 
 911 |         r5 = _mm_avg_epu8(zero, r4); // shift right 1
 912 |         r5 = _mm_and_si128(r5, mask_lo_cnt);
 913 |         r4 = _mm_srli_epi16(r4, 7);
 914 |         r4 = _mm_packs_epi32(r4, r5);
 915 | 
 916 |         counterB = _mm_add_epi32(counterB, r4); // accumulate
 917 | 
 918 |         // rotate right 4
 919 |         r4 = _mm_slli_epi16(r0, 12);
 920 |         r5 = _mm_slli_epi16(r1, 12);
 921 |         r6 = _mm_slli_epi16(r2, 12);
 922 |         r7 = _mm_slli_epi16(r3, 12);
 923 |         r0 = _mm_srli_epi16(r0, 4);
 924 |         r1 = _mm_srli_epi16(r1, 4);
 925 |         r2 = _mm_srli_epi16(r2, 4);
 926 |         r3 = _mm_srli_epi16(r3, 4);
 927 |         r0 = _mm_or_si128(r0, r4);
 928 |         r1 = _mm_or_si128(r1, r5);
 929 |         r2 = _mm_or_si128(r2, r6);
 930 |         r3 = _mm_or_si128(r3, r7);
 931 | 
 932 |         // do it again...
 933 |         r4 = _mm_and_si128(mask_bits_a, r0);
 934 |         r5 = _mm_and_si128(mask_bits_a, r1);
 935 |         r6 = _mm_and_si128(mask_bits_a, r2);
 936 |         r7 = _mm_and_si128(mask_bits_a, r3);
 937 | 
 938 |         r4 = _mm_sad_epu8(r4, zero);
 939 |         r5 = _mm_sad_epu8(r5, zero);
 940 |         r6 = _mm_sad_epu8(r6, zero);
 941 |         r7 = _mm_sad_epu8(r7, zero);
 942 | 
 943 |         r4 = _mm_add_epi16(r4,r5);
 944 |         r4 = _mm_add_epi16(r4,r6);
 945 |         r4 = _mm_add_epi16(r4,r7);
 946 | 
 947 |         r5 = _mm_and_si128(mask_lo_cnt, r4);
 948 |         r4 = _mm_srli_epi16(r4, 6);
 949 |         r4 = _mm_packs_epi32(r4, r5);
 950 | 
 951 |         counterC = _mm_add_epi32(counterC, r4); // accumulate
 952 | 
 953 |         // do it again...
 954 |         r0 = _mm_and_si128(r0, mask_bits_b);
 955 |         r1 = _mm_and_si128(r1, mask_bits_b);
 956 |         r2 = _mm_and_si128(r2, mask_bits_b);
 957 |         r3 = _mm_and_si128(r3, mask_bits_b);
 958 | 
 959 |         r0 = _mm_sad_epu8(r0, zero);
 960 |         r1 = _mm_sad_epu8(r1, zero);
 961 |         r2 = _mm_sad_epu8(r2, zero);
 962 |         r3 = _mm_sad_epu8(r3, zero);
 963 | 
 964 |         r0 = _mm_add_epi16(r0,r1);
 965 |         r0 = _mm_add_epi16(r0,r2);
 966 |         r0 = _mm_add_epi16(r0,r3);
 967 | 
 968 |         r1 = _mm_avg_epu8(zero, r0);
 969 |         r1 = _mm_and_si128(r1, mask_lo_cnt);
 970 |         r0 = _mm_srli_epi16(r0, 7);
 971 |         r0 = _mm_packs_epi32(r0, r1);
 972 | 
 973 |         counterD = _mm_add_epi32(counterD, r0); // accumulate
 974 |     }
 975 | 
 976 |     // transpose then store counters
 977 |     __m128i counter_1098 = _mm_unpackhi_epi32(counterA, counterB);
 978 |     __m128i counter_76FE = _mm_unpacklo_epi32(counterA, counterB);
 979 |     __m128i counter_32BA = _mm_unpacklo_epi32(counterC, counterD);
 980 |     __m128i counter_54DC = _mm_unpackhi_epi32(counterC, counterD);
 981 |     __m128i counter_7654 = _mm_unpackhi_epi64(counter_54DC, counter_76FE);
 982 |     __m128i counter_FEDC = _mm_unpacklo_epi64(counter_54DC, counter_76FE);
 983 |     __m128i counter_3210 = _mm_unpackhi_epi64(counter_1098, counter_32BA);
 984 |     __m128i counter_BA98 = _mm_unpacklo_epi64(counter_1098, counter_32BA);
 985 | 
 986 |     
 987 |     _mm_storeu_si128((__m128i*)&buffer[0], counter_3210);
 988 |     _mm_storeu_si128((__m128i*)&buffer[4], counter_7654);
 989 |     _mm_storeu_si128((__m128i*)&buffer[8], counter_BA98);
 990 |     _mm_storeu_si128((__m128i*)&buffer[12], counter_FEDC);
 991 |     for (int i = 0; i < 16; ++i) flag_counts[i] += buffer[i];
 992 | 
 993 |     // scalar tail loop
 994 |     int tail = len & 31;
 995 |     if (tail != 0) {
 996 |         uint64_t countsA = 0;
 997 |         uint64_t countsB = 0;
 998 |         do {
 999 |             // zero-extend a bit to 8-bits then accumulate
1000 |             // (emulate pdep)
1001 |             const uint64_t mask_01 = UINT64_C(0x0101010101010101);// 100000001000000010000000100000001000000010000000100000001
1002 |             const uint64_t magic   = UINT64_C(0x0000040010004001);// 000000000000001000000000000010000000000000100000000000001
1003 |                                                                   // 1+(1<<14)+(1<<28)+(1<<42)
1004 |             uint64_t x = *data++;
1005 |             countsA += ((x & 0x5555) * magic) & mask_01; // 0101010101010101
1006 |             countsB += (((x >> 1) & 0x5555) * magic) & mask_01;
1007 |         } while (--tail);
1008 | 
1009 |         // transpose then store counters
1010 |         flag_counts[0]  += countsA & 0xFF;
1011 |         flag_counts[8]  += (countsA >>  8) & 0xFF;
1012 |         flag_counts[2]  += (countsA >> 16) & 0xFF;
1013 |         flag_counts[10] += (countsA >> 24) & 0xFF;
1014 |         flag_counts[4]  += (countsA >> 32) & 0xFF;
1015 |         flag_counts[12] += (countsA >> 40) & 0xFF;
1016 |         flag_counts[6]  += (countsA >> 48) & 0xFF;
1017 |         flag_counts[14] += (countsA >> 56) & 0xFF;
1018 |         flag_counts[1]  += countsB & 0xFF;
1019 |         flag_counts[9]  += (countsB >>  8) & 0xFF;
1020 |         flag_counts[3]  += (countsB >> 16) & 0xFF;
1021 |         flag_counts[11] += (countsB >> 24) & 0xFF;
1022 |         flag_counts[5]  += (countsB >> 32) & 0xFF;
1023 |         flag_counts[13] += (countsB >> 40) & 0xFF;
1024 |         flag_counts[7]  += (countsB >> 48) & 0xFF;
1025 |         flag_counts[15] += (countsB >> 56) & 0xFF;
1026 |     }
1027 | 
1028 |     return 0;
1029 | }
1030 | 
1031 | STORM_TARGET("sse4.2")
1032 | static
1033 | int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) {
1034 |     const __m128i* data_vectors = (const __m128i*)(array);
1035 |     const uint32_t n_cycles = len / 8;
1036 | 
1037 |     size_t i = 0;
1038 |     for (/**/; i + 8 <= n_cycles; i += 8) {
1039 | #define L(p) __m128i v##p = _mm_loadu_si128(data_vectors+i+p);
1040 |         L(0) L(1) L(2) L(3)
1041 |         L(4) L(5) L(6) L(7)
1042 | 
1043 | #define U0(p,k) __m128i input##p = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0x00FF)), _mm_slli_epi16(v##k, 8));
1044 | #define U1(p,k) __m128i input##k = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0xFF00)), _mm_srli_epi16(v##k, 8));
1045 | #define U(p, k)  U0(p,k) U1(p,k)
1046 | 
1047 |         U(0,1) U(2,3) U(4,5) U(6,7)
1048 |         
1049 |         for (int i = 0; i < 8; ++i) {
1050 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p));
1051 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k));
1052 | #define A(p, k) A0(p) A1(k)
1053 |             A(0,1) A(2, 3) A(4,5) A(6, 7)
1054 | 
1055 | #define P0(p) input##p = _mm_add_epi8(input##p, input##p);
1056 | #define P(p, k) input##p = P0(p) P0(k)
1057 | 
1058 |             P(0,1) P(2, 3) P(4,5) P(6, 7)
1059 |         }
1060 |     }
1061 | 
1062 |     for (/**/; i + 4 <= n_cycles; i += 4) {
1063 |         L(0) L(1) L(2) L(3)
1064 |         U(0,1) U(2,3)
1065 |         
1066 |         for (int i = 0; i < 8; ++i) {
1067 |             A(0,1) A(2, 3)
1068 |             P(0,1) P(2, 3)
1069 |         }
1070 |     }
1071 | 
1072 |     for (/**/; i + 2 <= n_cycles; i += 2) {
1073 |         L(0) L(1)
1074 |         U(0,1)
1075 |         
1076 |         for (int i = 0; i < 8; ++i) {
1077 |             A(0,1)
1078 |             P(0,1)
1079 |         }
1080 |     }
1081 | 
1082 |     i *= 8;
1083 |     for (/**/; i < len; ++i) {
1084 |         for (int j = 0; j < 16; ++j) {
1085 |             out[j] += ((array[i] & (1 << j)) >> j);
1086 |         }
1087 |     }
1088 | 
1089 | #undef L
1090 | #undef U0
1091 | #undef U1
1092 | #undef U
1093 | #undef A0
1094 | #undef A1
1095 | #undef A
1096 | #undef P0
1097 | #undef P
1098 |     return 0;
1099 | }
1100 | 
1101 | STORM_TARGET("sse4.2")
1102 | static
1103 | int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
1104 |     for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) {
1105 |         for (int j = 0; j < 16; ++j) {
1106 |             out[j] += ((array[i] & (1 << j)) >> j);
1107 |         }
1108 |     }
1109 | 
1110 |     const __m128i* data = (const __m128i*)array;
1111 |     size_t size = len / 8;
1112 |     __m128i v1  = _mm_setzero_si128();
1113 |     __m128i v2  = _mm_setzero_si128();
1114 |     __m128i v4  = _mm_setzero_si128();
1115 |     __m128i v8  = _mm_setzero_si128();
1116 |     __m128i v16 = _mm_setzero_si128();
1117 |     __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1118 | 
1119 |     const uint64_t limit = size - size % 16;
1120 |     uint64_t i = 0;
1121 |     uint16_t buffer[8];
1122 |     __m128i counter[16];
1123 | 
1124 |     while (i < limit) {        
1125 |         for (size_t i = 0; i < 16; ++i) {
1126 |             counter[i] = _mm_setzero_si128();
1127 |         }
1128 | 
1129 |         size_t thislimit = limit;
1130 |         if (thislimit - i >= (1 << 16))
1131 |             thislimit = i + (1 << 16) - 1;
1132 | 
1133 |         for (/**/; i < thislimit; i += 16) {
1134 | #define U(pos) {                     \
1135 |     counter[pos] = _mm_add_epi16(counter[pos], _mm_and_si128(v16, _mm_set1_epi16(1))); \
1136 |     v16 = _mm_srli_epi16(v16, 1); \
1137 | }
1138 |             STORM_pospopcnt_csa_sse(&twosA,  &v1, _mm_loadu_si128(data + i +  0), _mm_loadu_si128(data + i +  1));
1139 |             STORM_pospopcnt_csa_sse(&twosB,  &v1, _mm_loadu_si128(data + i +  2), _mm_loadu_si128(data + i +  3));
1140 |             STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB);
1141 |             STORM_pospopcnt_csa_sse(&twosA,  &v1, _mm_loadu_si128(data + i +  4), _mm_loadu_si128(data + i +  5));
1142 |             STORM_pospopcnt_csa_sse(&twosB,  &v1, _mm_loadu_si128(data + i +  6), _mm_loadu_si128(data + i +  7));
1143 |             STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB);
1144 |             STORM_pospopcnt_csa_sse(&eightsA,&v4, foursA, foursB);
1145 |             STORM_pospopcnt_csa_sse(&twosA,  &v1, _mm_loadu_si128(data + i +  8),  _mm_loadu_si128(data + i +  9));
1146 |             STORM_pospopcnt_csa_sse(&twosB,  &v1, _mm_loadu_si128(data + i + 10),  _mm_loadu_si128(data + i + 11));
1147 |             STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB);
1148 |             STORM_pospopcnt_csa_sse(&twosA,  &v1, _mm_loadu_si128(data + i + 12),  _mm_loadu_si128(data + i + 13));
1149 |             STORM_pospopcnt_csa_sse(&twosB,  &v1, _mm_loadu_si128(data + i + 14),  _mm_loadu_si128(data + i + 15));
1150 |             STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB);
1151 |             STORM_pospopcnt_csa_sse(&eightsB,&v4, foursA, foursB);
1152 |             U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
1153 |             STORM_pospopcnt_csa_sse(&v16,    &v8, eightsA, eightsB);
1154 | #undef U
1155 |         }
1156 | 
1157 |         // update the counters after the last iteration
1158 |         for (size_t i = 0; i < 16; ++i) {
1159 |             counter[i] = _mm_add_epi16(counter[i], _mm_and_si128(v16, _mm_set1_epi16(1)));
1160 |             v16 = _mm_srli_epi16(v16, 1);
1161 |         }
1162 |         
1163 |         for (size_t i = 0; i < 16; ++i) {
1164 |             _mm_storeu_si128((__m128i*)buffer, counter[i]);
1165 |             for (size_t z = 0; z < 8; z++) {
1166 |                 out[i] += 16 * (uint32_t)buffer[z];
1167 |             }
1168 |         }
1169 |     }
1170 | 
1171 |     _mm_storeu_si128((__m128i*)buffer, v1);
1172 |     for (size_t i = 0; i < 8; ++i) {
1173 |         for (int j = 0; j < 16; ++j) {
1174 |             out[j] += ((buffer[i] & (1 << j)) >> j);
1175 |         }
1176 |     }
1177 | 
1178 |     _mm_storeu_si128((__m128i*)buffer, v2);
1179 |     for (size_t i = 0; i < 8; ++i) {
1180 |         for (int j = 0; j < 16; ++j) {
1181 |             out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
1182 |         }
1183 |     }
1184 |     _mm_storeu_si128((__m128i*)buffer, v4);
1185 |     for (size_t i = 0; i < 8; ++i) {
1186 |         for (int j = 0; j < 16; ++j) {
1187 |             out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
1188 |         }
1189 |     }
1190 |     _mm_storeu_si128((__m128i*)buffer, v8);
1191 |     for (size_t i = 0; i < 8; ++i) {
1192 |         for (int j = 0; j < 16; ++j) {
1193 |             out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
1194 |         }
1195 |     }
1196 |     return 0;
1197 | }
1198 | 
1199 | STORM_TARGET("sse4.2")
1200 | static 
1201 | uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 
1202 |                                         const __m128i* STORM_RESTRICT data2, 
1203 |                                         size_t size)
1204 | {
1205 |     __m128i ones     = _mm_setzero_si128();
1206 |     __m128i twos     = _mm_setzero_si128();
1207 |     __m128i fours    = _mm_setzero_si128();
1208 |     __m128i eights   = _mm_setzero_si128();
1209 |     __m128i sixteens = _mm_setzero_si128();
1210 |     __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1211 | 
1212 |     uint64_t i = 0;
1213 |     uint64_t limit = size - size % 16;
1214 |     uint64_t cnt64 = 0;
1215 | 
1216 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) & _mm_loadu_si128(&data2[i+a]))
1217 | 
1218 |     for (/**/; i < limit; i += 16) {
1219 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
1220 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
1221 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1222 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
1223 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
1224 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1225 |         STORM_CSA128(&eightsA, &fours,  fours, foursA, foursB);
1226 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
1227 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
1228 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1229 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
1230 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
1231 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1232 |         STORM_CSA128(&eightsB, &fours,  fours, foursA, foursB);
1233 |         STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1234 | 
1235 |         cnt64 += STORM_POPCOUNT_SSE(sixteens);
1236 |     }
1237 | #undef LOAD
1238 | 
1239 |     cnt64 <<= 4;
1240 |     cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1241 |     cnt64 += STORM_POPCOUNT_SSE(fours)  << 2;
1242 |     cnt64 += STORM_POPCOUNT_SSE(twos)   << 1;
1243 |     cnt64 += STORM_POPCOUNT_SSE(ones)   << 0;
1244 | 
1245 |     for (/**/; i < size; ++i)
1246 |         cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) & _mm_loadu_si128(&data2[i]));
1247 | 
1248 |     return cnt64;
1249 | }
1250 | 
1251 | STORM_TARGET("sse4.2")
1252 | static 
1253 | uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 
1254 |                                     const __m128i* STORM_RESTRICT data2, 
1255 |                                     size_t size)
1256 | {
1257 |     __m128i ones     = _mm_setzero_si128();
1258 |     __m128i twos     = _mm_setzero_si128();
1259 |     __m128i fours    = _mm_setzero_si128();
1260 |     __m128i eights   = _mm_setzero_si128();
1261 |     __m128i sixteens = _mm_setzero_si128();
1262 |     __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1263 | 
1264 |     uint64_t i = 0;
1265 |     uint64_t limit = size - size % 16;
1266 |     uint64_t cnt64 = 0;
1267 | 
1268 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) | _mm_loadu_si128(&data2[i+a]))
1269 | 
1270 |     for (/**/; i < limit; i += 16) {
1271 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
1272 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
1273 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1274 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
1275 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
1276 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1277 |         STORM_CSA128(&eightsA, &fours,  fours, foursA, foursB);
1278 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
1279 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
1280 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1281 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
1282 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
1283 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1284 |         STORM_CSA128(&eightsB, &fours,  fours, foursA, foursB);
1285 |         STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1286 | 
1287 |         cnt64 += STORM_POPCOUNT_SSE(sixteens);
1288 |     }
1289 | #undef LOAD
1290 | 
1291 |     cnt64 <<= 4;
1292 |     cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1293 |     cnt64 += STORM_POPCOUNT_SSE(fours)  << 2;
1294 |     cnt64 += STORM_POPCOUNT_SSE(twos)   << 1;
1295 |     cnt64 += STORM_POPCOUNT_SSE(ones)   << 0;
1296 | 
1297 |     for (/**/; i < size; ++i)
1298 |         cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) | _mm_loadu_si128(&data2[i]));
1299 | 
1300 |     return cnt64;
1301 | }
1302 | 
1303 | STORM_TARGET("sse4.2")
1304 | static 
1305 | uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 
1306 |                                    const __m128i* STORM_RESTRICT data2, 
1307 |                                    size_t size)
1308 | {
1309 |     __m128i ones     = _mm_setzero_si128();
1310 |     __m128i twos     = _mm_setzero_si128();
1311 |     __m128i fours    = _mm_setzero_si128();
1312 |     __m128i eights   = _mm_setzero_si128();
1313 |     __m128i sixteens = _mm_setzero_si128();
1314 |     __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1315 | 
1316 |     uint64_t i = 0;
1317 |     uint64_t limit = size - size % 16;
1318 |     uint64_t cnt64 = 0;
1319 | 
1320 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) ^ _mm_loadu_si128(&data2[i+a]))
1321 | 
1322 |     for (/**/; i < limit; i += 16) {
1323 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
1324 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
1325 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1326 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
1327 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
1328 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1329 |         STORM_CSA128(&eightsA, &fours,  fours, foursA, foursB);
1330 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
1331 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
1332 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1333 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
1334 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
1335 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1336 |         STORM_CSA128(&eightsB, &fours,  fours, foursA, foursB);
1337 |         STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1338 | 
1339 |         cnt64 += STORM_POPCOUNT_SSE(sixteens);
1340 |     }
1341 | #undef LOAD
1342 | 
1343 |     cnt64 <<= 4;
1344 |     cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1345 |     cnt64 += STORM_POPCOUNT_SSE(fours)  << 2;
1346 |     cnt64 += STORM_POPCOUNT_SSE(twos)   << 1;
1347 |     cnt64 += STORM_POPCOUNT_SSE(ones)   << 0;
1348 | 
1349 |     for (/**/; i < size; ++i)
1350 |         cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) ^ _mm_loadu_si128(&data2[i]));
1351 | 
1352 |     return cnt64;
1353 | }
1354 | 
1355 | STORM_TARGET("sse4.2")
1356 | static 
1357 | uint64_t STORM_popcnt_csa_sse4(const __m128i* STORM_RESTRICT data,
1358 |                                 size_t size)
1359 | {
1360 |     __m128i ones     = _mm_setzero_si128();
1361 |     __m128i twos     = _mm_setzero_si128();
1362 |     __m128i fours    = _mm_setzero_si128();
1363 |     __m128i eights   = _mm_setzero_si128();
1364 |     __m128i sixteens = _mm_setzero_si128();
1365 |     __m128i twosA, twosB, foursA, foursB, eightsA, eightsB;
1366 | 
1367 |     uint64_t i = 0;
1368 |     uint64_t limit = size - size % 16;
1369 |     uint64_t cnt64 = 0;
1370 | 
1371 | #define LOAD(a) (_mm_loadu_si128(&data[i+a]))
1372 | 
1373 |     for (/**/; i < limit; i += 16) {
1374 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
1375 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
1376 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1377 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
1378 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
1379 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1380 |         STORM_CSA128(&eightsA, &fours,  fours, foursA, foursB);
1381 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
1382 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
1383 |         STORM_CSA128(&foursA,  &twos,   twos,  twosA,  twosB);
1384 |         STORM_CSA128(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
1385 |         STORM_CSA128(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
1386 |         STORM_CSA128(&foursB,  &twos,   twos,  twosA,  twosB);
1387 |         STORM_CSA128(&eightsB, &fours,  fours, foursA, foursB);
1388 |         STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB);
1389 | 
1390 |         cnt64 += STORM_POPCOUNT_SSE(sixteens);
1391 |     }
1392 | #undef LOAD
1393 | 
1394 |     cnt64 <<= 4;
1395 |     cnt64 += STORM_POPCOUNT_SSE(eights) << 3;
1396 |     cnt64 += STORM_POPCOUNT_SSE(fours)  << 2;
1397 |     cnt64 += STORM_POPCOUNT_SSE(twos)   << 1;
1398 |     cnt64 += STORM_POPCOUNT_SSE(ones)   << 0;
1399 | 
1400 |     for (/**/; i < size; ++i)
1401 |         cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data[i]));
1402 | 
1403 |     return cnt64;
1404 | }
1405 | 
1406 | STORM_TARGET("sse4.2")
1407 | static 
1408 | uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, 
1409 |                             const uint64_t* STORM_RESTRICT b2, 
1410 |                             const size_t n_ints) 
1411 | {
1412 |     uint64_t count = 0;
1413 |     const __m128i* r1 = (__m128i*)b1;
1414 |     const __m128i* r2 = (__m128i*)b2;
1415 |     const uint32_t n_cycles = n_ints / 2;
1416 | 
1417 |     count += STORM_intersect_count_csa_sse4(r1, r2, n_cycles);
1418 | 
1419 |     for (int i = n_cycles*2; i < n_ints; ++i) {
1420 |         count += STORM_POPCOUNT(b1[i] & b2[i]);
1421 |     }
1422 | 
1423 |     return(count);
1424 | }
1425 | 
1426 | STORM_TARGET("sse4.2")
1427 | static 
1428 | uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, 
1429 |                                 const uint64_t* STORM_RESTRICT b2, 
1430 |                                 const size_t n_ints) 
1431 | {
1432 |     uint64_t count = 0;
1433 |     const __m128i* r1 = (__m128i*)b1;
1434 |     const __m128i* r2 = (__m128i*)b2;
1435 |     const uint32_t n_cycles = n_ints / 2;
1436 | 
1437 |     count += STORM_union_count_csa_sse4(r1, r2, n_cycles);
1438 | 
1439 |     for (int i = n_cycles*2; i < n_ints; ++i) {
1440 |         count += STORM_POPCOUNT(b1[i] | b2[i]);
1441 |     }
1442 | 
1443 |     return(count);
1444 | }
1445 | 
1446 | STORM_TARGET("sse4.2")
1447 | static 
1448 | uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, 
1449 |                             const uint64_t* STORM_RESTRICT b2, 
1450 |                             const size_t n_ints) 
1451 | {
1452 |     uint64_t count = 0;
1453 |     const __m128i* r1 = (__m128i*)b1;
1454 |     const __m128i* r2 = (__m128i*)b2;
1455 |     const uint32_t n_cycles = n_ints / 2;
1456 | 
1457 |     count += STORM_diff_count_csa_sse4(r1, r2, n_cycles);
1458 | 
1459 |     for (int i = n_cycles*2; i < n_ints; ++i) {
1460 |         count += STORM_POPCOUNT(b1[i] ^ b2[i]);
1461 |     }
1462 | 
1463 |     return(count);
1464 | }
1465 | 
1466 | STORM_TARGET("sse4.2")
1467 | static 
1468 | uint64_t STORM_popcnt_sse4(const uint64_t* STORM_RESTRICT data, 
1469 |                            const size_t n_ints) 
1470 | {
1471 |     uint64_t count = 0;
1472 |     const __m128i* r1 = (__m128i*)data;
1473 |     const uint32_t n_cycles = n_ints / 2;
1474 | 
1475 |     count += STORM_popcnt_csa_sse4(r1, n_cycles);
1476 | 
1477 |     for (int i = n_cycles*2; i < n_ints; ++i) {
1478 |         count += STORM_POPCOUNT(data[i]);
1479 |     }
1480 | 
1481 |     return(count);
1482 | }
1483 | #endif
1484 | 
1485 | /****************************
1486 | *  AVX256 functions
1487 | ****************************/
1488 | 
1489 | #if defined(STORM_HAVE_AVX2)
1490 | 
1491 | #include <immintrin.h>
1492 | 
1493 | STORM_TARGET("avx2")
1494 | STORM_FORCE_INLINE 
1495 | void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) {
1496 |     __m256i u = _mm256_xor_si256(a, b);
1497 |     *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
1498 |     *l = _mm256_xor_si256(u, c);
1499 | }
1500 | 
1501 | STORM_TARGET("avx2")
1502 | STORM_FORCE_INLINE
1503 | void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, 
1504 |                               __m256i* STORM_RESTRICT l, 
1505 |                               const __m256i b, 
1506 |                               const __m256i c) 
1507 | {
1508 |     const __m256i u = _mm256_xor_si256(*l, b);
1509 |     *h = _mm256_or_si256(*l & b, u & c);
1510 |     *l = _mm256_xor_si256(u, c);
1511 | }
1512 | 
1513 | STORM_TARGET("avx2")
1514 | static
1515 | int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) {
1516 |     const __m256i* data_vectors = (const __m256i*)(array);
1517 |     const uint32_t n_cycles = len / 16;
1518 | 
1519 |     size_t i = 0;
1520 |     for (/**/; i + 8 <= n_cycles; i += 8) {
1521 | #define L(p) __m256i v##p = _mm256_loadu_si256(data_vectors+i+p);
1522 |         L(0) L(1) L(2) L(3)
1523 |         L(4) L(5) L(6) L(7) 
1524 |         
1525 | #define U0(p,k) __m256i input##p = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0x00FF)), _mm256_slli_epi16(v##k, 8));
1526 | #define U1(p,k) __m256i input##k = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0xFF00)), _mm256_srli_epi16(v##k, 8));
1527 | #define U(p, k)  U0(p,k) U1(p,k)
1528 |        U(0,1) U(2, 3) U(4, 5) U(6, 7)
1529 |         
1530 |         for (int i = 0; i < 8; ++i) {
1531 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p));
1532 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k));
1533 | #define A(p, k) A0(p) A1(k)
1534 |             A(0,1) A(2, 3) A(4, 5) A(6, 7)
1535 | 
1536 | #define P0(p) input##p = _mm256_add_epi8(input##p, input##p);
1537 | #define P(p, k) input##p = P0(p) P0(k)
1538 |             P(0,1) P(2, 3) P(4, 5) P(6, 7)
1539 |         }
1540 |     }
1541 | 
1542 |     for (/**/; i + 4 <= n_cycles; i += 4) {
1543 |         L(0) L(1) L(2) L(3)
1544 |         U(0,1) U(2, 3)
1545 |         
1546 |         for (int i = 0; i < 8; ++i) {
1547 |             A(0,1) A( 2, 3)
1548 |             P(0,1) P( 2, 3)
1549 |         }
1550 |     }
1551 | 
1552 |     for (/**/; i + 2 <= n_cycles; i += 2) {
1553 |         L(0) L(1)
1554 |         U(0,1)
1555 |         
1556 |         for (int i = 0; i < 8; ++i) {
1557 |             A(0,1)
1558 |             P(0,1)
1559 |         }
1560 |     }
1561 | 
1562 |     i *= 16;
1563 |     for (/**/; i < len; ++i) {
1564 |         for (int j = 0; j < 16; ++j) {
1565 |             out[j] += ((array[i] & (1 << j)) >> j);
1566 |         }
1567 |     }
1568 | 
1569 | #undef L
1570 | #undef U0
1571 | #undef U1
1572 | #undef U
1573 | #undef A0
1574 | #undef A1
1575 | #undef A
1576 | #undef P0
1577 | #undef P
1578 | 
1579 |     return 0;
1580 | }
1581 | 
1582 | STORM_TARGET("avx2")
1583 | static 
1584 | int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
1585 |     for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) {
1586 |         for (int j = 0; j < 16; ++j) {
1587 |             out[j] += ((array[i] & (1 << j)) >> j);
1588 |         }
1589 |     }
1590 | 
1591 |     const __m256i* data = (const __m256i*)array;
1592 |     size_t size = len / 16;
1593 |     __m256i v1  = _mm256_setzero_si256();
1594 |     __m256i v2  = _mm256_setzero_si256();
1595 |     __m256i v4  = _mm256_setzero_si256();
1596 |     __m256i v8  = _mm256_setzero_si256();
1597 |     __m256i v16 = _mm256_setzero_si256();
1598 |     __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1599 | 
1600 |     const uint64_t limit = size - size % 16;
1601 |     uint64_t i = 0;
1602 |     uint16_t buffer[16];
1603 |     __m256i counter[16];
1604 |     const __m256i one = _mm256_set1_epi16(1);
1605 | 
1606 |     while (i < limit) {        
1607 |         for (size_t i = 0; i < 16; ++i) {
1608 |             counter[i] = _mm256_setzero_si256();
1609 |         }
1610 | 
1611 |         size_t thislimit = limit;
1612 |         if (thislimit - i >= (1 << 16))
1613 |             thislimit = i + (1 << 16) - 1;
1614 | 
1615 |         for (/**/; i < thislimit; i += 16) {
1616 | #define U(pos) {                     \
1617 |     counter[pos] = _mm256_add_epi16(counter[pos], _mm256_and_si256(v16, one)); \
1618 |     v16 = _mm256_srli_epi16(v16, 1); \
1619 | }
1620 |             STORM_pospopcnt_csa_avx2(&twosA,  &v1, _mm256_loadu_si256(data + i +  0), _mm256_loadu_si256(data + i +  1));
1621 |             STORM_pospopcnt_csa_avx2(&twosB,  &v1, _mm256_loadu_si256(data + i +  2), _mm256_loadu_si256(data + i +  3));
1622 |             STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB);
1623 |             STORM_pospopcnt_csa_avx2(&twosA,  &v1, _mm256_loadu_si256(data + i +  4), _mm256_loadu_si256(data + i +  5));
1624 |             STORM_pospopcnt_csa_avx2(&twosB,  &v1, _mm256_loadu_si256(data + i +  6), _mm256_loadu_si256(data + i +  7));
1625 |             STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB);
1626 |             STORM_pospopcnt_csa_avx2(&eightsA,&v4, foursA, foursB);
1627 |             STORM_pospopcnt_csa_avx2(&twosA,  &v1, _mm256_loadu_si256(data + i +  8),  _mm256_loadu_si256(data + i +  9));
1628 |             STORM_pospopcnt_csa_avx2(&twosB,  &v1, _mm256_loadu_si256(data + i + 10),  _mm256_loadu_si256(data + i + 11));
1629 |             STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB);
1630 |             STORM_pospopcnt_csa_avx2(&twosA,  &v1, _mm256_loadu_si256(data + i + 12),  _mm256_loadu_si256(data + i + 13));
1631 |             STORM_pospopcnt_csa_avx2(&twosB,  &v1, _mm256_loadu_si256(data + i + 14),  _mm256_loadu_si256(data + i + 15));
1632 |             STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB);
1633 |             STORM_pospopcnt_csa_avx2(&eightsB,&v4, foursA, foursB);
1634 |             U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
1635 |             STORM_pospopcnt_csa_avx2(&v16,    &v8, eightsA, eightsB);
1636 | #undef U
1637 |         }
1638 | 
1639 |         // update the counters after the last iteration
1640 |         for (size_t i = 0; i < 16; ++i) {
1641 |             counter[i] = _mm256_add_epi16(counter[i], _mm256_and_si256(v16, one));
1642 |             v16 = _mm256_srli_epi16(v16, 1);
1643 |         }
1644 |         
1645 |         for (size_t i = 0; i < 16; ++i) {
1646 |             _mm256_storeu_si256((__m256i*)buffer, counter[i]);
1647 |             for (size_t z = 0; z < 16; z++) {
1648 |                 out[i] += 16 * (uint32_t)buffer[z];
1649 |             }
1650 |         }
1651 |     }
1652 | 
1653 |     _mm256_storeu_si256((__m256i*)buffer, v1);
1654 |     for (size_t i = 0; i < 16; ++i) {
1655 |         for (int j = 0; j < 16; ++j) {
1656 |             out[j] += ((buffer[i] & (1 << j)) >> j);
1657 |         }
1658 |     }
1659 | 
1660 |     _mm256_storeu_si256((__m256i*)buffer, v2);
1661 |     for (size_t i = 0; i < 16; ++i) {
1662 |         for (int j = 0; j < 16; ++j) {
1663 |             out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
1664 |         }
1665 |     }
1666 |     _mm256_storeu_si256((__m256i*)buffer, v4);
1667 |     for (size_t i = 0; i < 16; ++i) {
1668 |         for (int j = 0; j < 16; ++j) {
1669 |             out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
1670 |         }
1671 |     }
1672 |     _mm256_storeu_si256((__m256i*)buffer, v8);
1673 |     for (size_t i = 0; i < 16; ++i) {
1674 |         for (int j = 0; j < 16; ++j) {
1675 |             out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
1676 |         }
1677 |     }
1678 |     return 0;
1679 | }
1680 | 
1681 | 
1682 | STORM_TARGET("avx2")
1683 | static 
1684 | __m256i STORM_popcnt256(__m256i v) {
1685 |     __m256i lookup1 = _mm256_setr_epi8(
1686 |         4, 5, 5, 6, 5, 6, 6, 7,
1687 |         5, 6, 6, 7, 6, 7, 7, 8,
1688 |         4, 5, 5, 6, 5, 6, 6, 7,
1689 |         5, 6, 6, 7, 6, 7, 7, 8
1690 |     );
1691 | 
1692 |     __m256i lookup2 = _mm256_setr_epi8(
1693 |         4, 3, 3, 2, 3, 2, 2, 1,
1694 |         3, 2, 2, 1, 2, 1, 1, 0,
1695 |         4, 3, 3, 2, 3, 2, 2, 1,
1696 |         3, 2, 2, 1, 2, 1, 1, 0
1697 |     );
1698 | 
1699 |     __m256i low_mask = _mm256_set1_epi8(0x0f);
1700 |     __m256i lo = _mm256_and_si256(v, low_mask);
1701 |     __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
1702 |     __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
1703 |     __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
1704 | 
1705 |     return _mm256_sad_epu8(popcnt1, popcnt2);
1706 | }
1707 | 
1708 | // modified from https://github.com/WojciechMula/sse-popcount
1709 | STORM_TARGET("avx2")
1710 | static
1711 | uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 
1712 |                                                 const uint8_t* STORM_RESTRICT data2, 
1713 |                                                 const size_t n)
1714 | {
1715 | 
1716 |     size_t i = 0;
1717 | 
1718 |     const __m256i lookup = _mm256_setr_epi8(
1719 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1720 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1721 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1722 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1723 | 
1724 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1725 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1726 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1727 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1728 |     );
1729 | 
1730 |     const __m256i low_mask = _mm256_set1_epi8(0x0f);
1731 | 
1732 |     __m256i acc = _mm256_setzero_si256();
1733 | 
1734 | #define ITER { \
1735 |         const __m256i vec = _mm256_and_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1736 |             _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1737 |         const __m256i lo  = _mm256_and_si256(vec, low_mask); \
1738 |         const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1739 |         const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1740 |         const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1741 |         local = _mm256_add_epi8(local, popcnt1); \
1742 |         local = _mm256_add_epi8(local, popcnt2); \
1743 |         i += 32; \
1744 |     }
1745 | 
1746 |     while (i + 8*32 <= n) {
1747 |         __m256i local = _mm256_setzero_si256();
1748 |         ITER ITER ITER ITER
1749 |         ITER ITER ITER ITER
1750 |         acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1751 |     }
1752 | 
1753 |     __m256i local = _mm256_setzero_si256();
1754 | 
1755 |     while (i + 32 <= n) {
1756 |         ITER;
1757 |     }
1758 | 
1759 |     acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1760 | 
1761 | #undef ITER
1762 | 
1763 |     uint64_t result = 0;
1764 | 
1765 |     result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1766 |     result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1767 |     result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1768 |     result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1769 | 
1770 |     for (/**/; i < n; ++i) {
1771 |         result += STORM_popcnt_lookup8bit[data1[i] & data2[i]];
1772 |     }
1773 | 
1774 |     return result;
1775 | }
1776 | 
1777 | // modified from https://github.com/WojciechMula/sse-popcount
1778 | STORM_TARGET("avx2")
1779 | static
1780 | uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 
1781 |                                             const uint8_t* STORM_RESTRICT data2, 
1782 |                                             const size_t n)
1783 |     {
1784 | 
1785 |     size_t i = 0;
1786 | 
1787 |     const __m256i lookup = _mm256_setr_epi8(
1788 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1789 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1790 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1791 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1792 | 
1793 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1794 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1795 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1796 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1797 |     );
1798 | 
1799 |     const __m256i low_mask = _mm256_set1_epi8(0x0f);
1800 | 
1801 |     __m256i acc = _mm256_setzero_si256();
1802 | 
1803 | #define ITER { \
1804 |         const __m256i vec = _mm256_or_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1805 |             _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1806 |         const __m256i lo  = _mm256_and_si256(vec, low_mask); \
1807 |         const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1808 |         const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1809 |         const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1810 |         local = _mm256_add_epi8(local, popcnt1); \
1811 |         local = _mm256_add_epi8(local, popcnt2); \
1812 |         i += 32; \
1813 |     }
1814 | 
1815 |     while (i + 8*32 <= n) {
1816 |         __m256i local = _mm256_setzero_si256();
1817 |         ITER ITER ITER ITER
1818 |         ITER ITER ITER ITER
1819 |         acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1820 |     }
1821 | 
1822 |     __m256i local = _mm256_setzero_si256();
1823 | 
1824 |     while (i + 32 <= n) {
1825 |         ITER;
1826 |     }
1827 | 
1828 |     acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1829 | 
1830 | #undef ITER
1831 | 
1832 |     uint64_t result = 0;
1833 | 
1834 |     result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1835 |     result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1836 |     result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1837 |     result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1838 | 
1839 |     for (/**/; i < n; ++i) {
1840 |         result += STORM_popcnt_lookup8bit[data1[i] | data2[i]];
1841 |     }
1842 | 
1843 |     return result;
1844 | }
1845 | 
1846 | // modified from https://github.com/WojciechMula/sse-popcount
1847 | STORM_TARGET("avx2")
1848 | static
1849 | uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 
1850 |                                            const uint8_t* STORM_RESTRICT data2, 
1851 |                                            const size_t n)
1852 | {
1853 | 
1854 |     size_t i = 0;
1855 | 
1856 |     const __m256i lookup = _mm256_setr_epi8(
1857 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1858 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1859 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1860 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
1861 | 
1862 |         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
1863 |         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
1864 |         /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
1865 |         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
1866 |     );
1867 | 
1868 |     const __m256i low_mask = _mm256_set1_epi8(0x0f);
1869 | 
1870 |     __m256i acc = _mm256_setzero_si256();
1871 | 
1872 | #define ITER { \
1873 |         const __m256i vec = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \
1874 |             _mm256_loadu_si256((const __m256i*)(data2 + i))); \
1875 |         const __m256i lo  = _mm256_and_si256(vec, low_mask); \
1876 |         const __m256i hi  = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
1877 |         const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
1878 |         const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
1879 |         local = _mm256_add_epi8(local, popcnt1); \
1880 |         local = _mm256_add_epi8(local, popcnt2); \
1881 |         i += 32; \
1882 |     }
1883 | 
1884 |     while (i + 8*32 <= n) {
1885 |         __m256i local = _mm256_setzero_si256();
1886 |         ITER ITER ITER ITER
1887 |         ITER ITER ITER ITER
1888 |         acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1889 |     }
1890 | 
1891 |     __m256i local = _mm256_setzero_si256();
1892 | 
1893 |     while (i + 32 <= n) {
1894 |         ITER;
1895 |     }
1896 | 
1897 |     acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
1898 | 
1899 | #undef ITER
1900 | 
1901 |     uint64_t result = 0;
1902 | 
1903 |     result += (uint64_t)(_mm256_extract_epi64(acc, 0));
1904 |     result += (uint64_t)(_mm256_extract_epi64(acc, 1));
1905 |     result += (uint64_t)(_mm256_extract_epi64(acc, 2));
1906 |     result += (uint64_t)(_mm256_extract_epi64(acc, 3));
1907 | 
1908 |     for (/**/; i < n; ++i) {
1909 |         result += STORM_popcnt_lookup8bit[data1[i] ^ data2[i]];
1910 |     }
1911 | 
1912 |     return result;
1913 | }
1914 | 
1915 | STORM_TARGET("avx2")
1916 | static
1917 | uint64_t STORM_popcnt_csa_avx2(const __m256i* data, uint64_t size)
1918 | {
1919 |     __m256i cnt      = _mm256_setzero_si256();
1920 |     __m256i ones     = _mm256_setzero_si256();
1921 |     __m256i twos     = _mm256_setzero_si256();
1922 |     __m256i fours    = _mm256_setzero_si256();
1923 |     __m256i eights   = _mm256_setzero_si256();
1924 |     __m256i sixteens = _mm256_setzero_si256();
1925 |     __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1926 | 
1927 |     uint64_t i = 0;
1928 |     uint64_t limit = size - size % 16;
1929 |     uint64_t* cnt64;
1930 | 
1931 | #define LOAD(a) (_mm256_loadu_si256(&data[i+a]))
1932 | 
1933 |     for (/**/; i < limit; i += 16) {
1934 |         STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1));
1935 |         STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3));
1936 |         STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
1937 |         STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5));
1938 |         STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7));
1939 |         STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
1940 |         STORM_CSA256(&eightsA, &fours, fours, foursA, foursB);
1941 |         STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9));
1942 |         STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11));
1943 |         STORM_CSA256(&foursA, &twos, twos, twosA, twosB);
1944 |         STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13));
1945 |         STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15));
1946 |         STORM_CSA256(&foursB, &twos, twos, twosA, twosB);
1947 |         STORM_CSA256(&eightsB, &fours, fours, foursA, foursB);
1948 |         STORM_CSA256(&sixteens, &eights, eights, eightsA, eightsB);
1949 | 
1950 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
1951 |     }
1952 | #undef LOAD
1953 | 
1954 |     cnt = _mm256_slli_epi64(cnt, 4);
1955 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
1956 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2));
1957 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1));
1958 |     cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
1959 | 
1960 |     for (/**/; i < size; ++i)
1961 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(data[i]));
1962 | 
1963 |     cnt64 = (uint64_t*) &cnt;
1964 | 
1965 |     return cnt64[0] +
1966 |             cnt64[1] +
1967 |             cnt64[2] +
1968 |             cnt64[3];
1969 | }
1970 | 
1971 | 
1972 | /*
1973 |  * AVX2 Harley-Seal popcount (4th iteration).
1974 |  * The algorithm is based on the paper "Faster Population Counts
1975 |  * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
1976 |  * Wojciech Mula (23 Nov 2016).
1977 |  * @see https://arxiv.org/abs/1611.07612
1978 |  */
1979 | // In this version we perform the operation A&B as input into the CSA operator.
1980 | STORM_TARGET("avx2")
1981 | static 
1982 | uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 
1983 |                                         const __m256i* STORM_RESTRICT data2, 
1984 |                                         size_t size)
1985 | {
1986 |     __m256i cnt      = _mm256_setzero_si256();
1987 |     __m256i ones     = _mm256_setzero_si256();
1988 |     __m256i twos     = _mm256_setzero_si256();
1989 |     __m256i fours    = _mm256_setzero_si256();
1990 |     __m256i eights   = _mm256_setzero_si256();
1991 |     __m256i sixteens = _mm256_setzero_si256();
1992 |     __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
1993 | 
1994 |     uint64_t i = 0;
1995 |     uint64_t limit = size - size % 16;
1996 |     uint64_t* cnt64;
1997 | 
1998 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) & _mm256_loadu_si256(&data2[i+a]))
1999 | 
2000 |     for (/**/; i < limit; i += 16) {
2001 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2002 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2003 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2004 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2005 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2006 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2007 |         STORM_CSA256(&eightsA, &fours,  fours, foursA, foursB);
2008 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2009 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2010 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2011 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2012 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2013 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2014 |         STORM_CSA256(&eightsB, &fours,  fours, foursA, foursB);
2015 |         STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2016 | 
2017 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2018 |     }
2019 | #undef LOAD
2020 | 
2021 |     cnt = _mm256_slli_epi64(cnt, 4);
2022 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2023 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours),  2));
2024 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos),   1));
2025 |     cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2026 | 
2027 |     for (/**/; i < size; ++i)
2028 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) & _mm256_loadu_si256(&data2[i])));
2029 | 
2030 |     cnt64 = (uint64_t*) &cnt;
2031 | 
2032 |     return cnt64[0] +
2033 |             cnt64[1] +
2034 |             cnt64[2] +
2035 |             cnt64[3];
2036 | }
2037 | 
2038 | // In this version we perform the operation A|B as input into the CSA operator.
2039 | STORM_TARGET("avx2")
2040 | static 
2041 | uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 
2042 |                                     const __m256i* STORM_RESTRICT data2, 
2043 |                                     size_t size)
2044 | {
2045 |     __m256i cnt      = _mm256_setzero_si256();
2046 |     __m256i ones     = _mm256_setzero_si256();
2047 |     __m256i twos     = _mm256_setzero_si256();
2048 |     __m256i fours    = _mm256_setzero_si256();
2049 |     __m256i eights   = _mm256_setzero_si256();
2050 |     __m256i sixteens = _mm256_setzero_si256();
2051 |     __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
2052 | 
2053 |     uint64_t i = 0;
2054 |     uint64_t limit = size - size % 16;
2055 |     uint64_t* cnt64;
2056 | 
2057 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) | _mm256_loadu_si256(&data2[i+a]))
2058 | 
2059 |     for (/**/; i < limit; i += 16) {
2060 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2061 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2062 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2063 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2064 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2065 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2066 |         STORM_CSA256(&eightsA, &fours,  fours, foursA, foursB);
2067 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2068 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2069 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2070 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2071 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2072 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2073 |         STORM_CSA256(&eightsB, &fours,  fours, foursA, foursB);
2074 |         STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2075 | 
2076 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2077 |     }
2078 | #undef LOAD
2079 | 
2080 |     cnt = _mm256_slli_epi64(cnt, 4);
2081 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2082 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours),  2));
2083 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos),   1));
2084 |     cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2085 | 
2086 |     for (/**/; i < size; ++i)
2087 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) | _mm256_loadu_si256(&data2[i])));
2088 | 
2089 |     cnt64 = (uint64_t*) &cnt;
2090 | 
2091 |     return cnt64[0] +
2092 |             cnt64[1] +
2093 |             cnt64[2] +
2094 |             cnt64[3];
2095 | }
2096 | 
2097 | // In this version we perform the operation A^B as input into the CSA operator.
2098 | STORM_TARGET("avx2")
2099 | static 
2100 | uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 
2101 |                                    const __m256i* STORM_RESTRICT data2, 
2102 |                                    size_t size)
2103 | {
2104 |     __m256i cnt      = _mm256_setzero_si256();
2105 |     __m256i ones     = _mm256_setzero_si256();
2106 |     __m256i twos     = _mm256_setzero_si256();
2107 |     __m256i fours    = _mm256_setzero_si256();
2108 |     __m256i eights   = _mm256_setzero_si256();
2109 |     __m256i sixteens = _mm256_setzero_si256();
2110 |     __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
2111 | 
2112 |     uint64_t i = 0;
2113 |     uint64_t limit = size - size % 16;
2114 |     uint64_t* cnt64;
2115 | 
2116 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) ^ _mm256_loadu_si256(&data2[i+a]))
2117 | 
2118 |     for (/**/; i < limit; i += 16) {
2119 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2120 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2121 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2122 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2123 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2124 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2125 |         STORM_CSA256(&eightsA, &fours,  fours, foursA, foursB);
2126 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2127 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2128 |         STORM_CSA256(&foursA,  &twos,   twos,  twosA,  twosB);
2129 |         STORM_CSA256(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2130 |         STORM_CSA256(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2131 |         STORM_CSA256(&foursB,  &twos,   twos,  twosA,  twosB);
2132 |         STORM_CSA256(&eightsB, &fours,  fours, foursA, foursB);
2133 |         STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB);
2134 | 
2135 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens));
2136 |     }
2137 | #undef LOAD
2138 | 
2139 |     cnt = _mm256_slli_epi64(cnt, 4);
2140 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3));
2141 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours),  2));
2142 |     cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos),   1));
2143 |     cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones));
2144 | 
2145 |     for (/**/; i < size; ++i)
2146 |         cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) ^ _mm256_loadu_si256(&data2[i])));
2147 | 
2148 |     cnt64 = (uint64_t*) &cnt;
2149 | 
2150 |     return cnt64[0] +
2151 |             cnt64[1] +
2152 |             cnt64[2] +
2153 |             cnt64[3];
2154 | }
2155 | 
2156 | STORM_TARGET("avx2")
2157 | static 
2158 | uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, 
2159 |                                     const uint64_t* STORM_RESTRICT b2, 
2160 |                                     const size_t n_ints)
2161 | {
2162 |     uint64_t count = 0;
2163 |     const __m256i* r1 = (__m256i*)b1;
2164 |     const __m256i* r2 = (__m256i*)b2;
2165 |     const uint32_t n_cycles = n_ints / 4;
2166 | 
2167 |     count += STORM_intersect_count_csa_avx2(r1, r2, n_cycles);
2168 | 
2169 |     for (int i = n_cycles*4; i < n_ints; ++i) {
2170 |         count += STORM_POPCOUNT(b1[i] & b2[i]);
2171 |     }
2172 | 
2173 |     return(count);
2174 | }
2175 | 
2176 | STORM_TARGET("avx2")
2177 | static 
2178 | uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, 
2179 |                                 const uint64_t* STORM_RESTRICT b2, 
2180 |                                 const size_t n_ints)
2181 | {
2182 |     uint64_t count = 0;
2183 |     const __m256i* r1 = (__m256i*)b1;
2184 |     const __m256i* r2 = (__m256i*)b2;
2185 |     const uint32_t n_cycles = n_ints / 4;
2186 | 
2187 |     count += STORM_union_count_csa_avx2(r1, r2, n_cycles);
2188 | 
2189 |     for (int i = n_cycles*4; i < n_ints; ++i) {
2190 |         count += STORM_POPCOUNT(b1[i] | b2[i]);
2191 |     }
2192 | 
2193 |     return(count);
2194 | }
2195 | 
2196 | STORM_TARGET("avx2")
2197 | static 
2198 | uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, 
2199 |                                const uint64_t* STORM_RESTRICT b2, 
2200 |                                const size_t n_ints)
2201 | {
2202 |     uint64_t count = 0;
2203 |     const __m256i* r1 = (__m256i*)b1;
2204 |     const __m256i* r2 = (__m256i*)b2;
2205 |     const uint32_t n_cycles = n_ints / 4;
2206 | 
2207 |     count += STORM_diff_count_csa_avx2(r1, r2, n_cycles);
2208 | 
2209 |     for (int i = n_cycles*4; i < n_ints; ++i) {
2210 |         count += STORM_POPCOUNT(b1[i] ^ b2[i]);
2211 |     }
2212 | 
2213 |     return(count);
2214 | }
2215 | 
2216 | STORM_TARGET("avx2")
2217 | static 
2218 | uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 
2219 |                                            const uint64_t* STORM_RESTRICT b2, 
2220 |                                            const size_t n_ints)
2221 | {
2222 |     return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2223 | }
2224 | 
2225 | STORM_TARGET("avx2")
2226 | static 
2227 | uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 
2228 |                                        const uint64_t* STORM_RESTRICT b2, 
2229 |                                        const size_t n_ints)
2230 | {
2231 |     return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2232 | }
2233 | 
2234 | STORM_TARGET("avx2")
2235 | static 
2236 | uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 
2237 |                                       const uint64_t* STORM_RESTRICT b2, 
2238 |                                       const size_t n_ints)
2239 | {
2240 |     return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8);
2241 | }
2242 | 
2243 | STORM_TARGET("avx2")
2244 | static 
2245 | uint64_t STORM_popcnt_avx2(const uint64_t* data, 
2246 |                            const size_t n_ints) 
2247 | {
2248 |     uint64_t count = 0;
2249 |     const uint32_t n_cycles = n_ints / 4;
2250 |     const uint32_t n_cycles_sse = (n_ints % 4) / 2;
2251 | 
2252 |     const __m256i* r1 = (__m256i*)&data[0];
2253 |     const __m128i* r2 = (__m128i*)&data[n_cycles_sse*4];
2254 | 
2255 |     count += STORM_popcnt_csa_avx2(r1, n_cycles);
2256 |     count += STORM_popcnt_csa_sse4(r2, n_cycles_sse);
2257 | 
2258 |     for (int i = (4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) {
2259 |         count += STORM_POPCOUNT(data[i]);
2260 |     }
2261 | 
2262 |     return count;
2263 | }
2264 | #endif
2265 | 
2266 | /****************************
2267 | *  AVX512BW functions
2268 | ****************************/
2269 | 
2270 | #if defined(STORM_HAVE_AVX512)
2271 | 
2272 | #include <immintrin.h>
2273 | 
2274 | STORM_TARGET("avx512bw")
2275 | STORM_FORCE_INLINE  
2276 | __m512i STORM_popcnt512(__m512i v) {
2277 |     __m512i m1 = _mm512_set1_epi8(0x55);
2278 |     __m512i m2 = _mm512_set1_epi8(0x33);
2279 |     __m512i m4 = _mm512_set1_epi8(0x0F);
2280 |     __m512i t1 = _mm512_sub_epi8(v,       (_mm512_srli_epi16(v, 1)   & m1));
2281 |     __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2)  & m2));
2282 |     __m512i t3 = _mm512_add_epi8(t2,       _mm512_srli_epi16(t2, 4)) & m4;
2283 | 
2284 |     return _mm512_sad_epu8(t3, _mm512_setzero_si512());
2285 | }
2286 | 
2287 | STORM_TARGET("avx512bw")
2288 | STORM_FORCE_INLINE  
2289 | void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) {
2290 |     *l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
2291 |     *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8);
2292 | }
2293 | 
2294 | // By Wojciech Muła
2295 | // @see https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx512-harley-seal.cpp#L3
2296 | // @see https://arxiv.org/abs/1611.07612
2297 | STORM_TARGET("avx512bw")
2298 | STORM_FORCE_INLINE
2299 | __m512i STORM_avx512_popcount(const __m512i v) {
2300 |     const __m512i m1 = _mm512_set1_epi8(0x55); // 01010101
2301 |     const __m512i m2 = _mm512_set1_epi8(0x33); // 00110011
2302 |     const __m512i m4 = _mm512_set1_epi8(0x0F); // 00001111
2303 | 
2304 |     const __m512i t1 = _mm512_sub_epi8(v,       (_mm512_srli_epi16(v,  1)  & m1));
2305 |     const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2)  & m2));
2306 |     const __m512i t3 = _mm512_add_epi8(t2,       _mm512_srli_epi16(t2, 4)) & m4;
2307 |     return _mm512_sad_epu8(t3, _mm512_setzero_si512());
2308 | }
2309 | 
2310 | // 512i-version of carry-save adder subroutine.
2311 | STORM_TARGET("avx512bw")
2312 | STORM_FORCE_INLINE
2313 | void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, 
2314 |                                 __m512i* STORM_RESTRICT l, 
2315 |                                 __m512i b, __m512i c) 
2316 | {
2317 |      *h = _mm512_ternarylogic_epi32(c, b, *l, 0xE8); // 11101000
2318 |      *l = _mm512_ternarylogic_epi32(c, b, *l, 0x96); // 10010110
2319 | }
2320 | 
2321 | STORM_TARGET("avx512bw")
2322 | static 
2323 | uint64_t STORM_popcnt_csa_avx512bw(const __m512i* STORM_RESTRICT data, size_t size)
2324 | {
2325 |     __m512i cnt      = _mm512_setzero_si512();
2326 |     __m512i ones     = _mm512_setzero_si512();
2327 |     __m512i twos     = _mm512_setzero_si512();
2328 |     __m512i fours    = _mm512_setzero_si512();
2329 |     __m512i eights   = _mm512_setzero_si512();
2330 |     __m512i sixteens = _mm512_setzero_si512();
2331 |     __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2332 | 
2333 |     uint64_t i = 0;
2334 |     uint64_t limit = size - size % 16;
2335 |     uint64_t* cnt64;
2336 | 
2337 | #define LOAD(a) (_mm512_loadu_si512(&data[i+a]))
2338 | 
2339 |     for (/**/; i < limit; i += 16) {
2340 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2341 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2342 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2343 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2344 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2345 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2346 |         STORM_CSA512(&eightsA, &fours,  fours, foursA, foursB);
2347 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2348 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2349 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2350 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2351 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2352 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2353 |         STORM_CSA512(&eightsB, &fours,  fours, foursA, foursB);
2354 |         STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2355 | 
2356 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2357 |     }
2358 | #undef LOAD
2359 | 
2360 |     cnt = _mm512_slli_epi64(cnt, 4);
2361 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2362 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2363 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2364 |     cnt = _mm512_add_epi64(cnt,  STORM_popcnt512(ones));
2365 | 
2366 |     for (/**/; i < size; ++i)
2367 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data[i])));
2368 | 
2369 |     cnt64 = (uint64_t*)&cnt;
2370 | 
2371 |     return cnt64[0] +
2372 |             cnt64[1] +
2373 |             cnt64[2] +
2374 |             cnt64[3] +
2375 |             cnt64[4] +
2376 |             cnt64[5] +
2377 |             cnt64[6] +
2378 |             cnt64[7];
2379 | }
2380 | 
2381 | STORM_TARGET("avx512bw")
2382 | static
2383 | int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) {
2384 |     for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) {
2385 |         for (int j = 0; j < 16; ++j) {
2386 |             out[j] += ((array[i] & (1 << j)) >> j);
2387 |         }
2388 |     }
2389 | 
2390 |     const __m512i* data = (const __m512i*)array;
2391 |     __m512i v1  = _mm512_setzero_si512();
2392 |     __m512i v2  = _mm512_setzero_si512();
2393 |     __m512i v4  = _mm512_setzero_si512();
2394 |     __m512i v8  = _mm512_setzero_si512();
2395 |     __m512i v16 = _mm512_setzero_si512();
2396 |     __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2397 |     __m512i one = _mm512_set1_epi16(1);
2398 |     __m512i counter[16];
2399 | 
2400 |     const size_t size = len / 32;
2401 |     const uint64_t limit = size - size % 16;
2402 | 
2403 |     uint16_t buffer[32];
2404 | 
2405 |     uint64_t i = 0;
2406 |     while (i < limit) {
2407 |         for (size_t i = 0; i < 16; ++i)
2408 |             counter[i] = _mm512_setzero_si512();
2409 | 
2410 |         size_t thislimit = limit;
2411 |         if (thislimit - i >= (1 << 16))
2412 |             thislimit = i + (1 << 16) - 1;
2413 | 
2414 |         for (/**/; i < thislimit; i += 16) {
2415 | #define U(pos) {                     \
2416 |     counter[pos] = _mm512_add_epi16(counter[pos], _mm512_and_si512(v16, _mm512_set1_epi16(1))); \
2417 |     v16 = _mm512_srli_epi16(v16, 1); \
2418 | }
2419 |             STORM_pospopcnt_csa_avx512(&twosA,   &v1, _mm512_loadu_si512(data + i + 0), _mm512_loadu_si512(data + i + 1));
2420 |             STORM_pospopcnt_csa_avx512(&twosB,   &v1, _mm512_loadu_si512(data + i + 2), _mm512_loadu_si512(data + i + 3));
2421 |             STORM_pospopcnt_csa_avx512(&foursA,  &v2, twosA, twosB);
2422 |             STORM_pospopcnt_csa_avx512(&twosA,   &v1, _mm512_loadu_si512(data + i + 4), _mm512_loadu_si512(data + i + 5));
2423 |             STORM_pospopcnt_csa_avx512(&twosB,   &v1, _mm512_loadu_si512(data + i + 6), _mm512_loadu_si512(data + i + 7));
2424 |             STORM_pospopcnt_csa_avx512(&foursB,  &v2, twosA, twosB);
2425 |             STORM_pospopcnt_csa_avx512(&eightsA, &v4, foursA, foursB);
2426 |             STORM_pospopcnt_csa_avx512(&twosA,   &v1, _mm512_loadu_si512(data + i + 8),  _mm512_loadu_si512(data + i + 9));
2427 |             STORM_pospopcnt_csa_avx512(&twosB,   &v1, _mm512_loadu_si512(data + i + 10), _mm512_loadu_si512(data + i + 11));
2428 |             STORM_pospopcnt_csa_avx512(&foursA,  &v2, twosA, twosB);
2429 |             STORM_pospopcnt_csa_avx512(&twosA,   &v1, _mm512_loadu_si512(data + i + 12), _mm512_loadu_si512(data + i + 13));
2430 |             STORM_pospopcnt_csa_avx512(&twosB,   &v1, _mm512_loadu_si512(data + i + 14), _mm512_loadu_si512(data + i + 15));
2431 |             STORM_pospopcnt_csa_avx512(&foursB,  &v2, twosA, twosB);
2432 |             STORM_pospopcnt_csa_avx512(&eightsB, &v4, foursA, foursB);
2433 |             U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates
2434 |             STORM_pospopcnt_csa_avx512(&v16,     &v8, eightsA, eightsB);
2435 |         }
2436 |         // Update the counters after the last iteration.
2437 |         for (size_t i = 0; i < 16; ++i) U(i)
2438 | #undef U
2439 |         
2440 |         for (size_t i = 0; i < 16; ++i) {
2441 |             _mm512_storeu_si512((__m512i*)buffer, counter[i]);
2442 |             for (size_t z = 0; z < 32; z++) {
2443 |                 out[i] += 16 * (uint32_t)buffer[z];
2444 |             }
2445 |         }
2446 |     }
2447 | 
2448 |     _mm512_storeu_si512((__m512i*)buffer, v1);
2449 |     for (size_t i = 0; i < 32; ++i) {
2450 |         for (int j = 0; j < 16; ++j) {
2451 |             out[j] += 1 * ((buffer[i] & (1 << j)) >> j);
2452 |         }
2453 |     }
2454 | 
2455 |     _mm512_storeu_si512((__m512i*)buffer, v2);
2456 |     for (size_t i = 0; i < 32; ++i) {
2457 |         for (int j = 0; j < 16; ++j) {
2458 |             out[j] += 2 * ((buffer[i] & (1 << j)) >> j);
2459 |         }
2460 |     }
2461 |     
2462 |     _mm512_storeu_si512((__m512i*)buffer, v4);
2463 |     for (size_t i = 0; i < 32; ++i) {
2464 |         for (int j = 0; j < 16; ++j) {
2465 |             out[j] += 4 * ((buffer[i] & (1 << j)) >> j);
2466 |         }
2467 |     }
2468 | 
2469 |     _mm512_storeu_si512((__m512i*)buffer, v8);
2470 |     for (size_t i = 0; i < 32; ++i) {
2471 |         for (int j = 0; j < 16; ++j) {
2472 |             out[j] += 8 * ((buffer[i] & (1 << j)) >> j);
2473 |         }
2474 |     }
2475 | 
2476 |     return 0;
2477 | }
2478 | 
2479 | STORM_TARGET("avx512bw")
2480 | static 
2481 | int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, size_t len, uint32_t* out) { 
2482 | #define AND_OR 0xea // ternary function: (a & b) | c
2483 |     const __m512i* data_vectors = (const __m512i*)(data);
2484 |     const uint32_t n_cycles = len / 32;
2485 | 
2486 |     size_t i = 0;
2487 |     for (/**/; i + 8 <= n_cycles; i += 8) {
2488 | #define L(p) __m512i v##p = _mm512_loadu_si512(data_vectors+i+p);
2489 |         L(0)  L(1)  L(2)  L(3)  
2490 |         L(4)  L(5)  L(6)  L(7) 
2491 | 
2492 | #define U0(p,k) __m512i input##p = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0x00FF), _mm512_slli_epi16(v##k, 8), AND_OR);
2493 | #define U1(p,k) __m512i input##k = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0xFF00), _mm512_srli_epi16(v##k, 8), AND_OR);
2494 | #define U(p, k)  U0(p,k) U1(p,k)
2495 | 
2496 |         U(0,1) U( 2, 3) U( 4, 5) U( 6, 7)
2497 |         
2498 |         for (int i = 0; i < 8; ++i) {
2499 | #define A0(p) out[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p));
2500 | #define A1(k) out[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k));
2501 | #define A(p, k) A0(p) A1(k)
2502 |             A(0,1) A(2, 3) A(4,5) A(6, 7)
2503 | 
2504 | #define P0(p) input##p = _mm512_add_epi8(input##p, input##p);
2505 | #define P(p, k) input##p = P0(p) P0(k)
2506 | 
2507 |             P(0,1) P(2, 3) P(4,5) P(6, 7)
2508 |         }
2509 |     }
2510 | 
2511 |     for (/**/; i + 4 <= n_cycles; i += 4) {
2512 |         L(0) L(1) L(2) L(3)
2513 |         U(0,1) U(2,3)
2514 |         
2515 |         for (int i = 0; i < 8; ++i) {
2516 |             A(0,1) A(2, 3)
2517 |             P(0,1) P(2, 3)
2518 |         }
2519 |     }
2520 | 
2521 |     for (/**/; i + 2 <= n_cycles; i += 2) {
2522 |         L(0) L(1)
2523 |         U(0,1)
2524 |         
2525 |         for (int i = 0; i < 8; ++i) {
2526 |             A(0,1)
2527 |             P(0,1)
2528 |         }
2529 |     }
2530 | 
2531 |     i *= 32;
2532 |     for (/**/; i < len; ++i) {
2533 |         for (int j = 0; j < 16; ++j) {
2534 |             out[j] += ((data[i] & (1 << j)) >> j);
2535 |         }
2536 |     }
2537 | 
2538 | #undef L
2539 | #undef U0
2540 | #undef U1
2541 | #undef U
2542 | #undef A0
2543 | #undef A1
2544 | #undef A
2545 | #undef P0
2546 | #undef P
2547 | #undef AND_OR
2548 |     
2549 |     return 0;
2550 | }
2551 | 
2552 | STORM_TARGET("avx512bw")
2553 | static
2554 | int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, size_t len, uint32_t* out) {
2555 |     __m512i counters[16];
2556 | 
2557 |     for (size_t i = 0; i < 16; ++i) {
2558 |         counters[i] = _mm512_setzero_si512();
2559 |     }
2560 | 
2561 |     const __m512i mask1bit = _mm512_set1_epi16(0x5555); // 0101010101010101 Pattern: 01
2562 |     const __m512i mask2bit = _mm512_set1_epi16(0x3333); // 0011001100110011 Pattern: 0011
2563 |     const __m512i mask4bit = _mm512_set1_epi16(0x0F0F); // 0000111100001111 Pattern: 00001111
2564 |     const __m512i mask8bit = _mm512_set1_epi16(0x00FF); // 0000000011111111 Pattern: 0000000011111111
2565 |     
2566 |     const uint32_t n_cycles = len / (2048 * (16*32));
2567 |     const uint32_t n_total  = len / (16*32);
2568 |     uint16_t tmp[32];
2569 | 
2570 | /*------ Macros --------*/
2571 | #define LE(i,p,k)  const __m512i sum##p##k##_##i##bit_even = _mm512_add_epi8(input##p & mask##i##bit, input##k & mask##i##bit);
2572 | #define LO(i,p,k)  const __m512i sum##p##k##_##i##bit_odd  = _mm512_add_epi8(_mm512_srli_epi16(input##p, i) & mask##i##bit, _mm512_srli_epi16(input##k, i) & mask##i##bit);
2573 | 
2574 | #define LBLOCK(i)           \
2575 |     LE(i,0,1)   LO(i,0,1)   \
2576 |     LE(i,2,3)   LO(i,2,3)   \
2577 |     LE(i,4,5)   LO(i,4,5)   \
2578 |     LE(i,6,7)   LO(i,6,7)   \
2579 |     LE(i,8,9)   LO(i,8,9)   \
2580 |     LE(i,10,11) LO(i,10,11) \
2581 |     LE(i,12,13) LO(i,12,13) \
2582 |     LE(i,14,15) LO(i,14,15) \
2583 | 
2584 | #define EVEN(b,i,k,p) input##i = sum##k##p##_##b##bit_even;
2585 | #define ODD(b,i,k,p)  input##i = sum##k##p##_##b##bit_odd;
2586 | 
2587 | #define UPDATE(i)                                                  \
2588 |     EVEN(i,0,0,1) EVEN(i,1,2,3)   EVEN(i,2,4,5)   EVEN(i,3,6,7)    \
2589 |     EVEN(i,4,8,9) EVEN(i,5,10,11) EVEN(i,6,12,13) EVEN(i,7,14,15)  \
2590 |      ODD(i,8,0,1)  ODD(i,9,2,3)    ODD(i,10,4,5)   ODD(i,11,6,7)   \
2591 |      ODD(i,12,8,9) ODD(i,13,10,11) ODD(i,14,12,13) ODD(i,15,14,15) \
2592 | 
2593 | #define UE(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_even);
2594 | #define UO(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_odd);
2595 | 
2596 | /*------ Start --------*/
2597 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*2048*512 + j*512 + p*32));
2598 |     size_t i = 0;
2599 |     for (/**/; i < n_cycles; ++i) {
2600 |         for (int j = 0; j < 2048; ++j) {
2601 |             // Load 16 registers.
2602 |             L(0)  L(1)  L(2)  L(3)  
2603 |             L(4)  L(5)  L(6)  L(7) 
2604 |             L(8)  L(9)  L(10) L(11) 
2605 |             L(12) L(13) L(14) L(15)
2606 | 
2607 |             // Perform updates for bits {1,2,4,8}.
2608 |             LBLOCK(1) UPDATE(1)
2609 |             LBLOCK(2) UPDATE(2)
2610 |             LBLOCK(4) UPDATE(4)
2611 |             LBLOCK(8) UPDATE(8)
2612 | 
2613 |             // Update accumulators.
2614 |             UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7)  
2615 |             UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15) 
2616 |             UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7) 
2617 |             UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15)
2618 |         }
2619 | 
2620 |         // Update.
2621 |         for (size_t i = 0; i < 16; ++i) {
2622 |             _mm512_storeu_si512((__m512i*)tmp, counters[i]);
2623 |             for (int j = 0; j < 32; ++j) out[i] += tmp[j];
2624 |         }
2625 |         // Reset.
2626 |         for (size_t i = 0; i < 16; ++i) {
2627 |             counters[i] = _mm512_setzero_si512();
2628 |         }
2629 |     }
2630 | #undef L
2631 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*512 + p*32));
2632 |     i *= 2048;
2633 |     for (/**/; i < n_total; ++i) {
2634 |         // Load 16 registers.
2635 |         L(0)  L(1)  L(2)  L(3)  
2636 |         L(4)  L(5)  L(6)  L(7) 
2637 |         L(8)  L(9)  L(10) L(11) 
2638 |         L(12) L(13) L(14) L(15)
2639 | 
2640 |         // Perform updates for bits {1,2,4,8}.
2641 |         LBLOCK(1) UPDATE(1)
2642 |         LBLOCK(2) UPDATE(2)
2643 |         LBLOCK(4) UPDATE(4)
2644 |         LBLOCK(8) UPDATE(8)
2645 | 
2646 |         // Update accumulators.
2647 |         UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7)  
2648 |         UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15) 
2649 |         UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7) 
2650 |         UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15)
2651 |     }
2652 | 
2653 |     i *= 512;
2654 |     for (/**/; i < len; ++i) {
2655 |         for (int j = 0; j < 16; ++j) {
2656 |             out[j] += ((array[i] & (1 << j)) >> j);
2657 |         }
2658 |     }
2659 | 
2660 | #undef L
2661 | #undef UPDATE
2662 | #undef ODD
2663 | #undef EVEN
2664 | #undef LBLOCK
2665 | #undef LE
2666 | #undef LO
2667 | #undef UO
2668 | #undef UE
2669 | 
2670 |     for (size_t i = 0; i < 16; ++i) {
2671 |         _mm512_storeu_si512((__m512i*)tmp, counters[i]);
2672 |         for (int j = 0; j < 32; ++j) out[i] += tmp[j];
2673 |     }
2674 |     return 0;
2675 | }
2676 | 
2677 | /*
2678 |  * AVX512 Harley-Seal popcount (4th iteration).
2679 |  * The algorithm is based on the paper "Faster Population Counts
2680 |  * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
2681 |  * Wojciech Mula (23 Nov 2016).
2682 |  * @see https://arxiv.org/abs/1611.07612
2683 |  */
2684 | STORM_TARGET("avx512bw")
2685 | static 
2686 | uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 
2687 |                                           const __m512i* STORM_RESTRICT data2, 
2688 |                                           size_t size)
2689 | {
2690 |     __m512i cnt      = _mm512_setzero_si512();
2691 |     __m512i ones     = _mm512_setzero_si512();
2692 |     __m512i twos     = _mm512_setzero_si512();
2693 |     __m512i fours    = _mm512_setzero_si512();
2694 |     __m512i eights   = _mm512_setzero_si512();
2695 |     __m512i sixteens = _mm512_setzero_si512();
2696 |     __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2697 | 
2698 |     uint64_t i = 0;
2699 |     uint64_t limit = size - size % 16;
2700 |     uint64_t* cnt64;
2701 | 
2702 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) & _mm512_loadu_si512(&data2[i+a]))
2703 | 
2704 |     for (/**/; i < limit; i += 16) {
2705 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2706 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2707 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2708 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2709 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2710 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2711 |         STORM_CSA512(&eightsA, &fours,  fours, foursA, foursB);
2712 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2713 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2714 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2715 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2716 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2717 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2718 |         STORM_CSA512(&eightsB, &fours,  fours, foursA, foursB);
2719 |         STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2720 | 
2721 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2722 |     }
2723 | #undef LOAD
2724 | 
2725 |     cnt = _mm512_slli_epi64(cnt, 4);
2726 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2727 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2728 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2729 |     cnt = _mm512_add_epi64(cnt,  STORM_popcnt512(ones));
2730 | 
2731 |     for (/**/; i < size; ++i)
2732 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) & _mm512_loadu_si512(&data2[i])));
2733 | 
2734 |     cnt64 = (uint64_t*)&cnt;
2735 | 
2736 |     return cnt64[0] +
2737 |             cnt64[1] +
2738 |             cnt64[2] +
2739 |             cnt64[3] +
2740 |             cnt64[4] +
2741 |             cnt64[5] +
2742 |             cnt64[6] +
2743 |             cnt64[7];
2744 | }
2745 | 
2746 | STORM_TARGET("avx512bw")
2747 | static 
2748 | uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 
2749 |                                       const __m512i* STORM_RESTRICT data2, 
2750 |                                       size_t size)
2751 | {
2752 |     __m512i cnt      = _mm512_setzero_si512();
2753 |     __m512i ones     = _mm512_setzero_si512();
2754 |     __m512i twos     = _mm512_setzero_si512();
2755 |     __m512i fours    = _mm512_setzero_si512();
2756 |     __m512i eights   = _mm512_setzero_si512();
2757 |     __m512i sixteens = _mm512_setzero_si512();
2758 |     __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2759 | 
2760 |     uint64_t i = 0;
2761 |     uint64_t limit = size - size % 16;
2762 |     uint64_t* cnt64;
2763 | 
2764 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) | _mm512_loadu_si512(&data2[i+a]))
2765 | 
2766 |     for (/**/; i < limit; i += 16) {
2767 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2768 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2769 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2770 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2771 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2772 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2773 |         STORM_CSA512(&eightsA, &fours,  fours, foursA, foursB);
2774 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2775 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2776 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2777 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2778 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2779 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2780 |         STORM_CSA512(&eightsB, &fours,  fours, foursA, foursB);
2781 |         STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2782 | 
2783 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2784 |     }
2785 | #undef LOAD
2786 | 
2787 |     cnt = _mm512_slli_epi64(cnt, 4);
2788 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2789 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2790 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2791 |     cnt = _mm512_add_epi64(cnt,  STORM_popcnt512(ones));
2792 | 
2793 |     for (/**/; i < size; ++i)
2794 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) | _mm512_loadu_si512(&data2[i])));
2795 | 
2796 |     cnt64 = (uint64_t*)&cnt;
2797 | 
2798 |     return cnt64[0] +
2799 |             cnt64[1] +
2800 |             cnt64[2] +
2801 |             cnt64[3] +
2802 |             cnt64[4] +
2803 |             cnt64[5] +
2804 |             cnt64[6] +
2805 |             cnt64[7];
2806 | }
2807 | 
2808 | STORM_TARGET("avx512bw")
2809 | static 
2810 | uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 
2811 |                                      const __m512i* STORM_RESTRICT data2, 
2812 |                                      size_t size)
2813 | {
2814 |     __m512i cnt      = _mm512_setzero_si512();
2815 |     __m512i ones     = _mm512_setzero_si512();
2816 |     __m512i twos     = _mm512_setzero_si512();
2817 |     __m512i fours    = _mm512_setzero_si512();
2818 |     __m512i eights   = _mm512_setzero_si512();
2819 |     __m512i sixteens = _mm512_setzero_si512();
2820 |     __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
2821 | 
2822 |     uint64_t i = 0;
2823 |     uint64_t limit = size - size % 16;
2824 |     uint64_t* cnt64;
2825 | 
2826 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) ^ _mm512_loadu_si512(&data2[i+a]))
2827 | 
2828 |     for (/**/; i < limit; i += 16) {
2829 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(0), LOAD(1));
2830 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(2), LOAD(3));
2831 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2832 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(4), LOAD(5));
2833 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(6), LOAD(7));
2834 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2835 |         STORM_CSA512(&eightsA, &fours,  fours, foursA, foursB);
2836 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(8), LOAD(9));
2837 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(10), LOAD(11));
2838 |         STORM_CSA512(&foursA,  &twos,   twos,  twosA,  twosB);
2839 |         STORM_CSA512(&twosA,   &ones,   ones,  LOAD(12), LOAD(13));
2840 |         STORM_CSA512(&twosB,   &ones,   ones,  LOAD(14), LOAD(15));
2841 |         STORM_CSA512(&foursB,  &twos,   twos,  twosA,  twosB);
2842 |         STORM_CSA512(&eightsB, &fours,  fours, foursA, foursB);
2843 |         STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB);
2844 | 
2845 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens));
2846 |     }
2847 | #undef LOAD
2848 | 
2849 |     cnt = _mm512_slli_epi64(cnt, 4);
2850 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3));
2851 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2));
2852 |     cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1));
2853 |     cnt = _mm512_add_epi64(cnt,  STORM_popcnt512(ones));
2854 | 
2855 |     for (/**/; i < size; ++i)
2856 |         cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) ^ _mm512_loadu_si512(&data2[i])));
2857 | 
2858 |     cnt64 = (uint64_t*)&cnt;
2859 | 
2860 |     return cnt64[0] +
2861 |             cnt64[1] +
2862 |             cnt64[2] +
2863 |             cnt64[3] +
2864 |             cnt64[4] +
2865 |             cnt64[5] +
2866 |             cnt64[6] +
2867 |             cnt64[7];
2868 | }
2869 | 
2870 | // Functions
2871 | // AVX512
2872 | STORM_TARGET("avx512bw")
2873 | static 
2874 | uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, 
2875 |                                       const uint64_t* STORM_RESTRICT b2, 
2876 |                                       const size_t n_ints) 
2877 | {
2878 |     uint64_t count = 0;
2879 |     const __m512i* r1 = (const __m512i*)(b1);
2880 |     const __m512i* r2 = (const __m512i*)(b2);
2881 |     const uint32_t n_cycles = n_ints / 8;
2882 | 
2883 |     count += STORM_intersect_count_csa_avx512(r1, r2, n_cycles);
2884 | 
2885 |     for (int i = n_cycles*8; i < n_ints; ++i) {
2886 |         count += STORM_POPCOUNT(b1[i] & b2[i]);
2887 |     }
2888 | 
2889 |     return(count);
2890 | }
2891 | 
2892 | STORM_TARGET("avx512bw")
2893 | static 
2894 | uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, 
2895 |                                   const uint64_t* STORM_RESTRICT b2, 
2896 |                                   const size_t n_ints) 
2897 | {
2898 |     uint64_t count = 0;
2899 |     const __m512i* r1 = (const __m512i*)(b1);
2900 |     const __m512i* r2 = (const __m512i*)(b2);
2901 |     const uint32_t n_cycles = n_ints / 8;
2902 | 
2903 |     count += STORM_union_count_csa_avx512(r1, r2, n_cycles);
2904 | 
2905 |     for (int i = n_cycles*8; i < n_ints; ++i) {
2906 |         count += STORM_POPCOUNT(b1[i] | b2[i]);
2907 |     }
2908 | 
2909 |     return(count);
2910 | }
2911 | 
2912 | STORM_TARGET("avx512bw")
2913 | static 
2914 | uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, 
2915 |                                  const uint64_t* STORM_RESTRICT b2, 
2916 |                                  const size_t n_ints) 
2917 | {
2918 |     uint64_t count = 0;
2919 |     const __m512i* r1 = (const __m512i*)(b1);
2920 |     const __m512i* r2 = (const __m512i*)(b2);
2921 |     const uint32_t n_cycles = n_ints / 8;
2922 | 
2923 |     count += STORM_diff_count_csa_avx512(r1, r2, n_cycles);
2924 | 
2925 |     for (int i = n_cycles*8; i < n_ints; ++i) {
2926 |         count += STORM_POPCOUNT(b1[i] ^ b2[i]);
2927 |     }
2928 | 
2929 |     return(count);
2930 | }
2931 | 
2932 | STORM_TARGET("avx2")
2933 | static 
2934 | uint64_t STORM_popcnt_avx512(const uint64_t* data, 
2935 |                              const size_t n_ints) 
2936 | {
2937 |     uint64_t count = 0;
2938 |     const uint32_t n_cycles = n_ints / 8;
2939 |     const uint32_t n_cycles_avx2 = (n_ints % 8) / 4;
2940 |     const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2;
2941 | 
2942 |     const __m512i* r1 = (__m512i*)&data[0];
2943 |     const __m256i* r2 = (__m256i*)&data[n_cycles*8];
2944 |     const __m128i* r3 = (__m128i*)&data[n_cycles*8+n_cycles_avx2*4];
2945 | 
2946 |     count += STORM_popcnt_csa_avx512bw(r1, n_cycles);
2947 |     count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2);
2948 |     count += STORM_popcnt_csa_sse4(r3, n_cycles_sse);
2949 | 
2950 |     for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) {
2951 |         count += STORM_POPCOUNT(data[i]);
2952 |     }
2953 | 
2954 |     return count;
2955 | }
2956 | #endif
2957 | 
2958 | /****************************
2959 | *  Popcount
2960 | ****************************/
2961 | 
2962 | STORM_FORCE_INLINE
2963 | uint64_t STORM_popcount64_unrolled(const uint64_t* data, size_t size) {
2964 |     uint64_t i = 0;
2965 |     uint64_t limit = size - size % 4;
2966 |     uint64_t cnt = 0;
2967 | 
2968 |     for (/**/; i < limit; i += 4) {
2969 |         cnt += STORM_popcount64(data[i+0]);
2970 |         cnt += STORM_popcount64(data[i+1]);
2971 |         cnt += STORM_popcount64(data[i+2]);
2972 |         cnt += STORM_popcount64(data[i+3]);
2973 |     }
2974 | 
2975 |     for (/**/; i < size; ++i)
2976 |         cnt += STORM_popcount64(data[i]);
2977 | 
2978 |     return cnt;
2979 | }
2980 | 
2981 | /****************************
2982 | *  Scalar functions
2983 | ****************************/
2984 | 
2985 | STORM_FORCE_INLINE 
2986 | uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, 
2987 |                                       const uint64_t* STORM_RESTRICT b2, 
2988 |                                       const size_t n_ints)
2989 | {
2990 |     return STORM_intersect_count_unrolled(b1, b2, n_ints);
2991 | }
2992 | 
2993 | STORM_FORCE_INLINE 
2994 | uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, 
2995 |                                   const uint64_t* STORM_RESTRICT b2, 
2996 |                                   const size_t n_ints)
2997 | {
2998 |     return STORM_union_count_unrolled(b1, b2, n_ints);
2999 | }
3000 | 
3001 | STORM_FORCE_INLINE 
3002 | uint64_t STORM_diff_count_scalar(const uint64_t* STORM_RESTRICT b1, 
3003 |                                  const uint64_t* STORM_RESTRICT b2, 
3004 |                                  const size_t n_ints)
3005 | {
3006 |     return STORM_diff_count_unrolled(b1, b2, n_ints);
3007 | }
3008 | 
3009 | static
3010 | uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, 
3011 |                                            const uint64_t* STORM_RESTRICT b2, 
3012 |                                            const uint32_t* STORM_RESTRICT l1, 
3013 |                                            const uint32_t* STORM_RESTRICT l2,
3014 |                                            const size_t n1, 
3015 |                                            const size_t n2) 
3016 | {
3017 |     uint64_t count = 0;
3018 | 
3019 | #define MOD(x) (( (x) * 64 ) >> 6)
3020 |     if (n1 < n2) {
3021 |         for (int i = 0; i < n1; ++i)
3022 |             count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0);
3023 |     } else {
3024 |         for (int i = 0; i < n2; ++i)
3025 |             count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0);
3026 |     }
3027 | #undef MOD
3028 |     return(count);
3029 | }
3030 | 
3031 | 
3032 | /* *************************************
3033 | *  Function pointer definitions.
3034 | ***************************************/
3035 | typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const size_t);
3036 | typedef int (STORM_pposcnt_func)(const uint16_t*, size_t, uint32_t*);
3037 | typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t);
3038 | 
3039 | /* *************************************
3040 | *  Alignment 
3041 | ***************************************/
3042 | // Return the best alignment given the available instruction set at
3043 | // run-time.
3044 | static 
3045 | uint32_t STORM_get_alignment() {
3046 | 
3047 | #if defined(STORM_HAVE_CPUID)
3048 |     #if defined(__cplusplus)
3049 |     /* C++11 thread-safe singleton */
3050 |     static const int cpuid = STORM_get_cpuid();
3051 |     #else
3052 |     static int cpuid_ = -1;
3053 |     int cpuid = cpuid_;
3054 |     if (cpuid == -1) {
3055 |         cpuid = STORM_get_cpuid();
3056 | 
3057 |         #if defined(_MSC_VER)
3058 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3059 |         #else
3060 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3061 |         #endif
3062 |     }
3063 |     #endif
3064 | #endif
3065 | 
3066 |     uint32_t alignment = 0;
3067 | #if defined(STORM_HAVE_AVX512)
3068 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512
3069 |         alignment = STORM_AVX512_ALIGNMENT;
3070 |     }
3071 | #endif
3072 | 
3073 | #if defined(STORM_HAVE_AVX2)
3074 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256
3075 |         alignment = STORM_AVX2_ALIGNMENT;
3076 |     }
3077 | #endif
3078 | 
3079 | #if defined(STORM_HAVE_SSE42)
3080 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128
3081 |         alignment = STORM_SSE_ALIGNMENT;
3082 |     }
3083 | #endif
3084 | 
3085 |     if (alignment == 0) alignment = 8;
3086 |     return alignment;
3087 | }
3088 | 
3089 | /* *************************************
3090 | *  Set algebra functions
3091 | ***************************************/
3092 | // Return the optimal intersection function given the range [0, n_bitmaps_vector)
3093 | // and the available instruction set at run-time.
3094 | static
3095 | STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) {
3096 | 
3097 | #if defined(STORM_HAVE_CPUID)
3098 |     #if defined(__cplusplus)
3099 |     /* C++11 thread-safe singleton */
3100 |     static const int cpuid = STORM_get_cpuid();
3101 |     #else
3102 |     static int cpuid_ = -1;
3103 |     int cpuid = cpuid_;
3104 |     if (cpuid == -1) {
3105 |         cpuid = STORM_get_cpuid();
3106 | 
3107 |         #if defined(_MSC_VER)
3108 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3109 |         #else
3110 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3111 |         #endif
3112 |     }
3113 |     #endif
3114 | #endif
3115 | 
3116 | 
3117 | #if defined(STORM_HAVE_AVX512)
3118 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3119 |         return &STORM_intersect_count_avx512;
3120 |     }
3121 | #endif
3122 | 
3123 | #if defined(STORM_HAVE_AVX2)
3124 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3125 |         return &STORM_intersect_count_avx2;
3126 |     }
3127 |     
3128 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3129 |         return &STORM_intersect_count_lookup_avx2;
3130 |     }
3131 | #endif
3132 | 
3133 | #if defined(STORM_HAVE_SSE42)
3134 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3135 |         return &STORM_intersect_count_sse4;
3136 |     }
3137 | #endif
3138 | 
3139 |     return &STORM_intersect_count_scalar;
3140 | }
3141 | 
3142 | static
3143 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) {
3144 | 
3145 | #if defined(STORM_HAVE_CPUID)
3146 |     #if defined(__cplusplus)
3147 |     /* C++11 thread-safe singleton */
3148 |     static const int cpuid = STORM_get_cpuid();
3149 |     #else
3150 |     static int cpuid_ = -1;
3151 |     int cpuid = cpuid_;
3152 |     if (cpuid == -1) {
3153 |         cpuid = STORM_get_cpuid();
3154 | 
3155 |         #if defined(_MSC_VER)
3156 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3157 |         #else
3158 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3159 |         #endif
3160 |     }
3161 |     #endif
3162 | #endif
3163 | 
3164 | 
3165 | #if defined(STORM_HAVE_AVX512)
3166 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3167 |         return &STORM_union_count_avx512;
3168 |     }
3169 | #endif
3170 | 
3171 | #if defined(STORM_HAVE_AVX2)
3172 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3173 |         return &STORM_union_count_avx2;
3174 |     }
3175 |     
3176 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3177 |         return &STORM_union_count_lookup_avx2;
3178 |     }
3179 | #endif
3180 | 
3181 | #if defined(STORM_HAVE_SSE42)
3182 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3183 |         return &STORM_union_count_sse4;
3184 |     }
3185 | #endif
3186 | 
3187 |     return &STORM_union_count_scalar;
3188 | }
3189 | 
3190 | static
3191 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) {
3192 | 
3193 | #if defined(STORM_HAVE_CPUID)
3194 |     #if defined(__cplusplus)
3195 |     /* C++11 thread-safe singleton */
3196 |     static const int cpuid = STORM_get_cpuid();
3197 |     #else
3198 |     static int cpuid_ = -1;
3199 |     int cpuid = cpuid_;
3200 |     if (cpuid == -1) {
3201 |         cpuid = STORM_get_cpuid();
3202 | 
3203 |         #if defined(_MSC_VER)
3204 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3205 |         #else
3206 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3207 |         #endif
3208 |     }
3209 |     #endif
3210 | #endif
3211 | 
3212 | 
3213 | #if defined(STORM_HAVE_AVX512)
3214 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512
3215 |         return &STORM_diff_count_avx512;
3216 |     }
3217 | #endif
3218 | 
3219 | #if defined(STORM_HAVE_AVX2)
3220 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256
3221 |         return &STORM_diff_count_avx2;
3222 |     }
3223 |     
3224 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) {
3225 |         return &STORM_diff_count_lookup_avx2;
3226 |     }
3227 | #endif
3228 | 
3229 | #if defined(STORM_HAVE_SSE42)
3230 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128
3231 |         return &STORM_diff_count_sse4;
3232 |     }
3233 | #endif
3234 | 
3235 |     return &STORM_diff_count_scalar;
3236 | }
3237 | 
3238 | // real
3239 | // Return the optimal intersection function given the range [0, n_bitmaps_vector)
3240 | // and the available instruction set at run-time.
3241 | static
3242 | uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, 
3243 |                                const uint64_t* STORM_RESTRICT data2, 
3244 |                                const size_t n_len)
3245 | {
3246 | 
3247 | #if defined(STORM_HAVE_CPUID)
3248 |     #if defined(__cplusplus)
3249 |     /* C++11 thread-safe singleton */
3250 |     static const int cpuid = STORM_get_cpuid();
3251 |     #else
3252 |     static int cpuid_ = -1;
3253 |     int cpuid = cpuid_;
3254 |     if (cpuid == -1) {
3255 |         cpuid = STORM_get_cpuid();
3256 | 
3257 |         #if defined(_MSC_VER)
3258 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3259 |         #else
3260 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3261 |         #endif
3262 |     }
3263 |     #endif
3264 | #endif
3265 | 
3266 | 
3267 | #if defined(STORM_HAVE_AVX512)
3268 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3269 |         return STORM_intersect_count_avx512(data1, data2, n_len);
3270 |     }
3271 | #endif
3272 | 
3273 | #if defined(STORM_HAVE_AVX2)
3274 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3275 |         return STORM_intersect_count_avx2(data1, data2, n_len);
3276 |     }
3277 |     
3278 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3279 |         return STORM_intersect_count_lookup_avx2(data1, data2, n_len);
3280 |     }
3281 | #endif
3282 | 
3283 | #if defined(STORM_HAVE_SSE42)
3284 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3285 |         return STORM_intersect_count_sse4(data1, data2, n_len);
3286 |     }
3287 | #endif
3288 | 
3289 |     return STORM_intersect_count_scalar(data1, data2, n_len);
3290 | }
3291 | 
3292 | static
3293 | uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, 
3294 |                            const uint64_t* STORM_RESTRICT data2, 
3295 |                            const size_t n_len)
3296 | {
3297 | 
3298 | #if defined(STORM_HAVE_CPUID)
3299 |     #if defined(__cplusplus)
3300 |     /* C++11 thread-safe singleton */
3301 |     static const int cpuid = STORM_get_cpuid();
3302 |     #else
3303 |     static int cpuid_ = -1;
3304 |     int cpuid = cpuid_;
3305 |     if (cpuid == -1) {
3306 |         cpuid = STORM_get_cpuid();
3307 | 
3308 |         #if defined(_MSC_VER)
3309 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3310 |         #else
3311 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3312 |         #endif
3313 |     }
3314 |     #endif
3315 | #endif
3316 | 
3317 | 
3318 | #if defined(STORM_HAVE_AVX512)
3319 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3320 |         return STORM_union_count_avx512(data1, data2, n_len);
3321 |     }
3322 | #endif
3323 | 
3324 | #if defined(STORM_HAVE_AVX2)
3325 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3326 |         return STORM_union_count_avx2(data1, data2, n_len);
3327 |     }
3328 |     
3329 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3330 |         return STORM_union_count_lookup_avx2(data1, data2, n_len);
3331 |     }
3332 | #endif
3333 | 
3334 | #if defined(STORM_HAVE_SSE42)
3335 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3336 |         return STORM_union_count_sse4(data1, data2, n_len);
3337 |     }
3338 | #endif
3339 | 
3340 |     return STORM_union_count_scalar(data1, data2, n_len);
3341 | }
3342 | 
3343 | static
3344 | uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, 
3345 |                           const uint64_t* STORM_RESTRICT data2, 
3346 |                           const size_t n_len)
3347 | {
3348 | 
3349 | #if defined(STORM_HAVE_CPUID)
3350 |     #if defined(__cplusplus)
3351 |     /* C++11 thread-safe singleton */
3352 |     static const int cpuid = STORM_get_cpuid();
3353 |     #else
3354 |     static int cpuid_ = -1;
3355 |     int cpuid = cpuid_;
3356 |     if (cpuid == -1) {
3357 |         cpuid = STORM_get_cpuid();
3358 | 
3359 |         #if defined(_MSC_VER)
3360 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3361 |         #else
3362 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3363 |         #endif
3364 |     }
3365 |     #endif
3366 | #endif
3367 | 
3368 | 
3369 | #if defined(STORM_HAVE_AVX512)
3370 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512
3371 |         return STORM_diff_count_avx512(data1, data2, n_len);
3372 |     }
3373 | #endif
3374 | 
3375 | #if defined(STORM_HAVE_AVX2)
3376 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256
3377 |         return STORM_diff_count_avx2(data1, data2, n_len);
3378 |     }
3379 |     
3380 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) {
3381 |         return STORM_diff_count_lookup_avx2(data1, data2, n_len);
3382 |     }
3383 | #endif
3384 | 
3385 | #if defined(STORM_HAVE_SSE42)
3386 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128
3387 |         return STORM_diff_count_sse4(data1, data2, n_len);
3388 |     }
3389 | #endif
3390 | 
3391 |     return STORM_diff_count_scalar(data1, data2, n_len);
3392 | }
3393 | 
3394 | /* *************************************
3395 | *  POPCNT and POSPOPCNT functions.
3396 | ***************************************/
3397 | static
3398 | uint64_t STORM_popcnt(const uint8_t* data, size_t size) {
3399 |     uint64_t cnt = 0;
3400 |     uint64_t i;
3401 |     // size /= 8;
3402 | 
3403 | #if defined(STORM_HAVE_CPUID)
3404 |     #if defined(__cplusplus)
3405 |     /* C++11 thread-safe singleton */
3406 |     static const int cpuid = STORM_get_cpuid();
3407 |     #else
3408 |     static int cpuid_ = -1;
3409 |     int cpuid = cpuid_;
3410 |     if (cpuid == -1) {
3411 |         cpuid = STORM_get_cpuid();
3412 | 
3413 |     #if defined(_MSC_VER)
3414 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3415 |     #else
3416 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3417 |     #endif
3418 |     }
3419 |     #endif
3420 | #endif
3421 | 
3422 | #if defined(STORM_HAVE_AVX512)
3423 | 
3424 |     /* AVX512 requires arrays >= 1024 bytes */
3425 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) &&
3426 |         size >= 1024)
3427 |     {
3428 |         // cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64);
3429 |         // data += size - size % 64;
3430 |         // size = size % 64;
3431 |         cnt += STORM_popcnt_avx512((uint64_t*)data, size/8);
3432 |         data += size - size % 8;
3433 |         size = size % 8;
3434 |     }
3435 | 
3436 | #endif
3437 | 
3438 | #if defined(STORM_HAVE_AVX2)
3439 | 
3440 |     /* AVX2 requires arrays >= 512 bytes */
3441 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2) &&
3442 |         size >= 512)
3443 |     {
3444 |         cnt += STORM_popcnt_avx2((uint64_t*)data, size/8);
3445 |         data += size - size % 8;
3446 |         size = size % 8;
3447 |         // data += size - size % 32;
3448 |         // size = size % 32;
3449 |     }
3450 | 
3451 | #endif
3452 | 
3453 | #if defined(STORM_HAVE_SSE42)
3454 | 
3455 |     /* AVX2 requires arrays >= 512 bytes */
3456 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE42) &&
3457 |         size >= 256)
3458 |     {
3459 |         cnt += STORM_popcnt_sse4((uint64_t*)data, size/8);
3460 |         data += size - size % 8;
3461 |         size = size % 8;
3462 |         // data += size - size % 32;
3463 |         // size = size % 32;
3464 |     }
3465 | 
3466 | #endif
3467 | 
3468 | #if defined(STORM_HAVE_POPCNT)
3469 | 
3470 |     if (cpuid & STORM_CPUID_runtime_bit_POPCNT) {
3471 |         cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8);
3472 |         data += size - size % 8;
3473 |         size = size % 8;
3474 |         for (i = 0; i < size; ++i)
3475 |             cnt += STORM_popcount64(data[i]);
3476 | 
3477 |         return cnt;
3478 |     }
3479 | 
3480 | #endif
3481 | 
3482 |     /* pure integer popcount algorithm */
3483 |     if (size >= 8) {
3484 |         cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8);
3485 |         data += size - size % 8;
3486 |         size = size % 8;
3487 |     }
3488 | 
3489 |     /* pure integer popcount algorithm */
3490 |     for (i = 0; i < size; ++i)
3491 |         cnt += STORM_popcount64(data[i]);
3492 | 
3493 |     return cnt;
3494 | }
3495 | 
3496 | static
3497 | int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) {
3498 |     memset(out, 0, sizeof(uint32_t)*16);
3499 | 
3500 | #if defined(STORM_HAVE_CPUID)
3501 |     #if defined(__cplusplus)
3502 |     /* C++11 thread-safe singleton */
3503 |     static const int cpuid = STORM_get_cpuid();
3504 |     #else
3505 |     static int cpuid_ = -1;
3506 |     int cpuid = cpuid_;
3507 |     if (cpuid == -1) {
3508 |         cpuid = STORM_get_cpuid();
3509 | 
3510 |     #if defined(_MSC_VER)
3511 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
3512 |     #else
3513 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
3514 |     #endif
3515 |     }
3516 |     #endif
3517 | #endif
3518 | 
3519 | #if defined(STORM_HAVE_AVX512)
3520 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW))
3521 |     {
3522 |         if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small
3523 |         else if (len < 256)  return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small
3524 |         else if (len < 512)  return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, out)); // medium
3525 |         else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, out)); // medium3
3526 |         else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, out)); // fix
3527 |     }
3528 | #endif
3529 | 
3530 | #if defined(STORM_HAVE_AVX2)
3531 |     if ((cpuid & STORM_CPUID_runtime_bit_AVX2))
3532 |     {
3533 |         if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small
3534 |         else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium
3535 |         else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, out)); // large
3536 |     }
3537 | #endif
3538 | 
3539 | #if defined(STORM_HAVE_SSE42)
3540 |     if ((cpuid & STORM_CPUID_runtime_bit_SSE42))
3541 |     {
3542 |          return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out));
3543 |     }
3544 | #endif
3545 | 
3546 | #ifndef _MSC_VER
3547 |     return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, out)); // fallback scalar
3548 | #else
3549 |     return(STORM_pospopcnt_u16_scalar_naive(data, len, out));
3550 | #endif
3551 | }
3552 | 
3553 | #ifdef __cplusplus
3554 | } /* extern "C" */
3555 | #endif
3556 | 
3557 | #endif /* LIBALGEBRA_H_8723467365934 */
3558 | 


--------------------------------------------------------------------------------