├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── appveyor.yml ├── benchmark.cpp └── libalgebra.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | # Executable 55 | benchmark 56 | # VSCode 57 | settings.json 58 | # CMake 59 | CMakeFiles 60 | cmake_* 61 | Makefile 62 | CMakeCache.txt 63 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Test configuration for Travis CI. See . 2 | language: cpp 3 | 4 | matrix: 5 | fast_finish: true 6 | 7 | include: 8 | - name: Xenial (g++-4.9) 9 | os: linux 10 | dist: xenial 11 | addons: 12 | apt: 13 | packages: 14 | - g++-4.9 15 | sources: &sources 16 | - ubuntu-toolchain-r-test 17 | env: 18 | - COMPILERS="CC=gcc-4.9 && CXX=g++-4.9" 19 | 20 | - name: Xenial (g++-5) 21 | os: linux 22 | dist: xenial 23 | addons: 24 | apt: 25 | packages: 26 | - g++-5 27 | sources: &sources 28 | - ubuntu-toolchain-r-test 29 | env: 30 | - COMPILERS="CC=gcc-5 && CXX=g++-5" 31 | 32 | - name: Xenial (g++-6) 33 | os: linux 34 | dist: xenial 35 | addons: 36 | apt: 37 | packages: 38 | - g++-6 39 | sources: &sources 40 | - ubuntu-toolchain-r-test 41 | env: 42 | - COMPILERS="CC=gcc-6 && CXX=g++-6" 43 | 44 | - name: Xenial (g++-7) 45 | os: linux 46 | dist: xenial 47 | addons: 48 | apt: 49 | packages: 50 | - g++-7 51 | - cmake 52 | sources: &sources 53 | - ubuntu-toolchain-r-test 54 | env: 55 | - COMPILERS="CC=gcc-7 && CXX=g++-7" 56 | 57 | - name: Xenial (g++-8) 58 | os: linux 59 | dist: xenial 60 | addons: 61 | apt: 62 | packages: 63 | - g++-8 64 | sources: &sources 65 | - ubuntu-toolchain-r-test 66 | env: 67 | - COMPILERS="CC=gcc-8 && CXX=g++-8" 68 | 69 | - name: Xenial (clang++-3.7) 70 | os: linux 71 | dist: xenial 72 | addons: 73 | apt: 74 | sources: 75 | - ubuntu-toolchain-r-test 76 | - llvm-toolchain-xenial-3.7 77 | packages: 78 | - clang-3.7 79 | env: 80 | - MATRIX_EVAL="CC=clang-3.7 && CXX=clang++-3.7" 81 | compiler: clang 82 | 83 | - name: Xenial (clang++-3.8) 84 | os: linux 85 | dist: xenial 86 | addons: 87 | apt: 88 | sources: 89 | - ubuntu-toolchain-r-test 90 | - llvm-toolchain-xenial-3.8 91 | packages: 92 | - clang-3.8 93 | env: 94 | - MATRIX_EVAL="CC=clang-3.8 && CXX=clang++-3.8" 95 | compiler: clang 96 | 97 | - name: Xenial (clang++-3.9) 98 | os: linux 99 | dist: xenial 100 | addons: 101 | apt: 102 | sources: 103 | - llvm-toolchain-xenial-3.9 104 | packages: 105 | - clang-3.9 106 | env: 107 | - MATRIX_EVAL="CC=clang-3.9 && CXX=clang++-3.9" 108 | compiler: clang 109 | 110 | - name: Xenial (clang++-4.0) 111 | os: linux 112 | dist: xenial 113 | addons: 114 | apt: 115 | sources: 116 | - llvm-toolchain-xenial-4.0 117 | packages: 118 | - clang-4.0 119 | env: 120 | - MATRIX_EVAL="CC=clang-4.0 && CXX=clang++-4.0" 121 | compiler: clang 122 | 123 | - name: Xenial (clang++-5.0) 124 | os: linux 125 | dist: xenial 126 | addons: 127 | apt: 128 | sources: 129 | - llvm-toolchain-xenial-5.0 130 | - ubuntu-toolchain-r-test 131 | packages: 132 | - clang-5.0 133 | env: 134 | - MATRIX_EVAL="CC=clang-5.0 && CXX=clang++-5.0" 135 | compiler: clang 136 | 137 | - name: Xenial (clang++-6.0) 138 | os: linux 139 | dist: xenial 140 | addons: 141 | apt: 142 | sources: 143 | - ubuntu-toolchain-r-test 144 | - llvm-toolchain-xenial-6.0 145 | packages: 146 | - clang-6.0 147 | - libstdc++-6-dev 148 | env: 149 | - MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0" 150 | compiler: clang 151 | 152 | - name: Xenial (clang++-7.0) 153 | os: linux 154 | dist: xenial 155 | addons: 156 | apt: 157 | sources: 158 | - ubuntu-toolchain-r-test 159 | - llvm-toolchain-xenial-7 160 | packages: 161 | - clang-7 162 | - libstdc++-7-dev 163 | env: 164 | - MATRIX_EVAL="CC=clang-7 && CXX=clang++-7" 165 | compiler: clang 166 | 167 | # OSX begin 168 | - os: osx 169 | compiler: clang 170 | 171 | install: 172 | ############################################################################ 173 | # Install a recent CMake (unless already installed on OS X) 174 | ############################################################################ 175 | - CMAKE_VERSION=3.7.2 176 | - | 177 | if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then 178 | CMAKE_URL="https://cmake.org/files/v${CMAKE_VERSION%.[0-9]}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz" 179 | mkdir cmake && travis_retry wget --no-check-certificate -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C cmake 180 | export PATH=${DEPS_DIR}/cmake/bin:${PATH} 181 | else 182 | brew install cmake || brew upgrade cmake 183 | fi 184 | - cmake --version 185 | 186 | before_script: 187 | - cd "${TRAVIS_BUILD_DIR}" 188 | # Overwrite default CC and CXX 189 | - eval "${COMPILERS}" 190 | 191 | script: 192 | - cmake . 193 | - make 194 | - | 195 | if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then 196 | sudo ./benchmark -r 10 197 | else 198 | ./benchmark -r 10 199 | fi -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 2 | project(libalgebra C CXX) 3 | set (CMAKE_BUILD_TYPE Release) 4 | set (CMAKE_CXX_FLAGS "-std=c++11") 5 | 6 | include_directories(.) 7 | 8 | add_executable(benchmark benchmark.cpp) 9 | 10 | install(FILES libalgebra.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.com/mklarqvist/libalgebra.svg)](https://travis-ci.com/mklarqvist/libalgebra) 2 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/mklarqvist/libalgebra?branch=master&svg=true)](https://ci.appveyor.com/project/mklarqvist/libalgebra) 3 | [![Github Releases](https://img.shields.io/github/release/mklarqvist/libalgebra.svg)](https://github.com/mklarqvist/libalgebra/releases) 4 | [![License](https://img.shields.io/badge/Apache-2.0-blue.svg)](LICENSE) 5 | 6 | # libalgebra 7 | 8 | ```libalgebra.h``` is a header-only C/C++ library for: 9 | * counting the number of set bits ("population count", `popcnt`) in an array 10 | * counting the number of set bits at each position ("positional population count", `pospopcnt`) in an array 11 | * perform set algebraic operations on bitmaps including union, intersection, and diff cardinalities 12 | 13 | using specialized CPU instructions i.e. 14 | [POPCNT](https://en.wikipedia.org/wiki/SSE4#POPCNT_and_LZCNT), 15 | [SSE4.2](https://en.wikipedia.org/wiki/SSE4#SSE4.2), 16 | [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions), 17 | [AVX512BW](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions), 18 | [NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_.28NEON.29). ```libalgebra.h``` has been tested successfully using the GCC, 19 | Clang and MSVC compilers. 20 | 21 | The core algorithms are described in the papers: 22 | 23 | * [Faster Population Counts using AVX2 Instructions](https://arxiv.org/abs/1611.07612) by Daniel Lemire, Nathan Kurz 24 | and Wojciech Muła (23 Nov 2016). 25 | * [Efficient Computation of Positional Population Counts Using SIMD Instructions](https://arxiv.org/abs/1911.02696) by Marcus D. R. Klarqvist and Wojciech Muła and Daniel Lemire 26 | * [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/abs/1603.06549) by D. Lemire, G. Ssi-Yan-Kai, 27 | and O. Kaser (21 Mar 2016). 28 | 29 | ### Speedup 30 | 31 | Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level. 32 | The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3). 33 | 34 | ### POSPOPCNT 35 | 36 | This benchmark shows the speedup of the four `pospopcnt` algorithms used on x86 37 | CPUs compared to a naive unvectorized solution 38 | (`pospopcnt_u16_scalar_naive_nosimd`) for different array sizes (in number of 39 | 2-byte values). 40 | 41 | | Algorithm | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | 65536 | 42 | |-----------------------------------|------|-------|-------|-------|-------|-------|-------|--------| 43 | | pospopcnt_u16_sse_blend_popcnt_unroll8 | **8.28** | 9.84 | 10.55 | 11 | 11.58 | 11.93 | 12.13 | 12.28 | 44 | | pospopcnt_u16_avx512_blend_popcnt_unroll8 | 7.07 | **11.25** | **16.21** | 21 | 25.49 | 27.91 | 29.73 | 31.55 | 45 | | pospopcnt_u16_avx512_adder_forest | 3.05 | 2.82 | 14.53 | **23.13** | **34.37** | 44.91 | 52.78 | 61.68 | 46 | | pospopcnt_u16_avx512_harvey_seal | 2.07 | 2.3 | 8.21 | 15.41 | 28.17 | **49.14** | **76.11** | **138.71** | 47 | 48 | ### POPCNT 49 | 50 | Fold speedup compared to a naive unvectorized algorithm 51 | (`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word): 52 | 53 | | Words | libalgebra.h | Scalar | Speedup | 54 | |---------|--------------|---------------|---------| 55 | | 4 | 27.75 (37) | 26.75 (33.5) | 1 | 56 | | 8 | 16.38 (25.5) | 17.38 (30.25) | 1.1 | 57 | | 16 | 10.5 (19.94) | 12.75 (28.63) | 1.2 | 58 | | 32 | 7.72 (17.16) | 10.69 (27.81) | 1.4 | 59 | | 64 | 3.09 (4.36) | 9.61 (27.41) | 3.1 | 60 | | 128 | 2.53 (2.73) | 8.84 (27.2) | 3.5 | 61 | | 256 | 1.35 (1.7) | 8.5 (27.1) | 6.3 | 62 | | 512 | 0.67 (1.18) | 8.33 (27.05) | 12.4 | 63 | | 1024 | 0.5 (0.92) | 8.25 (27.03) | 16.4 | 64 | | 2048 | 0.41 (0.79) | 8.15 (27.01) | 20.1 | 65 | | 4096 | 0.46 (0.72) | 8.12 (27.01) | 17.8 | 66 | | 8192 | 0.39 (0.69) | 8.11 (27) | 21 | 67 | | 16384 | 0.39 (0.67) | 8.1 (27) | 20.6 | 68 | | 32768 | 0.89 (0.66) | 8.1 (27) | 9.1 | 69 | | 65536 | 0.84 (0.66) | 8.1 (27) | 9.6 | 70 | | 131072 | 0.68 (0.66) | 8.09 (27) | 11.9 | 71 | | 262144 | 1.11 (0.66) | 8.09 (27) | 7.3 | 72 | | 524288 | 1.84 (0.66) | 8.12 (27) | 4.4 | 73 | | 1048576 | 1.95 (0.66) | 8.15 (27) | 4.2 | 74 | 75 | ### Set algebra 76 | 77 | Fold speedup compared to naive unvectorized solution (`*_scalar_naive_nosimd`) 78 | for different array sizes (in number of _pairs_ of 64-bit word but results reported per _single_ 64-bit word). These 79 | functions are identifical with the exception of the bitwise operator used (AND, 80 | OR, or XOR) which all have identical latency and throughput (CPI). 81 | 82 | | Words | libalgebra.h | Scalar | Speedup | 83 | |---------|--------------|---------------|---------| 84 | | 4 | 17.63 (8.63) | 14.63 (22.75) | 0.8 | 85 | | 8 | 8.13 (5.44) | 10 (20.88) | 1.2 | 86 | | 16 | 4.69 (3.84) | 7.91 (19.94) | 1.7 | 87 | | 32 | 2.38 (2.56) | 6.59 (19.47) | 2.8 | 88 | | 64 | 1.82 (2.06) | 5.87 (19.23) | 3.2 | 89 | | 128 | 0.88 (0.89) | 5.43 (19.12) | 6.2 | 90 | | 256 | 0.57 (0.64) | 5.18 (19.06) | 9.2 | 91 | | 512 | 0.41 (0.51) | 5.11 (19.03) | 12.4 | 92 | | 1024 | 0.33 (0.45) | 5.06 (19.02) | 15.3 | 93 | | 2048 | 0.39 (0.41) | 5.03 (19.01) | 13.1 | 94 | | 4096 | 0.36 (0.4) | 5.02 (19) | 13.9 | 95 | | 8192 | 0.37 (0.39) | 5.01 (19) | 13.7 | 96 | | 16384 | 0.55 (0.39) | 5.01 (19) | 9.1 | 97 | | 32768 | 0.55 (0.39) | 5 (19) | 9.2 | 98 | | 65536 | 0.52 (0.38) | 5 (19) | 9.7 | 99 | | 131072 | 0.56 (0.38) | 5.01 (19) | 9 | 100 | | 262144 | 1.25 (0.38) | 5.02 (19) | 4 | 101 | | 524288 | 1.76 (0.38) | 5.03 (19) | 2.9 | 102 | | 1048576 | 1.81 (0.38) | 5.07 (19) | 2.8 | 103 | 104 | ## C/C++ API 105 | 106 | ```C 107 | #include "libalgebra.h" 108 | 109 | /* 110 | * Count the number of 1 bits in the data array 111 | * @data: An array 112 | * @size: Size of data in bytes 113 | */ 114 | uint64_t STORM_popcnt(const void* data, uint64_t size); 115 | ``` 116 | 117 | ```C 118 | #include "libalgebra.h" 119 | 120 | /* 121 | * Count the number of 1 bits for each position in the data array 122 | * @data: A 16-bit array 123 | * @size: Size of data in bytes 124 | * @flags: Output vector[16] 125 | */ 126 | uint32_t flags[16]; 127 | int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags); 128 | ``` 129 | 130 | ```C 131 | #include "libalgebra.h" 132 | 133 | /* 134 | * Compute the intersection, union, or diff cardinality between pairs of bitmaps 135 | * @data1: A 64-bit array 136 | * @data2: A 64-bit array 137 | * @size: Size of data in 64-bit words 138 | */ 139 | // Intersect cardinality 140 | uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); 141 | // Union cardinality 142 | uint64_t STORM_union_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); 143 | // Diff cardinality 144 | uint64_t STORM_diff_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); 145 | ``` 146 | 147 | ### Advanced use 148 | 149 | Retrieve a function pointer to the optimal function given the target length. 150 | 151 | ```C 152 | STORM_compute_func STORM_get_intersection_count_func(const size_t n_bitmaps_vector); 153 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector); 154 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector); 155 | ``` 156 | 157 | Portable memory alignment. 158 | 159 | ```C 160 | #include "libalgebra.h" 161 | 162 | void* STORM_aligned_malloc(size_t alignment, size_t size); 163 | void STORM_aligned_free(void* memblock); 164 | ``` 165 | 166 | ## How it works 167 | 168 | On x86 CPUs ```libalgebra.h``` uses a combination of algorithms depending on the input vector size and what instruction set your CPU supports. These checks are performed during **run-time**. -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # Automated Windows (MSVC++) testing using appveyor.com 2 | # https://ci.appveyor.com/projects 3 | 4 | version: 1.0.{build} 5 | 6 | branches: 7 | except: 8 | - gh-pages 9 | 10 | os: Visual Studio 2017 11 | 12 | platform: 13 | - x86 14 | - x64 15 | 16 | build_script: 17 | - if "%platform%" == "x86" cmake -G "Visual Studio 15 2017" . 18 | - if "%platform%" == "x64" cmake -G "Visual Studio 15 2017 Win64" . 19 | - cmake --build . --config Release 20 | 21 | test_script: 22 | - ps: C:\projects\libalgebra\Release\benchmark.exe -------------------------------------------------------------------------------- /benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "libalgebra.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #if !defined(_MSC_VER) 9 | #include "getopt.h" 10 | #endif 11 | 12 | uint64_t* generate_random_data(uint32_t n_bitmaps) { 13 | // Clear data 14 | // uint32_t n_bitmaps = ceil(n / 64.0); 15 | // memset(data, 0, sizeof(uint64_t)*n_bitmaps); 16 | uint64_t* mem = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); 17 | 18 | // PRNG 19 | std::uniform_int_distribution distr(0, std::numeric_limits::max()-1); // right inclusive 20 | std::random_device rd; // obtain a random number from hardware 21 | std::mt19937 eng(rd()); // seed the generator 22 | 23 | // Generate some random data. 24 | uint32_t n_unique = 0; 25 | // while (n_unique < n) { 26 | for (int i = 0; i < n_bitmaps; ++i) { 27 | uint32_t val1 = distr(eng); 28 | uint32_t val2 = distr(eng); 29 | uint64_t x = ((uint64_t)val1 << 32) | val2; 30 | mem[i] = x; 31 | } 32 | 33 | return mem; 34 | } 35 | 36 | #if !defined(__clang__) && !defined(_MSC_VER) 37 | __attribute__((optimize("no-tree-vectorize"))) 38 | #endif 39 | uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { 40 | uint64_t total = 0; 41 | // for (int i = 0; i < len; ++i) { 42 | // total += STORM_popcount64(data1[i] & data2[i]); 43 | // } 44 | // assert(len % 8 == 0); 45 | 46 | for (int j = 0; j < len; j += 8) { 47 | // total += STORM_popcount64(data[i]); 48 | // diff = data1[i] & data2[i]; 49 | total += STORM_popcnt_lookup8bit[data[j+0]]; 50 | total += STORM_popcnt_lookup8bit[data[j+1]]; 51 | total += STORM_popcnt_lookup8bit[data[j+2]]; 52 | total += STORM_popcnt_lookup8bit[data[j+3]]; 53 | total += STORM_popcnt_lookup8bit[data[j+4]]; 54 | total += STORM_popcnt_lookup8bit[data[j+5]]; 55 | total += STORM_popcnt_lookup8bit[data[j+6]]; 56 | total += STORM_popcnt_lookup8bit[data[j+7]]; 57 | } 58 | 59 | return total; 60 | } 61 | 62 | #ifdef __linux__ 63 | 64 | #include // for __NR_perf_event_open 65 | #include // for perf event constants 66 | #include // for ioctl 67 | #include // for syscall 68 | #include 69 | #include // for errno 70 | #include // for memset 71 | #include 72 | 73 | #include 74 | 75 | template 76 | class LinuxEvents { 77 | int fd; 78 | bool working; 79 | perf_event_attr attribs; 80 | int num_events; 81 | std::vector temp_result_vec; 82 | std::vector ids; 83 | 84 | public: 85 | explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) { 86 | memset(&attribs, 0, sizeof(attribs)); 87 | attribs.type = TYPE; 88 | attribs.size = sizeof(attribs); 89 | attribs.disabled = 1; 90 | attribs.exclude_kernel = 1; 91 | attribs.exclude_hv = 1; 92 | 93 | attribs.sample_period = 0; 94 | attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; 95 | const int pid = 0; // the current process 96 | const int cpu = -1; // all CPUs 97 | const unsigned long flags = 0; 98 | 99 | int group = -1; // no group 100 | num_events = config_vec.size(); 101 | uint32_t i = 0; 102 | for (auto config : config_vec) { 103 | attribs.config = config; 104 | fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); 105 | if (fd == -1) { 106 | report_error("perf_event_open"); 107 | } 108 | ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]); 109 | if (group == -1) { 110 | group = fd; 111 | } 112 | } 113 | 114 | temp_result_vec.resize(num_events * 2 + 1); 115 | } 116 | 117 | ~LinuxEvents() { close(fd); } 118 | 119 | inline void start() { 120 | if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { 121 | report_error("ioctl(PERF_EVENT_IOC_RESET)"); 122 | } 123 | 124 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { 125 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); 126 | } 127 | } 128 | 129 | inline void end(std::vector &results) { 130 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { 131 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); 132 | } 133 | 134 | if (read(fd, &temp_result_vec[0], temp_result_vec.size() * 8) == -1) { 135 | report_error("read"); 136 | } 137 | // our actual results are in slots 1,3,5, ... of this structure 138 | // we really should be checking our ids obtained earlier to be safe 139 | for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { 140 | results[i / 2] = temp_result_vec[i]; 141 | } 142 | } 143 | 144 | private: 145 | void report_error(const std::string &context) { 146 | if (working) 147 | std::cerr << (context + ": " + std::string(strerror(errno))) << std::endl; 148 | working = false; 149 | } 150 | }; 151 | 152 | std::vector 153 | compute_mins(std::vector< std::vector > allresults) { 154 | if (allresults.size() == 0) 155 | return std::vector(); 156 | 157 | std::vector answer = allresults[0]; 158 | 159 | for (size_t k = 1; k < allresults.size(); k++) { 160 | assert(allresults[k].size() == answer.size()); 161 | for (size_t z = 0; z < answer.size(); z++) { 162 | if (allresults[k][z] < answer[z]) 163 | answer[z] = allresults[k][z]; 164 | } 165 | } 166 | return answer; 167 | } 168 | 169 | std::vector 170 | compute_averages(std::vector< std::vector > allresults) { 171 | if (allresults.size() == 0) 172 | return std::vector(); 173 | 174 | std::vector answer(allresults[0].size()); 175 | 176 | for (size_t k = 0; k < allresults.size(); k++) { 177 | assert(allresults[k].size() == answer.size()); 178 | for (size_t z = 0; z < answer.size(); z++) { 179 | answer[z] += allresults[k][z]; 180 | } 181 | } 182 | 183 | for (size_t z = 0; z < answer.size(); z++) { 184 | answer[z] /= allresults.size(); 185 | } 186 | return answer; 187 | } 188 | 189 | int linux_set_algebra_wrapper(std::string name, 190 | STORM_compute_func f, 191 | int iterations, 192 | uint32_t range, 193 | uint32_t n_values, 194 | uint32_t n_bitmaps, 195 | bool verbose) 196 | { 197 | std::vector evts; 198 | evts.push_back(PERF_COUNT_HW_CPU_CYCLES); 199 | evts.push_back(PERF_COUNT_HW_INSTRUCTIONS); 200 | evts.push_back(PERF_COUNT_HW_BRANCH_MISSES); 201 | evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES); 202 | evts.push_back(PERF_COUNT_HW_CACHE_MISSES); 203 | evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES); 204 | LinuxEvents unified(evts); 205 | std::vector results; // tmp buffer 206 | std::vector< std::vector > allresults; 207 | results.resize(evts.size()); 208 | 209 | std::random_device rd; 210 | std::mt19937 gen(rd()); 211 | std::uniform_int_distribution<> dis(0, 0xFFFF); 212 | 213 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization 214 | for (uint32_t i = 0; i < iterations; i++) { 215 | uint64_t* mem1 = generate_random_data(n_values); 216 | uint64_t* mem2 = generate_random_data(n_values); 217 | 218 | unified.start(); 219 | // Call argument subroutine pointer. 220 | total += (*f)(mem1, mem2, n_bitmaps); 221 | unified.end(results); 222 | allresults.push_back(results); 223 | 224 | STORM_aligned_free(mem1); 225 | STORM_aligned_free(mem2); 226 | } 227 | 228 | std::vector mins = compute_mins(allresults); 229 | std::vector avg = compute_averages(allresults); 230 | 231 | if (verbose) { 232 | printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", 233 | name.c_str(), 234 | n_bitmaps, 235 | double(mins[1]) / mins[0], 236 | double(mins[0]) / (2*n_bitmaps), 237 | double(mins[1]) / (2*n_bitmaps), 238 | mins[0], 239 | mins[1], 240 | mins[2], 241 | mins[3], 242 | mins[4], 243 | avg[0], 244 | avg[1], 245 | avg[2], 246 | avg[3], 247 | avg[4]); 248 | } else { 249 | printf("%s-%u:\n",name.c_str(),n_bitmaps); 250 | printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " 251 | "instructions per 64-bit word %4.3f \n", 252 | double(mins[1]) / mins[0], double(mins[0]) / (2*n_bitmaps), double(mins[1]) / (2*n_bitmaps)); 253 | // first we display mins 254 | printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu " 255 | "cache ref., %8llu cache mis.\n", 256 | mins[0], mins[1], mins[2], mins[3], mins[4]); 257 | printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " 258 | "cache ref., %8.1f cache mis.\n", 259 | avg[0], avg[1], avg[2], avg[3], avg[4]); 260 | } 261 | 262 | return 1; 263 | } 264 | 265 | int linux_popcount_wrapper(std::string name, 266 | STORM_popcnt_func f, 267 | int iterations, 268 | uint32_t range, 269 | uint32_t n_values, 270 | uint32_t n_bitmaps, 271 | bool verbose) 272 | { 273 | std::vector evts; 274 | evts.push_back(PERF_COUNT_HW_CPU_CYCLES); 275 | evts.push_back(PERF_COUNT_HW_INSTRUCTIONS); 276 | evts.push_back(PERF_COUNT_HW_BRANCH_MISSES); 277 | evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES); 278 | evts.push_back(PERF_COUNT_HW_CACHE_MISSES); 279 | evts.push_back(PERF_COUNT_HW_REF_CPU_CYCLES); 280 | LinuxEvents unified(evts); 281 | std::vector results; // tmp buffer 282 | std::vector< std::vector > allresults; 283 | results.resize(evts.size()); 284 | 285 | std::random_device rd; 286 | std::mt19937 gen(rd()); 287 | std::uniform_int_distribution<> dis(0, 0xFFFF); 288 | 289 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization 290 | for (uint32_t i = 0; i < iterations; i++) { 291 | uint64_t* mem1 = generate_random_data(n_values); 292 | 293 | unified.start(); 294 | // Call argument subroutine pointer. 295 | uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8); 296 | unified.end(results); 297 | allresults.push_back(results); 298 | 299 | uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8); 300 | assert(a == b); 301 | total += a; 302 | 303 | STORM_aligned_free(mem1); 304 | } 305 | 306 | std::vector mins = compute_mins(allresults); 307 | std::vector avg = compute_averages(allresults); 308 | 309 | if (verbose) { 310 | printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", 311 | name.c_str(), 312 | n_bitmaps, 313 | double(mins[1]) / mins[0], 314 | double(mins[0]) / (n_bitmaps), 315 | double(mins[1]) / (n_bitmaps), 316 | mins[0], 317 | mins[1], 318 | mins[2], 319 | mins[3], 320 | mins[4], 321 | avg[0], 322 | avg[1], 323 | avg[2], 324 | avg[3], 325 | avg[4]); 326 | } else { 327 | printf("%s-%u:\n",name.c_str(),n_bitmaps); 328 | printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " 329 | "instructions per 64-bit word %4.3f \n", 330 | double(mins[1]) / mins[0], double(mins[0]) / (n_bitmaps), double(mins[1]) / (n_bitmaps)); 331 | // first we display mins 332 | printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu " 333 | "cache ref., %8llu cache mis.\n", 334 | mins[0], mins[1], mins[2], mins[3], mins[4]); 335 | printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " 336 | "cache ref., %8.1f cache mis.\n", 337 | avg[0], avg[1], avg[2], avg[3], avg[4]); 338 | } 339 | 340 | return 1; 341 | } 342 | #endif // end is linux 343 | 344 | struct bench_unit { 345 | bench_unit() : valid(false), cycles(0), cycles_local(0), times(0), times_local(0){} 346 | 347 | bool valid; 348 | float cycles; 349 | float cycles_local; 350 | uint64_t times; 351 | uint64_t times_local; 352 | }; 353 | 354 | uint64_t get_cpu_cycles() { 355 | uint64_t result; 356 | #ifndef _MSC_VER 357 | __asm__ volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax":"=a" 358 | (result)::"%rdx"); 359 | #else 360 | result = __rdtsc(); 361 | #endif 362 | return result; 363 | }; 364 | 365 | #if !defined(__clang__) && !defined(_MSC_VER) 366 | __attribute__((optimize("no-tree-vectorize"))) 367 | #endif 368 | uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { 369 | uint64_t total = 0; 370 | // for (int i = 0; i < len; ++i) { 371 | // total += STORM_popcount64(data1[i] & data2[i]); 372 | // } 373 | 374 | uint64_t diff; 375 | uint8_t* b8 = (uint8_t*)&diff; 376 | for (int i = 0; i < len; ++i) { 377 | // total += STORM_popcount64(data1[i] & data2[i]); 378 | diff = data1[i] & data2[i]; 379 | total += STORM_popcnt_lookup8bit[b8[0]]; 380 | total += STORM_popcnt_lookup8bit[b8[1]]; 381 | total += STORM_popcnt_lookup8bit[b8[2]]; 382 | total += STORM_popcnt_lookup8bit[b8[3]]; 383 | total += STORM_popcnt_lookup8bit[b8[4]]; 384 | total += STORM_popcnt_lookup8bit[b8[5]]; 385 | total += STORM_popcnt_lookup8bit[b8[6]]; 386 | total += STORM_popcnt_lookup8bit[b8[7]]; 387 | } 388 | 389 | return total; 390 | } 391 | 392 | #if !defined(__clang__) && !defined(_MSC_VER) 393 | __attribute__((optimize("no-tree-vectorize"))) 394 | #endif 395 | uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { 396 | uint64_t total = 0; 397 | // for (int i = 0; i < len; ++i) { 398 | // total += STORM_popcount64(data1[i] | data2[i]); 399 | // } 400 | 401 | uint64_t diff; 402 | uint8_t* b8 = (uint8_t*)&diff; 403 | for (int i = 0; i < len; ++i) { 404 | // total += STORM_popcount64(data1[i] | data2[i]); 405 | diff = data1[i] | data2[i]; 406 | total += STORM_popcnt_lookup8bit[b8[0]]; 407 | total += STORM_popcnt_lookup8bit[b8[1]]; 408 | total += STORM_popcnt_lookup8bit[b8[2]]; 409 | total += STORM_popcnt_lookup8bit[b8[3]]; 410 | total += STORM_popcnt_lookup8bit[b8[4]]; 411 | total += STORM_popcnt_lookup8bit[b8[5]]; 412 | total += STORM_popcnt_lookup8bit[b8[6]]; 413 | total += STORM_popcnt_lookup8bit[b8[7]]; 414 | } 415 | 416 | return total; 417 | } 418 | 419 | #if !defined(__clang__) && !defined(_MSC_VER) 420 | __attribute__((optimize("no-tree-vectorize"))) 421 | #endif 422 | uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { 423 | uint64_t total = 0; 424 | // for (int i = 0; i < len; ++i) { 425 | // total += STORM_popcount64(data1[i] ^ data2[i]); 426 | // } 427 | 428 | uint64_t diff; 429 | uint8_t* b8 = (uint8_t*)&diff; 430 | for (int i = 0; i < len; ++i) { 431 | // total += STORM_popcount64(data1[i] ^ data2[i]); 432 | diff = data1[i] ^ data2[i]; 433 | total += STORM_popcnt_lookup8bit[b8[0]]; 434 | total += STORM_popcnt_lookup8bit[b8[1]]; 435 | total += STORM_popcnt_lookup8bit[b8[2]]; 436 | total += STORM_popcnt_lookup8bit[b8[3]]; 437 | total += STORM_popcnt_lookup8bit[b8[4]]; 438 | total += STORM_popcnt_lookup8bit[b8[5]]; 439 | total += STORM_popcnt_lookup8bit[b8[6]]; 440 | total += STORM_popcnt_lookup8bit[b8[7]]; 441 | } 442 | 443 | return total; 444 | } 445 | 446 | // Definition for microsecond timer. 447 | typedef std::chrono::high_resolution_clock::time_point clockdef; 448 | 449 | int set_algebra_wrapper(std::string name, 450 | STORM_compute_func f, 451 | int iterations, 452 | uint32_t range, 453 | uint32_t n_values, 454 | size_t n_bitmaps, 455 | bench_unit& unit) 456 | { 457 | uint32_t cycles_low = 0, cycles_high = 0; 458 | uint32_t cycles_low1 = 0, cycles_high1 = 0; 459 | // Start timer. 460 | 461 | std::vector clocks; 462 | std::vector times; 463 | 464 | #ifndef _MSC_VER 465 | // Intel guide: 466 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf 467 | asm volatile ("CPUID\n\t" 468 | "RDTSC\n\t" 469 | "mov %%edx, %0\n\t" 470 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 471 | asm volatile("RDTSCP\n\t" 472 | "mov %%edx, %0\n\t" 473 | "mov %%eax, %1\n\t" 474 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 475 | asm volatile ("CPUID\n\t" 476 | "RDTSC\n\t" 477 | "mov %%edx, %0\n\t" 478 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 479 | asm volatile("RDTSCP\n\t" 480 | "mov %%edx, %0\n\t" 481 | "mov %%eax, %1\n\t" 482 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 483 | #endif 484 | uint64_t* mem1 = generate_random_data(n_values); 485 | uint64_t* mem2 = generate_random_data(n_values); 486 | 487 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization 488 | clockdef t1 = std::chrono::high_resolution_clock::now(); 489 | 490 | #ifdef __linux__ 491 | // unsigned long flags; 492 | // preempt_disable(); /*we disable preemption on our CPU*/ 493 | // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/ 494 | /*at this stage we exclusively own the CPU*/ 495 | #endif 496 | 497 | #ifndef _MSC_VER 498 | asm volatile ("CPUID\n\t" 499 | "RDTSC\n\t" 500 | "mov %%edx, %0\n\t" 501 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 502 | #endif 503 | 504 | for (int i = 0; i < iterations; ++i) { 505 | // Call argument subroutine pointer. 506 | total += (*f)(mem1, mem2, n_bitmaps); 507 | } 508 | 509 | #ifndef _MSC_VER 510 | asm volatile("RDTSCP\n\t" 511 | "mov %%edx, %0\n\t" 512 | "mov %%eax, %1\n\t" 513 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 514 | #endif 515 | #ifdef __linux__ 516 | // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/ 517 | // preempt_enable();/*we enable preemption*/ 518 | #endif 519 | 520 | clockdef t2 = std::chrono::high_resolution_clock::now(); 521 | auto time_span = std::chrono::duration_cast(t2 - t1); 522 | 523 | STORM_aligned_free(mem1); 524 | STORM_aligned_free(mem2); 525 | 526 | uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); 527 | uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); 528 | 529 | double mean_cycles = (end - start) / (double)iterations; 530 | uint32_t mean_time = time_span.count() / (double)iterations; 531 | 532 | std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" << 533 | mean_cycles << "\t" << 534 | mean_time << "\t" << 535 | mean_cycles / n_bitmaps << "\t" << 536 | ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl; 537 | 538 | unit.times += mean_time; 539 | unit.times_local = mean_time; 540 | unit.cycles += mean_cycles; 541 | unit.cycles_local = mean_cycles; 542 | unit.valid = 1; 543 | 544 | return 0; 545 | } 546 | 547 | int popcount_wrapper(std::string name, 548 | STORM_popcnt_func f, 549 | int iterations, 550 | uint32_t range, 551 | uint32_t n_values, 552 | uint32_t n_bitmaps, 553 | bench_unit& unit) 554 | { 555 | uint32_t cycles_low = 0, cycles_high = 0; 556 | uint32_t cycles_low1 = 0, cycles_high1 = 0; 557 | // Start timer. 558 | 559 | std::vector clocks; 560 | std::vector times; 561 | 562 | #ifndef _MSC_VER 563 | // Intel guide: 564 | // @see: https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf 565 | asm volatile ("CPUID\n\t" 566 | "RDTSC\n\t" 567 | "mov %%edx, %0\n\t" 568 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 569 | asm volatile("RDTSCP\n\t" 570 | "mov %%edx, %0\n\t" 571 | "mov %%eax, %1\n\t" 572 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 573 | asm volatile ("CPUID\n\t" 574 | "RDTSC\n\t" 575 | "mov %%edx, %0\n\t" 576 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 577 | asm volatile("RDTSCP\n\t" 578 | "mov %%edx, %0\n\t" 579 | "mov %%eax, %1\n\t" 580 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 581 | #endif 582 | uint64_t* mem = generate_random_data(n_values); 583 | 584 | volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization 585 | clockdef t1 = std::chrono::high_resolution_clock::now(); 586 | 587 | #ifdef __linux__ 588 | // unsigned long flags; 589 | // preempt_disable(); /*we disable preemption on our CPU*/ 590 | // raw_local_irq_save(flags); /*we disable hard interrupts on our CPU*/ 591 | /*at this stage we exclusively own the CPU*/ 592 | #endif 593 | 594 | #ifndef _MSC_VER 595 | asm volatile ("CPUID\n\t" 596 | "RDTSC\n\t" 597 | "mov %%edx, %0\n\t" 598 | "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); 599 | #endif 600 | 601 | size_t n_b = n_bitmaps*8; 602 | for (int i = 0; i < iterations; ++i) { 603 | // Call argument subroutine pointer. 604 | total += (*f)((uint8_t*)mem, n_b); 605 | } 606 | 607 | #ifndef _MSC_VER 608 | asm volatile("RDTSCP\n\t" 609 | "mov %%edx, %0\n\t" 610 | "mov %%eax, %1\n\t" 611 | "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); 612 | #endif 613 | #ifdef __linux__ 614 | // raw_local_irq_restore(flags);/*we enable hard interrupts on our CPU*/ 615 | // preempt_enable();/*we enable preemption*/ 616 | #endif 617 | 618 | clockdef t2 = std::chrono::high_resolution_clock::now(); 619 | auto time_span = std::chrono::duration_cast(t2 - t1); 620 | 621 | STORM_aligned_free(mem); 622 | 623 | uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); 624 | uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); 625 | 626 | double mean_cycles = (end - start) / (double)iterations; 627 | uint32_t mean_time = time_span.count() / (double)iterations; 628 | 629 | std::cout << name << "\t" << n_bitmaps << "\t" << total << "\t" << 630 | mean_cycles << "\t" << 631 | mean_time << "\t" << 632 | mean_cycles / n_bitmaps << "\t" << 633 | ((n_bitmaps*2*sizeof(uint64_t)) / (1024*1024.0)) / (mean_time / 1000000000.0) << std::endl; 634 | 635 | unit.times += mean_time; 636 | unit.times_local = mean_time; 637 | unit.cycles += mean_cycles; 638 | unit.cycles_local = mean_cycles; 639 | unit.valid = 1; 640 | 641 | return 0; 642 | } 643 | 644 | int benchmark(int n_repetitions, bool use_perf = false) { 645 | // Align some bitmaps. 646 | uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); 647 | uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); 648 | 649 | std::vector ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576}; 650 | std::vector reps; 651 | if (n_repetitions <= 0) { 652 | reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,150,150,150,150,150,150,150,100,100,100}; 653 | } else { 654 | reps = std::vector(ranges.size(), n_repetitions); 655 | } 656 | 657 | if (use_perf) { 658 | #ifndef __linux__ 659 | std::cerr << "perf counter are only available on Linux systems!" << std::endl; 660 | exit(EXIT_FAILURE); 661 | #endif 662 | printf("Algorithm\tWords\tInstructions/cycle\tCycles/word\tInstructions/word\tMinCycles\tMinInstructions\tMinBranchMiss\tMinCacheRef\tminCacheMiss\tAvgCycles\tAvgInstructions\tAvgBranchMiss\tAvgCacheRef\tAvgCacheMiss\n"); 663 | } 664 | 665 | 666 | for (int i = 0; i < ranges.size(); ++i) { 667 | bench_unit unit_intsec, unit_union, unit_diff; 668 | 669 | if (use_perf) { 670 | #ifdef __linux__ 671 | linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); 672 | linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], true); 673 | linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); 674 | linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); 675 | linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); 676 | linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); 677 | linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); 678 | linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); 679 | #else 680 | std::cerr << "perf counter are only available on Linux systems!" << std::endl; 681 | exit(EXIT_FAILURE); 682 | #endif 683 | } else { 684 | popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 685 | popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 686 | set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 687 | set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 688 | set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 689 | set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_union); 690 | set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); 691 | set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_diff); 692 | } 693 | } 694 | 695 | // Clean up. 696 | STORM_aligned_free(bitmaps); 697 | STORM_aligned_free(bitmaps2); 698 | 699 | return 1; 700 | } 701 | 702 | int main(int argc, char **argv) { 703 | #if !defined(_MSC_VER) 704 | bool verbose = false; 705 | bool perf_subsystem = false; 706 | int c; 707 | int n_repetitions = -1; 708 | 709 | while ((c = getopt(argc, argv, "vpr:")) != -1) { 710 | switch (c) { 711 | case 'r': 712 | n_repetitions = atoi(optarg); 713 | break; 714 | case 'v': 715 | verbose = true; 716 | break; 717 | case 'p': 718 | perf_subsystem = true; 719 | break; 720 | default: 721 | abort(); 722 | } 723 | } 724 | 725 | benchmark(n_repetitions, perf_subsystem); 726 | #else 727 | int n_repetitions = -1; 728 | if (argc > 2) { 729 | n_repetitions = std::atoi(argv[1]); 730 | } 731 | benchmark(n_repetitions, false); 732 | #endif 733 | 734 | return EXIT_SUCCESS; 735 | } -------------------------------------------------------------------------------- /libalgebra.h: -------------------------------------------------------------------------------- 1 | // License for libalgebra.h 2 | /* 3 | * Copyright (c) 2019 Marcus D. R. Klarqvist 4 | * Author(s): Marcus D. R. Klarqvist 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | // License for pospopcnt.h 20 | /* 21 | * Copyright (c) 2019 22 | * Author(s): Marcus D. R. Klarqvist, Wojciech Muła, and Daniel Lemire 23 | * 24 | * Licensed under the Apache License, Version 2.0 (the "License"); 25 | * you may not use this file except in compliance with the License. 26 | * You may obtain a copy of the License at 27 | * 28 | * http://www.apache.org/licenses/LICENSE-2.0 29 | * 30 | * Unless required by applicable law or agreed to in writing, 31 | * software distributed under the License is distributed on an 32 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 33 | * KIND, either express or implied. See the License for the 34 | * specific language governing permissions and limitations 35 | * under the License. 36 | */ 37 | // License for libpopcnt.h 38 | /* 39 | * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit 40 | * population count) in an array as quickly as possible using 41 | * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. 42 | * 43 | * Copyright (c) 2016 - 2018, Kim Walisch 44 | * Copyright (c) 2016 - 2018, Wojciech Muła 45 | * 46 | * All rights reserved. 47 | * 48 | * Redistribution and use in source and binary forms, with or without 49 | * modification, are permitted provided that the following conditions are met: 50 | * 51 | * 1. Redistributions of source code must retain the above copyright notice, this 52 | * list of conditions and the following disclaimer. 53 | * 2. Redistributions in binary form must reproduce the above copyright notice, 54 | * this list of conditions and the following disclaimer in the documentation 55 | * and/or other materials provided with the distribution. 56 | * 57 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 58 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 59 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 60 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 61 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 62 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 63 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 64 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 65 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 66 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 67 | */ 68 | #ifndef LIBALGEBRA_H_8723467365934 69 | #define LIBALGEBRA_H_8723467365934 70 | 71 | /* ************************************* 72 | * Includes 73 | ***************************************/ 74 | #include 75 | #include 76 | #include 77 | #include 78 | #include 79 | 80 | /* ************************************* 81 | * Safety 82 | ***************************************/ 83 | 84 | #if !(defined(__APPLE__)) && !(defined(__FreeBSD__)) 85 | #include // this should never be needed but there are some reports that it is needed. 86 | #endif 87 | 88 | #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8 89 | #error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported. 90 | #endif 91 | 92 | /**************************** 93 | * Memory management 94 | * 95 | * The subroutines aligned_malloc and aligned_free had to be renamed to 96 | * STORM_aligned_malloc and STORM_aligned_free to prevent clashing with the 97 | * same subroutines in Roaring. These subroutines are included here 98 | * since there is no hard dependency on using Roaring bitmaps. 99 | * 100 | * These subroutines and definitions are taken from the CRoaring repo 101 | * by Daniel Lemire et al. available under the Apache 2.0 License 102 | * (same as libalgebra.h): 103 | * https://github.com/RoaringBitmap/CRoaring/ 104 | ****************************/ 105 | // portable version of posix_memalign 106 | #ifndef _MSC_VER 107 | #include 108 | #endif 109 | 110 | #ifndef STORM_aligned_malloc 111 | static 112 | void* STORM_aligned_malloc(size_t alignment, size_t size) { 113 | void *p; 114 | #ifdef _MSC_VER 115 | p = _aligned_malloc(size, alignment); 116 | #elif defined(__MINGW32__) || defined(__MINGW64__) 117 | p = __mingw_aligned_malloc(size, alignment); 118 | #else 119 | // somehow, if this is used before including "x86intrin.h", it creates an 120 | // implicit defined warning. 121 | if (posix_memalign(&p, alignment, size) != 0) 122 | return NULL; 123 | #endif 124 | return p; 125 | } 126 | #endif 127 | 128 | #ifndef STORM_aligned_free 129 | static 130 | void STORM_aligned_free(void* memblock) { 131 | #ifdef _MSC_VER 132 | _aligned_free(memblock); 133 | #elif defined(__MINGW32__) || defined(__MINGW64__) 134 | __mingw_aligned_free(memblock); 135 | #else 136 | free(memblock); 137 | #endif 138 | } 139 | #endif 140 | 141 | // portable alignment 142 | #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ 143 | # include 144 | # define STORM_ALIGN(n) alignas(n) 145 | #elif defined(__GNUC__) 146 | # define STORM_ALIGN(n) __attribute__ ((aligned(n))) 147 | #elif defined(_MSC_VER) 148 | # define STORM_ALIGN(n) __declspec(align(n)) 149 | #else 150 | # define STORM_ALIGN(n) /* disabled */ 151 | #endif 152 | 153 | /* ************************************* 154 | * Compiler Specific Options 155 | ***************************************/ 156 | // Taken from XXHASH 157 | #ifdef _MSC_VER /* Visual Studio */ 158 | # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ 159 | # define STORM_FORCE_INLINE static __forceinline 160 | # define STORM_NO_INLINE static __declspec(noinline) 161 | #else 162 | # if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ 163 | # ifdef __GNUC__ 164 | # define STORM_FORCE_INLINE static inline __attribute__((always_inline)) 165 | # define STORM_NO_INLINE static __attribute__((noinline)) 166 | # else 167 | # define STORM_FORCE_INLINE static inline 168 | # define STORM_NO_INLINE static 169 | # endif 170 | # else 171 | # define STORM_FORCE_INLINE static 172 | # define STORM_NO_INLINE static 173 | # endif /* __STDC_VERSION__ */ 174 | #endif 175 | 176 | /**************************** 177 | * General checks 178 | ****************************/ 179 | 180 | #ifndef __has_builtin 181 | #define STORM_HAS_BUILTIN(x) 0 182 | #else 183 | #define STORM_HAS_BUILTIN(x) __has_builtin(x) 184 | #endif 185 | 186 | #ifndef __has_attribute 187 | #define STORM_HAS_ATTRIBUTE(x) 0 188 | #else 189 | #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x) 190 | #endif 191 | 192 | // disable noise 193 | #ifdef __GNUC__ 194 | #define STORM_WARN_UNUSED __attribute__((warn_unused_result)) 195 | #else 196 | #define STORM_WARN_UNUSED 197 | #endif 198 | 199 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ 200 | # define STORM_RESTRICT restrict 201 | #else 202 | /* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */ 203 | # define STORM_RESTRICT /* disable */ 204 | #endif 205 | 206 | #ifdef __GNUC__ 207 | #define GNUC_PREREQ(x, y) \ 208 | (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) 209 | #else 210 | #define GNUC_PREREQ(x, y) 0 211 | #endif 212 | 213 | #ifdef __clang__ 214 | #define CLANG_PREREQ(x, y) \ 215 | (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) 216 | #else 217 | #define CLANG_PREREQ(x, y) 0 218 | #endif 219 | 220 | #if (defined(__i386__) || \ 221 | defined(__x86_64__) || \ 222 | defined(_M_IX86) || \ 223 | defined(_M_X64)) 224 | #define X86_OR_X64 225 | #endif 226 | 227 | #if defined(X86_OR_X64) && \ 228 | (defined(__cplusplus) || \ 229 | defined(_MSC_VER) || \ 230 | (GNUC_PREREQ(4, 2) || \ 231 | STORM_HAS_BUILTIN(__sync_val_compare_and_swap))) 232 | #define STORM_HAVE_CPUID 233 | #endif 234 | 235 | #if GNUC_PREREQ(4, 2) || \ 236 | STORM_HAS_BUILTIN(__builtin_popcount) 237 | #define STORM_HAVE_BUILTIN_POPCOUNT 238 | #endif 239 | 240 | #if GNUC_PREREQ(4, 2) || \ 241 | CLANG_PREREQ(3, 0) 242 | #define STORM_HAVE_ASM_POPCNT 243 | #endif 244 | 245 | #if defined(STORM_HAVE_CPUID) && \ 246 | (defined(STORM_HAVE_ASM_POPCNT) || \ 247 | defined(_MSC_VER)) 248 | #define STORM_HAVE_POPCNT 249 | #endif 250 | 251 | #if defined(STORM_HAVE_CPUID) && \ 252 | GNUC_PREREQ(4, 9) 253 | #define STORM_HAVE_SSE42 254 | #define STORM_HAVE_AVX2 255 | #endif 256 | 257 | #if defined(STORM_HAVE_CPUID) && \ 258 | GNUC_PREREQ(5, 0) 259 | #define STORM_HAVE_AVX512 260 | #endif 261 | 262 | #if defined(STORM_HAVE_CPUID) && \ 263 | defined(_MSC_VER) && \ 264 | defined(__AVX2__) 265 | #define STORM_HAVE_SSE42 266 | #define STORM_HAVE_AVX2 267 | #endif 268 | 269 | #if defined(STORM_HAVE_CPUID) && \ 270 | defined(_MSC_VER) && \ 271 | defined(__AVX512__) 272 | #define STORM_HAVE_AVX512 273 | #endif 274 | 275 | #if defined(STORM_HAVE_CPUID) && \ 276 | CLANG_PREREQ(3, 8) && \ 277 | STORM_HAS_ATTRIBUTE(target) && \ 278 | (!defined(_MSC_VER) || defined(__AVX2__)) && \ 279 | (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) 280 | #define STORM_HAVE_SSE42 281 | #define STORM_HAVE_AVX2 282 | #define STORM_HAVE_AVX512 283 | #endif 284 | 285 | // Target attribute 286 | #if !defined(_MSC_VER) 287 | #define STORM_TARGET(x) __attribute__ ((target (x))) 288 | #else 289 | #define STORM_TARGET(x) 0 290 | #endif 291 | 292 | 293 | /**************************** 294 | * CPUID and SIMD 295 | ****************************/ 296 | 297 | #define STORM_SSE_ALIGNMENT 16 298 | #define STORM_AVX2_ALIGNMENT 32 299 | #define STORM_AVX512_ALIGNMENT 64 300 | 301 | #ifdef __cplusplus 302 | extern "C" { 303 | #endif 304 | 305 | #if defined(STORM_HAVE_CPUID) 306 | 307 | #if defined(_MSC_VER) 308 | #include 309 | #include 310 | #endif 311 | 312 | // CPUID flags. See https://en.wikipedia.org/wiki/CPUID for more info. 313 | /* %ecx bit flags */ 314 | #define STORM_CPUID_runtime_bit_POPCNT (1 << 23) // POPCNT instruction 315 | #define STORM_CPUID_runtime_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19] 316 | #define STORM_CPUID_runtime_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20] 317 | 318 | /* %ebx bit flags */ 319 | #define STORM_CPUID_runtime_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5] 320 | #define STORM_CPUID_runtime_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions 321 | 322 | /* xgetbv bit flags */ 323 | #define STORM_XSTATE_SSE (1 << 1) 324 | #define STORM_XSTATE_YMM (1 << 2) 325 | #define STORM_XSTATE_ZMM (7 << 5) 326 | 327 | static 328 | void STORM_run_cpuid(int eax, int ecx, int* abcd) { 329 | #if defined(_MSC_VER) 330 | __cpuidex(abcd, eax, ecx); 331 | #else 332 | int ebx = 0; 333 | int edx = 0; 334 | 335 | #if defined(__i386__) && \ 336 | defined(__PIC__) 337 | /* in case of PIC under 32-bit EBX cannot be clobbered */ 338 | __asm__ ("movl %%ebx, %%edi;" 339 | "cpuid;" 340 | "xchgl %%ebx, %%edi;" 341 | : "=D" (ebx), 342 | "+a" (eax), 343 | "+c" (ecx), 344 | "=d" (edx)); 345 | #else 346 | __asm__ ("cpuid;" 347 | : "+b" (ebx), 348 | "+a" (eax), 349 | "+c" (ecx), 350 | "=d" (edx)); 351 | #endif 352 | 353 | abcd[0] = eax; 354 | abcd[1] = ebx; 355 | abcd[2] = ecx; 356 | abcd[3] = edx; 357 | #endif 358 | } 359 | 360 | #if defined(STORM_HAVE_AVX2) || \ 361 | defined(STORM_HAVE_AVX512) 362 | 363 | static 364 | int STORM_get_xcr0() { 365 | int xcr0; 366 | 367 | #if defined(_MSC_VER) 368 | xcr0 = (int) _xgetbv(0); 369 | #else 370 | __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); 371 | #endif 372 | 373 | return xcr0; 374 | } 375 | 376 | #endif 377 | 378 | static 379 | int STORM_get_cpuid() { 380 | int flags = 0; 381 | int abcd[4]; 382 | 383 | STORM_run_cpuid(1, 0, abcd); 384 | 385 | // Check for POPCNT instruction 386 | if ((abcd[2] & STORM_CPUID_runtime_bit_POPCNT) == STORM_CPUID_runtime_bit_POPCNT) 387 | flags |= STORM_CPUID_runtime_bit_POPCNT; 388 | 389 | // Check for SSE4.1 instruction set 390 | if ((abcd[2] & STORM_CPUID_runtime_bit_SSE41) == STORM_CPUID_runtime_bit_SSE41) 391 | flags |= STORM_CPUID_runtime_bit_SSE41; 392 | 393 | // Check for SSE4.2 instruction set 394 | if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42) 395 | flags |= STORM_CPUID_runtime_bit_SSE42; 396 | 397 | #if defined(STORM_HAVE_AVX2) || \ 398 | defined(STORM_HAVE_AVX512) 399 | 400 | int osxsave_mask = (1 << 27); 401 | 402 | /* ensure OS supports extended processor state management */ 403 | if ((abcd[2] & osxsave_mask) != osxsave_mask) 404 | return 0; 405 | 406 | int ymm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM; 407 | int zmm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM | STORM_XSTATE_ZMM; 408 | 409 | int xcr0 = STORM_get_xcr0(); 410 | 411 | if ((xcr0 & ymm_mask) == ymm_mask) { 412 | STORM_run_cpuid(7, 0, abcd); 413 | 414 | if ((abcd[1] & STORM_CPUID_runtime_bit_AVX2) == STORM_CPUID_runtime_bit_AVX2) 415 | flags |= STORM_CPUID_runtime_bit_AVX2; 416 | 417 | if ((xcr0 & zmm_mask) == zmm_mask) { 418 | if ((abcd[1] & STORM_CPUID_runtime_bit_AVX512BW) == STORM_CPUID_runtime_bit_AVX512BW) 419 | flags |= STORM_CPUID_runtime_bit_AVX512BW; 420 | } 421 | } 422 | 423 | #endif 424 | 425 | return flags; 426 | } 427 | #endif // defined(STORM_HAVE_CPUID) 428 | 429 | /// Taken from libpopcnt.h 430 | #if defined(STORM_HAVE_ASM_POPCNT) && \ 431 | defined(__x86_64__) 432 | 433 | STORM_FORCE_INLINE 434 | uint64_t STORM_POPCOUNT(uint64_t x) 435 | { 436 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); 437 | return x; 438 | } 439 | 440 | #elif defined(STORM_HAVE_ASM_POPCNT) && \ 441 | defined(__i386__) 442 | 443 | STORM_FORCE_INLINE 444 | uint32_t STORM_popcnt32(uint32_t x) 445 | { 446 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); 447 | return x; 448 | } 449 | 450 | STORM_FORCE_INLINE 451 | uint64_t STORM_POPCOUNT(uint64_t x) 452 | { 453 | return STORM_popcnt32((uint32_t) x) + 454 | STORM_popcnt32((uint32_t)(x >> 32)); 455 | } 456 | 457 | #elif defined(_MSC_VER) && \ 458 | defined(_M_X64) 459 | 460 | #include 461 | 462 | STORM_FORCE_INLINE 463 | uint64_t STORM_POPCOUNT(uint64_t x) { 464 | return _mm_popcnt_u64(x); 465 | } 466 | 467 | #elif defined(_MSC_VER) && \ 468 | defined(_M_IX86) 469 | 470 | #include 471 | 472 | STORM_FORCE_INLINE 473 | uint64_t STORM_POPCOUNT(uint64_t x) 474 | { 475 | return _mm_popcnt_u32((uint32_t) x) + 476 | _mm_popcnt_u32((uint32_t)(x >> 32)); 477 | } 478 | 479 | /* non x86 CPUs */ 480 | #elif defined(STORM_HAVE_BUILTIN_POPCOUNT) 481 | 482 | STORM_FORCE_INLINE 483 | uint64_t STORM_POPCOUNT(uint64_t x) { 484 | return __builtin_popcountll(x); 485 | } 486 | 487 | /* no hardware POPCNT, 488 | * use pure integer algorithm */ 489 | #else 490 | 491 | STORM_FORCE_INLINE 492 | uint64_t STORM_POPCOUNT(uint64_t x) { 493 | return STORM_popcount64(x); 494 | } 495 | 496 | #endif 497 | 498 | 499 | static 500 | uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, 501 | const uint64_t* STORM_RESTRICT data2, 502 | size_t size) 503 | { 504 | const uint64_t limit = size - size % 4; 505 | uint64_t cnt = 0; 506 | uint64_t i = 0; 507 | 508 | for (/**/; i < limit; i += 4) { 509 | cnt += STORM_POPCOUNT(data1[i+0] & data2[i+0]); 510 | cnt += STORM_POPCOUNT(data1[i+1] & data2[i+1]); 511 | cnt += STORM_POPCOUNT(data1[i+2] & data2[i+2]); 512 | cnt += STORM_POPCOUNT(data1[i+3] & data2[i+3]); 513 | } 514 | 515 | for (/**/; i < size; ++i) 516 | cnt += STORM_POPCOUNT(data1[i] & data2[i]); 517 | 518 | return cnt; 519 | } 520 | 521 | static 522 | uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, 523 | const uint64_t* STORM_RESTRICT data2, 524 | size_t size) 525 | { 526 | const uint64_t limit = size - size % 4; 527 | uint64_t cnt = 0; 528 | uint64_t i = 0; 529 | 530 | for (/**/; i < limit; i += 4) { 531 | cnt += STORM_POPCOUNT(data1[i+0] | data2[i+0]); 532 | cnt += STORM_POPCOUNT(data1[i+1] | data2[i+1]); 533 | cnt += STORM_POPCOUNT(data1[i+2] | data2[i+2]); 534 | cnt += STORM_POPCOUNT(data1[i+3] | data2[i+3]); 535 | } 536 | 537 | for (/**/; i < size; ++i) 538 | cnt += STORM_POPCOUNT(data1[i] | data2[i]); 539 | 540 | return cnt; 541 | } 542 | 543 | static 544 | uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, 545 | const uint64_t* STORM_RESTRICT data2, 546 | size_t size) 547 | { 548 | const uint64_t limit = size - size % 4; 549 | uint64_t cnt = 0; 550 | uint64_t i = 0; 551 | 552 | for (/**/; i < limit; i += 4) { 553 | cnt += STORM_POPCOUNT(data1[i+0] ^ data2[i+0]); 554 | cnt += STORM_POPCOUNT(data1[i+1] ^ data2[i+1]); 555 | cnt += STORM_POPCOUNT(data1[i+2] ^ data2[i+2]); 556 | cnt += STORM_POPCOUNT(data1[i+3] ^ data2[i+3]); 557 | } 558 | 559 | for (/**/; i < size; ++i) 560 | cnt += STORM_POPCOUNT(data1[i] ^ data2[i]); 561 | 562 | return cnt; 563 | } 564 | 565 | static 566 | int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, size_t len, uint32_t* out) { 567 | for (int i = 0; i < len; ++i) { 568 | for (int j = 0; j < 16; ++j) { 569 | out[j] += ((data[i] & (1 << j)) >> j); 570 | } 571 | } 572 | 573 | return 0; 574 | } 575 | 576 | #ifndef _MSC_VER 577 | 578 | STORM_FORCE_INLINE 579 | uint64_t STORM_pospopcnt_umul128(uint64_t a, uint64_t b, uint64_t* hi) { 580 | unsigned __int128 x = (unsigned __int128)a * (unsigned __int128)b; 581 | *hi = (uint64_t)(x >> 64); 582 | return (uint64_t)x; 583 | } 584 | 585 | STORM_FORCE_INLINE 586 | uint64_t STORM_pospopcnt_loadu_u64(const void* ptr) { 587 | uint64_t data; 588 | memcpy(&data, ptr, sizeof(data)); 589 | return data; 590 | } 591 | 592 | // By @aqrit (https://github.com/aqrit) 593 | // @see: https://gist.github.com/aqrit/c729815b0165c139d0bac642ab7ee104 594 | static 595 | int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, size_t n, uint32_t* out) { 596 | while (n >= 8) { 597 | uint64_t counter_a = 0; // 4 packed 12-bit counters 598 | uint64_t counter_b = 0; 599 | uint64_t counter_c = 0; 600 | uint64_t counter_d = 0; 601 | 602 | // end before overflowing the counters 603 | uint32_t len = ((n < 0x0FFF) ? n : 0x0FFF) & ~7; 604 | n -= len; 605 | for (const uint16_t* end = &in[len]; in != end; in += 8) { 606 | const uint64_t mask_a = UINT64_C(0x1111111111111111); 607 | const uint64_t mask_b = mask_a + mask_a; 608 | const uint64_t mask_c = mask_b + mask_b; 609 | const uint64_t mask_0001 = UINT64_C(0x0001000100010001); 610 | const uint64_t mask_cnts = UINT64_C(0x000000F00F00F00F); 611 | 612 | uint64_t v0 = STORM_pospopcnt_loadu_u64(&in[0]); 613 | uint64_t v1 = STORM_pospopcnt_loadu_u64(&in[4]); 614 | 615 | uint64_t a = (v0 & mask_a) + (v1 & mask_a); 616 | uint64_t b = ((v0 & mask_b) + (v1 & mask_b)) >> 1; 617 | uint64_t c = ((v0 & mask_c) + (v1 & mask_c)) >> 2; 618 | uint64_t d = ((v0 >> 3) & mask_a) + ((v1 >> 3) & mask_a); 619 | 620 | uint64_t hi; 621 | a = STORM_pospopcnt_umul128(a, mask_0001, &hi); 622 | a += hi; // broadcast 4-bit counts 623 | b = STORM_pospopcnt_umul128(b, mask_0001, &hi); 624 | b += hi; 625 | c = STORM_pospopcnt_umul128(c, mask_0001, &hi); 626 | c += hi; 627 | d = STORM_pospopcnt_umul128(d, mask_0001, &hi); 628 | d += hi; 629 | 630 | counter_a += a & mask_cnts; 631 | counter_b += b & mask_cnts; 632 | counter_c += c & mask_cnts; 633 | counter_d += d & mask_cnts; 634 | } 635 | 636 | out[0] += counter_a & 0x0FFF; 637 | out[1] += counter_b & 0x0FFF; 638 | out[2] += counter_c & 0x0FFF; 639 | out[3] += counter_d & 0x0FFF; 640 | out[4] += (counter_a >> 36); 641 | out[5] += (counter_b >> 36); 642 | out[6] += (counter_c >> 36); 643 | out[7] += (counter_d >> 36); 644 | out[8] += (counter_a >> 24) & 0x0FFF; 645 | out[9] += (counter_b >> 24) & 0x0FFF; 646 | out[10] += (counter_c >> 24) & 0x0FFF; 647 | out[11] += (counter_d >> 24) & 0x0FFF; 648 | out[12] += (counter_a >> 12) & 0x0FFF; 649 | out[13] += (counter_b >> 12) & 0x0FFF; 650 | out[14] += (counter_c >> 12) & 0x0FFF; 651 | out[15] += (counter_d >> 12) & 0x0FFF; 652 | } 653 | 654 | // assert(n < 8) 655 | if (n != 0) { 656 | uint64_t tail_counter_a = 0; 657 | uint64_t tail_counter_b = 0; 658 | do { // zero-extend a bit to 8-bits (emulate pdep) then accumulate 659 | const uint64_t mask_01 = UINT64_C(0x0101010101010101); 660 | const uint64_t magic = UINT64_C(0x0000040010004001); // 1+(1<<14)+(1<<28)+(1<<42) 661 | uint64_t x = *in++; 662 | tail_counter_a += ((x & 0x5555) * magic) & mask_01; // 0101010101010101 663 | tail_counter_b += (((x >> 1) & 0x5555) * magic) & mask_01; 664 | } while (--n); 665 | 666 | out[0] += tail_counter_a & 0xFF; 667 | out[8] += (tail_counter_a >> 8) & 0xFF; 668 | out[2] += (tail_counter_a >> 16) & 0xFF; 669 | out[10] += (tail_counter_a >> 24) & 0xFF; 670 | out[4] += (tail_counter_a >> 32) & 0xFF; 671 | out[12] += (tail_counter_a >> 40) & 0xFF; 672 | out[6] += (tail_counter_a >> 48) & 0xFF; 673 | out[14] += (tail_counter_a >> 56) & 0xFF; 674 | out[1] += tail_counter_b & 0xFF; 675 | out[9] += (tail_counter_b >> 8) & 0xFF; 676 | out[3] += (tail_counter_b >> 16) & 0xFF; 677 | out[11] += (tail_counter_b >> 24) & 0xFF; 678 | out[5] += (tail_counter_b >> 32) & 0xFF; 679 | out[13] += (tail_counter_b >> 40) & 0xFF; 680 | out[7] += (tail_counter_b >> 48) & 0xFF; 681 | out[15] += (tail_counter_b >> 56) & 0xFF; 682 | } 683 | 684 | return 0; 685 | } 686 | #endif 687 | 688 | /* 689 | * This uses fewer arithmetic operations than any other known 690 | * implementation on machines with fast multiplication. 691 | * It uses 12 arithmetic operations, one of which is a multiply. 692 | * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation 693 | */ 694 | STORM_FORCE_INLINE 695 | uint64_t STORM_popcount64(uint64_t x) 696 | { 697 | uint64_t m1 = UINT64_C(0x5555555555555555); 698 | uint64_t m2 = UINT64_C(0x3333333333333333); 699 | uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F); 700 | uint64_t h01 = UINT64_C(0x0101010101010101); 701 | 702 | x -= (x >> 1) & m1; 703 | x = (x & m2) + ((x >> 2) & m2); 704 | x = (x + (x >> 4)) & m4; 705 | 706 | return (x * h01) >> 56; 707 | } 708 | 709 | 710 | static 711 | const uint8_t STORM_popcnt_lookup8bit[256] = { 712 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 713 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 714 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 715 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 716 | /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3, 717 | /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4, 718 | /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4, 719 | /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5, 720 | /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3, 721 | /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4, 722 | /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4, 723 | /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5, 724 | /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4, 725 | /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5, 726 | /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5, 727 | /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6, 728 | /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3, 729 | /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4, 730 | /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4, 731 | /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5, 732 | /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4, 733 | /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5, 734 | /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5, 735 | /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6, 736 | /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4, 737 | /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5, 738 | /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5, 739 | /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6, 740 | /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5, 741 | /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6, 742 | /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6, 743 | /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7, 744 | /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3, 745 | /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4, 746 | /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4, 747 | /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5, 748 | /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4, 749 | /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5, 750 | /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5, 751 | /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6, 752 | /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4, 753 | /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5, 754 | /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5, 755 | /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6, 756 | /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5, 757 | /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6, 758 | /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6, 759 | /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7, 760 | /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4, 761 | /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5, 762 | /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5, 763 | /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6, 764 | /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5, 765 | /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6, 766 | /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6, 767 | /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7, 768 | /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5, 769 | /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6, 770 | /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6, 771 | /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7, 772 | /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6, 773 | /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7, 774 | /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7, 775 | /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8 776 | }; 777 | 778 | /**************************** 779 | * SSE4.1 functions 780 | ****************************/ 781 | 782 | #if defined(STORM_HAVE_SSE42) 783 | 784 | #include 785 | 786 | STORM_TARGET("sse4.2") 787 | STORM_FORCE_INLINE 788 | uint64_t STORM_POPCOUNT_SSE(const __m128i n) { 789 | return(STORM_POPCOUNT(_mm_cvtsi128_si64(n)) + 790 | STORM_POPCOUNT(_mm_cvtsi128_si64(_mm_unpackhi_epi64(n, n)))); 791 | } 792 | 793 | STORM_TARGET("sse4.2") 794 | STORM_FORCE_INLINE 795 | void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) { 796 | __m128i u = _mm_xor_si128(a, b); 797 | *h = _mm_or_si128(_mm_and_si128(a, b), _mm_and_si128(u, c)); 798 | *l = _mm_xor_si128(u, c); 799 | } 800 | 801 | /** 802 | * Carry-save adder update step. 803 | * @see https://en.wikipedia.org/wiki/Carry-save_adder#Technical_details 804 | * 805 | * Steps: 806 | * 1) U = *L ⊕ B 807 | * 2) *H = (*L ^ B) | (U ^ C) 808 | * 3) *L = *L ⊕ B ⊕ C = U ⊕ C 809 | * 810 | * B and C are 16-bit staggered registers such that &C - &B = 1. 811 | * 812 | * Example usage: 813 | * pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 0), _mm_loadu_si128(data + i + 1)); 814 | * 815 | * @param h 816 | * @param l 817 | * @param b 818 | * @param c 819 | */ 820 | STORM_TARGET("sse4.2") 821 | STORM_FORCE_INLINE 822 | void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, 823 | __m128i* STORM_RESTRICT l, 824 | const __m128i b, 825 | const __m128i c) 826 | { 827 | const __m128i u = _mm_xor_si128(*l, b); 828 | *h = _mm_or_si128(*l & b, u & c); // shift carry (sc_i). 829 | *l = _mm_xor_si128(u, c); // partial sum (ps). 830 | } 831 | 832 | // By @aqrit (https://github.com/aqrit) 833 | // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3 834 | STORM_TARGET("sse4.2") 835 | static 836 | int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, size_t len, uint32_t* flag_counts) { 837 | const __m128i zero = _mm_setzero_si128(); 838 | const __m128i mask_lo_byte = _mm_srli_epi16(_mm_cmpeq_epi8(zero, zero), 8); 839 | const __m128i mask_lo_cnt = _mm_srli_epi16(mask_lo_byte, 2); 840 | const __m128i mask_bits_a = _mm_set1_epi8(0x41); // 01000001 841 | const __m128i mask_bits_b = _mm_add_epi8(mask_bits_a, mask_bits_a); 842 | uint32_t buffer[16]; 843 | 844 | __m128i counterA = zero; 845 | __m128i counterB = zero; 846 | __m128i counterC = zero; 847 | __m128i counterD = zero; 848 | 849 | for (const uint16_t* end = &data[(len & ~31)]; data != end; data += 32) { 850 | __m128i r0 = _mm_loadu_si128((__m128i*)&data[0]); 851 | __m128i r1 = _mm_loadu_si128((__m128i*)&data[8]); 852 | __m128i r2 = _mm_loadu_si128((__m128i*)&data[16]); 853 | __m128i r3 = _mm_loadu_si128((__m128i*)&data[24]); 854 | __m128i r4, r5, r6, r7; 855 | 856 | // seperate LOBYTE and HIBYTE of each WORD 857 | // (emulate PSHUFB F,D,B,9,7,5,3,1, E,C,A,8,6,4,2,0) 858 | r4 = _mm_and_si128(mask_lo_byte, r0); 859 | r5 = _mm_and_si128(mask_lo_byte, r1); 860 | r6 = _mm_and_si128(mask_lo_byte, r2); 861 | r7 = _mm_and_si128(mask_lo_byte, r3); 862 | r0 = _mm_srli_epi16(r0, 8); 863 | r1 = _mm_srli_epi16(r1, 8); 864 | r2 = _mm_srli_epi16(r2, 8); 865 | r3 = _mm_srli_epi16(r3, 8); 866 | r0 = _mm_packus_epi16(r0, r4); 867 | r1 = _mm_packus_epi16(r1, r5); 868 | r2 = _mm_packus_epi16(r2, r6); 869 | r3 = _mm_packus_epi16(r3, r7); 870 | 871 | // isolate bits to count 872 | r4 = _mm_and_si128(mask_bits_a, r0); 873 | r5 = _mm_and_si128(mask_bits_a, r1); 874 | r6 = _mm_and_si128(mask_bits_a, r2); 875 | r7 = _mm_and_si128(mask_bits_a, r3); 876 | 877 | // horizontal sum of qwords 878 | r4 = _mm_sad_epu8(r4, zero); 879 | r5 = _mm_sad_epu8(r5, zero); 880 | r6 = _mm_sad_epu8(r6, zero); 881 | r7 = _mm_sad_epu8(r7, zero); 882 | 883 | // sum 6-bit counts 884 | r4 = _mm_add_epi16(r4,r5); 885 | r4 = _mm_add_epi16(r4,r6); 886 | r4 = _mm_add_epi16(r4,r7); 887 | 888 | // unpack 6-bit counts to 32-bits 889 | r5 = _mm_and_si128(mask_lo_cnt, r4); 890 | r4 = _mm_srli_epi16(r4, 6); 891 | r4 = _mm_packs_epi32(r4, r5); 892 | 893 | // accumulate 894 | counterA = _mm_add_epi32(counterA, r4); 895 | 896 | // do it again... 897 | r4 = _mm_and_si128(mask_bits_b, r0); 898 | r5 = _mm_and_si128(mask_bits_b, r1); 899 | r6 = _mm_and_si128(mask_bits_b, r2); 900 | r7 = _mm_and_si128(mask_bits_b, r3); 901 | 902 | r4 = _mm_sad_epu8(r4, zero); 903 | r5 = _mm_sad_epu8(r5, zero); 904 | r6 = _mm_sad_epu8(r6, zero); 905 | r7 = _mm_sad_epu8(r7, zero); 906 | 907 | r4 = _mm_add_epi16(r4,r5); 908 | r4 = _mm_add_epi16(r4,r6); 909 | r4 = _mm_add_epi16(r4,r7); 910 | 911 | r5 = _mm_avg_epu8(zero, r4); // shift right 1 912 | r5 = _mm_and_si128(r5, mask_lo_cnt); 913 | r4 = _mm_srli_epi16(r4, 7); 914 | r4 = _mm_packs_epi32(r4, r5); 915 | 916 | counterB = _mm_add_epi32(counterB, r4); // accumulate 917 | 918 | // rotate right 4 919 | r4 = _mm_slli_epi16(r0, 12); 920 | r5 = _mm_slli_epi16(r1, 12); 921 | r6 = _mm_slli_epi16(r2, 12); 922 | r7 = _mm_slli_epi16(r3, 12); 923 | r0 = _mm_srli_epi16(r0, 4); 924 | r1 = _mm_srli_epi16(r1, 4); 925 | r2 = _mm_srli_epi16(r2, 4); 926 | r3 = _mm_srli_epi16(r3, 4); 927 | r0 = _mm_or_si128(r0, r4); 928 | r1 = _mm_or_si128(r1, r5); 929 | r2 = _mm_or_si128(r2, r6); 930 | r3 = _mm_or_si128(r3, r7); 931 | 932 | // do it again... 933 | r4 = _mm_and_si128(mask_bits_a, r0); 934 | r5 = _mm_and_si128(mask_bits_a, r1); 935 | r6 = _mm_and_si128(mask_bits_a, r2); 936 | r7 = _mm_and_si128(mask_bits_a, r3); 937 | 938 | r4 = _mm_sad_epu8(r4, zero); 939 | r5 = _mm_sad_epu8(r5, zero); 940 | r6 = _mm_sad_epu8(r6, zero); 941 | r7 = _mm_sad_epu8(r7, zero); 942 | 943 | r4 = _mm_add_epi16(r4,r5); 944 | r4 = _mm_add_epi16(r4,r6); 945 | r4 = _mm_add_epi16(r4,r7); 946 | 947 | r5 = _mm_and_si128(mask_lo_cnt, r4); 948 | r4 = _mm_srli_epi16(r4, 6); 949 | r4 = _mm_packs_epi32(r4, r5); 950 | 951 | counterC = _mm_add_epi32(counterC, r4); // accumulate 952 | 953 | // do it again... 954 | r0 = _mm_and_si128(r0, mask_bits_b); 955 | r1 = _mm_and_si128(r1, mask_bits_b); 956 | r2 = _mm_and_si128(r2, mask_bits_b); 957 | r3 = _mm_and_si128(r3, mask_bits_b); 958 | 959 | r0 = _mm_sad_epu8(r0, zero); 960 | r1 = _mm_sad_epu8(r1, zero); 961 | r2 = _mm_sad_epu8(r2, zero); 962 | r3 = _mm_sad_epu8(r3, zero); 963 | 964 | r0 = _mm_add_epi16(r0,r1); 965 | r0 = _mm_add_epi16(r0,r2); 966 | r0 = _mm_add_epi16(r0,r3); 967 | 968 | r1 = _mm_avg_epu8(zero, r0); 969 | r1 = _mm_and_si128(r1, mask_lo_cnt); 970 | r0 = _mm_srli_epi16(r0, 7); 971 | r0 = _mm_packs_epi32(r0, r1); 972 | 973 | counterD = _mm_add_epi32(counterD, r0); // accumulate 974 | } 975 | 976 | // transpose then store counters 977 | __m128i counter_1098 = _mm_unpackhi_epi32(counterA, counterB); 978 | __m128i counter_76FE = _mm_unpacklo_epi32(counterA, counterB); 979 | __m128i counter_32BA = _mm_unpacklo_epi32(counterC, counterD); 980 | __m128i counter_54DC = _mm_unpackhi_epi32(counterC, counterD); 981 | __m128i counter_7654 = _mm_unpackhi_epi64(counter_54DC, counter_76FE); 982 | __m128i counter_FEDC = _mm_unpacklo_epi64(counter_54DC, counter_76FE); 983 | __m128i counter_3210 = _mm_unpackhi_epi64(counter_1098, counter_32BA); 984 | __m128i counter_BA98 = _mm_unpacklo_epi64(counter_1098, counter_32BA); 985 | 986 | 987 | _mm_storeu_si128((__m128i*)&buffer[0], counter_3210); 988 | _mm_storeu_si128((__m128i*)&buffer[4], counter_7654); 989 | _mm_storeu_si128((__m128i*)&buffer[8], counter_BA98); 990 | _mm_storeu_si128((__m128i*)&buffer[12], counter_FEDC); 991 | for (int i = 0; i < 16; ++i) flag_counts[i] += buffer[i]; 992 | 993 | // scalar tail loop 994 | int tail = len & 31; 995 | if (tail != 0) { 996 | uint64_t countsA = 0; 997 | uint64_t countsB = 0; 998 | do { 999 | // zero-extend a bit to 8-bits then accumulate 1000 | // (emulate pdep) 1001 | const uint64_t mask_01 = UINT64_C(0x0101010101010101);// 100000001000000010000000100000001000000010000000100000001 1002 | const uint64_t magic = UINT64_C(0x0000040010004001);// 000000000000001000000000000010000000000000100000000000001 1003 | // 1+(1<<14)+(1<<28)+(1<<42) 1004 | uint64_t x = *data++; 1005 | countsA += ((x & 0x5555) * magic) & mask_01; // 0101010101010101 1006 | countsB += (((x >> 1) & 0x5555) * magic) & mask_01; 1007 | } while (--tail); 1008 | 1009 | // transpose then store counters 1010 | flag_counts[0] += countsA & 0xFF; 1011 | flag_counts[8] += (countsA >> 8) & 0xFF; 1012 | flag_counts[2] += (countsA >> 16) & 0xFF; 1013 | flag_counts[10] += (countsA >> 24) & 0xFF; 1014 | flag_counts[4] += (countsA >> 32) & 0xFF; 1015 | flag_counts[12] += (countsA >> 40) & 0xFF; 1016 | flag_counts[6] += (countsA >> 48) & 0xFF; 1017 | flag_counts[14] += (countsA >> 56) & 0xFF; 1018 | flag_counts[1] += countsB & 0xFF; 1019 | flag_counts[9] += (countsB >> 8) & 0xFF; 1020 | flag_counts[3] += (countsB >> 16) & 0xFF; 1021 | flag_counts[11] += (countsB >> 24) & 0xFF; 1022 | flag_counts[5] += (countsB >> 32) & 0xFF; 1023 | flag_counts[13] += (countsB >> 40) & 0xFF; 1024 | flag_counts[7] += (countsB >> 48) & 0xFF; 1025 | flag_counts[15] += (countsB >> 56) & 0xFF; 1026 | } 1027 | 1028 | return 0; 1029 | } 1030 | 1031 | STORM_TARGET("sse4.2") 1032 | static 1033 | int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { 1034 | const __m128i* data_vectors = (const __m128i*)(array); 1035 | const uint32_t n_cycles = len / 8; 1036 | 1037 | size_t i = 0; 1038 | for (/**/; i + 8 <= n_cycles; i += 8) { 1039 | #define L(p) __m128i v##p = _mm_loadu_si128(data_vectors+i+p); 1040 | L(0) L(1) L(2) L(3) 1041 | L(4) L(5) L(6) L(7) 1042 | 1043 | #define U0(p,k) __m128i input##p = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0x00FF)), _mm_slli_epi16(v##k, 8)); 1044 | #define U1(p,k) __m128i input##k = _mm_or_si128(_mm_and_si128(v##p, _mm_set1_epi16(0xFF00)), _mm_srli_epi16(v##k, 8)); 1045 | #define U(p, k) U0(p,k) U1(p,k) 1046 | 1047 | U(0,1) U(2,3) U(4,5) U(6,7) 1048 | 1049 | for (int i = 0; i < 8; ++i) { 1050 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p)); 1051 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k)); 1052 | #define A(p, k) A0(p) A1(k) 1053 | A(0,1) A(2, 3) A(4,5) A(6, 7) 1054 | 1055 | #define P0(p) input##p = _mm_add_epi8(input##p, input##p); 1056 | #define P(p, k) input##p = P0(p) P0(k) 1057 | 1058 | P(0,1) P(2, 3) P(4,5) P(6, 7) 1059 | } 1060 | } 1061 | 1062 | for (/**/; i + 4 <= n_cycles; i += 4) { 1063 | L(0) L(1) L(2) L(3) 1064 | U(0,1) U(2,3) 1065 | 1066 | for (int i = 0; i < 8; ++i) { 1067 | A(0,1) A(2, 3) 1068 | P(0,1) P(2, 3) 1069 | } 1070 | } 1071 | 1072 | for (/**/; i + 2 <= n_cycles; i += 2) { 1073 | L(0) L(1) 1074 | U(0,1) 1075 | 1076 | for (int i = 0; i < 8; ++i) { 1077 | A(0,1) 1078 | P(0,1) 1079 | } 1080 | } 1081 | 1082 | i *= 8; 1083 | for (/**/; i < len; ++i) { 1084 | for (int j = 0; j < 16; ++j) { 1085 | out[j] += ((array[i] & (1 << j)) >> j); 1086 | } 1087 | } 1088 | 1089 | #undef L 1090 | #undef U0 1091 | #undef U1 1092 | #undef U 1093 | #undef A0 1094 | #undef A1 1095 | #undef A 1096 | #undef P0 1097 | #undef P 1098 | return 0; 1099 | } 1100 | 1101 | STORM_TARGET("sse4.2") 1102 | static 1103 | int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { 1104 | for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) { 1105 | for (int j = 0; j < 16; ++j) { 1106 | out[j] += ((array[i] & (1 << j)) >> j); 1107 | } 1108 | } 1109 | 1110 | const __m128i* data = (const __m128i*)array; 1111 | size_t size = len / 8; 1112 | __m128i v1 = _mm_setzero_si128(); 1113 | __m128i v2 = _mm_setzero_si128(); 1114 | __m128i v4 = _mm_setzero_si128(); 1115 | __m128i v8 = _mm_setzero_si128(); 1116 | __m128i v16 = _mm_setzero_si128(); 1117 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 1118 | 1119 | const uint64_t limit = size - size % 16; 1120 | uint64_t i = 0; 1121 | uint16_t buffer[8]; 1122 | __m128i counter[16]; 1123 | 1124 | while (i < limit) { 1125 | for (size_t i = 0; i < 16; ++i) { 1126 | counter[i] = _mm_setzero_si128(); 1127 | } 1128 | 1129 | size_t thislimit = limit; 1130 | if (thislimit - i >= (1 << 16)) 1131 | thislimit = i + (1 << 16) - 1; 1132 | 1133 | for (/**/; i < thislimit; i += 16) { 1134 | #define U(pos) { \ 1135 | counter[pos] = _mm_add_epi16(counter[pos], _mm_and_si128(v16, _mm_set1_epi16(1))); \ 1136 | v16 = _mm_srli_epi16(v16, 1); \ 1137 | } 1138 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 0), _mm_loadu_si128(data + i + 1)); 1139 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 2), _mm_loadu_si128(data + i + 3)); 1140 | STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB); 1141 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 4), _mm_loadu_si128(data + i + 5)); 1142 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 6), _mm_loadu_si128(data + i + 7)); 1143 | STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB); 1144 | STORM_pospopcnt_csa_sse(&eightsA,&v4, foursA, foursB); 1145 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 8), _mm_loadu_si128(data + i + 9)); 1146 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 10), _mm_loadu_si128(data + i + 11)); 1147 | STORM_pospopcnt_csa_sse(&foursA, &v2, twosA, twosB); 1148 | STORM_pospopcnt_csa_sse(&twosA, &v1, _mm_loadu_si128(data + i + 12), _mm_loadu_si128(data + i + 13)); 1149 | STORM_pospopcnt_csa_sse(&twosB, &v1, _mm_loadu_si128(data + i + 14), _mm_loadu_si128(data + i + 15)); 1150 | STORM_pospopcnt_csa_sse(&foursB, &v2, twosA, twosB); 1151 | STORM_pospopcnt_csa_sse(&eightsB,&v4, foursA, foursB); 1152 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates 1153 | STORM_pospopcnt_csa_sse(&v16, &v8, eightsA, eightsB); 1154 | #undef U 1155 | } 1156 | 1157 | // update the counters after the last iteration 1158 | for (size_t i = 0; i < 16; ++i) { 1159 | counter[i] = _mm_add_epi16(counter[i], _mm_and_si128(v16, _mm_set1_epi16(1))); 1160 | v16 = _mm_srli_epi16(v16, 1); 1161 | } 1162 | 1163 | for (size_t i = 0; i < 16; ++i) { 1164 | _mm_storeu_si128((__m128i*)buffer, counter[i]); 1165 | for (size_t z = 0; z < 8; z++) { 1166 | out[i] += 16 * (uint32_t)buffer[z]; 1167 | } 1168 | } 1169 | } 1170 | 1171 | _mm_storeu_si128((__m128i*)buffer, v1); 1172 | for (size_t i = 0; i < 8; ++i) { 1173 | for (int j = 0; j < 16; ++j) { 1174 | out[j] += ((buffer[i] & (1 << j)) >> j); 1175 | } 1176 | } 1177 | 1178 | _mm_storeu_si128((__m128i*)buffer, v2); 1179 | for (size_t i = 0; i < 8; ++i) { 1180 | for (int j = 0; j < 16; ++j) { 1181 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j); 1182 | } 1183 | } 1184 | _mm_storeu_si128((__m128i*)buffer, v4); 1185 | for (size_t i = 0; i < 8; ++i) { 1186 | for (int j = 0; j < 16; ++j) { 1187 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j); 1188 | } 1189 | } 1190 | _mm_storeu_si128((__m128i*)buffer, v8); 1191 | for (size_t i = 0; i < 8; ++i) { 1192 | for (int j = 0; j < 16; ++j) { 1193 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j); 1194 | } 1195 | } 1196 | return 0; 1197 | } 1198 | 1199 | STORM_TARGET("sse4.2") 1200 | static 1201 | uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 1202 | const __m128i* STORM_RESTRICT data2, 1203 | size_t size) 1204 | { 1205 | __m128i ones = _mm_setzero_si128(); 1206 | __m128i twos = _mm_setzero_si128(); 1207 | __m128i fours = _mm_setzero_si128(); 1208 | __m128i eights = _mm_setzero_si128(); 1209 | __m128i sixteens = _mm_setzero_si128(); 1210 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 1211 | 1212 | uint64_t i = 0; 1213 | uint64_t limit = size - size % 16; 1214 | uint64_t cnt64 = 0; 1215 | 1216 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) & _mm_loadu_si128(&data2[i+a])) 1217 | 1218 | for (/**/; i < limit; i += 16) { 1219 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); 1220 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); 1221 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1222 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); 1223 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); 1224 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1225 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); 1226 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); 1227 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); 1228 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1229 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); 1230 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); 1231 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1232 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); 1233 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); 1234 | 1235 | cnt64 += STORM_POPCOUNT_SSE(sixteens); 1236 | } 1237 | #undef LOAD 1238 | 1239 | cnt64 <<= 4; 1240 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3; 1241 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2; 1242 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1; 1243 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0; 1244 | 1245 | for (/**/; i < size; ++i) 1246 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) & _mm_loadu_si128(&data2[i])); 1247 | 1248 | return cnt64; 1249 | } 1250 | 1251 | STORM_TARGET("sse4.2") 1252 | static 1253 | uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 1254 | const __m128i* STORM_RESTRICT data2, 1255 | size_t size) 1256 | { 1257 | __m128i ones = _mm_setzero_si128(); 1258 | __m128i twos = _mm_setzero_si128(); 1259 | __m128i fours = _mm_setzero_si128(); 1260 | __m128i eights = _mm_setzero_si128(); 1261 | __m128i sixteens = _mm_setzero_si128(); 1262 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 1263 | 1264 | uint64_t i = 0; 1265 | uint64_t limit = size - size % 16; 1266 | uint64_t cnt64 = 0; 1267 | 1268 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) | _mm_loadu_si128(&data2[i+a])) 1269 | 1270 | for (/**/; i < limit; i += 16) { 1271 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); 1272 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); 1273 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1274 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); 1275 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); 1276 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1277 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); 1278 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); 1279 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); 1280 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1281 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); 1282 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); 1283 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1284 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); 1285 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); 1286 | 1287 | cnt64 += STORM_POPCOUNT_SSE(sixteens); 1288 | } 1289 | #undef LOAD 1290 | 1291 | cnt64 <<= 4; 1292 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3; 1293 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2; 1294 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1; 1295 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0; 1296 | 1297 | for (/**/; i < size; ++i) 1298 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) | _mm_loadu_si128(&data2[i])); 1299 | 1300 | return cnt64; 1301 | } 1302 | 1303 | STORM_TARGET("sse4.2") 1304 | static 1305 | uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, 1306 | const __m128i* STORM_RESTRICT data2, 1307 | size_t size) 1308 | { 1309 | __m128i ones = _mm_setzero_si128(); 1310 | __m128i twos = _mm_setzero_si128(); 1311 | __m128i fours = _mm_setzero_si128(); 1312 | __m128i eights = _mm_setzero_si128(); 1313 | __m128i sixteens = _mm_setzero_si128(); 1314 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 1315 | 1316 | uint64_t i = 0; 1317 | uint64_t limit = size - size % 16; 1318 | uint64_t cnt64 = 0; 1319 | 1320 | #define LOAD(a) (_mm_loadu_si128(&data1[i+a]) ^ _mm_loadu_si128(&data2[i+a])) 1321 | 1322 | for (/**/; i < limit; i += 16) { 1323 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); 1324 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); 1325 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1326 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); 1327 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); 1328 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1329 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); 1330 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); 1331 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); 1332 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1333 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); 1334 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); 1335 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1336 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); 1337 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); 1338 | 1339 | cnt64 += STORM_POPCOUNT_SSE(sixteens); 1340 | } 1341 | #undef LOAD 1342 | 1343 | cnt64 <<= 4; 1344 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3; 1345 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2; 1346 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1; 1347 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0; 1348 | 1349 | for (/**/; i < size; ++i) 1350 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data1[i]) ^ _mm_loadu_si128(&data2[i])); 1351 | 1352 | return cnt64; 1353 | } 1354 | 1355 | STORM_TARGET("sse4.2") 1356 | static 1357 | uint64_t STORM_popcnt_csa_sse4(const __m128i* STORM_RESTRICT data, 1358 | size_t size) 1359 | { 1360 | __m128i ones = _mm_setzero_si128(); 1361 | __m128i twos = _mm_setzero_si128(); 1362 | __m128i fours = _mm_setzero_si128(); 1363 | __m128i eights = _mm_setzero_si128(); 1364 | __m128i sixteens = _mm_setzero_si128(); 1365 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 1366 | 1367 | uint64_t i = 0; 1368 | uint64_t limit = size - size % 16; 1369 | uint64_t cnt64 = 0; 1370 | 1371 | #define LOAD(a) (_mm_loadu_si128(&data[i+a])) 1372 | 1373 | for (/**/; i < limit; i += 16) { 1374 | STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); 1375 | STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); 1376 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1377 | STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); 1378 | STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); 1379 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1380 | STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); 1381 | STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); 1382 | STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); 1383 | STORM_CSA128(&foursA, &twos, twos, twosA, twosB); 1384 | STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); 1385 | STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); 1386 | STORM_CSA128(&foursB, &twos, twos, twosA, twosB); 1387 | STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); 1388 | STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); 1389 | 1390 | cnt64 += STORM_POPCOUNT_SSE(sixteens); 1391 | } 1392 | #undef LOAD 1393 | 1394 | cnt64 <<= 4; 1395 | cnt64 += STORM_POPCOUNT_SSE(eights) << 3; 1396 | cnt64 += STORM_POPCOUNT_SSE(fours) << 2; 1397 | cnt64 += STORM_POPCOUNT_SSE(twos) << 1; 1398 | cnt64 += STORM_POPCOUNT_SSE(ones) << 0; 1399 | 1400 | for (/**/; i < size; ++i) 1401 | cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data[i])); 1402 | 1403 | return cnt64; 1404 | } 1405 | 1406 | STORM_TARGET("sse4.2") 1407 | static 1408 | uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, 1409 | const uint64_t* STORM_RESTRICT b2, 1410 | const size_t n_ints) 1411 | { 1412 | uint64_t count = 0; 1413 | const __m128i* r1 = (__m128i*)b1; 1414 | const __m128i* r2 = (__m128i*)b2; 1415 | const uint32_t n_cycles = n_ints / 2; 1416 | 1417 | count += STORM_intersect_count_csa_sse4(r1, r2, n_cycles); 1418 | 1419 | for (int i = n_cycles*2; i < n_ints; ++i) { 1420 | count += STORM_POPCOUNT(b1[i] & b2[i]); 1421 | } 1422 | 1423 | return(count); 1424 | } 1425 | 1426 | STORM_TARGET("sse4.2") 1427 | static 1428 | uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, 1429 | const uint64_t* STORM_RESTRICT b2, 1430 | const size_t n_ints) 1431 | { 1432 | uint64_t count = 0; 1433 | const __m128i* r1 = (__m128i*)b1; 1434 | const __m128i* r2 = (__m128i*)b2; 1435 | const uint32_t n_cycles = n_ints / 2; 1436 | 1437 | count += STORM_union_count_csa_sse4(r1, r2, n_cycles); 1438 | 1439 | for (int i = n_cycles*2; i < n_ints; ++i) { 1440 | count += STORM_POPCOUNT(b1[i] | b2[i]); 1441 | } 1442 | 1443 | return(count); 1444 | } 1445 | 1446 | STORM_TARGET("sse4.2") 1447 | static 1448 | uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, 1449 | const uint64_t* STORM_RESTRICT b2, 1450 | const size_t n_ints) 1451 | { 1452 | uint64_t count = 0; 1453 | const __m128i* r1 = (__m128i*)b1; 1454 | const __m128i* r2 = (__m128i*)b2; 1455 | const uint32_t n_cycles = n_ints / 2; 1456 | 1457 | count += STORM_diff_count_csa_sse4(r1, r2, n_cycles); 1458 | 1459 | for (int i = n_cycles*2; i < n_ints; ++i) { 1460 | count += STORM_POPCOUNT(b1[i] ^ b2[i]); 1461 | } 1462 | 1463 | return(count); 1464 | } 1465 | 1466 | STORM_TARGET("sse4.2") 1467 | static 1468 | uint64_t STORM_popcnt_sse4(const uint64_t* STORM_RESTRICT data, 1469 | const size_t n_ints) 1470 | { 1471 | uint64_t count = 0; 1472 | const __m128i* r1 = (__m128i*)data; 1473 | const uint32_t n_cycles = n_ints / 2; 1474 | 1475 | count += STORM_popcnt_csa_sse4(r1, n_cycles); 1476 | 1477 | for (int i = n_cycles*2; i < n_ints; ++i) { 1478 | count += STORM_POPCOUNT(data[i]); 1479 | } 1480 | 1481 | return(count); 1482 | } 1483 | #endif 1484 | 1485 | /**************************** 1486 | * AVX256 functions 1487 | ****************************/ 1488 | 1489 | #if defined(STORM_HAVE_AVX2) 1490 | 1491 | #include 1492 | 1493 | STORM_TARGET("avx2") 1494 | STORM_FORCE_INLINE 1495 | void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { 1496 | __m256i u = _mm256_xor_si256(a, b); 1497 | *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); 1498 | *l = _mm256_xor_si256(u, c); 1499 | } 1500 | 1501 | STORM_TARGET("avx2") 1502 | STORM_FORCE_INLINE 1503 | void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, 1504 | __m256i* STORM_RESTRICT l, 1505 | const __m256i b, 1506 | const __m256i c) 1507 | { 1508 | const __m256i u = _mm256_xor_si256(*l, b); 1509 | *h = _mm256_or_si256(*l & b, u & c); 1510 | *l = _mm256_xor_si256(u, c); 1511 | } 1512 | 1513 | STORM_TARGET("avx2") 1514 | static 1515 | int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { 1516 | const __m256i* data_vectors = (const __m256i*)(array); 1517 | const uint32_t n_cycles = len / 16; 1518 | 1519 | size_t i = 0; 1520 | for (/**/; i + 8 <= n_cycles; i += 8) { 1521 | #define L(p) __m256i v##p = _mm256_loadu_si256(data_vectors+i+p); 1522 | L(0) L(1) L(2) L(3) 1523 | L(4) L(5) L(6) L(7) 1524 | 1525 | #define U0(p,k) __m256i input##p = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0x00FF)), _mm256_slli_epi16(v##k, 8)); 1526 | #define U1(p,k) __m256i input##k = _mm256_or_si256(_mm256_and_si256(v##p, _mm256_set1_epi16(0xFF00)), _mm256_srli_epi16(v##k, 8)); 1527 | #define U(p, k) U0(p,k) U1(p,k) 1528 | U(0,1) U(2, 3) U(4, 5) U(6, 7) 1529 | 1530 | for (int i = 0; i < 8; ++i) { 1531 | #define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p)); 1532 | #define A1(k) out[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k)); 1533 | #define A(p, k) A0(p) A1(k) 1534 | A(0,1) A(2, 3) A(4, 5) A(6, 7) 1535 | 1536 | #define P0(p) input##p = _mm256_add_epi8(input##p, input##p); 1537 | #define P(p, k) input##p = P0(p) P0(k) 1538 | P(0,1) P(2, 3) P(4, 5) P(6, 7) 1539 | } 1540 | } 1541 | 1542 | for (/**/; i + 4 <= n_cycles; i += 4) { 1543 | L(0) L(1) L(2) L(3) 1544 | U(0,1) U(2, 3) 1545 | 1546 | for (int i = 0; i < 8; ++i) { 1547 | A(0,1) A( 2, 3) 1548 | P(0,1) P( 2, 3) 1549 | } 1550 | } 1551 | 1552 | for (/**/; i + 2 <= n_cycles; i += 2) { 1553 | L(0) L(1) 1554 | U(0,1) 1555 | 1556 | for (int i = 0; i < 8; ++i) { 1557 | A(0,1) 1558 | P(0,1) 1559 | } 1560 | } 1561 | 1562 | i *= 16; 1563 | for (/**/; i < len; ++i) { 1564 | for (int j = 0; j < 16; ++j) { 1565 | out[j] += ((array[i] & (1 << j)) >> j); 1566 | } 1567 | } 1568 | 1569 | #undef L 1570 | #undef U0 1571 | #undef U1 1572 | #undef U 1573 | #undef A0 1574 | #undef A1 1575 | #undef A 1576 | #undef P0 1577 | #undef P 1578 | 1579 | return 0; 1580 | } 1581 | 1582 | STORM_TARGET("avx2") 1583 | static 1584 | int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { 1585 | for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) { 1586 | for (int j = 0; j < 16; ++j) { 1587 | out[j] += ((array[i] & (1 << j)) >> j); 1588 | } 1589 | } 1590 | 1591 | const __m256i* data = (const __m256i*)array; 1592 | size_t size = len / 16; 1593 | __m256i v1 = _mm256_setzero_si256(); 1594 | __m256i v2 = _mm256_setzero_si256(); 1595 | __m256i v4 = _mm256_setzero_si256(); 1596 | __m256i v8 = _mm256_setzero_si256(); 1597 | __m256i v16 = _mm256_setzero_si256(); 1598 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 1599 | 1600 | const uint64_t limit = size - size % 16; 1601 | uint64_t i = 0; 1602 | uint16_t buffer[16]; 1603 | __m256i counter[16]; 1604 | const __m256i one = _mm256_set1_epi16(1); 1605 | 1606 | while (i < limit) { 1607 | for (size_t i = 0; i < 16; ++i) { 1608 | counter[i] = _mm256_setzero_si256(); 1609 | } 1610 | 1611 | size_t thislimit = limit; 1612 | if (thislimit - i >= (1 << 16)) 1613 | thislimit = i + (1 << 16) - 1; 1614 | 1615 | for (/**/; i < thislimit; i += 16) { 1616 | #define U(pos) { \ 1617 | counter[pos] = _mm256_add_epi16(counter[pos], _mm256_and_si256(v16, one)); \ 1618 | v16 = _mm256_srli_epi16(v16, 1); \ 1619 | } 1620 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 0), _mm256_loadu_si256(data + i + 1)); 1621 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 2), _mm256_loadu_si256(data + i + 3)); 1622 | STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB); 1623 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 4), _mm256_loadu_si256(data + i + 5)); 1624 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 6), _mm256_loadu_si256(data + i + 7)); 1625 | STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB); 1626 | STORM_pospopcnt_csa_avx2(&eightsA,&v4, foursA, foursB); 1627 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 8), _mm256_loadu_si256(data + i + 9)); 1628 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 10), _mm256_loadu_si256(data + i + 11)); 1629 | STORM_pospopcnt_csa_avx2(&foursA, &v2, twosA, twosB); 1630 | STORM_pospopcnt_csa_avx2(&twosA, &v1, _mm256_loadu_si256(data + i + 12), _mm256_loadu_si256(data + i + 13)); 1631 | STORM_pospopcnt_csa_avx2(&twosB, &v1, _mm256_loadu_si256(data + i + 14), _mm256_loadu_si256(data + i + 15)); 1632 | STORM_pospopcnt_csa_avx2(&foursB, &v2, twosA, twosB); 1633 | STORM_pospopcnt_csa_avx2(&eightsB,&v4, foursA, foursB); 1634 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates 1635 | STORM_pospopcnt_csa_avx2(&v16, &v8, eightsA, eightsB); 1636 | #undef U 1637 | } 1638 | 1639 | // update the counters after the last iteration 1640 | for (size_t i = 0; i < 16; ++i) { 1641 | counter[i] = _mm256_add_epi16(counter[i], _mm256_and_si256(v16, one)); 1642 | v16 = _mm256_srli_epi16(v16, 1); 1643 | } 1644 | 1645 | for (size_t i = 0; i < 16; ++i) { 1646 | _mm256_storeu_si256((__m256i*)buffer, counter[i]); 1647 | for (size_t z = 0; z < 16; z++) { 1648 | out[i] += 16 * (uint32_t)buffer[z]; 1649 | } 1650 | } 1651 | } 1652 | 1653 | _mm256_storeu_si256((__m256i*)buffer, v1); 1654 | for (size_t i = 0; i < 16; ++i) { 1655 | for (int j = 0; j < 16; ++j) { 1656 | out[j] += ((buffer[i] & (1 << j)) >> j); 1657 | } 1658 | } 1659 | 1660 | _mm256_storeu_si256((__m256i*)buffer, v2); 1661 | for (size_t i = 0; i < 16; ++i) { 1662 | for (int j = 0; j < 16; ++j) { 1663 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j); 1664 | } 1665 | } 1666 | _mm256_storeu_si256((__m256i*)buffer, v4); 1667 | for (size_t i = 0; i < 16; ++i) { 1668 | for (int j = 0; j < 16; ++j) { 1669 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j); 1670 | } 1671 | } 1672 | _mm256_storeu_si256((__m256i*)buffer, v8); 1673 | for (size_t i = 0; i < 16; ++i) { 1674 | for (int j = 0; j < 16; ++j) { 1675 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j); 1676 | } 1677 | } 1678 | return 0; 1679 | } 1680 | 1681 | 1682 | STORM_TARGET("avx2") 1683 | static 1684 | __m256i STORM_popcnt256(__m256i v) { 1685 | __m256i lookup1 = _mm256_setr_epi8( 1686 | 4, 5, 5, 6, 5, 6, 6, 7, 1687 | 5, 6, 6, 7, 6, 7, 7, 8, 1688 | 4, 5, 5, 6, 5, 6, 6, 7, 1689 | 5, 6, 6, 7, 6, 7, 7, 8 1690 | ); 1691 | 1692 | __m256i lookup2 = _mm256_setr_epi8( 1693 | 4, 3, 3, 2, 3, 2, 2, 1, 1694 | 3, 2, 2, 1, 2, 1, 1, 0, 1695 | 4, 3, 3, 2, 3, 2, 2, 1, 1696 | 3, 2, 2, 1, 2, 1, 1, 0 1697 | ); 1698 | 1699 | __m256i low_mask = _mm256_set1_epi8(0x0f); 1700 | __m256i lo = _mm256_and_si256(v, low_mask); 1701 | __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); 1702 | __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); 1703 | __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); 1704 | 1705 | return _mm256_sad_epu8(popcnt1, popcnt2); 1706 | } 1707 | 1708 | // modified from https://github.com/WojciechMula/sse-popcount 1709 | STORM_TARGET("avx2") 1710 | static 1711 | uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 1712 | const uint8_t* STORM_RESTRICT data2, 1713 | const size_t n) 1714 | { 1715 | 1716 | size_t i = 0; 1717 | 1718 | const __m256i lookup = _mm256_setr_epi8( 1719 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1720 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1721 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1722 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 1723 | 1724 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1725 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1726 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1727 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 1728 | ); 1729 | 1730 | const __m256i low_mask = _mm256_set1_epi8(0x0f); 1731 | 1732 | __m256i acc = _mm256_setzero_si256(); 1733 | 1734 | #define ITER { \ 1735 | const __m256i vec = _mm256_and_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \ 1736 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \ 1737 | const __m256i lo = _mm256_and_si256(vec, low_mask); \ 1738 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \ 1739 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \ 1740 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \ 1741 | local = _mm256_add_epi8(local, popcnt1); \ 1742 | local = _mm256_add_epi8(local, popcnt2); \ 1743 | i += 32; \ 1744 | } 1745 | 1746 | while (i + 8*32 <= n) { 1747 | __m256i local = _mm256_setzero_si256(); 1748 | ITER ITER ITER ITER 1749 | ITER ITER ITER ITER 1750 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1751 | } 1752 | 1753 | __m256i local = _mm256_setzero_si256(); 1754 | 1755 | while (i + 32 <= n) { 1756 | ITER; 1757 | } 1758 | 1759 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1760 | 1761 | #undef ITER 1762 | 1763 | uint64_t result = 0; 1764 | 1765 | result += (uint64_t)(_mm256_extract_epi64(acc, 0)); 1766 | result += (uint64_t)(_mm256_extract_epi64(acc, 1)); 1767 | result += (uint64_t)(_mm256_extract_epi64(acc, 2)); 1768 | result += (uint64_t)(_mm256_extract_epi64(acc, 3)); 1769 | 1770 | for (/**/; i < n; ++i) { 1771 | result += STORM_popcnt_lookup8bit[data1[i] & data2[i]]; 1772 | } 1773 | 1774 | return result; 1775 | } 1776 | 1777 | // modified from https://github.com/WojciechMula/sse-popcount 1778 | STORM_TARGET("avx2") 1779 | static 1780 | uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 1781 | const uint8_t* STORM_RESTRICT data2, 1782 | const size_t n) 1783 | { 1784 | 1785 | size_t i = 0; 1786 | 1787 | const __m256i lookup = _mm256_setr_epi8( 1788 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1789 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1790 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1791 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 1792 | 1793 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1794 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1795 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1796 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 1797 | ); 1798 | 1799 | const __m256i low_mask = _mm256_set1_epi8(0x0f); 1800 | 1801 | __m256i acc = _mm256_setzero_si256(); 1802 | 1803 | #define ITER { \ 1804 | const __m256i vec = _mm256_or_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \ 1805 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \ 1806 | const __m256i lo = _mm256_and_si256(vec, low_mask); \ 1807 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \ 1808 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \ 1809 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \ 1810 | local = _mm256_add_epi8(local, popcnt1); \ 1811 | local = _mm256_add_epi8(local, popcnt2); \ 1812 | i += 32; \ 1813 | } 1814 | 1815 | while (i + 8*32 <= n) { 1816 | __m256i local = _mm256_setzero_si256(); 1817 | ITER ITER ITER ITER 1818 | ITER ITER ITER ITER 1819 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1820 | } 1821 | 1822 | __m256i local = _mm256_setzero_si256(); 1823 | 1824 | while (i + 32 <= n) { 1825 | ITER; 1826 | } 1827 | 1828 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1829 | 1830 | #undef ITER 1831 | 1832 | uint64_t result = 0; 1833 | 1834 | result += (uint64_t)(_mm256_extract_epi64(acc, 0)); 1835 | result += (uint64_t)(_mm256_extract_epi64(acc, 1)); 1836 | result += (uint64_t)(_mm256_extract_epi64(acc, 2)); 1837 | result += (uint64_t)(_mm256_extract_epi64(acc, 3)); 1838 | 1839 | for (/**/; i < n; ++i) { 1840 | result += STORM_popcnt_lookup8bit[data1[i] | data2[i]]; 1841 | } 1842 | 1843 | return result; 1844 | } 1845 | 1846 | // modified from https://github.com/WojciechMula/sse-popcount 1847 | STORM_TARGET("avx2") 1848 | static 1849 | uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, 1850 | const uint8_t* STORM_RESTRICT data2, 1851 | const size_t n) 1852 | { 1853 | 1854 | size_t i = 0; 1855 | 1856 | const __m256i lookup = _mm256_setr_epi8( 1857 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1858 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1859 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1860 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 1861 | 1862 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 1863 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 1864 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 1865 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 1866 | ); 1867 | 1868 | const __m256i low_mask = _mm256_set1_epi8(0x0f); 1869 | 1870 | __m256i acc = _mm256_setzero_si256(); 1871 | 1872 | #define ITER { \ 1873 | const __m256i vec = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(data1 + i)), \ 1874 | _mm256_loadu_si256((const __m256i*)(data2 + i))); \ 1875 | const __m256i lo = _mm256_and_si256(vec, low_mask); \ 1876 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \ 1877 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \ 1878 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \ 1879 | local = _mm256_add_epi8(local, popcnt1); \ 1880 | local = _mm256_add_epi8(local, popcnt2); \ 1881 | i += 32; \ 1882 | } 1883 | 1884 | while (i + 8*32 <= n) { 1885 | __m256i local = _mm256_setzero_si256(); 1886 | ITER ITER ITER ITER 1887 | ITER ITER ITER ITER 1888 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1889 | } 1890 | 1891 | __m256i local = _mm256_setzero_si256(); 1892 | 1893 | while (i + 32 <= n) { 1894 | ITER; 1895 | } 1896 | 1897 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 1898 | 1899 | #undef ITER 1900 | 1901 | uint64_t result = 0; 1902 | 1903 | result += (uint64_t)(_mm256_extract_epi64(acc, 0)); 1904 | result += (uint64_t)(_mm256_extract_epi64(acc, 1)); 1905 | result += (uint64_t)(_mm256_extract_epi64(acc, 2)); 1906 | result += (uint64_t)(_mm256_extract_epi64(acc, 3)); 1907 | 1908 | for (/**/; i < n; ++i) { 1909 | result += STORM_popcnt_lookup8bit[data1[i] ^ data2[i]]; 1910 | } 1911 | 1912 | return result; 1913 | } 1914 | 1915 | STORM_TARGET("avx2") 1916 | static 1917 | uint64_t STORM_popcnt_csa_avx2(const __m256i* data, uint64_t size) 1918 | { 1919 | __m256i cnt = _mm256_setzero_si256(); 1920 | __m256i ones = _mm256_setzero_si256(); 1921 | __m256i twos = _mm256_setzero_si256(); 1922 | __m256i fours = _mm256_setzero_si256(); 1923 | __m256i eights = _mm256_setzero_si256(); 1924 | __m256i sixteens = _mm256_setzero_si256(); 1925 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 1926 | 1927 | uint64_t i = 0; 1928 | uint64_t limit = size - size % 16; 1929 | uint64_t* cnt64; 1930 | 1931 | #define LOAD(a) (_mm256_loadu_si256(&data[i+a])) 1932 | 1933 | for (/**/; i < limit; i += 16) { 1934 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1)); 1935 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3)); 1936 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 1937 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5)); 1938 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7)); 1939 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 1940 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB); 1941 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9)); 1942 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11)); 1943 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 1944 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13)); 1945 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15)); 1946 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 1947 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB); 1948 | STORM_CSA256(&sixteens, &eights, eights, eightsA, eightsB); 1949 | 1950 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens)); 1951 | } 1952 | #undef LOAD 1953 | 1954 | cnt = _mm256_slli_epi64(cnt, 4); 1955 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3)); 1956 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2)); 1957 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1)); 1958 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones)); 1959 | 1960 | for (/**/; i < size; ++i) 1961 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(data[i])); 1962 | 1963 | cnt64 = (uint64_t*) &cnt; 1964 | 1965 | return cnt64[0] + 1966 | cnt64[1] + 1967 | cnt64[2] + 1968 | cnt64[3]; 1969 | } 1970 | 1971 | 1972 | /* 1973 | * AVX2 Harley-Seal popcount (4th iteration). 1974 | * The algorithm is based on the paper "Faster Population Counts 1975 | * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and 1976 | * Wojciech Mula (23 Nov 2016). 1977 | * @see https://arxiv.org/abs/1611.07612 1978 | */ 1979 | // In this version we perform the operation A&B as input into the CSA operator. 1980 | STORM_TARGET("avx2") 1981 | static 1982 | uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 1983 | const __m256i* STORM_RESTRICT data2, 1984 | size_t size) 1985 | { 1986 | __m256i cnt = _mm256_setzero_si256(); 1987 | __m256i ones = _mm256_setzero_si256(); 1988 | __m256i twos = _mm256_setzero_si256(); 1989 | __m256i fours = _mm256_setzero_si256(); 1990 | __m256i eights = _mm256_setzero_si256(); 1991 | __m256i sixteens = _mm256_setzero_si256(); 1992 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 1993 | 1994 | uint64_t i = 0; 1995 | uint64_t limit = size - size % 16; 1996 | uint64_t* cnt64; 1997 | 1998 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) & _mm256_loadu_si256(&data2[i+a])) 1999 | 2000 | for (/**/; i < limit; i += 16) { 2001 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2002 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2003 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2004 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2005 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2006 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2007 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB); 2008 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2009 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2010 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2011 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2012 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2013 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2014 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB); 2015 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB); 2016 | 2017 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens)); 2018 | } 2019 | #undef LOAD 2020 | 2021 | cnt = _mm256_slli_epi64(cnt, 4); 2022 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3)); 2023 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2)); 2024 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1)); 2025 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones)); 2026 | 2027 | for (/**/; i < size; ++i) 2028 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) & _mm256_loadu_si256(&data2[i]))); 2029 | 2030 | cnt64 = (uint64_t*) &cnt; 2031 | 2032 | return cnt64[0] + 2033 | cnt64[1] + 2034 | cnt64[2] + 2035 | cnt64[3]; 2036 | } 2037 | 2038 | // In this version we perform the operation A|B as input into the CSA operator. 2039 | STORM_TARGET("avx2") 2040 | static 2041 | uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 2042 | const __m256i* STORM_RESTRICT data2, 2043 | size_t size) 2044 | { 2045 | __m256i cnt = _mm256_setzero_si256(); 2046 | __m256i ones = _mm256_setzero_si256(); 2047 | __m256i twos = _mm256_setzero_si256(); 2048 | __m256i fours = _mm256_setzero_si256(); 2049 | __m256i eights = _mm256_setzero_si256(); 2050 | __m256i sixteens = _mm256_setzero_si256(); 2051 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 2052 | 2053 | uint64_t i = 0; 2054 | uint64_t limit = size - size % 16; 2055 | uint64_t* cnt64; 2056 | 2057 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) | _mm256_loadu_si256(&data2[i+a])) 2058 | 2059 | for (/**/; i < limit; i += 16) { 2060 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2061 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2062 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2063 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2064 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2065 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2066 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB); 2067 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2068 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2069 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2070 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2071 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2072 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2073 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB); 2074 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB); 2075 | 2076 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens)); 2077 | } 2078 | #undef LOAD 2079 | 2080 | cnt = _mm256_slli_epi64(cnt, 4); 2081 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3)); 2082 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2)); 2083 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1)); 2084 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones)); 2085 | 2086 | for (/**/; i < size; ++i) 2087 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) | _mm256_loadu_si256(&data2[i]))); 2088 | 2089 | cnt64 = (uint64_t*) &cnt; 2090 | 2091 | return cnt64[0] + 2092 | cnt64[1] + 2093 | cnt64[2] + 2094 | cnt64[3]; 2095 | } 2096 | 2097 | // In this version we perform the operation A^B as input into the CSA operator. 2098 | STORM_TARGET("avx2") 2099 | static 2100 | uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, 2101 | const __m256i* STORM_RESTRICT data2, 2102 | size_t size) 2103 | { 2104 | __m256i cnt = _mm256_setzero_si256(); 2105 | __m256i ones = _mm256_setzero_si256(); 2106 | __m256i twos = _mm256_setzero_si256(); 2107 | __m256i fours = _mm256_setzero_si256(); 2108 | __m256i eights = _mm256_setzero_si256(); 2109 | __m256i sixteens = _mm256_setzero_si256(); 2110 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 2111 | 2112 | uint64_t i = 0; 2113 | uint64_t limit = size - size % 16; 2114 | uint64_t* cnt64; 2115 | 2116 | #define LOAD(a) (_mm256_loadu_si256(&data1[i+a]) ^ _mm256_loadu_si256(&data2[i+a])) 2117 | 2118 | for (/**/; i < limit; i += 16) { 2119 | STORM_CSA256(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2120 | STORM_CSA256(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2121 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2122 | STORM_CSA256(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2123 | STORM_CSA256(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2124 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2125 | STORM_CSA256(&eightsA, &fours, fours, foursA, foursB); 2126 | STORM_CSA256(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2127 | STORM_CSA256(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2128 | STORM_CSA256(&foursA, &twos, twos, twosA, twosB); 2129 | STORM_CSA256(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2130 | STORM_CSA256(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2131 | STORM_CSA256(&foursB, &twos, twos, twosA, twosB); 2132 | STORM_CSA256(&eightsB, &fours, fours, foursA, foursB); 2133 | STORM_CSA256(&sixteens,&eights, eights,eightsA,eightsB); 2134 | 2135 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(sixteens)); 2136 | } 2137 | #undef LOAD 2138 | 2139 | cnt = _mm256_slli_epi64(cnt, 4); 2140 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(eights), 3)); 2141 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(fours), 2)); 2142 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(STORM_popcnt256(twos), 1)); 2143 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(ones)); 2144 | 2145 | for (/**/; i < size; ++i) 2146 | cnt = _mm256_add_epi64(cnt, STORM_popcnt256(_mm256_loadu_si256(&data1[i]) ^ _mm256_loadu_si256(&data2[i]))); 2147 | 2148 | cnt64 = (uint64_t*) &cnt; 2149 | 2150 | return cnt64[0] + 2151 | cnt64[1] + 2152 | cnt64[2] + 2153 | cnt64[3]; 2154 | } 2155 | 2156 | STORM_TARGET("avx2") 2157 | static 2158 | uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, 2159 | const uint64_t* STORM_RESTRICT b2, 2160 | const size_t n_ints) 2161 | { 2162 | uint64_t count = 0; 2163 | const __m256i* r1 = (__m256i*)b1; 2164 | const __m256i* r2 = (__m256i*)b2; 2165 | const uint32_t n_cycles = n_ints / 4; 2166 | 2167 | count += STORM_intersect_count_csa_avx2(r1, r2, n_cycles); 2168 | 2169 | for (int i = n_cycles*4; i < n_ints; ++i) { 2170 | count += STORM_POPCOUNT(b1[i] & b2[i]); 2171 | } 2172 | 2173 | return(count); 2174 | } 2175 | 2176 | STORM_TARGET("avx2") 2177 | static 2178 | uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, 2179 | const uint64_t* STORM_RESTRICT b2, 2180 | const size_t n_ints) 2181 | { 2182 | uint64_t count = 0; 2183 | const __m256i* r1 = (__m256i*)b1; 2184 | const __m256i* r2 = (__m256i*)b2; 2185 | const uint32_t n_cycles = n_ints / 4; 2186 | 2187 | count += STORM_union_count_csa_avx2(r1, r2, n_cycles); 2188 | 2189 | for (int i = n_cycles*4; i < n_ints; ++i) { 2190 | count += STORM_POPCOUNT(b1[i] | b2[i]); 2191 | } 2192 | 2193 | return(count); 2194 | } 2195 | 2196 | STORM_TARGET("avx2") 2197 | static 2198 | uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, 2199 | const uint64_t* STORM_RESTRICT b2, 2200 | const size_t n_ints) 2201 | { 2202 | uint64_t count = 0; 2203 | const __m256i* r1 = (__m256i*)b1; 2204 | const __m256i* r2 = (__m256i*)b2; 2205 | const uint32_t n_cycles = n_ints / 4; 2206 | 2207 | count += STORM_diff_count_csa_avx2(r1, r2, n_cycles); 2208 | 2209 | for (int i = n_cycles*4; i < n_ints; ++i) { 2210 | count += STORM_POPCOUNT(b1[i] ^ b2[i]); 2211 | } 2212 | 2213 | return(count); 2214 | } 2215 | 2216 | STORM_TARGET("avx2") 2217 | static 2218 | uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 2219 | const uint64_t* STORM_RESTRICT b2, 2220 | const size_t n_ints) 2221 | { 2222 | return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); 2223 | } 2224 | 2225 | STORM_TARGET("avx2") 2226 | static 2227 | uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 2228 | const uint64_t* STORM_RESTRICT b2, 2229 | const size_t n_ints) 2230 | { 2231 | return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); 2232 | } 2233 | 2234 | STORM_TARGET("avx2") 2235 | static 2236 | uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, 2237 | const uint64_t* STORM_RESTRICT b2, 2238 | const size_t n_ints) 2239 | { 2240 | return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); 2241 | } 2242 | 2243 | STORM_TARGET("avx2") 2244 | static 2245 | uint64_t STORM_popcnt_avx2(const uint64_t* data, 2246 | const size_t n_ints) 2247 | { 2248 | uint64_t count = 0; 2249 | const uint32_t n_cycles = n_ints / 4; 2250 | const uint32_t n_cycles_sse = (n_ints % 4) / 2; 2251 | 2252 | const __m256i* r1 = (__m256i*)&data[0]; 2253 | const __m128i* r2 = (__m128i*)&data[n_cycles_sse*4]; 2254 | 2255 | count += STORM_popcnt_csa_avx2(r1, n_cycles); 2256 | count += STORM_popcnt_csa_sse4(r2, n_cycles_sse); 2257 | 2258 | for (int i = (4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { 2259 | count += STORM_POPCOUNT(data[i]); 2260 | } 2261 | 2262 | return count; 2263 | } 2264 | #endif 2265 | 2266 | /**************************** 2267 | * AVX512BW functions 2268 | ****************************/ 2269 | 2270 | #if defined(STORM_HAVE_AVX512) 2271 | 2272 | #include 2273 | 2274 | STORM_TARGET("avx512bw") 2275 | STORM_FORCE_INLINE 2276 | __m512i STORM_popcnt512(__m512i v) { 2277 | __m512i m1 = _mm512_set1_epi8(0x55); 2278 | __m512i m2 = _mm512_set1_epi8(0x33); 2279 | __m512i m4 = _mm512_set1_epi8(0x0F); 2280 | __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1)); 2281 | __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2)); 2282 | __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4; 2283 | 2284 | return _mm512_sad_epu8(t3, _mm512_setzero_si512()); 2285 | } 2286 | 2287 | STORM_TARGET("avx512bw") 2288 | STORM_FORCE_INLINE 2289 | void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) { 2290 | *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); 2291 | *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); 2292 | } 2293 | 2294 | // By Wojciech Muła 2295 | // @see https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx512-harley-seal.cpp#L3 2296 | // @see https://arxiv.org/abs/1611.07612 2297 | STORM_TARGET("avx512bw") 2298 | STORM_FORCE_INLINE 2299 | __m512i STORM_avx512_popcount(const __m512i v) { 2300 | const __m512i m1 = _mm512_set1_epi8(0x55); // 01010101 2301 | const __m512i m2 = _mm512_set1_epi8(0x33); // 00110011 2302 | const __m512i m4 = _mm512_set1_epi8(0x0F); // 00001111 2303 | 2304 | const __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1)); 2305 | const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2)); 2306 | const __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4; 2307 | return _mm512_sad_epu8(t3, _mm512_setzero_si512()); 2308 | } 2309 | 2310 | // 512i-version of carry-save adder subroutine. 2311 | STORM_TARGET("avx512bw") 2312 | STORM_FORCE_INLINE 2313 | void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, 2314 | __m512i* STORM_RESTRICT l, 2315 | __m512i b, __m512i c) 2316 | { 2317 | *h = _mm512_ternarylogic_epi32(c, b, *l, 0xE8); // 11101000 2318 | *l = _mm512_ternarylogic_epi32(c, b, *l, 0x96); // 10010110 2319 | } 2320 | 2321 | STORM_TARGET("avx512bw") 2322 | static 2323 | uint64_t STORM_popcnt_csa_avx512bw(const __m512i* STORM_RESTRICT data, size_t size) 2324 | { 2325 | __m512i cnt = _mm512_setzero_si512(); 2326 | __m512i ones = _mm512_setzero_si512(); 2327 | __m512i twos = _mm512_setzero_si512(); 2328 | __m512i fours = _mm512_setzero_si512(); 2329 | __m512i eights = _mm512_setzero_si512(); 2330 | __m512i sixteens = _mm512_setzero_si512(); 2331 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 2332 | 2333 | uint64_t i = 0; 2334 | uint64_t limit = size - size % 16; 2335 | uint64_t* cnt64; 2336 | 2337 | #define LOAD(a) (_mm512_loadu_si512(&data[i+a])) 2338 | 2339 | for (/**/; i < limit; i += 16) { 2340 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2341 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2342 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2343 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2344 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2345 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2346 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB); 2347 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2348 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2349 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2350 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2351 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2352 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2353 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB); 2354 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB); 2355 | 2356 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens)); 2357 | } 2358 | #undef LOAD 2359 | 2360 | cnt = _mm512_slli_epi64(cnt, 4); 2361 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3)); 2362 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2)); 2363 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1)); 2364 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones)); 2365 | 2366 | for (/**/; i < size; ++i) 2367 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data[i]))); 2368 | 2369 | cnt64 = (uint64_t*)&cnt; 2370 | 2371 | return cnt64[0] + 2372 | cnt64[1] + 2373 | cnt64[2] + 2374 | cnt64[3] + 2375 | cnt64[4] + 2376 | cnt64[5] + 2377 | cnt64[6] + 2378 | cnt64[7]; 2379 | } 2380 | 2381 | STORM_TARGET("avx512bw") 2382 | static 2383 | int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { 2384 | for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) { 2385 | for (int j = 0; j < 16; ++j) { 2386 | out[j] += ((array[i] & (1 << j)) >> j); 2387 | } 2388 | } 2389 | 2390 | const __m512i* data = (const __m512i*)array; 2391 | __m512i v1 = _mm512_setzero_si512(); 2392 | __m512i v2 = _mm512_setzero_si512(); 2393 | __m512i v4 = _mm512_setzero_si512(); 2394 | __m512i v8 = _mm512_setzero_si512(); 2395 | __m512i v16 = _mm512_setzero_si512(); 2396 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 2397 | __m512i one = _mm512_set1_epi16(1); 2398 | __m512i counter[16]; 2399 | 2400 | const size_t size = len / 32; 2401 | const uint64_t limit = size - size % 16; 2402 | 2403 | uint16_t buffer[32]; 2404 | 2405 | uint64_t i = 0; 2406 | while (i < limit) { 2407 | for (size_t i = 0; i < 16; ++i) 2408 | counter[i] = _mm512_setzero_si512(); 2409 | 2410 | size_t thislimit = limit; 2411 | if (thislimit - i >= (1 << 16)) 2412 | thislimit = i + (1 << 16) - 1; 2413 | 2414 | for (/**/; i < thislimit; i += 16) { 2415 | #define U(pos) { \ 2416 | counter[pos] = _mm512_add_epi16(counter[pos], _mm512_and_si512(v16, _mm512_set1_epi16(1))); \ 2417 | v16 = _mm512_srli_epi16(v16, 1); \ 2418 | } 2419 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 0), _mm512_loadu_si512(data + i + 1)); 2420 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 2), _mm512_loadu_si512(data + i + 3)); 2421 | STORM_pospopcnt_csa_avx512(&foursA, &v2, twosA, twosB); 2422 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 4), _mm512_loadu_si512(data + i + 5)); 2423 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 6), _mm512_loadu_si512(data + i + 7)); 2424 | STORM_pospopcnt_csa_avx512(&foursB, &v2, twosA, twosB); 2425 | STORM_pospopcnt_csa_avx512(&eightsA, &v4, foursA, foursB); 2426 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 8), _mm512_loadu_si512(data + i + 9)); 2427 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 10), _mm512_loadu_si512(data + i + 11)); 2428 | STORM_pospopcnt_csa_avx512(&foursA, &v2, twosA, twosB); 2429 | STORM_pospopcnt_csa_avx512(&twosA, &v1, _mm512_loadu_si512(data + i + 12), _mm512_loadu_si512(data + i + 13)); 2430 | STORM_pospopcnt_csa_avx512(&twosB, &v1, _mm512_loadu_si512(data + i + 14), _mm512_loadu_si512(data + i + 15)); 2431 | STORM_pospopcnt_csa_avx512(&foursB, &v2, twosA, twosB); 2432 | STORM_pospopcnt_csa_avx512(&eightsB, &v4, foursA, foursB); 2433 | U(0) U(1) U(2) U(3) U(4) U(5) U(6) U(7) U(8) U(9) U(10) U(11) U(12) U(13) U(14) U(15) // Updates 2434 | STORM_pospopcnt_csa_avx512(&v16, &v8, eightsA, eightsB); 2435 | } 2436 | // Update the counters after the last iteration. 2437 | for (size_t i = 0; i < 16; ++i) U(i) 2438 | #undef U 2439 | 2440 | for (size_t i = 0; i < 16; ++i) { 2441 | _mm512_storeu_si512((__m512i*)buffer, counter[i]); 2442 | for (size_t z = 0; z < 32; z++) { 2443 | out[i] += 16 * (uint32_t)buffer[z]; 2444 | } 2445 | } 2446 | } 2447 | 2448 | _mm512_storeu_si512((__m512i*)buffer, v1); 2449 | for (size_t i = 0; i < 32; ++i) { 2450 | for (int j = 0; j < 16; ++j) { 2451 | out[j] += 1 * ((buffer[i] & (1 << j)) >> j); 2452 | } 2453 | } 2454 | 2455 | _mm512_storeu_si512((__m512i*)buffer, v2); 2456 | for (size_t i = 0; i < 32; ++i) { 2457 | for (int j = 0; j < 16; ++j) { 2458 | out[j] += 2 * ((buffer[i] & (1 << j)) >> j); 2459 | } 2460 | } 2461 | 2462 | _mm512_storeu_si512((__m512i*)buffer, v4); 2463 | for (size_t i = 0; i < 32; ++i) { 2464 | for (int j = 0; j < 16; ++j) { 2465 | out[j] += 4 * ((buffer[i] & (1 << j)) >> j); 2466 | } 2467 | } 2468 | 2469 | _mm512_storeu_si512((__m512i*)buffer, v8); 2470 | for (size_t i = 0; i < 32; ++i) { 2471 | for (int j = 0; j < 16; ++j) { 2472 | out[j] += 8 * ((buffer[i] & (1 << j)) >> j); 2473 | } 2474 | } 2475 | 2476 | return 0; 2477 | } 2478 | 2479 | STORM_TARGET("avx512bw") 2480 | static 2481 | int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, size_t len, uint32_t* out) { 2482 | #define AND_OR 0xea // ternary function: (a & b) | c 2483 | const __m512i* data_vectors = (const __m512i*)(data); 2484 | const uint32_t n_cycles = len / 32; 2485 | 2486 | size_t i = 0; 2487 | for (/**/; i + 8 <= n_cycles; i += 8) { 2488 | #define L(p) __m512i v##p = _mm512_loadu_si512(data_vectors+i+p); 2489 | L(0) L(1) L(2) L(3) 2490 | L(4) L(5) L(6) L(7) 2491 | 2492 | #define U0(p,k) __m512i input##p = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0x00FF), _mm512_slli_epi16(v##k, 8), AND_OR); 2493 | #define U1(p,k) __m512i input##k = _mm512_ternarylogic_epi32(v##p, _mm512_set1_epi16(0xFF00), _mm512_srli_epi16(v##k, 8), AND_OR); 2494 | #define U(p, k) U0(p,k) U1(p,k) 2495 | 2496 | U(0,1) U( 2, 3) U( 4, 5) U( 6, 7) 2497 | 2498 | for (int i = 0; i < 8; ++i) { 2499 | #define A0(p) out[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p)); 2500 | #define A1(k) out[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k)); 2501 | #define A(p, k) A0(p) A1(k) 2502 | A(0,1) A(2, 3) A(4,5) A(6, 7) 2503 | 2504 | #define P0(p) input##p = _mm512_add_epi8(input##p, input##p); 2505 | #define P(p, k) input##p = P0(p) P0(k) 2506 | 2507 | P(0,1) P(2, 3) P(4,5) P(6, 7) 2508 | } 2509 | } 2510 | 2511 | for (/**/; i + 4 <= n_cycles; i += 4) { 2512 | L(0) L(1) L(2) L(3) 2513 | U(0,1) U(2,3) 2514 | 2515 | for (int i = 0; i < 8; ++i) { 2516 | A(0,1) A(2, 3) 2517 | P(0,1) P(2, 3) 2518 | } 2519 | } 2520 | 2521 | for (/**/; i + 2 <= n_cycles; i += 2) { 2522 | L(0) L(1) 2523 | U(0,1) 2524 | 2525 | for (int i = 0; i < 8; ++i) { 2526 | A(0,1) 2527 | P(0,1) 2528 | } 2529 | } 2530 | 2531 | i *= 32; 2532 | for (/**/; i < len; ++i) { 2533 | for (int j = 0; j < 16; ++j) { 2534 | out[j] += ((data[i] & (1 << j)) >> j); 2535 | } 2536 | } 2537 | 2538 | #undef L 2539 | #undef U0 2540 | #undef U1 2541 | #undef U 2542 | #undef A0 2543 | #undef A1 2544 | #undef A 2545 | #undef P0 2546 | #undef P 2547 | #undef AND_OR 2548 | 2549 | return 0; 2550 | } 2551 | 2552 | STORM_TARGET("avx512bw") 2553 | static 2554 | int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, size_t len, uint32_t* out) { 2555 | __m512i counters[16]; 2556 | 2557 | for (size_t i = 0; i < 16; ++i) { 2558 | counters[i] = _mm512_setzero_si512(); 2559 | } 2560 | 2561 | const __m512i mask1bit = _mm512_set1_epi16(0x5555); // 0101010101010101 Pattern: 01 2562 | const __m512i mask2bit = _mm512_set1_epi16(0x3333); // 0011001100110011 Pattern: 0011 2563 | const __m512i mask4bit = _mm512_set1_epi16(0x0F0F); // 0000111100001111 Pattern: 00001111 2564 | const __m512i mask8bit = _mm512_set1_epi16(0x00FF); // 0000000011111111 Pattern: 0000000011111111 2565 | 2566 | const uint32_t n_cycles = len / (2048 * (16*32)); 2567 | const uint32_t n_total = len / (16*32); 2568 | uint16_t tmp[32]; 2569 | 2570 | /*------ Macros --------*/ 2571 | #define LE(i,p,k) const __m512i sum##p##k##_##i##bit_even = _mm512_add_epi8(input##p & mask##i##bit, input##k & mask##i##bit); 2572 | #define LO(i,p,k) const __m512i sum##p##k##_##i##bit_odd = _mm512_add_epi8(_mm512_srli_epi16(input##p, i) & mask##i##bit, _mm512_srli_epi16(input##k, i) & mask##i##bit); 2573 | 2574 | #define LBLOCK(i) \ 2575 | LE(i,0,1) LO(i,0,1) \ 2576 | LE(i,2,3) LO(i,2,3) \ 2577 | LE(i,4,5) LO(i,4,5) \ 2578 | LE(i,6,7) LO(i,6,7) \ 2579 | LE(i,8,9) LO(i,8,9) \ 2580 | LE(i,10,11) LO(i,10,11) \ 2581 | LE(i,12,13) LO(i,12,13) \ 2582 | LE(i,14,15) LO(i,14,15) \ 2583 | 2584 | #define EVEN(b,i,k,p) input##i = sum##k##p##_##b##bit_even; 2585 | #define ODD(b,i,k,p) input##i = sum##k##p##_##b##bit_odd; 2586 | 2587 | #define UPDATE(i) \ 2588 | EVEN(i,0,0,1) EVEN(i,1,2,3) EVEN(i,2,4,5) EVEN(i,3,6,7) \ 2589 | EVEN(i,4,8,9) EVEN(i,5,10,11) EVEN(i,6,12,13) EVEN(i,7,14,15) \ 2590 | ODD(i,8,0,1) ODD(i,9,2,3) ODD(i,10,4,5) ODD(i,11,6,7) \ 2591 | ODD(i,12,8,9) ODD(i,13,10,11) ODD(i,14,12,13) ODD(i,15,14,15) \ 2592 | 2593 | #define UE(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_even); 2594 | #define UO(i,p,k) counters[i] = _mm512_add_epi16(counters[i], sum##p##k##_8bit_odd); 2595 | 2596 | /*------ Start --------*/ 2597 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*2048*512 + j*512 + p*32)); 2598 | size_t i = 0; 2599 | for (/**/; i < n_cycles; ++i) { 2600 | for (int j = 0; j < 2048; ++j) { 2601 | // Load 16 registers. 2602 | L(0) L(1) L(2) L(3) 2603 | L(4) L(5) L(6) L(7) 2604 | L(8) L(9) L(10) L(11) 2605 | L(12) L(13) L(14) L(15) 2606 | 2607 | // Perform updates for bits {1,2,4,8}. 2608 | LBLOCK(1) UPDATE(1) 2609 | LBLOCK(2) UPDATE(2) 2610 | LBLOCK(4) UPDATE(4) 2611 | LBLOCK(8) UPDATE(8) 2612 | 2613 | // Update accumulators. 2614 | UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7) 2615 | UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15) 2616 | UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7) 2617 | UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15) 2618 | } 2619 | 2620 | // Update. 2621 | for (size_t i = 0; i < 16; ++i) { 2622 | _mm512_storeu_si512((__m512i*)tmp, counters[i]); 2623 | for (int j = 0; j < 32; ++j) out[i] += tmp[j]; 2624 | } 2625 | // Reset. 2626 | for (size_t i = 0; i < 16; ++i) { 2627 | counters[i] = _mm512_setzero_si512(); 2628 | } 2629 | } 2630 | #undef L 2631 | #define L(p) __m512i input##p = _mm512_loadu_si512((__m512i*)(array + i*512 + p*32)); 2632 | i *= 2048; 2633 | for (/**/; i < n_total; ++i) { 2634 | // Load 16 registers. 2635 | L(0) L(1) L(2) L(3) 2636 | L(4) L(5) L(6) L(7) 2637 | L(8) L(9) L(10) L(11) 2638 | L(12) L(13) L(14) L(15) 2639 | 2640 | // Perform updates for bits {1,2,4,8}. 2641 | LBLOCK(1) UPDATE(1) 2642 | LBLOCK(2) UPDATE(2) 2643 | LBLOCK(4) UPDATE(4) 2644 | LBLOCK(8) UPDATE(8) 2645 | 2646 | // Update accumulators. 2647 | UE( 0,0,1) UE( 1, 2, 3) UE( 2, 4, 5) UE( 3, 6, 7) 2648 | UE( 4,8,9) UE( 5,10,11) UE( 6,12,13) UE( 7,14,15) 2649 | UO( 8,0,1) UO( 9, 2, 3) UO(10, 4, 5) UO(11, 6, 7) 2650 | UO(12,8,9) UO(13,10,11) UO(14,12,13) UO(15,14,15) 2651 | } 2652 | 2653 | i *= 512; 2654 | for (/**/; i < len; ++i) { 2655 | for (int j = 0; j < 16; ++j) { 2656 | out[j] += ((array[i] & (1 << j)) >> j); 2657 | } 2658 | } 2659 | 2660 | #undef L 2661 | #undef UPDATE 2662 | #undef ODD 2663 | #undef EVEN 2664 | #undef LBLOCK 2665 | #undef LE 2666 | #undef LO 2667 | #undef UO 2668 | #undef UE 2669 | 2670 | for (size_t i = 0; i < 16; ++i) { 2671 | _mm512_storeu_si512((__m512i*)tmp, counters[i]); 2672 | for (int j = 0; j < 32; ++j) out[i] += tmp[j]; 2673 | } 2674 | return 0; 2675 | } 2676 | 2677 | /* 2678 | * AVX512 Harley-Seal popcount (4th iteration). 2679 | * The algorithm is based on the paper "Faster Population Counts 2680 | * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and 2681 | * Wojciech Mula (23 Nov 2016). 2682 | * @see https://arxiv.org/abs/1611.07612 2683 | */ 2684 | STORM_TARGET("avx512bw") 2685 | static 2686 | uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 2687 | const __m512i* STORM_RESTRICT data2, 2688 | size_t size) 2689 | { 2690 | __m512i cnt = _mm512_setzero_si512(); 2691 | __m512i ones = _mm512_setzero_si512(); 2692 | __m512i twos = _mm512_setzero_si512(); 2693 | __m512i fours = _mm512_setzero_si512(); 2694 | __m512i eights = _mm512_setzero_si512(); 2695 | __m512i sixteens = _mm512_setzero_si512(); 2696 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 2697 | 2698 | uint64_t i = 0; 2699 | uint64_t limit = size - size % 16; 2700 | uint64_t* cnt64; 2701 | 2702 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) & _mm512_loadu_si512(&data2[i+a])) 2703 | 2704 | for (/**/; i < limit; i += 16) { 2705 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2706 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2707 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2708 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2709 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2710 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2711 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB); 2712 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2713 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2714 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2715 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2716 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2717 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2718 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB); 2719 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB); 2720 | 2721 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens)); 2722 | } 2723 | #undef LOAD 2724 | 2725 | cnt = _mm512_slli_epi64(cnt, 4); 2726 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3)); 2727 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2)); 2728 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1)); 2729 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones)); 2730 | 2731 | for (/**/; i < size; ++i) 2732 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) & _mm512_loadu_si512(&data2[i]))); 2733 | 2734 | cnt64 = (uint64_t*)&cnt; 2735 | 2736 | return cnt64[0] + 2737 | cnt64[1] + 2738 | cnt64[2] + 2739 | cnt64[3] + 2740 | cnt64[4] + 2741 | cnt64[5] + 2742 | cnt64[6] + 2743 | cnt64[7]; 2744 | } 2745 | 2746 | STORM_TARGET("avx512bw") 2747 | static 2748 | uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 2749 | const __m512i* STORM_RESTRICT data2, 2750 | size_t size) 2751 | { 2752 | __m512i cnt = _mm512_setzero_si512(); 2753 | __m512i ones = _mm512_setzero_si512(); 2754 | __m512i twos = _mm512_setzero_si512(); 2755 | __m512i fours = _mm512_setzero_si512(); 2756 | __m512i eights = _mm512_setzero_si512(); 2757 | __m512i sixteens = _mm512_setzero_si512(); 2758 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 2759 | 2760 | uint64_t i = 0; 2761 | uint64_t limit = size - size % 16; 2762 | uint64_t* cnt64; 2763 | 2764 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) | _mm512_loadu_si512(&data2[i+a])) 2765 | 2766 | for (/**/; i < limit; i += 16) { 2767 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2768 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2769 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2770 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2771 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2772 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2773 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB); 2774 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2775 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2776 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2777 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2778 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2779 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2780 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB); 2781 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB); 2782 | 2783 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens)); 2784 | } 2785 | #undef LOAD 2786 | 2787 | cnt = _mm512_slli_epi64(cnt, 4); 2788 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3)); 2789 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2)); 2790 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1)); 2791 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones)); 2792 | 2793 | for (/**/; i < size; ++i) 2794 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) | _mm512_loadu_si512(&data2[i]))); 2795 | 2796 | cnt64 = (uint64_t*)&cnt; 2797 | 2798 | return cnt64[0] + 2799 | cnt64[1] + 2800 | cnt64[2] + 2801 | cnt64[3] + 2802 | cnt64[4] + 2803 | cnt64[5] + 2804 | cnt64[6] + 2805 | cnt64[7]; 2806 | } 2807 | 2808 | STORM_TARGET("avx512bw") 2809 | static 2810 | uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, 2811 | const __m512i* STORM_RESTRICT data2, 2812 | size_t size) 2813 | { 2814 | __m512i cnt = _mm512_setzero_si512(); 2815 | __m512i ones = _mm512_setzero_si512(); 2816 | __m512i twos = _mm512_setzero_si512(); 2817 | __m512i fours = _mm512_setzero_si512(); 2818 | __m512i eights = _mm512_setzero_si512(); 2819 | __m512i sixteens = _mm512_setzero_si512(); 2820 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 2821 | 2822 | uint64_t i = 0; 2823 | uint64_t limit = size - size % 16; 2824 | uint64_t* cnt64; 2825 | 2826 | #define LOAD(a) (_mm512_loadu_si512(&data1[i+a]) ^ _mm512_loadu_si512(&data2[i+a])) 2827 | 2828 | for (/**/; i < limit; i += 16) { 2829 | STORM_CSA512(&twosA, &ones, ones, LOAD(0), LOAD(1)); 2830 | STORM_CSA512(&twosB, &ones, ones, LOAD(2), LOAD(3)); 2831 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2832 | STORM_CSA512(&twosA, &ones, ones, LOAD(4), LOAD(5)); 2833 | STORM_CSA512(&twosB, &ones, ones, LOAD(6), LOAD(7)); 2834 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2835 | STORM_CSA512(&eightsA, &fours, fours, foursA, foursB); 2836 | STORM_CSA512(&twosA, &ones, ones, LOAD(8), LOAD(9)); 2837 | STORM_CSA512(&twosB, &ones, ones, LOAD(10), LOAD(11)); 2838 | STORM_CSA512(&foursA, &twos, twos, twosA, twosB); 2839 | STORM_CSA512(&twosA, &ones, ones, LOAD(12), LOAD(13)); 2840 | STORM_CSA512(&twosB, &ones, ones, LOAD(14), LOAD(15)); 2841 | STORM_CSA512(&foursB, &twos, twos, twosA, twosB); 2842 | STORM_CSA512(&eightsB, &fours, fours, foursA, foursB); 2843 | STORM_CSA512(&sixteens,&eights, eights,eightsA,eightsB); 2844 | 2845 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(sixteens)); 2846 | } 2847 | #undef LOAD 2848 | 2849 | cnt = _mm512_slli_epi64(cnt, 4); 2850 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(eights), 3)); 2851 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(fours), 2)); 2852 | cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(STORM_popcnt512(twos), 1)); 2853 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(ones)); 2854 | 2855 | for (/**/; i < size; ++i) 2856 | cnt = _mm512_add_epi64(cnt, STORM_popcnt512(_mm512_loadu_si512(&data1[i]) ^ _mm512_loadu_si512(&data2[i]))); 2857 | 2858 | cnt64 = (uint64_t*)&cnt; 2859 | 2860 | return cnt64[0] + 2861 | cnt64[1] + 2862 | cnt64[2] + 2863 | cnt64[3] + 2864 | cnt64[4] + 2865 | cnt64[5] + 2866 | cnt64[6] + 2867 | cnt64[7]; 2868 | } 2869 | 2870 | // Functions 2871 | // AVX512 2872 | STORM_TARGET("avx512bw") 2873 | static 2874 | uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, 2875 | const uint64_t* STORM_RESTRICT b2, 2876 | const size_t n_ints) 2877 | { 2878 | uint64_t count = 0; 2879 | const __m512i* r1 = (const __m512i*)(b1); 2880 | const __m512i* r2 = (const __m512i*)(b2); 2881 | const uint32_t n_cycles = n_ints / 8; 2882 | 2883 | count += STORM_intersect_count_csa_avx512(r1, r2, n_cycles); 2884 | 2885 | for (int i = n_cycles*8; i < n_ints; ++i) { 2886 | count += STORM_POPCOUNT(b1[i] & b2[i]); 2887 | } 2888 | 2889 | return(count); 2890 | } 2891 | 2892 | STORM_TARGET("avx512bw") 2893 | static 2894 | uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, 2895 | const uint64_t* STORM_RESTRICT b2, 2896 | const size_t n_ints) 2897 | { 2898 | uint64_t count = 0; 2899 | const __m512i* r1 = (const __m512i*)(b1); 2900 | const __m512i* r2 = (const __m512i*)(b2); 2901 | const uint32_t n_cycles = n_ints / 8; 2902 | 2903 | count += STORM_union_count_csa_avx512(r1, r2, n_cycles); 2904 | 2905 | for (int i = n_cycles*8; i < n_ints; ++i) { 2906 | count += STORM_POPCOUNT(b1[i] | b2[i]); 2907 | } 2908 | 2909 | return(count); 2910 | } 2911 | 2912 | STORM_TARGET("avx512bw") 2913 | static 2914 | uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, 2915 | const uint64_t* STORM_RESTRICT b2, 2916 | const size_t n_ints) 2917 | { 2918 | uint64_t count = 0; 2919 | const __m512i* r1 = (const __m512i*)(b1); 2920 | const __m512i* r2 = (const __m512i*)(b2); 2921 | const uint32_t n_cycles = n_ints / 8; 2922 | 2923 | count += STORM_diff_count_csa_avx512(r1, r2, n_cycles); 2924 | 2925 | for (int i = n_cycles*8; i < n_ints; ++i) { 2926 | count += STORM_POPCOUNT(b1[i] ^ b2[i]); 2927 | } 2928 | 2929 | return(count); 2930 | } 2931 | 2932 | STORM_TARGET("avx2") 2933 | static 2934 | uint64_t STORM_popcnt_avx512(const uint64_t* data, 2935 | const size_t n_ints) 2936 | { 2937 | uint64_t count = 0; 2938 | const uint32_t n_cycles = n_ints / 8; 2939 | const uint32_t n_cycles_avx2 = (n_ints % 8) / 4; 2940 | const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2; 2941 | 2942 | const __m512i* r1 = (__m512i*)&data[0]; 2943 | const __m256i* r2 = (__m256i*)&data[n_cycles*8]; 2944 | const __m128i* r3 = (__m128i*)&data[n_cycles*8+n_cycles_avx2*4]; 2945 | 2946 | count += STORM_popcnt_csa_avx512bw(r1, n_cycles); 2947 | count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2); 2948 | count += STORM_popcnt_csa_sse4(r3, n_cycles_sse); 2949 | 2950 | for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { 2951 | count += STORM_POPCOUNT(data[i]); 2952 | } 2953 | 2954 | return count; 2955 | } 2956 | #endif 2957 | 2958 | /**************************** 2959 | * Popcount 2960 | ****************************/ 2961 | 2962 | STORM_FORCE_INLINE 2963 | uint64_t STORM_popcount64_unrolled(const uint64_t* data, size_t size) { 2964 | uint64_t i = 0; 2965 | uint64_t limit = size - size % 4; 2966 | uint64_t cnt = 0; 2967 | 2968 | for (/**/; i < limit; i += 4) { 2969 | cnt += STORM_popcount64(data[i+0]); 2970 | cnt += STORM_popcount64(data[i+1]); 2971 | cnt += STORM_popcount64(data[i+2]); 2972 | cnt += STORM_popcount64(data[i+3]); 2973 | } 2974 | 2975 | for (/**/; i < size; ++i) 2976 | cnt += STORM_popcount64(data[i]); 2977 | 2978 | return cnt; 2979 | } 2980 | 2981 | /**************************** 2982 | * Scalar functions 2983 | ****************************/ 2984 | 2985 | STORM_FORCE_INLINE 2986 | uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, 2987 | const uint64_t* STORM_RESTRICT b2, 2988 | const size_t n_ints) 2989 | { 2990 | return STORM_intersect_count_unrolled(b1, b2, n_ints); 2991 | } 2992 | 2993 | STORM_FORCE_INLINE 2994 | uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, 2995 | const uint64_t* STORM_RESTRICT b2, 2996 | const size_t n_ints) 2997 | { 2998 | return STORM_union_count_unrolled(b1, b2, n_ints); 2999 | } 3000 | 3001 | STORM_FORCE_INLINE 3002 | uint64_t STORM_diff_count_scalar(const uint64_t* STORM_RESTRICT b1, 3003 | const uint64_t* STORM_RESTRICT b2, 3004 | const size_t n_ints) 3005 | { 3006 | return STORM_diff_count_unrolled(b1, b2, n_ints); 3007 | } 3008 | 3009 | static 3010 | uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, 3011 | const uint64_t* STORM_RESTRICT b2, 3012 | const uint32_t* STORM_RESTRICT l1, 3013 | const uint32_t* STORM_RESTRICT l2, 3014 | const size_t n1, 3015 | const size_t n2) 3016 | { 3017 | uint64_t count = 0; 3018 | 3019 | #define MOD(x) (( (x) * 64 ) >> 6) 3020 | if (n1 < n2) { 3021 | for (int i = 0; i < n1; ++i) 3022 | count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); 3023 | } else { 3024 | for (int i = 0; i < n2; ++i) 3025 | count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0); 3026 | } 3027 | #undef MOD 3028 | return(count); 3029 | } 3030 | 3031 | 3032 | /* ************************************* 3033 | * Function pointer definitions. 3034 | ***************************************/ 3035 | typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const size_t); 3036 | typedef int (STORM_pposcnt_func)(const uint16_t*, size_t, uint32_t*); 3037 | typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t); 3038 | 3039 | /* ************************************* 3040 | * Alignment 3041 | ***************************************/ 3042 | // Return the best alignment given the available instruction set at 3043 | // run-time. 3044 | static 3045 | uint32_t STORM_get_alignment() { 3046 | 3047 | #if defined(STORM_HAVE_CPUID) 3048 | #if defined(__cplusplus) 3049 | /* C++11 thread-safe singleton */ 3050 | static const int cpuid = STORM_get_cpuid(); 3051 | #else 3052 | static int cpuid_ = -1; 3053 | int cpuid = cpuid_; 3054 | if (cpuid == -1) { 3055 | cpuid = STORM_get_cpuid(); 3056 | 3057 | #if defined(_MSC_VER) 3058 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3059 | #else 3060 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3061 | #endif 3062 | } 3063 | #endif 3064 | #endif 3065 | 3066 | uint32_t alignment = 0; 3067 | #if defined(STORM_HAVE_AVX512) 3068 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512 3069 | alignment = STORM_AVX512_ALIGNMENT; 3070 | } 3071 | #endif 3072 | 3073 | #if defined(STORM_HAVE_AVX2) 3074 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256 3075 | alignment = STORM_AVX2_ALIGNMENT; 3076 | } 3077 | #endif 3078 | 3079 | #if defined(STORM_HAVE_SSE42) 3080 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128 3081 | alignment = STORM_SSE_ALIGNMENT; 3082 | } 3083 | #endif 3084 | 3085 | if (alignment == 0) alignment = 8; 3086 | return alignment; 3087 | } 3088 | 3089 | /* ************************************* 3090 | * Set algebra functions 3091 | ***************************************/ 3092 | // Return the optimal intersection function given the range [0, n_bitmaps_vector) 3093 | // and the available instruction set at run-time. 3094 | static 3095 | STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) { 3096 | 3097 | #if defined(STORM_HAVE_CPUID) 3098 | #if defined(__cplusplus) 3099 | /* C++11 thread-safe singleton */ 3100 | static const int cpuid = STORM_get_cpuid(); 3101 | #else 3102 | static int cpuid_ = -1; 3103 | int cpuid = cpuid_; 3104 | if (cpuid == -1) { 3105 | cpuid = STORM_get_cpuid(); 3106 | 3107 | #if defined(_MSC_VER) 3108 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3109 | #else 3110 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3111 | #endif 3112 | } 3113 | #endif 3114 | #endif 3115 | 3116 | 3117 | #if defined(STORM_HAVE_AVX512) 3118 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 3119 | return &STORM_intersect_count_avx512; 3120 | } 3121 | #endif 3122 | 3123 | #if defined(STORM_HAVE_AVX2) 3124 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 3125 | return &STORM_intersect_count_avx2; 3126 | } 3127 | 3128 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { 3129 | return &STORM_intersect_count_lookup_avx2; 3130 | } 3131 | #endif 3132 | 3133 | #if defined(STORM_HAVE_SSE42) 3134 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 3135 | return &STORM_intersect_count_sse4; 3136 | } 3137 | #endif 3138 | 3139 | return &STORM_intersect_count_scalar; 3140 | } 3141 | 3142 | static 3143 | STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { 3144 | 3145 | #if defined(STORM_HAVE_CPUID) 3146 | #if defined(__cplusplus) 3147 | /* C++11 thread-safe singleton */ 3148 | static const int cpuid = STORM_get_cpuid(); 3149 | #else 3150 | static int cpuid_ = -1; 3151 | int cpuid = cpuid_; 3152 | if (cpuid == -1) { 3153 | cpuid = STORM_get_cpuid(); 3154 | 3155 | #if defined(_MSC_VER) 3156 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3157 | #else 3158 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3159 | #endif 3160 | } 3161 | #endif 3162 | #endif 3163 | 3164 | 3165 | #if defined(STORM_HAVE_AVX512) 3166 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 3167 | return &STORM_union_count_avx512; 3168 | } 3169 | #endif 3170 | 3171 | #if defined(STORM_HAVE_AVX2) 3172 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 3173 | return &STORM_union_count_avx2; 3174 | } 3175 | 3176 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { 3177 | return &STORM_union_count_lookup_avx2; 3178 | } 3179 | #endif 3180 | 3181 | #if defined(STORM_HAVE_SSE42) 3182 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 3183 | return &STORM_union_count_sse4; 3184 | } 3185 | #endif 3186 | 3187 | return &STORM_union_count_scalar; 3188 | } 3189 | 3190 | static 3191 | STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { 3192 | 3193 | #if defined(STORM_HAVE_CPUID) 3194 | #if defined(__cplusplus) 3195 | /* C++11 thread-safe singleton */ 3196 | static const int cpuid = STORM_get_cpuid(); 3197 | #else 3198 | static int cpuid_ = -1; 3199 | int cpuid = cpuid_; 3200 | if (cpuid == -1) { 3201 | cpuid = STORM_get_cpuid(); 3202 | 3203 | #if defined(_MSC_VER) 3204 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3205 | #else 3206 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3207 | #endif 3208 | } 3209 | #endif 3210 | #endif 3211 | 3212 | 3213 | #if defined(STORM_HAVE_AVX512) 3214 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 3215 | return &STORM_diff_count_avx512; 3216 | } 3217 | #endif 3218 | 3219 | #if defined(STORM_HAVE_AVX2) 3220 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 3221 | return &STORM_diff_count_avx2; 3222 | } 3223 | 3224 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { 3225 | return &STORM_diff_count_lookup_avx2; 3226 | } 3227 | #endif 3228 | 3229 | #if defined(STORM_HAVE_SSE42) 3230 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 3231 | return &STORM_diff_count_sse4; 3232 | } 3233 | #endif 3234 | 3235 | return &STORM_diff_count_scalar; 3236 | } 3237 | 3238 | // real 3239 | // Return the optimal intersection function given the range [0, n_bitmaps_vector) 3240 | // and the available instruction set at run-time. 3241 | static 3242 | uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, 3243 | const uint64_t* STORM_RESTRICT data2, 3244 | const size_t n_len) 3245 | { 3246 | 3247 | #if defined(STORM_HAVE_CPUID) 3248 | #if defined(__cplusplus) 3249 | /* C++11 thread-safe singleton */ 3250 | static const int cpuid = STORM_get_cpuid(); 3251 | #else 3252 | static int cpuid_ = -1; 3253 | int cpuid = cpuid_; 3254 | if (cpuid == -1) { 3255 | cpuid = STORM_get_cpuid(); 3256 | 3257 | #if defined(_MSC_VER) 3258 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3259 | #else 3260 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3261 | #endif 3262 | } 3263 | #endif 3264 | #endif 3265 | 3266 | 3267 | #if defined(STORM_HAVE_AVX512) 3268 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 3269 | return STORM_intersect_count_avx512(data1, data2, n_len); 3270 | } 3271 | #endif 3272 | 3273 | #if defined(STORM_HAVE_AVX2) 3274 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 3275 | return STORM_intersect_count_avx2(data1, data2, n_len); 3276 | } 3277 | 3278 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { 3279 | return STORM_intersect_count_lookup_avx2(data1, data2, n_len); 3280 | } 3281 | #endif 3282 | 3283 | #if defined(STORM_HAVE_SSE42) 3284 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 3285 | return STORM_intersect_count_sse4(data1, data2, n_len); 3286 | } 3287 | #endif 3288 | 3289 | return STORM_intersect_count_scalar(data1, data2, n_len); 3290 | } 3291 | 3292 | static 3293 | uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, 3294 | const uint64_t* STORM_RESTRICT data2, 3295 | const size_t n_len) 3296 | { 3297 | 3298 | #if defined(STORM_HAVE_CPUID) 3299 | #if defined(__cplusplus) 3300 | /* C++11 thread-safe singleton */ 3301 | static const int cpuid = STORM_get_cpuid(); 3302 | #else 3303 | static int cpuid_ = -1; 3304 | int cpuid = cpuid_; 3305 | if (cpuid == -1) { 3306 | cpuid = STORM_get_cpuid(); 3307 | 3308 | #if defined(_MSC_VER) 3309 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3310 | #else 3311 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3312 | #endif 3313 | } 3314 | #endif 3315 | #endif 3316 | 3317 | 3318 | #if defined(STORM_HAVE_AVX512) 3319 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 3320 | return STORM_union_count_avx512(data1, data2, n_len); 3321 | } 3322 | #endif 3323 | 3324 | #if defined(STORM_HAVE_AVX2) 3325 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 3326 | return STORM_union_count_avx2(data1, data2, n_len); 3327 | } 3328 | 3329 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { 3330 | return STORM_union_count_lookup_avx2(data1, data2, n_len); 3331 | } 3332 | #endif 3333 | 3334 | #if defined(STORM_HAVE_SSE42) 3335 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 3336 | return STORM_union_count_sse4(data1, data2, n_len); 3337 | } 3338 | #endif 3339 | 3340 | return STORM_union_count_scalar(data1, data2, n_len); 3341 | } 3342 | 3343 | static 3344 | uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, 3345 | const uint64_t* STORM_RESTRICT data2, 3346 | const size_t n_len) 3347 | { 3348 | 3349 | #if defined(STORM_HAVE_CPUID) 3350 | #if defined(__cplusplus) 3351 | /* C++11 thread-safe singleton */ 3352 | static const int cpuid = STORM_get_cpuid(); 3353 | #else 3354 | static int cpuid_ = -1; 3355 | int cpuid = cpuid_; 3356 | if (cpuid == -1) { 3357 | cpuid = STORM_get_cpuid(); 3358 | 3359 | #if defined(_MSC_VER) 3360 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3361 | #else 3362 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3363 | #endif 3364 | } 3365 | #endif 3366 | #endif 3367 | 3368 | 3369 | #if defined(STORM_HAVE_AVX512) 3370 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 3371 | return STORM_diff_count_avx512(data1, data2, n_len); 3372 | } 3373 | #endif 3374 | 3375 | #if defined(STORM_HAVE_AVX2) 3376 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 3377 | return STORM_diff_count_avx2(data1, data2, n_len); 3378 | } 3379 | 3380 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { 3381 | return STORM_diff_count_lookup_avx2(data1, data2, n_len); 3382 | } 3383 | #endif 3384 | 3385 | #if defined(STORM_HAVE_SSE42) 3386 | if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 3387 | return STORM_diff_count_sse4(data1, data2, n_len); 3388 | } 3389 | #endif 3390 | 3391 | return STORM_diff_count_scalar(data1, data2, n_len); 3392 | } 3393 | 3394 | /* ************************************* 3395 | * POPCNT and POSPOPCNT functions. 3396 | ***************************************/ 3397 | static 3398 | uint64_t STORM_popcnt(const uint8_t* data, size_t size) { 3399 | uint64_t cnt = 0; 3400 | uint64_t i; 3401 | // size /= 8; 3402 | 3403 | #if defined(STORM_HAVE_CPUID) 3404 | #if defined(__cplusplus) 3405 | /* C++11 thread-safe singleton */ 3406 | static const int cpuid = STORM_get_cpuid(); 3407 | #else 3408 | static int cpuid_ = -1; 3409 | int cpuid = cpuid_; 3410 | if (cpuid == -1) { 3411 | cpuid = STORM_get_cpuid(); 3412 | 3413 | #if defined(_MSC_VER) 3414 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3415 | #else 3416 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3417 | #endif 3418 | } 3419 | #endif 3420 | #endif 3421 | 3422 | #if defined(STORM_HAVE_AVX512) 3423 | 3424 | /* AVX512 requires arrays >= 1024 bytes */ 3425 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && 3426 | size >= 1024) 3427 | { 3428 | // cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); 3429 | // data += size - size % 64; 3430 | // size = size % 64; 3431 | cnt += STORM_popcnt_avx512((uint64_t*)data, size/8); 3432 | data += size - size % 8; 3433 | size = size % 8; 3434 | } 3435 | 3436 | #endif 3437 | 3438 | #if defined(STORM_HAVE_AVX2) 3439 | 3440 | /* AVX2 requires arrays >= 512 bytes */ 3441 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && 3442 | size >= 512) 3443 | { 3444 | cnt += STORM_popcnt_avx2((uint64_t*)data, size/8); 3445 | data += size - size % 8; 3446 | size = size % 8; 3447 | // data += size - size % 32; 3448 | // size = size % 32; 3449 | } 3450 | 3451 | #endif 3452 | 3453 | #if defined(STORM_HAVE_SSE42) 3454 | 3455 | /* AVX2 requires arrays >= 512 bytes */ 3456 | if ((cpuid & STORM_CPUID_runtime_bit_SSE42) && 3457 | size >= 256) 3458 | { 3459 | cnt += STORM_popcnt_sse4((uint64_t*)data, size/8); 3460 | data += size - size % 8; 3461 | size = size % 8; 3462 | // data += size - size % 32; 3463 | // size = size % 32; 3464 | } 3465 | 3466 | #endif 3467 | 3468 | #if defined(STORM_HAVE_POPCNT) 3469 | 3470 | if (cpuid & STORM_CPUID_runtime_bit_POPCNT) { 3471 | cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8); 3472 | data += size - size % 8; 3473 | size = size % 8; 3474 | for (i = 0; i < size; ++i) 3475 | cnt += STORM_popcount64(data[i]); 3476 | 3477 | return cnt; 3478 | } 3479 | 3480 | #endif 3481 | 3482 | /* pure integer popcount algorithm */ 3483 | if (size >= 8) { 3484 | cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8); 3485 | data += size - size % 8; 3486 | size = size % 8; 3487 | } 3488 | 3489 | /* pure integer popcount algorithm */ 3490 | for (i = 0; i < size; ++i) 3491 | cnt += STORM_popcount64(data[i]); 3492 | 3493 | return cnt; 3494 | } 3495 | 3496 | static 3497 | int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { 3498 | memset(out, 0, sizeof(uint32_t)*16); 3499 | 3500 | #if defined(STORM_HAVE_CPUID) 3501 | #if defined(__cplusplus) 3502 | /* C++11 thread-safe singleton */ 3503 | static const int cpuid = STORM_get_cpuid(); 3504 | #else 3505 | static int cpuid_ = -1; 3506 | int cpuid = cpuid_; 3507 | if (cpuid == -1) { 3508 | cpuid = STORM_get_cpuid(); 3509 | 3510 | #if defined(_MSC_VER) 3511 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 3512 | #else 3513 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 3514 | #endif 3515 | } 3516 | #endif 3517 | #endif 3518 | 3519 | #if defined(STORM_HAVE_AVX512) 3520 | if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) 3521 | { 3522 | if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small 3523 | else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small 3524 | else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, out)); // medium 3525 | else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, out)); // medium3 3526 | else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, out)); // fix 3527 | } 3528 | #endif 3529 | 3530 | #if defined(STORM_HAVE_AVX2) 3531 | if ((cpuid & STORM_CPUID_runtime_bit_AVX2)) 3532 | { 3533 | if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small 3534 | else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium 3535 | else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, out)); // large 3536 | } 3537 | #endif 3538 | 3539 | #if defined(STORM_HAVE_SSE42) 3540 | if ((cpuid & STORM_CPUID_runtime_bit_SSE42)) 3541 | { 3542 | return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out)); 3543 | } 3544 | #endif 3545 | 3546 | #ifndef _MSC_VER 3547 | return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, out)); // fallback scalar 3548 | #else 3549 | return(STORM_pospopcnt_u16_scalar_naive(data, len, out)); 3550 | #endif 3551 | } 3552 | 3553 | #ifdef __cplusplus 3554 | } /* extern "C" */ 3555 | #endif 3556 | 3557 | #endif /* LIBALGEBRA_H_8723467365934 */ 3558 | --------------------------------------------------------------------------------