├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── test ├── CMakeLists.txt ├── test2.c └── test1.cpp ├── CMakeLists.txt ├── .gitignore ├── LICENSE ├── ChangeLog ├── benchmark.cpp ├── README.md └── libpopcnt.h /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: kimwalisch 2 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB files "*.cpp" "*.c") 2 | foreach(file ${files}) 3 | get_filename_component(binary_name ${file} NAME_WE) 4 | add_executable(${binary_name} ${file}) 5 | add_test(NAME ${binary_name} COMMAND ${binary_name}) 6 | endforeach() 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4...3.19) 2 | project(libpopcnt C CXX) 3 | set(CMAKE_BUILD_TYPE Release) 4 | include_directories(.) 5 | 6 | add_executable(benchmark benchmark.cpp) 7 | enable_testing() 8 | add_subdirectory(test) 9 | 10 | install(FILES libpopcnt.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | *~ 3 | !/.github 4 | !/.gitignore 5 | !/.travis.yml 6 | *.exe 7 | *.tar.gz 8 | *.zip 9 | *.obj 10 | *.lib 11 | *.o 12 | *.pc 13 | *.dSYM 14 | CMakeCache.txt 15 | CMakeFiles 16 | CMakeScripts 17 | Makefile 18 | cmake_install.cmake 19 | install_manifest.txt 20 | CTestTestfile.cmake 21 | /benchmark 22 | /test/test1 23 | /test/test2 24 | build 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2016 - 2020, Kim Walisch 4 | Copyright (c) 2016 - 2019, Wojciech Muła 5 | 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /test/test2.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Simple C test program for libpopcnt.h. 3 | * Generates an array with random data and computes the bit population 4 | * count using 2 different algorithms and checks that the 5 | * results match. 6 | * 7 | * Usage: ./test2 8 | * 9 | * Copyright (C) 2017 Kim Walisch, 10 | * 11 | * This file is distributed under the BSD License. See the LICENSE 12 | * file in the top level directory. 13 | */ 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | /* 24 | * Count 1 bits from &data[i] till &data[size]. 25 | * @data: An array for testing 26 | * @size: size of data 27 | * @i: Array start index 28 | */ 29 | void test(uint8_t* data, 30 | size_t size, 31 | size_t i) 32 | { 33 | uint64_t bits = popcnt(&data[i], size - i); 34 | uint64_t bits_verify = 0; 35 | 36 | for (; i < size; i++) 37 | bits_verify += popcnt64_bitwise(data[i]); 38 | 39 | if (bits != bits_verify) 40 | { 41 | printf("\nlibpopcnt test failed!\n"); 42 | free(data); 43 | exit(1); 44 | } 45 | } 46 | 47 | int main(void) 48 | { 49 | size_t i; 50 | size_t size = 70000; 51 | 52 | uint8_t* data = (uint8_t*) malloc(size); 53 | 54 | if (!data) 55 | { 56 | printf("Failed to allocate memory!\n"); 57 | exit(1); 58 | } 59 | 60 | /* init array with only 1 bits */ 61 | memset(data, 0xff, size); 62 | test(data, size, 0); 63 | 64 | srand((unsigned) time(0)); 65 | 66 | /* generate array with random data */ 67 | for (i = 0; i < size; i++) 68 | data[i] = (uint8_t) rand(); 69 | 70 | for (i = 0; i < size; i++) 71 | test(data, size, i); 72 | 73 | free(data); 74 | 75 | printf("\rStatus: 100%%\n"); 76 | printf("libpopcnt tested successfully!\n"); 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /test/test1.cpp: -------------------------------------------------------------------------------- 1 | /// 2 | /// @file test1.cpp 3 | /// @brief Simple C++ test program for libpopcnt.h. 4 | /// Generates an array with random data and computes the bit 5 | /// population count using 2 different algorithms and checks 6 | /// that the results match. 7 | /// 8 | /// Usage: ./test1 [array bytes] 9 | /// 10 | /// Copyright (C) 2017 Kim Walisch, 11 | /// 12 | /// This file is distributed under the BSD License. See the LICENSE 13 | /// file in the top level directory. 14 | /// 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | using namespace std; 25 | 26 | /// Count 1 bits from &data[i] till &data[size]. 27 | /// @data: An array for testing 28 | /// @i: Array start index 29 | /// 30 | void test(vector& data, size_t i) 31 | { 32 | size_t size = data.size(); 33 | 34 | uint64_t bits = popcnt(&data[i], size - i); 35 | uint64_t bits_verify = 0; 36 | 37 | for (; i < size; i++) 38 | bits_verify += popcnt64_bitwise(data[i]); 39 | 40 | if (bits != bits_verify) 41 | { 42 | cerr << endl; 43 | cerr << "libpopcnt test failed!" << endl; 44 | exit(1); 45 | } 46 | } 47 | 48 | int main(int argc, char* argv[]) 49 | { 50 | size_t size = 100000; 51 | 52 | if (argc > 1) 53 | size = atoi(argv[1]); 54 | 55 | // init array with only 1 bits 56 | vector data(size, 0xff); 57 | 58 | if (!data.empty()) 59 | test(data, 0); 60 | 61 | srand((unsigned) time(0)); 62 | 63 | // generate array with random data 64 | for (size_t i = 0; i < size; i++) 65 | data[i] = (uint8_t) rand(); 66 | 67 | for (size_t i = 0; i < size; i++) 68 | { 69 | test(data, i); 70 | double percent = (100.0 * i) / size; 71 | cout << "\rStatus: " << (int) percent << "%" << flush; 72 | } 73 | 74 | cout << "\rStatus: 100%" << endl; 75 | cout << "libpopcnt tested successfully!" << endl; 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2024-06-29 Kim Walisch 2 | 3 | Version 3.1 4 | 5 | * Improve AVX512 algorithm for trailing 64 bytes. 6 | * AVX512 algorithm does not require AVX512-BITALG extension anymore. 7 | 8 | 2024-06-27 Kim Walisch 9 | 10 | Version 3.0 11 | 12 | * Add ARM SVE algorithm. 13 | * Replace AVX512BW algorithm by faster AVX512 VPOPCNTDQ algorithm. 14 | * Add MSVC support for ARM NEON. 15 | * Improve preprocessor checks using __has_include() macro. 16 | * Port tests from AppVeyor to GitHub actions. 17 | * Get rid of unaligned uint64_t memory acceses, this fixes 18 | test failures when using GCC compiler sanitizers. 19 | * Prefix all libpopcnt macros using LIBPOPCNT_ to avoid any naming collisions. 20 | 21 | 2020-08-10 Kim Walisch 22 | 23 | * Use unaligned memory accesses to improve performance. 24 | Aligning memory causes branch mispredictions which 25 | can significantly deteriorate performance especially 26 | for small array sizes. 27 | * On x86/x64 runtime checks are now removed if the user 28 | compiles his code with e.g. -mpopcnt, -mavx2, 29 | -march=native, ... 30 | 31 | 2020-07-19 Kim Walisch 32 | 33 | * Enable AVX2, AVX512 by default for MSVC. 34 | Thanks to KOConchobhair for the pull request #15. 35 | 36 | 2019-12-31 Kim Walisch 37 | 38 | Version 2.3 released. 39 | 40 | * Up to 10% speedup on ARM NEON. 41 | * Fix unaligned memory access on ARM. 42 | 43 | 2018-01-13 Kim Walisch 44 | 45 | Version 2.2 released. 46 | 47 | * Up to 6x faster on old x86 CPUs without POPCNT. 48 | 49 | 2017-11-10 Kim Walisch 50 | 51 | Version 2.1 released. 52 | 53 | * Up to 20% ARM NEON speedup. 54 | * test/test1.cpp: Add only 1 bits test case. 55 | * test/test2.c: Add only 1 bits test case. 56 | 57 | 2017-10-27 Kim Walisch 58 | 59 | Version 2.0 released. 60 | 61 | * Add AVX512 support. 62 | * Support AVX2 and AVX512 for the MSVC compiler. 63 | * Support CPUID for MSVC and the C programming language. 64 | 65 | 2017-09-09 Kim Walisch 66 | 67 | Version 1.9 released. 68 | 69 | * libpopcnt.h: Fix MSVC POPCNT detection. 70 | 71 | 2017-04-24 Kim Walisch 72 | 73 | Version 1.8 released. 74 | 75 | * Fixed "illegal instruction" runtime crash with GCC 4.4.7. 76 | * benchmark.cpp: Print statistics and algorithm name. 77 | * Reduce CMake minimum version to 2.8. 78 | 79 | 2017-04-08 Kim Walisch 80 | 81 | Version 1.7 released. 82 | 83 | * Refactor ARM NEON popcount algorithm. 84 | 85 | 2017-04-04 Kim Walisch 86 | 87 | Version 1.6 released. 88 | 89 | * Add ARM NEON popcount algorithm. 90 | * Fix x86 segmentation fault for CPUs without POPCNT. 91 | 92 | 2017-04-02 Kim Walisch 93 | 94 | Version 1.5 released. 95 | 96 | * libpopcnt.h now supports C and C++! 97 | * Use CMake build system. 98 | * test2.c: Add C test. 99 | * Fix clang-cl bug on Windows. 100 | 101 | 2017-04-01 Kim Walisch 102 | 103 | Version 1.4 released. 104 | 105 | * libpopcnt.h: Fix compiler warning. 106 | 107 | 2017-03-30 Kim Walisch 108 | 109 | Version 1.3 released. 110 | 111 | libpopcnt.h is now C++ only (previously C/C++), the reason being 112 | that the cpuid check cannot be made thread-safe using plain C, 113 | whereas in C++ its trivial (Meyers Singleton). 114 | 115 | * Add benchmark.cpp. 116 | * Use AVX2 for arrays >= 512 bytes (previously 1024). 117 | * Improve Makefile. 118 | 119 | 2017-03-26 Kim Walisch 120 | 121 | Version 1.2 released. 122 | 123 | * Add cpuid check for x86 CPUs. 124 | * Compiles without -mpopcnt, -mavx2 flags. 125 | * Successfully tested on IBM POWER8 (generates popcntd, GCC 5.4). 126 | * Successfully tested using clang-cl (Windows). 127 | * Add ChangeLog. 128 | * Update README.md. 129 | -------------------------------------------------------------------------------- /benchmark.cpp: -------------------------------------------------------------------------------- 1 | /// 2 | /// @file benchmark.cpp 3 | /// @brief Simple benchmark program for libpopcnt.h, repeatedly 4 | /// counts the 1 bits inside a vector. 5 | /// 6 | /// Usage: ./benchmark [array bytes] [iters] 7 | /// 8 | /// Copyright (C) 2019 Kim Walisch, 9 | /// 10 | /// This file is distributed under the BSD License. See the LICENSE 11 | /// file in the top level directory. 12 | /// 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | double get_seconds() 25 | { 26 | return (double) std::clock() / CLOCKS_PER_SEC; 27 | } 28 | 29 | // init vector with random data 30 | void init(std::vector& vect) 31 | { 32 | std::srand((unsigned) std::time(0)); 33 | 34 | for (size_t i = 0; i < vect.size(); i++) 35 | vect[i] = (uint8_t) std::rand(); 36 | } 37 | 38 | // count 1 bits inside vector 39 | uint64_t benchmark(const std::vector& vect, int iters) 40 | { 41 | uint64_t total = 0; 42 | int old = - 1; 43 | 44 | for (int i = 0; i < iters; i++) 45 | { 46 | int percent = (int)(100.0 * i / iters); 47 | if (percent > old) 48 | { 49 | std::cout << "\rStatus: " << percent << "%" << std::flush; 50 | old = percent; 51 | } 52 | total += popcnt(&vect[0], vect.size()); 53 | } 54 | 55 | return total; 56 | } 57 | 58 | void verify(uint64_t cnt, uint64_t total, int iters) 59 | { 60 | if (cnt != total / iters) 61 | { 62 | std::cerr << "libpopcnt verification failed!" << std::endl; 63 | std::exit(1); 64 | } 65 | } 66 | 67 | int main(int argc, char* argv[]) 68 | { 69 | int bytes = (1 << 10) * 16; 70 | int iters = 10000000; 71 | 72 | if (argc > 1) 73 | bytes = std::atoi(argv[1]); 74 | if (argc > 2) 75 | iters = std::atoi(argv[2]); 76 | 77 | uint64_t cnt = 0; 78 | std::vector vect(bytes); 79 | std::string algo; 80 | init(vect); 81 | 82 | std::cout << "Iters: " << iters << std::endl; 83 | 84 | if (bytes < 1024) 85 | std::cout << "Array size: " << bytes << " bytes" << std::endl; 86 | else if (bytes < 1024 * 1024) 87 | std::cout << "Array size: " << std::fixed << std::setprecision(2) << bytes / 1024.0 << " KB" << std::endl; 88 | else 89 | std::cout << "Array size: " << std::fixed << std::setprecision(2) << bytes / (1024.0 * 1024.0) << " MB" << std::endl; 90 | 91 | #if defined(LIBPOPCNT_X86_OR_X64) 92 | 93 | #if defined(LIBPOPCNT_HAVE_CPUID) 94 | int cpuid = get_cpuid(); 95 | if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && bytes >= 40) 96 | algo = "AVX512"; 97 | else if ((cpuid & LIBPOPCNT_BIT_AVX2) && bytes >= 512) 98 | algo = "AVX2"; 99 | else if (cpuid & LIBPOPCNT_BIT_POPCNT) 100 | algo = "POPCNT"; 101 | #else 102 | #if defined(LIBPOPCNT_HAVE_AVX512) && (defined(__AVX512__) || \ 103 | (defined(__AVX512F__) && \ 104 | defined(__AVX512BW__) && \ 105 | defined(__AVX512VPOPCNTDQ__))) 106 | if (algo.empty() && bytes >= 40) 107 | algo = "AVX512"; 108 | #endif 109 | #if defined(LIBPOPCNT_HAVE_AVX2) && defined(__AVX2__) 110 | if (algo.empty() && bytes >= 512) 111 | algo = "AVX2"; 112 | #endif 113 | #if defined(LIBPOPCNT_HAVE_POPCNT) && defined(__POPCNT__) 114 | if (algo.empty()) 115 | algo = "POPCNT"; 116 | #endif 117 | #endif 118 | 119 | #elif defined(__ARM_FEATURE_SVE) && \ 120 | __has_include() 121 | algo = "ARM SVE"; 122 | #elif (defined(__ARM_NEON) || \ 123 | defined(__aarch64__)) && \ 124 | __has_include() 125 | algo = "ARM NEON"; 126 | #elif defined(__PPC64__) 127 | algo = "POPCNTD"; 128 | #endif 129 | 130 | if (algo.empty()) 131 | algo = "integer popcount"; 132 | 133 | std::cout << "Algorithm: " << algo << std::endl; 134 | 135 | for (size_t i = 0; i < vect.size(); i++) 136 | cnt += popcnt64_bitwise(vect[i]); 137 | 138 | double seconds = get_seconds(); 139 | uint64_t total = benchmark(vect, iters); 140 | seconds = get_seconds() - seconds; 141 | 142 | std::cout << "\rStatus: 100%" << std::endl; 143 | std::cout << "Seconds: " << std::fixed << std::setprecision(2) << seconds << std::endl; 144 | 145 | double total_bytes = (double) bytes * (double) iters; 146 | double GB = total_bytes / 1e9; 147 | double GBs = GB / seconds; 148 | 149 | std::cout << std::fixed << std::setprecision(1) << GBs << " GB/s" << std::endl; 150 | verify(cnt, total, iters); 151 | 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # libpopcnt 2 | 3 | [![Build status](https://github.com/kimwalisch/libpopcnt/actions/workflows/ci.yml/badge.svg)](https://github.com/kimwalisch/libpopcnt/actions/workflows/ci.yml) 4 | [![Github Releases](https://img.shields.io/github/release/kimwalisch/libpopcnt.svg)](https://github.com/kimwalisch/libpopcnt/releases) 5 | 6 | ```libpopcnt.h``` is a header-only C/C++ library for counting the 7 | number of 1 bits (bit population count) in an array as quickly as 8 | possible using specialized CPU instructions i.e. 9 | [POPCNT](https://en.wikipedia.org/wiki/SSE4#POPCNT_and_LZCNT), 10 | [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions), 11 | [AVX512](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions), 12 | [NEON](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)), 13 | [SVE](https://en.wikipedia.org/wiki/AArch64#Scalable_Vector_Extension_(SVE)). 14 | ```libpopcnt.h``` has been tested successfully using the GCC, 15 | Clang and MSVC compilers. 16 | 17 | ## C/C++ API 18 | 19 | ```C 20 | #include "libpopcnt.h" 21 | 22 | /* 23 | * Count the number of 1 bits in the data array 24 | * @data: An array 25 | * @size: Size of data in bytes 26 | */ 27 | uint64_t popcnt(const void* data, uint64_t size); 28 | ``` 29 | 30 | ## How to compile 31 | 32 | ```libpopcnt.h``` does not require any special compiler flags like ```-mavx2```! 33 | To get the best performance we only recommend to compile with 34 | optimizations enabled e.g. ```-O3``` or ```-O2```. 35 | 36 | ```bash 37 | cc -O3 program.c 38 | c++ -O3 program.cpp 39 | ``` 40 | 41 | ## CPU architectures 42 | 43 | ```libpopcnt.h``` has hardware accelerated popcount algorithms for 44 | the following CPU architectures: 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
x86POPCNT, AVX2, AVX512
x86-64POPCNT, AVX2, AVX512
ARMNEON, SVE
PPC64POPCNTD
64 | 65 | For other CPU architectures a fast integer popcount algorithm is used. 66 | 67 | ## How it works 68 | 69 | On x86 CPUs, ```libpopcnt.h``` first queries your CPU's supported 70 | instruction sets using the ```CPUID``` instruction (this is done only once). 71 | Then ```libpopcnt.h``` chooses the fastest bit population count algorithm 72 | supported by your CPU: 73 | 74 | * If the CPU supports ```AVX512``` the ```AVX512 VPOPCNT``` algorithm is used. 75 | * Else if the CPU supports ```AVX2``` the ```AVX2 Harley Seal``` algorithm is used. 76 | * Else if the CPU supports ```POPCNT``` the ```POPCNT``` algorithm is used. 77 | * For CPUs without ```POPCNT``` instruction a portable integer algorithm is used. 78 | 79 | Note that ```libpopcnt.h``` works on all CPUs (x86, ARM, PPC, WebAssembly, ...). 80 | It is portable by default and hardware acceleration is only enabled if the CPU 81 | supports it. ```libpopcnt.h``` it is also thread-safe. 82 | 83 | We take performance seriously, if you compile using e.g. ```-march=native``` 84 | on an x86 CPU with AVX512 support then all runtime ```CPUID``` checks are removed! 85 | 86 | ## ARM SVE (Scalable Vector Extension) 87 | 88 | ARM SVE is a new vector instruction set for ARM CPUs that was first released in 89 | 2020. ARM SVE supports a variable vector length from 128 to 2048 bits. Hence 90 | ARM SVE algorithms can be much faster than ARM NEON algorithms which are limited 91 | to 128 bits vector length. 92 | 93 | libpopcnt's new ARM SVE popcount algorithm is up to 3x faster than its ARM NEON 94 | popcount algorithm (on AWS Graviton3 CPUs). Unfortunately runtime dispatching to 95 | ARM SVE is not yet well supported by the GCC and Clang compilers and libc's. 96 | Therefore, by default only the (portable) ARM NEON popcount algorithm is enabled 97 | when using libpopcnt on ARM CPUs. 98 | 99 | To enable libpopcnt's ARM SVE popcount algorithm you need to compile your program 100 | using your compiler's ARM SVE option e.g.: 101 | 102 | ```bash 103 | gcc -O3 -march=armv8-a+sve program.c 104 | g++ -O3 -march=armv8-a+sve program.cpp 105 | ``` 106 | 107 | ## Development 108 | 109 | ```bash 110 | cmake . 111 | make -j 112 | make test 113 | ``` 114 | 115 | The above commands also build the ```benchmark``` program which is 116 | useful for benchmarking ```libpopcnt.h```. Below is a 117 | usage example run on an AMD EPYC 9R14 CPU from 2023: 118 | 119 | ```bash 120 | # Usage: ./benchmark [array bytes] [iters] 121 | ./benchmark 122 | Iters: 10000000 123 | Array size: 16.00 KB 124 | Algorithm: AVX512 125 | Status: 100% 126 | Seconds: 1.23 127 | 133.5 GB/s 128 | ``` 129 | 130 | ## Acknowledgments 131 | 132 | Some of the algorithms used in ```libpopcnt.h``` are described in the paper 133 | [Faster Population Counts using AVX2 Instructions](https://arxiv.org/abs/1611.07612) 134 | by Daniel Lemire, Nathan Kurz and Wojciech Mula (23 Nov 2016). The AVX2 Harley Seal 135 | popcount algorithm used in ```libpopcnt.h``` has been copied from Wojciech Muła's 136 | [sse-popcount](https://github.com/WojciechMula/sse-popcount) GitHub repo. 137 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | - gh-pages 7 | pull_request: 8 | branches-ignore: 9 | - gh-pages 10 | 11 | jobs: 12 | gcc_linux: 13 | strategy: 14 | matrix: 15 | platform: ['ubuntu-latest'] 16 | config: ['Debug', 'Release'] 17 | runs-on: ${{ matrix.platform }} 18 | env: 19 | CC: gcc 20 | CXX: g++ 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Build libpopcnt 24 | run: | 25 | cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror" 26 | cmake --build . --parallel --verbose 27 | - name: CTest (unit tests) 28 | run: ctest -j2 29 | - name: benchmark 30 | run: ./benchmark 16389 100 31 | 32 | gcc_linux_march_native: 33 | strategy: 34 | matrix: 35 | platform: ['ubuntu-latest'] 36 | config: ['Debug', 'Release'] 37 | runs-on: ${{ matrix.platform }} 38 | env: 39 | CC: gcc 40 | CXX: g++ 41 | steps: 42 | - uses: actions/checkout@v4 43 | - name: Build libpopcnt 44 | run: | 45 | cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" 46 | cmake --build . --parallel --verbose 47 | - name: CTest (unit tests) 48 | run: ctest -j2 49 | - name: benchmark 50 | run: ./benchmark 16389 100 51 | 52 | gcc_linux_valgrind: 53 | runs-on: ubuntu-latest 54 | env: 55 | CC: gcc 56 | CXX: g++ 57 | steps: 58 | - uses: actions/checkout@v4 59 | - name: Install valgrind 60 | run: | 61 | sudo apt update 62 | sudo apt install valgrind 63 | - name: Build libpopcnt 64 | run: | 65 | cmake . -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror -g" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror -g" 66 | cmake --build . --parallel --verbose 67 | - name: CTest (unit tests) 68 | run: ctest -j2 69 | - name: benchmark 70 | run: ./benchmark 16389 100 71 | 72 | gcc_linux_sanitizers: 73 | # This test fails on Ubuntu 22.04 likely due to an Ubuntu or compiler bug. 74 | # See discussion at: https://github.com/quantumlib/Stim/issues/717#issuecomment-2002623560 75 | runs-on: ubuntu-20.04 76 | env: 77 | CC: gcc 78 | CXX: g++ 79 | steps: 80 | - uses: actions/checkout@v4 81 | - name: Build libpopcnt 82 | run: | 83 | cmake . -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" 84 | cmake --build . --parallel --verbose 85 | - name: CTest (unit tests) 86 | run: ctest -j2 --output-on-failure 87 | - name: benchmark 88 | run: ./benchmark 16389 100 89 | 90 | clang_linux: 91 | strategy: 92 | matrix: 93 | platform: ['ubuntu-latest'] 94 | config: ['Debug', 'Release'] 95 | runs-on: ${{ matrix.platform }} 96 | env: 97 | CC: clang 98 | CXX: clang++ 99 | steps: 100 | - uses: actions/checkout@v4 101 | - name: Build libpopcnt 102 | run: | 103 | cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror" 104 | cmake --build . --parallel --verbose 105 | - name: CTest (unit tests) 106 | run: ctest -j2 107 | - name: benchmark 108 | run: ./benchmark 16389 100 109 | 110 | clang_linux_march_native: 111 | strategy: 112 | matrix: 113 | platform: ['ubuntu-latest'] 114 | config: ['Debug', 'Release'] 115 | runs-on: ${{ matrix.platform }} 116 | env: 117 | CC: clang 118 | CXX: clang++ 119 | steps: 120 | - uses: actions/checkout@v4 121 | - name: Build libpopcnt 122 | run: | 123 | cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" 124 | cmake --build . --parallel --verbose 125 | - name: CTest (unit tests) 126 | run: ctest -j2 127 | - name: benchmark 128 | run: ./benchmark 16389 100 129 | 130 | clang_macos: 131 | strategy: 132 | matrix: 133 | platform: ['macos-latest'] 134 | config: ['Debug', 'Release'] 135 | runs-on: ${{ matrix.platform }} 136 | env: 137 | CC: clang 138 | CXX: clang++ 139 | steps: 140 | - uses: actions/checkout@v4 141 | - name: Build libpopcnt 142 | run: | 143 | cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_INSTALL_PREFIX=$(pwd) -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wno-c++11-long-long -Wall -Wextra -pedantic -Werror" 144 | cmake --build . --parallel --verbose 145 | cmake --install . 146 | - name: CTest (unit tests) 147 | run: ctest -j2 148 | - name: benchmark 149 | run: ./benchmark 16389 100 150 | 151 | msvc_windows_vs2022: 152 | runs-on: windows-2022 153 | steps: 154 | - uses: actions/checkout@v4 155 | - name: Build libpopcnt 156 | run: | 157 | cmake . -G "Visual Studio 17 2022" -DCMAKE_CXX_FLAGS="/W3 /WX /MP /EHsc" 158 | cmake --build . --config Release --target install 159 | - name: CTest (unit tests) 160 | run: ctest -j2 -C Release --output-on-failure 161 | - name: benchmark 162 | run: Release\benchmark.exe 16389 100 163 | 164 | # See documentation: https://www.msys2.org/docs/ci/ 165 | msvc_windows_mingw64: 166 | strategy: 167 | matrix: 168 | platform: ['windows-latest'] 169 | config: ['Debug', 'Release'] 170 | runs-on: ${{ matrix.platform }} 171 | defaults: 172 | run: 173 | shell: msys2 {0} 174 | steps: 175 | - uses: actions/checkout@v4 176 | - uses: msys2/setup-msys2@v2 177 | with: 178 | update: true 179 | install: base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-cmake 180 | - name: Build libpopcnt 181 | run: | 182 | cmake . -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror" 183 | cmake --build . --parallel --verbose 184 | - name: CTest (unit tests) 185 | run: ctest -j2 186 | - name: benchmark 187 | run: ./benchmark 16389 100 188 | -------------------------------------------------------------------------------- /libpopcnt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit 3 | * population count) in an array as quickly as possible using 4 | * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. 5 | * 6 | * Copyright (c) 2016 - 2024, Kim Walisch 7 | * Copyright (c) 2016 - 2018, Wojciech Muła 8 | * 9 | * All rights reserved. 10 | * 11 | * Redistribution and use in source and binary forms, with or without 12 | * modification, are permitted provided that the following conditions are met: 13 | * 14 | * 1. Redistributions of source code must retain the above copyright notice, this 15 | * list of conditions and the following disclaimer. 16 | * 2. Redistributions in binary form must reproduce the above copyright notice, 17 | * this list of conditions and the following disclaimer in the documentation 18 | * and/or other materials provided with the distribution. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 24 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 27 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | 32 | #ifndef LIBPOPCNT_H 33 | #define LIBPOPCNT_H 34 | 35 | #include 36 | 37 | #ifndef __has_builtin 38 | #define __has_builtin(x) 0 39 | #endif 40 | 41 | #ifndef __has_attribute 42 | #define __has_attribute(x) 0 43 | #endif 44 | 45 | #ifndef __has_include 46 | #define __has_include(x) 0 47 | #endif 48 | 49 | #ifdef __GNUC__ 50 | #define LIBPOPCNT_GNUC_PREREQ(x, y) \ 51 | (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) 52 | #else 53 | #define LIBPOPCNT_GNUC_PREREQ(x, y) 0 54 | #endif 55 | 56 | #ifdef __clang__ 57 | #define LIBPOPCNT_CLANG_PREREQ(x, y) \ 58 | (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) 59 | #else 60 | #define LIBPOPCNT_CLANG_PREREQ(x, y) 0 61 | #endif 62 | 63 | #if (_MSC_VER < 1900) && \ 64 | !defined(__cplusplus) 65 | #define inline __inline 66 | #endif 67 | 68 | #if (defined(__i386__) || \ 69 | defined(__x86_64__) || \ 70 | defined(_M_IX86) || \ 71 | defined(_M_X64)) 72 | #define LIBPOPCNT_X86_OR_X64 73 | #endif 74 | 75 | #if LIBPOPCNT_GNUC_PREREQ(4, 2) || \ 76 | __has_builtin(__builtin_popcount) 77 | #define LIBPOPCNT_HAVE_BUILTIN_POPCOUNT 78 | #endif 79 | 80 | #if LIBPOPCNT_GNUC_PREREQ(4, 2) || \ 81 | LIBPOPCNT_CLANG_PREREQ(3, 0) 82 | #define LIBPOPCNT_HAVE_ASM_POPCNT 83 | #endif 84 | 85 | #if defined(LIBPOPCNT_X86_OR_X64) && \ 86 | (defined(LIBPOPCNT_HAVE_ASM_POPCNT) || \ 87 | defined(_MSC_VER)) 88 | #define LIBPOPCNT_HAVE_POPCNT 89 | #endif 90 | 91 | /* GCC compiler */ 92 | #if defined(LIBPOPCNT_X86_OR_X64) && \ 93 | LIBPOPCNT_GNUC_PREREQ(5, 0) 94 | #define LIBPOPCNT_HAVE_AVX2 95 | #endif 96 | 97 | /* GCC compiler */ 98 | #if defined(LIBPOPCNT_X86_OR_X64) && \ 99 | LIBPOPCNT_GNUC_PREREQ(11, 0) 100 | #define LIBPOPCNT_HAVE_AVX512 101 | #endif 102 | 103 | /* Clang (Unix-like OSes) */ 104 | #if defined(LIBPOPCNT_X86_OR_X64) && !defined(_MSC_VER) 105 | #if LIBPOPCNT_CLANG_PREREQ(3, 8) && \ 106 | __has_attribute(target) && \ 107 | (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) 108 | #define LIBPOPCNT_HAVE_AVX2 109 | #endif 110 | #if LIBPOPCNT_CLANG_PREREQ(9, 0) && \ 111 | __has_attribute(target) && \ 112 | (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) 113 | #define LIBPOPCNT_HAVE_AVX512 114 | #endif 115 | #endif 116 | 117 | /* MSVC compatible compilers (Windows) */ 118 | #if defined(LIBPOPCNT_X86_OR_X64) && \ 119 | defined(_MSC_VER) 120 | /* 121 | * There is an LLVM/Clang bug on Windows where function targets 122 | * for AVX2 and AVX512 fail to compile unless the user compiles 123 | * using the options /arch:AVX2 and /arch:AVX512. 124 | * All Clang versions <= 18.0 (from 2024) are affected by this bug. 125 | * However, I expect this bug will be fixed in near future: 126 | * https://github.com/llvm/llvm-project/issues/53520 127 | */ 128 | #if defined(__clang__) 129 | #if defined(__AVX2__) 130 | #define LIBPOPCNT_HAVE_AVX2 131 | #endif 132 | #if defined(__AVX512__) 133 | #define LIBPOPCNT_HAVE_AVX2 134 | #define LIBPOPCNT_HAVE_AVX512 135 | #endif 136 | /* MSVC 2017 or later does not require 137 | * /arch:AVX2 or /arch:AVX512 */ 138 | #elif _MSC_VER >= 1910 139 | #define LIBPOPCNT_HAVE_AVX2 140 | #define LIBPOPCNT_HAVE_AVX512 141 | #endif 142 | #endif 143 | 144 | /* 145 | * Only enable CPUID runtime checks if this is really 146 | * needed. E.g. do not enable if user has compiled 147 | * using -march=native on a CPU that supports AVX512. 148 | */ 149 | #if defined(LIBPOPCNT_X86_OR_X64) && \ 150 | (defined(__cplusplus) || \ 151 | defined(_MSC_VER) || \ 152 | (LIBPOPCNT_GNUC_PREREQ(4, 2) || \ 153 | __has_builtin(__sync_val_compare_and_swap))) && \ 154 | ((defined(LIBPOPCNT_HAVE_AVX512) && !(defined(__AVX512__) || \ 155 | (defined(__AVX512F__) && \ 156 | defined(__AVX512BW__) && \ 157 | defined(__AVX512VPOPCNTDQ__)))) || \ 158 | (defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || \ 159 | (defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__))) 160 | #define LIBPOPCNT_HAVE_CPUID 161 | #endif 162 | 163 | #ifdef __cplusplus 164 | extern "C" { 165 | #endif 166 | 167 | /* 168 | * This uses fewer arithmetic operations than any other known 169 | * implementation on machines with fast multiplication. 170 | * It uses 12 arithmetic operations, one of which is a multiply. 171 | * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation 172 | */ 173 | static inline uint64_t popcnt64_bitwise(uint64_t x) 174 | { 175 | uint64_t m1 = 0x5555555555555555ull; 176 | uint64_t m2 = 0x3333333333333333ull; 177 | uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; 178 | uint64_t h01 = 0x0101010101010101ull; 179 | 180 | x -= (x >> 1) & m1; 181 | x = (x & m2) + ((x >> 2) & m2); 182 | x = (x + (x >> 4)) & m4; 183 | 184 | return (x * h01) >> 56; 185 | } 186 | 187 | #if defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \ 188 | defined(__x86_64__) 189 | 190 | static inline uint64_t popcnt64(uint64_t x) 191 | { 192 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); 193 | return x; 194 | } 195 | 196 | #elif defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \ 197 | defined(__i386__) 198 | 199 | static inline uint32_t popcnt32(uint32_t x) 200 | { 201 | __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); 202 | return x; 203 | } 204 | 205 | static inline uint64_t popcnt64(uint64_t x) 206 | { 207 | return popcnt32((uint32_t) x) + 208 | popcnt32((uint32_t)(x >> 32)); 209 | } 210 | 211 | #elif defined(_MSC_VER) && \ 212 | defined(_M_X64) 213 | 214 | #include 215 | 216 | static inline uint64_t popcnt64(uint64_t x) 217 | { 218 | return __popcnt64(x); 219 | } 220 | 221 | #elif defined(_MSC_VER) && \ 222 | defined(_M_IX86) 223 | 224 | #include 225 | 226 | static inline uint64_t popcnt64(uint64_t x) 227 | { 228 | return __popcnt((uint32_t) x) + 229 | __popcnt((uint32_t)(x >> 32)); 230 | } 231 | 232 | /* non x86 CPUs */ 233 | #elif defined(LIBPOPCNT_HAVE_BUILTIN_POPCOUNT) 234 | 235 | static inline uint64_t popcnt64(uint64_t x) 236 | { 237 | return __builtin_popcountll(x); 238 | } 239 | 240 | /* no hardware POPCNT, 241 | * use pure integer algorithm */ 242 | #else 243 | 244 | static inline uint64_t popcnt64(uint64_t x) 245 | { 246 | return popcnt64_bitwise(x); 247 | } 248 | 249 | #endif 250 | 251 | #if defined(LIBPOPCNT_HAVE_CPUID) 252 | 253 | #if defined(_MSC_VER) 254 | #include 255 | #include 256 | #endif 257 | 258 | /* CPUID bits documentation: */ 259 | /* https://en.wikipedia.org/wiki/CPUID */ 260 | 261 | /* %ebx bit flags */ 262 | #define LIBPOPCNT_BIT_AVX2 (1 << 5) 263 | #define LIBPOPCNT_BIT_AVX512F (1 << 16) 264 | #define LIBPOPCNT_BIT_AVX512BW (1 << 30) 265 | 266 | /* %ecx bit flags */ 267 | #define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14) 268 | #define LIBPOPCNT_BIT_POPCNT (1 << 23) 269 | 270 | /* xgetbv bit flags */ 271 | #define LIBPOPCNT_XSTATE_SSE (1 << 1) 272 | #define LIBPOPCNT_XSTATE_YMM (1 << 2) 273 | #define LIBPOPCNT_XSTATE_ZMM (7 << 5) 274 | 275 | static inline void run_cpuid(int eax, int ecx, int* abcd) 276 | { 277 | #if defined(_MSC_VER) 278 | __cpuidex(abcd, eax, ecx); 279 | #else 280 | int ebx = 0; 281 | int edx = 0; 282 | 283 | #if defined(__i386__) && \ 284 | defined(__PIC__) 285 | /* In case of PIC under 32-bit EBX cannot be clobbered */ 286 | __asm__ __volatile__("movl %%ebx, %%edi;" 287 | "cpuid;" 288 | "xchgl %%ebx, %%edi;" 289 | : "+a" (eax), 290 | "=D" (ebx), 291 | "+c" (ecx), 292 | "=d" (edx)); 293 | #else 294 | __asm__ __volatile__("cpuid" 295 | : "+a" (eax), 296 | "+b" (ebx), 297 | "+c" (ecx), 298 | "=d" (edx)); 299 | #endif 300 | 301 | abcd[0] = eax; 302 | abcd[1] = ebx; 303 | abcd[2] = ecx; 304 | abcd[3] = edx; 305 | #endif 306 | } 307 | 308 | #if defined(LIBPOPCNT_HAVE_AVX2) || \ 309 | defined(LIBPOPCNT_HAVE_AVX512) 310 | 311 | static inline uint64_t get_xcr0(void) 312 | { 313 | #if defined(_MSC_VER) 314 | return _xgetbv(0); 315 | #else 316 | uint32_t eax; 317 | uint32_t edx; 318 | 319 | __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); 320 | return eax | (((uint64_t) edx) << 32); 321 | #endif 322 | } 323 | 324 | #endif 325 | 326 | static inline int get_cpuid(void) 327 | { 328 | int flags = 0; 329 | int abcd[4]; 330 | 331 | run_cpuid(1, 0, abcd); 332 | 333 | if ((abcd[2] & LIBPOPCNT_BIT_POPCNT) == LIBPOPCNT_BIT_POPCNT) 334 | flags |= LIBPOPCNT_BIT_POPCNT; 335 | 336 | #if defined(LIBPOPCNT_HAVE_AVX2) || \ 337 | defined(LIBPOPCNT_HAVE_AVX512) 338 | 339 | int osxsave_mask = (1 << 27); 340 | 341 | /* ensure OS supports extended processor state management */ 342 | if ((abcd[2] & osxsave_mask) != osxsave_mask) 343 | return 0; 344 | 345 | uint64_t ymm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM; 346 | uint64_t zmm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM | LIBPOPCNT_XSTATE_ZMM; 347 | uint64_t xcr0 = get_xcr0(); 348 | 349 | if ((xcr0 & ymm_mask) == ymm_mask) 350 | { 351 | run_cpuid(7, 0, abcd); 352 | 353 | if ((abcd[1] & LIBPOPCNT_BIT_AVX2) == LIBPOPCNT_BIT_AVX2) 354 | flags |= LIBPOPCNT_BIT_AVX2; 355 | 356 | if ((xcr0 & zmm_mask) == zmm_mask) 357 | { 358 | /* If all AVX512 features required by our popcnt_avx512() are supported */ 359 | /* then we add LIBPOPCNT_BIT_AVX512_VPOPCNTDQ to our CPUID flags. */ 360 | if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F && 361 | (abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW && 362 | (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) 363 | flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ; 364 | } 365 | } 366 | 367 | #endif 368 | 369 | return flags; 370 | } 371 | 372 | #endif /* cpuid */ 373 | 374 | #if defined(LIBPOPCNT_HAVE_AVX2) && \ 375 | __has_include() 376 | 377 | #include 378 | 379 | #if __has_attribute(target) 380 | __attribute__ ((target ("avx2"))) 381 | #endif 382 | static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) 383 | { 384 | __m256i u = _mm256_xor_si256(a, b); 385 | *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); 386 | *l = _mm256_xor_si256(u, c); 387 | } 388 | 389 | #if __has_attribute(target) 390 | __attribute__ ((target ("avx2"))) 391 | #endif 392 | static inline __m256i popcnt256(__m256i v) 393 | { 394 | __m256i lookup1 = _mm256_setr_epi8( 395 | 4, 5, 5, 6, 5, 6, 6, 7, 396 | 5, 6, 6, 7, 6, 7, 7, 8, 397 | 4, 5, 5, 6, 5, 6, 6, 7, 398 | 5, 6, 6, 7, 6, 7, 7, 8 399 | ); 400 | 401 | __m256i lookup2 = _mm256_setr_epi8( 402 | 4, 3, 3, 2, 3, 2, 2, 1, 403 | 3, 2, 2, 1, 2, 1, 1, 0, 404 | 4, 3, 3, 2, 3, 2, 2, 1, 405 | 3, 2, 2, 1, 2, 1, 1, 0 406 | ); 407 | 408 | __m256i low_mask = _mm256_set1_epi8(0x0f); 409 | __m256i lo = _mm256_and_si256(v, low_mask); 410 | __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); 411 | __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); 412 | __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); 413 | 414 | return _mm256_sad_epu8(popcnt1, popcnt2); 415 | } 416 | 417 | /* 418 | * AVX2 Harley-Seal popcount (4th iteration). 419 | * The algorithm is based on the paper "Faster Population Counts 420 | * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and 421 | * Wojciech Mula (23 Nov 2016). 422 | * @see https://arxiv.org/abs/1611.07612 423 | */ 424 | #if __has_attribute(target) 425 | __attribute__ ((target ("avx2"))) 426 | #endif 427 | static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size) 428 | { 429 | __m256i cnt = _mm256_setzero_si256(); 430 | __m256i ones = _mm256_setzero_si256(); 431 | __m256i twos = _mm256_setzero_si256(); 432 | __m256i fours = _mm256_setzero_si256(); 433 | __m256i eights = _mm256_setzero_si256(); 434 | __m256i sixteens = _mm256_setzero_si256(); 435 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 436 | 437 | uint64_t i = 0; 438 | uint64_t limit = size - size % 16; 439 | uint64_t* cnt64; 440 | 441 | for(; i < limit; i += 16) 442 | { 443 | CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1)); 444 | CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3)); 445 | CSA256(&foursA, &twos, twos, twosA, twosB); 446 | CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5)); 447 | CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7)); 448 | CSA256(&foursB, &twos, twos, twosA, twosB); 449 | CSA256(&eightsA, &fours, fours, foursA, foursB); 450 | CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9)); 451 | CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11)); 452 | CSA256(&foursA, &twos, twos, twosA, twosB); 453 | CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13)); 454 | CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15)); 455 | CSA256(&foursB, &twos, twos, twosA, twosB); 456 | CSA256(&eightsB, &fours, fours, foursA, foursB); 457 | CSA256(&sixteens, &eights, eights, eightsA, eightsB); 458 | 459 | cnt = _mm256_add_epi64(cnt, popcnt256(sixteens)); 460 | } 461 | 462 | cnt = _mm256_slli_epi64(cnt, 4); 463 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3)); 464 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2)); 465 | cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1)); 466 | cnt = _mm256_add_epi64(cnt, popcnt256(ones)); 467 | 468 | for(; i < size; i++) 469 | cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i))); 470 | 471 | cnt64 = (uint64_t*) &cnt; 472 | 473 | return cnt64[0] + 474 | cnt64[1] + 475 | cnt64[2] + 476 | cnt64[3]; 477 | } 478 | 479 | #endif 480 | 481 | #if defined(LIBPOPCNT_HAVE_AVX512) && \ 482 | __has_include() 483 | 484 | #include 485 | 486 | #if __has_attribute(target) 487 | __attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq"))) 488 | #endif 489 | static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size) 490 | { 491 | __m512i cnt = _mm512_setzero_si512(); 492 | const uint64_t* ptr64 = (const uint64_t*) ptr8; 493 | uint64_t size64 = size / sizeof(uint64_t); 494 | uint64_t i = 0; 495 | 496 | for (; i + 32 <= size64; i += 32) 497 | { 498 | __m512i vec0 = _mm512_loadu_epi64(&ptr64[i + 0]); 499 | __m512i vec1 = _mm512_loadu_epi64(&ptr64[i + 8]); 500 | __m512i vec2 = _mm512_loadu_epi64(&ptr64[i + 16]); 501 | __m512i vec3 = _mm512_loadu_epi64(&ptr64[i + 24]); 502 | 503 | vec0 = _mm512_popcnt_epi64(vec0); 504 | vec1 = _mm512_popcnt_epi64(vec1); 505 | vec2 = _mm512_popcnt_epi64(vec2); 506 | vec3 = _mm512_popcnt_epi64(vec3); 507 | 508 | cnt = _mm512_add_epi64(cnt, vec0); 509 | cnt = _mm512_add_epi64(cnt, vec1); 510 | cnt = _mm512_add_epi64(cnt, vec2); 511 | cnt = _mm512_add_epi64(cnt, vec3); 512 | } 513 | 514 | for (; i + 8 <= size64; i += 8) 515 | { 516 | __m512i vec = _mm512_loadu_epi64(&ptr64[i]); 517 | vec = _mm512_popcnt_epi64(vec); 518 | cnt = _mm512_add_epi64(cnt, vec); 519 | } 520 | 521 | i *= sizeof(uint64_t); 522 | 523 | /* Process last 63 bytes */ 524 | if (i < size) 525 | { 526 | __mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size)); 527 | __m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]); 528 | vec = _mm512_popcnt_epi64(vec); 529 | cnt = _mm512_add_epi64(cnt, vec); 530 | } 531 | 532 | return _mm512_reduce_add_epi64(cnt); 533 | } 534 | 535 | #endif 536 | 537 | /* x86 CPUs */ 538 | #if defined(LIBPOPCNT_X86_OR_X64) 539 | 540 | /* 541 | * Count the number of 1 bits in the data array 542 | * @data: An array 543 | * @size: Size of data in bytes 544 | */ 545 | static uint64_t popcnt(const void* data, uint64_t size) 546 | { 547 | /* 548 | * CPUID runtime checks are only enabled if this is needed. 549 | * E.g. CPUID is disabled when a user compiles his 550 | * code using -march=native on a CPU with AVX512. 551 | */ 552 | #if defined(LIBPOPCNT_HAVE_CPUID) 553 | #if defined(__cplusplus) 554 | /* C++11 thread-safe singleton */ 555 | static const int cpuid = get_cpuid(); 556 | #else 557 | static int cpuid_ = -1; 558 | int cpuid = cpuid_; 559 | if (cpuid == -1) 560 | { 561 | cpuid = get_cpuid(); 562 | 563 | #if defined(_MSC_VER) 564 | _InterlockedCompareExchange(&cpuid_, cpuid, -1); 565 | #else 566 | __sync_val_compare_and_swap(&cpuid_, -1, cpuid); 567 | #endif 568 | } 569 | #endif 570 | #endif 571 | 572 | const uint8_t* ptr = (const uint8_t*) data; 573 | uint64_t cnt = 0; 574 | uint64_t i = 0; 575 | 576 | #if defined(LIBPOPCNT_HAVE_AVX512) 577 | #if defined(__AVX512__) || \ 578 | (defined(__AVX512F__) && \ 579 | defined(__AVX512BW__) && \ 580 | defined(__AVX512VPOPCNTDQ__)) 581 | /* For tiny arrays AVX512 is not worth it */ 582 | if (i + 40 <= size) 583 | #else 584 | if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && 585 | i + 40 <= size) 586 | #endif 587 | return popcnt_avx512(ptr, size); 588 | #endif 589 | 590 | #if defined(LIBPOPCNT_HAVE_AVX2) 591 | #if defined(__AVX2__) 592 | /* AVX2 requires arrays >= 512 bytes */ 593 | if (i + 512 <= size) 594 | #else 595 | if ((cpuid & LIBPOPCNT_BIT_AVX2) && 596 | i + 512 <= size) 597 | #endif 598 | { 599 | const __m256i* ptr256 = (const __m256i*)(ptr + i); 600 | cnt += popcnt_avx2(ptr256, (size - i) / 32); 601 | i = size - size % 32; 602 | } 603 | #endif 604 | 605 | #if defined(LIBPOPCNT_HAVE_POPCNT) 606 | /* 607 | * The user has compiled without -mpopcnt. 608 | * Unfortunately the MSVC compiler does not have 609 | * a POPCNT macro so we cannot get rid of the 610 | * runtime check for MSVC. 611 | */ 612 | #if !defined(__POPCNT__) 613 | if (cpuid & LIBPOPCNT_BIT_POPCNT) 614 | #endif 615 | { 616 | if (i + 8 <= size) 617 | { 618 | uintptr_t rem = ((uintptr_t) &ptr[i]) % 8; 619 | 620 | /* Align &ptr[i] to an 8 byte boundary */ 621 | if (rem != 0) 622 | { 623 | uint64_t val = 0; 624 | uint64_t bytes = (uint64_t) (8 - rem % 8); 625 | bytes = (bytes <= 7) ? bytes : 7; 626 | for (uint64_t j = 0; j < bytes; j++) 627 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 628 | cnt += popcnt64(val); 629 | i += bytes; 630 | } 631 | } 632 | 633 | for (; i + 8 <= size; i += 8) 634 | cnt += popcnt64(*(const uint64_t*)(ptr + i)); 635 | 636 | if (i < size) 637 | { 638 | uint64_t val = 0; 639 | uint64_t bytes = (uint64_t) (size - i); 640 | bytes = (bytes <= 7) ? bytes : 7; 641 | for (uint64_t j = 0; j < bytes; j++) 642 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 643 | cnt += popcnt64(val); 644 | } 645 | 646 | return cnt; 647 | } 648 | #endif 649 | 650 | /* 651 | * This code is used for: 652 | * 1) Compiler does not support POPCNT. 653 | * 2) x86 CPU does not support POPCNT (cpuid != POPCNT). 654 | */ 655 | #if !defined(LIBPOPCNT_HAVE_POPCNT) || \ 656 | !defined(__POPCNT__) 657 | 658 | if (i + 8 <= size) 659 | { 660 | uintptr_t rem = ((uintptr_t) &ptr[i]) % 8; 661 | 662 | /* Align &ptr[i] to an 8 byte boundary */ 663 | if (rem != 0) 664 | { 665 | uint64_t val = 0; 666 | uint64_t bytes = (uint64_t) (8 - rem % 8); 667 | bytes = (bytes <= 7) ? bytes : 7; 668 | for (uint64_t j = 0; j < bytes; j++) 669 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 670 | cnt += popcnt64_bitwise(val); 671 | i += bytes; 672 | } 673 | } 674 | 675 | for (; i + 8 <= size; i += 8) 676 | cnt += popcnt64_bitwise(*(const uint64_t*)(ptr + i)); 677 | 678 | if (i < size) 679 | { 680 | uint64_t val = 0; 681 | uint64_t bytes = (uint64_t) (size - i); 682 | bytes = (bytes <= 7) ? bytes : 7; 683 | for (uint64_t j = 0; j < bytes; j++) 684 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 685 | cnt += popcnt64_bitwise(val); 686 | } 687 | 688 | return cnt; 689 | #endif 690 | } 691 | 692 | /* Compile with e.g. -march=armv8-a+sve to enable ARM SVE */ 693 | #elif defined(__ARM_FEATURE_SVE) && \ 694 | __has_include() 695 | 696 | #include 697 | 698 | /* 699 | * Count the number of 1 bits in the data array 700 | * @data: An array 701 | * @size: Size of data in bytes 702 | */ 703 | static inline uint64_t popcnt(const void* data, uint64_t size) 704 | { 705 | uint64_t i = 0; 706 | const uint64_t* ptr64 = (const uint64_t*) data; 707 | uint64_t size64 = size / sizeof(uint64_t); 708 | svuint64_t vcnt = svdup_u64(0); 709 | 710 | for (; i + svcntd() * 4 <= size64; i += svcntd() * 4) 711 | { 712 | svuint64_t vec0 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 0]); 713 | svuint64_t vec1 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 1]); 714 | svuint64_t vec2 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 2]); 715 | svuint64_t vec3 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 3]); 716 | 717 | vec0 = svcnt_u64_x(svptrue_b64(), vec0); 718 | vec1 = svcnt_u64_x(svptrue_b64(), vec1); 719 | vec2 = svcnt_u64_x(svptrue_b64(), vec2); 720 | vec3 = svcnt_u64_x(svptrue_b64(), vec3); 721 | 722 | vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec0); 723 | vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec1); 724 | vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec2); 725 | vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec3); 726 | } 727 | 728 | svbool_t pg = svwhilelt_b64(i, size64); 729 | 730 | while (svptest_any(svptrue_b64(), pg)) 731 | { 732 | svuint64_t vec = svld1_u64(pg, &ptr64[i]); 733 | vec = svcnt_u64_z(pg, vec); 734 | vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); 735 | i += svcntd(); 736 | pg = svwhilelt_b64(i, size64); 737 | } 738 | 739 | uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt); 740 | uint64_t bytes = size % sizeof(uint64_t); 741 | 742 | if (bytes != 0) 743 | { 744 | i = size - bytes; 745 | const uint8_t* ptr8 = (const uint8_t*) data; 746 | svbool_t pg8 = svwhilelt_b8(i, size); 747 | svuint8_t vec = svld1_u8(pg8, &ptr8[i]); 748 | svuint8_t vcnt8 = svcnt_u8_z(pg8, vec); 749 | cnt += svaddv_u8(pg8, vcnt8); 750 | } 751 | 752 | return cnt; 753 | } 754 | 755 | #elif (defined(__ARM_NEON) || \ 756 | defined(__aarch64__) || \ 757 | defined(_M_ARM64)) && \ 758 | __has_include() 759 | 760 | #include 761 | 762 | static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) 763 | { 764 | return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); 765 | } 766 | 767 | /* 768 | * Count the number of 1 bits in the data array 769 | * @data: An array 770 | * @size: Size of data in bytes 771 | */ 772 | static inline uint64_t popcnt(const void* data, uint64_t size) 773 | { 774 | uint64_t i = 0; 775 | uint64_t cnt = 0; 776 | uint64_t chunk_size = 64; 777 | const uint8_t* ptr = (const uint8_t*) data; 778 | 779 | if (size >= chunk_size) 780 | { 781 | uint64_t iters = size / chunk_size; 782 | uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); 783 | uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); 784 | 785 | do 786 | { 787 | uint8x16_t t0 = zero; 788 | uint8x16_t t1 = zero; 789 | uint8x16_t t2 = zero; 790 | uint8x16_t t3 = zero; 791 | 792 | /* 793 | * After every 31 iterations we need to add the 794 | * temporary sums (t0, t1, t2, t3) to the total sum. 795 | * We must ensure that the temporary sums <= 255 796 | * and 31 * 8 bits = 248 which is OK. 797 | */ 798 | uint64_t limit = (i + 31 < iters) ? i + 31 : iters; 799 | 800 | /* Each iteration processes 64 bytes */ 801 | for (; i < limit; i++) 802 | { 803 | uint8x16x4_t input = vld4q_u8(ptr); 804 | ptr += chunk_size; 805 | 806 | t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); 807 | t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); 808 | t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); 809 | t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); 810 | } 811 | 812 | sum = vpadalq(sum, t0); 813 | sum = vpadalq(sum, t1); 814 | sum = vpadalq(sum, t2); 815 | sum = vpadalq(sum, t3); 816 | } 817 | while (i < iters); 818 | 819 | i = 0; 820 | size %= chunk_size; 821 | 822 | uint64_t tmp[2]; 823 | vst1q_u64(tmp, sum); 824 | cnt += tmp[0]; 825 | cnt += tmp[1]; 826 | } 827 | 828 | if (i + 8 <= size) 829 | { 830 | uintptr_t rem = ((uintptr_t) &ptr[i]) % 8; 831 | 832 | /* Align &ptr[i] to an 8 byte boundary */ 833 | if (rem != 0) 834 | { 835 | uint64_t val = 0; 836 | uint64_t bytes = (uint64_t) (8 - rem % 8); 837 | bytes = (bytes <= 7) ? bytes : 7; 838 | for (uint64_t j = 0; j < bytes; j++) 839 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 840 | cnt += popcnt64(val); 841 | i += bytes; 842 | } 843 | } 844 | 845 | for (; i + 8 <= size; i += 8) 846 | cnt += popcnt64(*(const uint64_t*)(ptr + i)); 847 | 848 | if (i < size) 849 | { 850 | uint64_t val = 0; 851 | uint64_t bytes = (uint64_t) (size - i); 852 | bytes = (bytes <= 7) ? bytes : 7; 853 | for (uint64_t j = 0; j < bytes; j++) 854 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 855 | cnt += popcnt64(val); 856 | } 857 | 858 | return cnt; 859 | } 860 | 861 | /* all other CPUs */ 862 | #else 863 | 864 | /* 865 | * Count the number of 1 bits in the data array 866 | * @data: An array 867 | * @size: Size of data in bytes 868 | */ 869 | static inline uint64_t popcnt(const void* data, uint64_t size) 870 | { 871 | uint64_t i = 0; 872 | uint64_t cnt = 0; 873 | const uint8_t* ptr = (const uint8_t*) data; 874 | 875 | if (i + 8 <= size) 876 | { 877 | uintptr_t rem = ((uintptr_t) &ptr[i]) % 8; 878 | 879 | /* Align &ptr[i] to an 8 byte boundary */ 880 | if (rem != 0) 881 | { 882 | uint64_t val = 0; 883 | uint64_t bytes = (uint64_t) (8 - rem % 8); 884 | bytes = (bytes <= 7) ? bytes : 7; 885 | for (uint64_t j = 0; j < bytes; j++) 886 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 887 | cnt += popcnt64(val); 888 | i += bytes; 889 | } 890 | } 891 | 892 | for (; i + 8 <= size; i += 8) 893 | cnt += popcnt64(*(const uint64_t*)(ptr + i)); 894 | 895 | if (i < size) 896 | { 897 | uint64_t val = 0; 898 | uint64_t bytes = (uint64_t) (size - i); 899 | bytes = (bytes <= 7) ? bytes : 7; 900 | for (uint64_t j = 0; j < bytes; j++) 901 | val |= ((uint64_t) ptr[i + j]) << (j * 8); 902 | cnt += popcnt64(val); 903 | } 904 | 905 | return cnt; 906 | } 907 | 908 | #endif 909 | 910 | #ifdef __cplusplus 911 | } /* extern "C" */ 912 | #endif 913 | 914 | #endif /* LIBPOPCNT_H */ 915 | --------------------------------------------------------------------------------