├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── test
    ├── CMakeLists.txt
    ├── test2.c
    └── test1.cpp
├── CMakeLists.txt
├── .gitignore
├── LICENSE
├── ChangeLog
├── benchmark.cpp
├── README.md
└── libpopcnt.h


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: kimwalisch
2 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB files "*.cpp" "*.c")
2 | foreach(file ${files})
3 |     get_filename_component(binary_name ${file} NAME_WE)
4 |     add_executable(${binary_name} ${file})
5 |     add_test(NAME ${binary_name} COMMAND ${binary_name})
6 | endforeach()
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4...3.19)
 2 | project(libpopcnt C CXX)
 3 | set(CMAKE_BUILD_TYPE Release)
 4 | include_directories(.)
 5 | 
 6 | add_executable(benchmark benchmark.cpp)
 7 | enable_testing()
 8 | add_subdirectory(test)
 9 | 
10 | install(FILES libpopcnt.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | *~
 3 | !/.github
 4 | !/.gitignore
 5 | !/.travis.yml
 6 | *.exe
 7 | *.tar.gz
 8 | *.zip
 9 | *.obj
10 | *.lib
11 | *.o
12 | *.pc
13 | *.dSYM
14 | CMakeCache.txt
15 | CMakeFiles
16 | CMakeScripts
17 | Makefile
18 | cmake_install.cmake
19 | install_manifest.txt
20 | CTestTestfile.cmake
21 | /benchmark
22 | /test/test1
23 | /test/test2
24 | build
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2016 - 2020, Kim Walisch
 4 | Copyright (c) 2016 - 2019, Wojciech Muła
 5 | 
 6 | All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 | 
11 | 1. Redistributions of source code must retain the above copyright notice, this
12 |    list of conditions and the following disclaimer.
13 | 2. Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/test/test2.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Simple C test program for libpopcnt.h.
 3 |  * Generates an array with random data and computes the bit population
 4 |  * count using 2 different algorithms and checks that the
 5 |  * results match.
 6 |  *
 7 |  * Usage: ./test2
 8 |  *
 9 |  * Copyright (C) 2017 Kim Walisch, <kim.walisch@gmail.com>
10 |  *
11 |  * This file is distributed under the BSD License. See the LICENSE
12 |  * file in the top level directory.
13 |  */
14 | 
15 | #include <libpopcnt.h>
16 | 
17 | #include <stdlib.h>
18 | #include <stdio.h>
19 | #include <time.h>
20 | #include <stdint.h>
21 | #include <string.h>
22 | 
23 | /*
24 |  * Count 1 bits from &data[i] till &data[size].
25 |  * @data: An array for testing
26 |  * @size: size of data
27 |  * @i: Array start index
28 |  */
29 | void test(uint8_t* data,
30 |           size_t size,
31 |           size_t i)
32 | {
33 |   uint64_t bits = popcnt(&data[i], size - i);
34 |   uint64_t bits_verify = 0;
35 | 
36 |   for (; i < size; i++)
37 |     bits_verify += popcnt64_bitwise(data[i]);
38 | 
39 |   if (bits != bits_verify)
40 |   {
41 |     printf("\nlibpopcnt test failed!\n");
42 |     free(data);
43 |     exit(1);
44 |   }
45 | }
46 | 
47 | int main(void)
48 | {
49 |   size_t i;
50 |   size_t size = 70000;
51 | 
52 |   uint8_t* data = (uint8_t*) malloc(size);
53 | 
54 |   if (!data)
55 |   {
56 |     printf("Failed to allocate memory!\n");
57 |     exit(1);
58 |   }
59 | 
60 |   /* init array with only 1 bits */
61 |   memset(data, 0xff, size);
62 |   test(data, size, 0);
63 | 
64 |   srand((unsigned) time(0));
65 | 
66 |   /* generate array with random data */
67 |   for (i = 0; i < size; i++)
68 |     data[i] = (uint8_t) rand();
69 | 
70 |   for (i = 0; i < size; i++)
71 |     test(data, size, i);
72 | 
73 |   free(data);
74 | 
75 |   printf("\rStatus: 100%%\n");
76 |   printf("libpopcnt tested successfully!\n");
77 | 
78 |   return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/test/test1.cpp:
--------------------------------------------------------------------------------
 1 | ///
 2 | /// @file  test1.cpp
 3 | /// @brief Simple C++ test program for libpopcnt.h.
 4 | ///        Generates an array with random data and computes the bit
 5 | ///        population count using 2 different algorithms and checks
 6 | ///        that the results match.
 7 | ///
 8 | /// Usage: ./test1 [array bytes]
 9 | ///
10 | /// Copyright (C) 2017 Kim Walisch, <kim.walisch@gmail.com>
11 | ///
12 | /// This file is distributed under the BSD License. See the LICENSE
13 | /// file in the top level directory.
14 | ///
15 | 
16 | #include <libpopcnt.h>
17 | 
18 | #include <iostream>
19 | #include <vector>
20 | #include <ctime>
21 | #include <cstdlib>
22 | #include <stdint.h>
23 | 
24 | using namespace std;
25 | 
26 | /// Count 1 bits from &data[i] till &data[size].
27 | /// @data: An array for testing
28 | /// @i: Array start index
29 | ///
30 | void test(vector<uint8_t>& data, size_t i)
31 | {
32 |   size_t size = data.size();
33 | 
34 |   uint64_t bits = popcnt(&data[i], size - i);
35 |   uint64_t bits_verify = 0;
36 | 
37 |   for (; i < size; i++)
38 |     bits_verify += popcnt64_bitwise(data[i]);
39 | 
40 |   if (bits != bits_verify)
41 |   {
42 |     cerr << endl;
43 |     cerr << "libpopcnt test failed!" << endl;
44 |     exit(1);
45 |   }
46 | }
47 | 
48 | int main(int argc, char* argv[])
49 | {
50 |   size_t size = 100000;
51 | 
52 |   if (argc > 1)
53 |     size = atoi(argv[1]);
54 | 
55 |   // init array with only 1 bits
56 |   vector<uint8_t> data(size, 0xff);
57 | 
58 |   if (!data.empty())
59 |     test(data, 0);
60 | 
61 |   srand((unsigned) time(0));
62 | 
63 |   // generate array with random data
64 |   for (size_t i = 0; i < size; i++)
65 |     data[i] = (uint8_t) rand();
66 | 
67 |   for (size_t i = 0; i < size; i++)
68 |   {
69 |     test(data, i);
70 |     double percent = (100.0 * i) / size;
71 |     cout << "\rStatus: " << (int) percent << "%" << flush;
72 |   }
73 | 
74 |   cout << "\rStatus: 100%" << endl;
75 |   cout << "libpopcnt tested successfully!" << endl;
76 | 
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
  1 | 2024-06-29 Kim Walisch  <kim.walisch@gmail.com>
  2 | 
  3 |   Version 3.1
  4 | 
  5 |   * Improve AVX512 algorithm for trailing 64 bytes.
  6 |   * AVX512 algorithm does not require AVX512-BITALG extension anymore.
  7 | 
  8 | 2024-06-27 Kim Walisch  <kim.walisch@gmail.com>
  9 | 
 10 |   Version 3.0
 11 | 
 12 |   * Add ARM SVE algorithm.
 13 |   * Replace AVX512BW algorithm by faster AVX512 VPOPCNTDQ algorithm.
 14 |   * Add MSVC support for ARM NEON.
 15 |   * Improve preprocessor checks using __has_include() macro.
 16 |   * Port tests from AppVeyor to GitHub actions.
 17 |   * Get rid of unaligned uint64_t memory acceses, this fixes
 18 |     test failures when using GCC compiler sanitizers.
 19 |   * Prefix all libpopcnt macros using LIBPOPCNT_ to avoid any naming collisions.
 20 | 
 21 | 2020-08-10 Kim Walisch  <kim.walisch@gmail.com>
 22 | 
 23 |   * Use unaligned memory accesses to improve performance.
 24 |     Aligning memory causes branch mispredictions which
 25 |     can significantly deteriorate performance especially
 26 |     for small array sizes.
 27 |   * On x86/x64 runtime checks are now removed if the user
 28 |     compiles his code with e.g. -mpopcnt, -mavx2,
 29 |     -march=native, ...
 30 | 
 31 | 2020-07-19 Kim Walisch  <kim.walisch@gmail.com>
 32 | 
 33 |   * Enable AVX2, AVX512 by default for MSVC.
 34 |     Thanks to KOConchobhair for the pull request #15.
 35 | 
 36 | 2019-12-31 Kim Walisch  <kim.walisch@gmail.com>
 37 | 
 38 |   Version 2.3 released.
 39 | 
 40 |   * Up to 10% speedup on ARM NEON.
 41 |   * Fix unaligned memory access on ARM.
 42 | 
 43 | 2018-01-13 Kim Walisch  <kim.walisch@gmail.com>
 44 | 
 45 |   Version 2.2 released.
 46 | 
 47 |   * Up to 6x faster on old x86 CPUs without POPCNT.
 48 | 
 49 | 2017-11-10 Kim Walisch  <kim.walisch@gmail.com>
 50 | 
 51 |   Version 2.1 released.
 52 | 
 53 |   * Up to 20% ARM NEON speedup.
 54 |   * test/test1.cpp: Add only 1 bits test case.
 55 |   * test/test2.c: Add only 1 bits test case.
 56 | 
 57 | 2017-10-27 Kim Walisch  <kim.walisch@gmail.com>
 58 | 
 59 |   Version 2.0 released.
 60 | 
 61 |   * Add AVX512 support.
 62 |   * Support AVX2 and AVX512 for the MSVC compiler.
 63 |   * Support CPUID for MSVC and the C programming language.
 64 | 
 65 | 2017-09-09 Kim Walisch  <kim.walisch@gmail.com>
 66 | 
 67 |   Version 1.9 released.
 68 | 
 69 |   * libpopcnt.h: Fix MSVC POPCNT detection.
 70 | 
 71 | 2017-04-24 Kim Walisch  <kim.walisch@gmail.com>
 72 | 
 73 |   Version 1.8 released.
 74 | 
 75 |   * Fixed "illegal instruction" runtime crash with GCC 4.4.7.
 76 |   * benchmark.cpp: Print statistics and algorithm name.
 77 |   * Reduce CMake minimum version to 2.8.
 78 | 
 79 | 2017-04-08 Kim Walisch  <kim.walisch@gmail.com>
 80 | 
 81 |   Version 1.7 released.
 82 | 
 83 |   * Refactor ARM NEON popcount algorithm.
 84 | 
 85 | 2017-04-04 Kim Walisch  <kim.walisch@gmail.com>
 86 | 
 87 |   Version 1.6 released.
 88 | 
 89 |   * Add ARM NEON popcount algorithm.
 90 |   * Fix x86 segmentation fault for CPUs without POPCNT.
 91 | 
 92 | 2017-04-02 Kim Walisch  <kim.walisch@gmail.com>
 93 | 
 94 |   Version 1.5 released.
 95 | 
 96 |   * libpopcnt.h now supports C and C++!
 97 |   * Use CMake build system.
 98 |   * test2.c: Add C test.
 99 |   * Fix clang-cl bug on Windows.
100 | 
101 | 2017-04-01 Kim Walisch  <kim.walisch@gmail.com>
102 | 
103 |   Version 1.4 released.
104 | 
105 |   * libpopcnt.h: Fix compiler warning.
106 | 
107 | 2017-03-30 Kim Walisch  <kim.walisch@gmail.com>
108 | 
109 |   Version 1.3 released.
110 | 
111 |   libpopcnt.h is now C++ only (previously C/C++), the reason being
112 |   that the cpuid check cannot be made thread-safe using plain C,
113 |   whereas in C++ its trivial (Meyers Singleton).
114 | 
115 |   * Add benchmark.cpp.
116 |   * Use AVX2 for arrays >= 512 bytes (previously 1024).
117 |   * Improve Makefile.
118 | 
119 | 2017-03-26 Kim Walisch  <kim.walisch@gmail.com>
120 | 
121 |   Version 1.2 released.
122 | 
123 |   * Add cpuid check for x86 CPUs.
124 |   * Compiles without -mpopcnt, -mavx2 flags.
125 |   * Successfully tested on IBM POWER8 (generates popcntd, GCC 5.4).
126 |   * Successfully tested using clang-cl (Windows).
127 |   * Add ChangeLog.
128 |   * Update README.md.
129 | 


--------------------------------------------------------------------------------
/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | ///
  2 | /// @file  benchmark.cpp
  3 | /// @brief Simple benchmark program for libpopcnt.h, repeatedly
  4 | ///        counts the 1 bits inside a vector.
  5 | ///
  6 | /// Usage: ./benchmark [array bytes] [iters]
  7 | ///
  8 | /// Copyright (C) 2019 Kim Walisch, <kim.walisch@gmail.com>
  9 | ///
 10 | /// This file is distributed under the BSD License. See the LICENSE
 11 | /// file in the top level directory.
 12 | ///
 13 | 
 14 | #include <libpopcnt.h>
 15 | 
 16 | #include <iostream>
 17 | #include <iomanip>
 18 | #include <vector>
 19 | #include <cstdlib>
 20 | #include <ctime>
 21 | #include <stdint.h>
 22 | #include <string>
 23 | 
 24 | double get_seconds()
 25 | {
 26 |   return (double) std::clock() / CLOCKS_PER_SEC;
 27 | }
 28 | 
 29 | // init vector with random data
 30 | void init(std::vector<uint8_t>& vect)
 31 | {
 32 |   std::srand((unsigned) std::time(0));
 33 | 
 34 |   for (size_t i = 0; i < vect.size(); i++)
 35 |     vect[i] = (uint8_t) std::rand();
 36 | }
 37 | 
 38 | // count 1 bits inside vector
 39 | uint64_t benchmark(const std::vector<uint8_t>& vect, int iters)
 40 | {
 41 |   uint64_t total = 0;
 42 |   int old = - 1;
 43 | 
 44 |   for (int i = 0; i < iters; i++)
 45 |   {
 46 |     int percent = (int)(100.0 * i / iters);
 47 |     if (percent > old)
 48 |     {
 49 |       std::cout << "\rStatus: " << percent << "%" << std::flush;
 50 |       old = percent;
 51 |     }
 52 |     total += popcnt(&vect[0], vect.size());
 53 |   }
 54 | 
 55 |   return total;
 56 | }
 57 | 
 58 | void verify(uint64_t cnt, uint64_t total, int iters)
 59 | {
 60 |   if (cnt != total / iters)
 61 |   {
 62 |     std::cerr << "libpopcnt verification failed!" << std::endl;
 63 |     std::exit(1);
 64 |   }
 65 | }
 66 | 
 67 | int main(int argc, char* argv[])
 68 | {
 69 |   int bytes = (1 << 10) * 16;
 70 |   int iters = 10000000;
 71 | 
 72 |   if (argc > 1)
 73 |     bytes = std::atoi(argv[1]);
 74 |   if (argc > 2)
 75 |     iters = std::atoi(argv[2]);
 76 | 
 77 |   uint64_t cnt = 0;
 78 |   std::vector<uint8_t> vect(bytes);
 79 |   std::string algo;
 80 |   init(vect);
 81 | 
 82 |   std::cout << "Iters: " << iters << std::endl;
 83 | 
 84 |   if (bytes < 1024)
 85 |     std::cout << "Array size: " << bytes << " bytes" << std::endl;
 86 |   else if (bytes < 1024 * 1024)
 87 |     std::cout << "Array size: " << std::fixed << std::setprecision(2) << bytes / 1024.0 << " KB" << std::endl;
 88 |   else
 89 |     std::cout << "Array size: " << std::fixed << std::setprecision(2) << bytes / (1024.0 * 1024.0) << " MB" << std::endl;
 90 | 
 91 | #if defined(LIBPOPCNT_X86_OR_X64)
 92 | 
 93 |   #if defined(LIBPOPCNT_HAVE_CPUID)
 94 |     int cpuid = get_cpuid();
 95 |     if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && bytes >= 40)
 96 |       algo = "AVX512";
 97 |     else if ((cpuid & LIBPOPCNT_BIT_AVX2) && bytes >= 512)
 98 |       algo = "AVX2";
 99 |     else if (cpuid & LIBPOPCNT_BIT_POPCNT)
100 |       algo = "POPCNT";
101 |   #else
102 |     #if defined(LIBPOPCNT_HAVE_AVX512) && (defined(__AVX512__) || \
103 |                                           (defined(__AVX512F__) && \
104 |                                            defined(__AVX512BW__) && \
105 |                                            defined(__AVX512VPOPCNTDQ__)))
106 |       if (algo.empty() && bytes >= 40)
107 |         algo = "AVX512";
108 |     #endif
109 |     #if defined(LIBPOPCNT_HAVE_AVX2) && defined(__AVX2__)
110 |       if (algo.empty() && bytes >= 512)
111 |         algo = "AVX2";
112 |     #endif
113 |     #if defined(LIBPOPCNT_HAVE_POPCNT) && defined(__POPCNT__)
114 |       if (algo.empty())
115 |         algo = "POPCNT";
116 |     #endif
117 |   #endif
118 | 
119 | #elif defined(__ARM_FEATURE_SVE) && \
120 |       __has_include(<arm_sve.h>)
121 |   algo = "ARM SVE";
122 | #elif (defined(__ARM_NEON) || \
123 |        defined(__aarch64__)) && \
124 |       __has_include(<arm_neon.h>)
125 |   algo = "ARM NEON";
126 | #elif defined(__PPC64__)
127 |   algo = "POPCNTD";
128 | #endif
129 | 
130 |   if (algo.empty())
131 |     algo = "integer popcount";
132 | 
133 |   std::cout << "Algorithm: " << algo << std::endl;
134 | 
135 |   for (size_t i = 0; i < vect.size(); i++)
136 |     cnt += popcnt64_bitwise(vect[i]);
137 | 
138 |   double seconds = get_seconds();
139 |   uint64_t total = benchmark(vect, iters);
140 |   seconds = get_seconds() - seconds;
141 | 
142 |   std::cout << "\rStatus: 100%" << std::endl;
143 |   std::cout << "Seconds: " << std::fixed << std::setprecision(2) << seconds << std::endl;
144 | 
145 |   double total_bytes = (double) bytes * (double) iters;
146 |   double GB = total_bytes / 1e9;
147 |   double GBs = GB / seconds;
148 | 
149 |   std::cout << std::fixed << std::setprecision(1) << GBs << " GB/s" << std::endl;
150 |   verify(cnt, total, iters);
151 | 
152 |   return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # libpopcnt
  2 | 
  3 | [![Build status](https://github.com/kimwalisch/libpopcnt/actions/workflows/ci.yml/badge.svg)](https://github.com/kimwalisch/libpopcnt/actions/workflows/ci.yml)
  4 | [![Github Releases](https://img.shields.io/github/release/kimwalisch/libpopcnt.svg)](https://github.com/kimwalisch/libpopcnt/releases)
  5 | 
  6 | ```libpopcnt.h``` is a header-only C/C++ library for counting the
  7 | number of 1 bits (bit population count) in an array as quickly as
  8 | possible using specialized CPU instructions i.e.
  9 | [POPCNT](https://en.wikipedia.org/wiki/SSE4#POPCNT_and_LZCNT),
 10 | [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
 11 | [AVX512](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions),
 12 | [NEON](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)),
 13 | [SVE](https://en.wikipedia.org/wiki/AArch64#Scalable_Vector_Extension_(SVE)).
 14 | ```libpopcnt.h``` has been tested successfully using the GCC,
 15 | Clang and MSVC compilers.
 16 | 
 17 | ## C/C++ API
 18 | 
 19 | ```C
 20 | #include "libpopcnt.h"
 21 | 
 22 | /*
 23 |  * Count the number of 1 bits in the data array
 24 |  * @data: An array
 25 |  * @size: Size of data in bytes
 26 |  */
 27 | uint64_t popcnt(const void* data, uint64_t size);
 28 | ```
 29 | 
 30 | ## How to compile
 31 | 
 32 | ```libpopcnt.h``` does not require any special compiler flags like ```-mavx2```!
 33 | To get the best performance we only recommend to compile with
 34 | optimizations enabled e.g. ```-O3``` or ```-O2```.
 35 | 
 36 | ```bash
 37 | cc  -O3 program.c
 38 | c++ -O3 program.cpp
 39 | ```
 40 | 
 41 | ## CPU architectures
 42 | 
 43 | ```libpopcnt.h``` has hardware accelerated popcount algorithms for
 44 | the following CPU architectures:
 45 | 
 46 | <table>
 47 |   <tr>
 48 |     <td><b>x86</b></td>
 49 |     <td><code>POPCNT</code>, <code>AVX2</code>, <code>AVX512</code></td> 
 50 |   </tr>
 51 |   <tr>
 52 |     <td><b>x86-64</b></td>
 53 |     <td><code>POPCNT</code>, <code>AVX2</code>, <code>AVX512</code></td>
 54 |   </tr>
 55 |   <tr>
 56 |     <td><b>ARM</b></td>
 57 |     <td><code>NEON</code>, <code>SVE</code></td> 
 58 |   </tr>
 59 |   <tr>
 60 |     <td><b>PPC64</b></td>
 61 |     <td><code>POPCNTD</code></td>
 62 |   </tr>
 63 | </table>
 64 | 
 65 | For other CPU architectures a fast integer popcount algorithm is used.
 66 | 
 67 | ## How it works
 68 | 
 69 | On x86 CPUs, ```libpopcnt.h``` first queries your CPU's supported
 70 | instruction sets using the ```CPUID``` instruction (this is done only once).
 71 | Then ```libpopcnt.h``` chooses the fastest bit population count algorithm
 72 | supported by your CPU:
 73 | 
 74 | * If the CPU supports ```AVX512``` the ```AVX512 VPOPCNT``` algorithm is used.
 75 | * Else if the CPU supports ```AVX2``` the ```AVX2 Harley Seal``` algorithm is used.
 76 | * Else if the CPU supports ```POPCNT``` the ```POPCNT``` algorithm is used.
 77 | * For CPUs without ```POPCNT``` instruction a portable integer algorithm is used.
 78 | 
 79 | Note that ```libpopcnt.h``` works on all CPUs (x86, ARM, PPC, WebAssembly, ...).
 80 | It is portable by default and hardware acceleration is only enabled if the CPU
 81 | supports it. ```libpopcnt.h``` it is also thread-safe.
 82 | 
 83 | We take performance seriously, if you compile using e.g. ```-march=native```
 84 | on an x86 CPU with AVX512 support then all runtime ```CPUID``` checks are removed!
 85 | 
 86 | ## ARM SVE (Scalable Vector Extension)
 87 | 
 88 | ARM SVE is a new vector instruction set for ARM CPUs that was first released in
 89 | 2020. ARM SVE supports a variable vector length from 128 to 2048 bits. Hence
 90 | ARM SVE algorithms can be much faster than ARM NEON algorithms which are limited
 91 | to 128 bits vector length.
 92 | 
 93 | libpopcnt's new ARM SVE popcount algorithm is up to 3x faster than its ARM NEON
 94 | popcount algorithm (on AWS Graviton3  CPUs). Unfortunately runtime dispatching to
 95 | ARM SVE is not yet well supported by the GCC and Clang compilers and libc's.
 96 | Therefore, by default only the (portable) ARM NEON popcount algorithm is enabled
 97 | when using libpopcnt on ARM CPUs.
 98 | 
 99 | To enable libpopcnt's ARM SVE popcount algorithm you need to compile your program
100 | using your compiler's ARM SVE option e.g.:
101 | 
102 | ```bash
103 | gcc -O3 -march=armv8-a+sve program.c
104 | g++ -O3 -march=armv8-a+sve program.cpp
105 | ```
106 | 
107 | ## Development
108 | 
109 | ```bash
110 | cmake .
111 | make -j
112 | make test
113 | ```
114 | 
115 | The above commands also build the ```benchmark``` program which is
116 | useful for benchmarking ```libpopcnt.h```. Below is a
117 | usage example run on an AMD EPYC 9R14 CPU from 2023:
118 | 
119 | ```bash
120 | # Usage: ./benchmark [array bytes] [iters]
121 | ./benchmark
122 | Iters: 10000000
123 | Array size: 16.00 KB
124 | Algorithm: AVX512
125 | Status: 100%
126 | Seconds: 1.23
127 | 133.5 GB/s
128 | ```
129 | 
130 | ## Acknowledgments
131 | 
132 | Some of the algorithms used in ```libpopcnt.h``` are described in the paper
133 | [Faster Population Counts using AVX2 Instructions](https://arxiv.org/abs/1611.07612)
134 | by Daniel Lemire, Nathan Kurz and Wojciech Mula (23 Nov 2016). The AVX2 Harley Seal
135 | popcount algorithm used in ```libpopcnt.h``` has been copied from Wojciech Muła's
136 | [sse-popcount](https://github.com/WojciechMula/sse-popcount) GitHub repo.
137 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: ci
  2 | 
  3 | on:
  4 |   push:
  5 |     branches-ignore:
  6 |       - gh-pages
  7 |   pull_request:
  8 |     branches-ignore:
  9 |       - gh-pages
 10 | 
 11 | jobs:
 12 |   gcc_linux:
 13 |     strategy:
 14 |       matrix:
 15 |         platform: ['ubuntu-latest']
 16 |         config: ['Debug', 'Release']
 17 |     runs-on: ${{ matrix.platform }}
 18 |     env:
 19 |       CC: gcc
 20 |       CXX: g++
 21 |     steps:
 22 |       - uses: actions/checkout@v4
 23 |       - name: Build libpopcnt
 24 |         run: |
 25 |             cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror"
 26 |             cmake --build . --parallel --verbose
 27 |       - name: CTest (unit tests)
 28 |         run: ctest -j2
 29 |       - name: benchmark
 30 |         run: ./benchmark 16389 100
 31 | 
 32 |   gcc_linux_march_native:
 33 |     strategy:
 34 |       matrix:
 35 |         platform: ['ubuntu-latest']
 36 |         config: ['Debug', 'Release']
 37 |     runs-on: ${{ matrix.platform }}
 38 |     env:
 39 |       CC: gcc
 40 |       CXX: g++
 41 |     steps:
 42 |       - uses: actions/checkout@v4
 43 |       - name: Build libpopcnt
 44 |         run: |
 45 |             cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-march=native -Wall -Wextra -pedantic -Werror"
 46 |             cmake --build . --parallel --verbose
 47 |       - name: CTest (unit tests)
 48 |         run: ctest -j2
 49 |       - name: benchmark
 50 |         run: ./benchmark 16389 100
 51 | 
 52 |   gcc_linux_valgrind:
 53 |     runs-on: ubuntu-latest
 54 |     env:
 55 |       CC: gcc
 56 |       CXX: g++
 57 |     steps:
 58 |       - uses: actions/checkout@v4
 59 |       - name: Install valgrind
 60 |         run: |
 61 |             sudo apt update
 62 |             sudo apt install valgrind
 63 |       - name: Build libpopcnt
 64 |         run: |
 65 |             cmake . -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror -g" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror -g"
 66 |             cmake --build . --parallel --verbose
 67 |       - name: CTest (unit tests)
 68 |         run: ctest -j2
 69 |       - name: benchmark
 70 |         run: ./benchmark 16389 100
 71 | 
 72 |   gcc_linux_sanitizers:
 73 |     # This test fails on Ubuntu 22.04 likely due to an Ubuntu or compiler bug.
 74 |     # See discussion at: https://github.com/quantumlib/Stim/issues/717#issuecomment-2002623560
 75 |     runs-on: ubuntu-20.04
 76 |     env:
 77 |       CC: gcc
 78 |       CXX: g++
 79 |     steps:
 80 |       - uses: actions/checkout@v4
 81 |       - name: Build libpopcnt
 82 |         run: |
 83 |             cmake . -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer"
 84 |             cmake --build . --parallel --verbose
 85 |       - name: CTest (unit tests)
 86 |         run: ctest -j2 --output-on-failure
 87 |       - name: benchmark
 88 |         run: ./benchmark 16389 100
 89 | 
 90 |   clang_linux:
 91 |     strategy:
 92 |       matrix:
 93 |         platform: ['ubuntu-latest']
 94 |         config: ['Debug', 'Release']
 95 |     runs-on: ${{ matrix.platform }}
 96 |     env:
 97 |       CC: clang
 98 |       CXX: clang++
 99 |     steps:
100 |       - uses: actions/checkout@v4
101 |       - name: Build libpopcnt
102 |         run: |
103 |             cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror"
104 |             cmake --build . --parallel --verbose
105 |       - name: CTest (unit tests)
106 |         run: ctest -j2
107 |       - name: benchmark
108 |         run: ./benchmark 16389 100
109 | 
110 |   clang_linux_march_native:
111 |     strategy:
112 |       matrix:
113 |         platform: ['ubuntu-latest']
114 |         config: ['Debug', 'Release']
115 |     runs-on: ${{ matrix.platform }}
116 |     env:
117 |       CC: clang
118 |       CXX: clang++
119 |     steps:
120 |       - uses: actions/checkout@v4
121 |       - name: Build libpopcnt
122 |         run: |
123 |             cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-march=native -Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-march=native -Wall -Wextra -pedantic -Werror"
124 |             cmake --build . --parallel --verbose
125 |       - name: CTest (unit tests)
126 |         run: ctest -j2
127 |       - name: benchmark
128 |         run: ./benchmark 16389 100
129 | 
130 |   clang_macos:
131 |     strategy:
132 |       matrix:
133 |         platform: ['macos-latest']
134 |         config: ['Debug', 'Release']
135 |     runs-on: ${{ matrix.platform }}
136 |     env:
137 |       CC: clang
138 |       CXX: clang++
139 |     steps:
140 |       - uses: actions/checkout@v4
141 |       - name: Build libpopcnt
142 |         run: |
143 |             cmake . -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_INSTALL_PREFIX=$(pwd) -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wno-c++11-long-long -Wall -Wextra -pedantic -Werror"
144 |             cmake --build . --parallel --verbose
145 |             cmake --install .
146 |       - name: CTest (unit tests)
147 |         run: ctest -j2
148 |       - name: benchmark
149 |         run: ./benchmark 16389 100
150 | 
151 |   msvc_windows_vs2022:
152 |     runs-on: windows-2022
153 |     steps:
154 |       - uses: actions/checkout@v4
155 |       - name: Build libpopcnt
156 |         run: |
157 |             cmake . -G "Visual Studio 17 2022" -DCMAKE_CXX_FLAGS="/W3 /WX /MP /EHsc"
158 |             cmake --build . --config Release --target install
159 |       - name: CTest (unit tests)
160 |         run: ctest -j2 -C Release --output-on-failure
161 |       - name: benchmark
162 |         run: Release\benchmark.exe 16389 100
163 | 
164 |   # See documentation: https://www.msys2.org/docs/ci/
165 |   msvc_windows_mingw64:
166 |     strategy:
167 |       matrix:
168 |         platform: ['windows-latest']
169 |         config: ['Debug', 'Release']
170 |     runs-on: ${{ matrix.platform }}
171 |     defaults:
172 |       run:
173 |         shell: msys2 {0}
174 |     steps:
175 |       - uses: actions/checkout@v4
176 |       - uses: msys2/setup-msys2@v2
177 |         with:
178 |           update: true
179 |           install: base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-cmake
180 |       - name: Build libpopcnt
181 |         run: |
182 |             cmake . -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=${{matrix.config}} -DCMAKE_C_FLAGS="-Wall -Wextra -pedantic -Werror" -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic -Werror"
183 |             cmake --build . --parallel --verbose
184 |       - name: CTest (unit tests)
185 |         run: ctest -j2
186 |       - name: benchmark
187 |         run: ./benchmark 16389 100
188 | 


--------------------------------------------------------------------------------
/libpopcnt.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
  3 |  * population count) in an array as quickly as possible using
  4 |  * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
  5 |  *
  6 |  * Copyright (c) 2016 - 2024, Kim Walisch
  7 |  * Copyright (c) 2016 - 2018, Wojciech Muła
  8 |  *
  9 |  * All rights reserved.
 10 |  *
 11 |  * Redistribution and use in source and binary forms, with or without
 12 |  * modification, are permitted provided that the following conditions are met:
 13 |  *
 14 |  * 1. Redistributions of source code must retain the above copyright notice, this
 15 |  *    list of conditions and the following disclaimer.
 16 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 17 |  *    this list of conditions and the following disclaimer in the documentation
 18 |  *    and/or other materials provided with the distribution.
 19 |  *
 20 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 21 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 22 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 23 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 24 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 25 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 26 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 27 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 29 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  */
 31 | 
 32 | #ifndef LIBPOPCNT_H
 33 | #define LIBPOPCNT_H
 34 | 
 35 | #include <stdint.h>
 36 | 
 37 | #ifndef __has_builtin
 38 |   #define __has_builtin(x) 0
 39 | #endif
 40 | 
 41 | #ifndef __has_attribute
 42 |   #define __has_attribute(x) 0
 43 | #endif
 44 | 
 45 | #ifndef __has_include
 46 |   #define __has_include(x) 0
 47 | #endif
 48 | 
 49 | #ifdef __GNUC__
 50 |   #define LIBPOPCNT_GNUC_PREREQ(x, y) \
 51 |       (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
 52 | #else
 53 |   #define LIBPOPCNT_GNUC_PREREQ(x, y) 0
 54 | #endif
 55 | 
 56 | #ifdef __clang__
 57 |   #define LIBPOPCNT_CLANG_PREREQ(x, y) \
 58 |       (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
 59 | #else
 60 |   #define LIBPOPCNT_CLANG_PREREQ(x, y) 0
 61 | #endif
 62 | 
 63 | #if (_MSC_VER < 1900) && \
 64 |     !defined(__cplusplus)
 65 |   #define inline __inline
 66 | #endif
 67 | 
 68 | #if (defined(__i386__) || \
 69 |      defined(__x86_64__) || \
 70 |      defined(_M_IX86) || \
 71 |      defined(_M_X64))
 72 |   #define LIBPOPCNT_X86_OR_X64
 73 | #endif
 74 | 
 75 | #if LIBPOPCNT_GNUC_PREREQ(4, 2) || \
 76 |     __has_builtin(__builtin_popcount)
 77 |   #define LIBPOPCNT_HAVE_BUILTIN_POPCOUNT
 78 | #endif
 79 | 
 80 | #if LIBPOPCNT_GNUC_PREREQ(4, 2) || \
 81 |     LIBPOPCNT_CLANG_PREREQ(3, 0)
 82 |   #define LIBPOPCNT_HAVE_ASM_POPCNT
 83 | #endif
 84 | 
 85 | #if defined(LIBPOPCNT_X86_OR_X64) && \
 86 |    (defined(LIBPOPCNT_HAVE_ASM_POPCNT) || \
 87 |     defined(_MSC_VER))
 88 |   #define LIBPOPCNT_HAVE_POPCNT
 89 | #endif
 90 | 
 91 | /* GCC compiler */
 92 | #if defined(LIBPOPCNT_X86_OR_X64) && \
 93 |     LIBPOPCNT_GNUC_PREREQ(5, 0)
 94 |   #define LIBPOPCNT_HAVE_AVX2
 95 | #endif
 96 | 
 97 | /* GCC compiler */
 98 | #if defined(LIBPOPCNT_X86_OR_X64) && \
 99 |     LIBPOPCNT_GNUC_PREREQ(11, 0)
100 |   #define LIBPOPCNT_HAVE_AVX512
101 | #endif
102 | 
103 | /* Clang (Unix-like OSes) */
104 | #if defined(LIBPOPCNT_X86_OR_X64) && !defined(_MSC_VER)
105 |   #if LIBPOPCNT_CLANG_PREREQ(3, 8) && \
106 |       __has_attribute(target) && \
107 |       (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
108 |     #define LIBPOPCNT_HAVE_AVX2
109 |   #endif
110 |   #if LIBPOPCNT_CLANG_PREREQ(9, 0) && \
111 |       __has_attribute(target) && \
112 |       (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
113 |     #define LIBPOPCNT_HAVE_AVX512
114 |   #endif
115 | #endif
116 | 
117 | /* MSVC compatible compilers (Windows) */
118 | #if defined(LIBPOPCNT_X86_OR_X64) && \
119 |     defined(_MSC_VER)
120 |   /*
121 |    * There is an LLVM/Clang bug on Windows where function targets
122 |    * for AVX2 and AVX512 fail to compile unless the user compiles
123 |    * using the options /arch:AVX2 and /arch:AVX512.
124 |    * All Clang versions <= 18.0 (from 2024) are affected by this bug.
125 |    * However, I expect this bug will be fixed in near future:
126 |    * https://github.com/llvm/llvm-project/issues/53520
127 |    */
128 |   #if defined(__clang__)
129 |     #if defined(__AVX2__)
130 |       #define LIBPOPCNT_HAVE_AVX2
131 |     #endif
132 |     #if defined(__AVX512__)
133 |       #define LIBPOPCNT_HAVE_AVX2
134 |       #define LIBPOPCNT_HAVE_AVX512
135 |     #endif
136 |   /* MSVC 2017 or later does not require
137 |   * /arch:AVX2 or /arch:AVX512 */
138 |   #elif _MSC_VER >= 1910
139 |     #define LIBPOPCNT_HAVE_AVX2
140 |     #define LIBPOPCNT_HAVE_AVX512
141 |   #endif
142 | #endif
143 | 
144 | /*
145 |  * Only enable CPUID runtime checks if this is really
146 |  * needed. E.g. do not enable if user has compiled
147 |  * using -march=native on a CPU that supports AVX512.
148 |  */
149 | #if defined(LIBPOPCNT_X86_OR_X64) && \
150 |    (defined(__cplusplus) || \
151 |     defined(_MSC_VER) || \
152 |    (LIBPOPCNT_GNUC_PREREQ(4, 2) || \
153 |     __has_builtin(__sync_val_compare_and_swap))) && \
154 |    ((defined(LIBPOPCNT_HAVE_AVX512) && !(defined(__AVX512__) || \
155 |                                         (defined(__AVX512F__) && \
156 |                                          defined(__AVX512BW__) && \
157 |                                          defined(__AVX512VPOPCNTDQ__)))) || \
158 |     (defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || \
159 |     (defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__)))
160 |   #define LIBPOPCNT_HAVE_CPUID
161 | #endif
162 | 
163 | #ifdef __cplusplus
164 | extern "C" {
165 | #endif
166 | 
167 | /*
168 |  * This uses fewer arithmetic operations than any other known
169 |  * implementation on machines with fast multiplication.
170 |  * It uses 12 arithmetic operations, one of which is a multiply.
171 |  * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
172 |  */
173 | static inline uint64_t popcnt64_bitwise(uint64_t x)
174 | {
175 |   uint64_t m1 = 0x5555555555555555ull;
176 |   uint64_t m2 = 0x3333333333333333ull;
177 |   uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
178 |   uint64_t h01 = 0x0101010101010101ull;
179 | 
180 |   x -= (x >> 1) & m1;
181 |   x = (x & m2) + ((x >> 2) & m2);
182 |   x = (x + (x >> 4)) & m4;
183 | 
184 |   return (x * h01) >> 56;
185 | }
186 | 
187 | #if defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \
188 |     defined(__x86_64__)
189 | 
190 | static inline uint64_t popcnt64(uint64_t x)
191 | {
192 |   __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
193 |   return x;
194 | }
195 | 
196 | #elif defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \
197 |       defined(__i386__)
198 | 
199 | static inline uint32_t popcnt32(uint32_t x)
200 | {
201 |   __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
202 |   return x;
203 | }
204 | 
205 | static inline uint64_t popcnt64(uint64_t x)
206 | {
207 |   return popcnt32((uint32_t) x) +
208 |          popcnt32((uint32_t)(x >> 32));
209 | }
210 | 
211 | #elif defined(_MSC_VER) && \
212 |       defined(_M_X64)
213 | 
214 | #include <intrin.h>
215 | 
216 | static inline uint64_t popcnt64(uint64_t x)
217 | {
218 |   return __popcnt64(x);
219 | }
220 | 
221 | #elif defined(_MSC_VER) && \
222 |       defined(_M_IX86)
223 | 
224 | #include <intrin.h>
225 | 
226 | static inline uint64_t popcnt64(uint64_t x)
227 | {
228 |   return __popcnt((uint32_t) x) + 
229 |          __popcnt((uint32_t)(x >> 32));
230 | }
231 | 
232 | /* non x86 CPUs */
233 | #elif defined(LIBPOPCNT_HAVE_BUILTIN_POPCOUNT)
234 | 
235 | static inline uint64_t popcnt64(uint64_t x)
236 | {
237 |   return __builtin_popcountll(x);
238 | }
239 | 
240 | /* no hardware POPCNT,
241 |  * use pure integer algorithm */
242 | #else
243 | 
244 | static inline uint64_t popcnt64(uint64_t x)
245 | {
246 |   return popcnt64_bitwise(x);
247 | }
248 | 
249 | #endif
250 | 
251 | #if defined(LIBPOPCNT_HAVE_CPUID)
252 | 
253 | #if defined(_MSC_VER)
254 |   #include <intrin.h>
255 |   #include <immintrin.h>
256 | #endif
257 | 
258 | /* CPUID bits documentation: */
259 | /* https://en.wikipedia.org/wiki/CPUID */
260 | 
261 | /* %ebx bit flags */
262 | #define LIBPOPCNT_BIT_AVX2     (1 << 5)
263 | #define LIBPOPCNT_BIT_AVX512F  (1 << 16)
264 | #define LIBPOPCNT_BIT_AVX512BW (1 << 30)
265 | 
266 | /* %ecx bit flags */
267 | #define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14)
268 | #define LIBPOPCNT_BIT_POPCNT           (1 << 23)
269 | 
270 | /* xgetbv bit flags */
271 | #define LIBPOPCNT_XSTATE_SSE (1 << 1)
272 | #define LIBPOPCNT_XSTATE_YMM (1 << 2)
273 | #define LIBPOPCNT_XSTATE_ZMM (7 << 5)
274 | 
275 | static inline void run_cpuid(int eax, int ecx, int* abcd)
276 | {
277 | #if defined(_MSC_VER)
278 |   __cpuidex(abcd, eax, ecx);
279 | #else
280 |   int ebx = 0;
281 |   int edx = 0;
282 | 
283 |   #if defined(__i386__) && \
284 |       defined(__PIC__)
285 |     /* In case of PIC under 32-bit EBX cannot be clobbered */
286 |     __asm__ __volatile__("movl %%ebx, %%edi;"
287 |                          "cpuid;"
288 |                          "xchgl %%ebx, %%edi;"
289 |                          : "+a" (eax),
290 |                            "=D" (ebx),
291 |                            "+c" (ecx),
292 |                            "=d" (edx));
293 |   #else
294 |     __asm__ __volatile__("cpuid"
295 |                          : "+a" (eax),
296 |                            "+b" (ebx),
297 |                            "+c" (ecx),
298 |                            "=d" (edx));
299 |   #endif
300 | 
301 |   abcd[0] = eax;
302 |   abcd[1] = ebx;
303 |   abcd[2] = ecx;
304 |   abcd[3] = edx;
305 | #endif
306 | }
307 | 
308 | #if defined(LIBPOPCNT_HAVE_AVX2) || \
309 |     defined(LIBPOPCNT_HAVE_AVX512)
310 | 
311 | static inline uint64_t get_xcr0(void)
312 | {
313 | #if defined(_MSC_VER)
314 |   return _xgetbv(0);
315 | #else
316 |   uint32_t eax;
317 |   uint32_t edx;
318 | 
319 |   __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
320 |   return eax | (((uint64_t) edx) << 32);
321 | #endif
322 | }
323 | 
324 | #endif
325 | 
326 | static inline int get_cpuid(void)
327 | {
328 |   int flags = 0;
329 |   int abcd[4];
330 | 
331 |   run_cpuid(1, 0, abcd);
332 | 
333 |   if ((abcd[2] & LIBPOPCNT_BIT_POPCNT) == LIBPOPCNT_BIT_POPCNT)
334 |     flags |= LIBPOPCNT_BIT_POPCNT;
335 | 
336 | #if defined(LIBPOPCNT_HAVE_AVX2) || \
337 |     defined(LIBPOPCNT_HAVE_AVX512)
338 | 
339 |   int osxsave_mask = (1 << 27);
340 | 
341 |   /* ensure OS supports extended processor state management */
342 |   if ((abcd[2] & osxsave_mask) != osxsave_mask)
343 |     return 0;
344 | 
345 |   uint64_t ymm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM;
346 |   uint64_t zmm_mask = LIBPOPCNT_XSTATE_SSE | LIBPOPCNT_XSTATE_YMM | LIBPOPCNT_XSTATE_ZMM;
347 |   uint64_t xcr0 = get_xcr0();
348 | 
349 |   if ((xcr0 & ymm_mask) == ymm_mask)
350 |   {
351 |     run_cpuid(7, 0, abcd);
352 | 
353 |     if ((abcd[1] & LIBPOPCNT_BIT_AVX2) == LIBPOPCNT_BIT_AVX2)
354 |       flags |= LIBPOPCNT_BIT_AVX2;
355 | 
356 |     if ((xcr0 & zmm_mask) == zmm_mask)
357 |     {
358 |       /* If all AVX512 features required by our popcnt_avx512() are supported */
359 |       /* then we add LIBPOPCNT_BIT_AVX512_VPOPCNTDQ to our CPUID flags. */
360 |       if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F &&
361 |           (abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW &&
362 |           (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ)
363 |         flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ;
364 |     }
365 |   }
366 | 
367 | #endif
368 | 
369 |   return flags;
370 | }
371 | 
372 | #endif /* cpuid */
373 | 
374 | #if defined(LIBPOPCNT_HAVE_AVX2) && \
375 |     __has_include(<immintrin.h>)
376 | 
377 | #include <immintrin.h>
378 | 
379 | #if __has_attribute(target)
380 |   __attribute__ ((target ("avx2")))
381 | #endif
382 | static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
383 | {
384 |   __m256i u = _mm256_xor_si256(a, b);
385 |   *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
386 |   *l = _mm256_xor_si256(u, c);
387 | }
388 | 
389 | #if __has_attribute(target)
390 |   __attribute__ ((target ("avx2")))
391 | #endif
392 | static inline __m256i popcnt256(__m256i v)
393 | {
394 |   __m256i lookup1 = _mm256_setr_epi8(
395 |       4, 5, 5, 6, 5, 6, 6, 7,
396 |       5, 6, 6, 7, 6, 7, 7, 8,
397 |       4, 5, 5, 6, 5, 6, 6, 7,
398 |       5, 6, 6, 7, 6, 7, 7, 8
399 |   );
400 | 
401 |   __m256i lookup2 = _mm256_setr_epi8(
402 |       4, 3, 3, 2, 3, 2, 2, 1,
403 |       3, 2, 2, 1, 2, 1, 1, 0,
404 |       4, 3, 3, 2, 3, 2, 2, 1,
405 |       3, 2, 2, 1, 2, 1, 1, 0
406 |   );
407 | 
408 |   __m256i low_mask = _mm256_set1_epi8(0x0f);
409 |   __m256i lo = _mm256_and_si256(v, low_mask);
410 |   __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
411 |   __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
412 |   __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
413 | 
414 |   return _mm256_sad_epu8(popcnt1, popcnt2);
415 | }
416 | 
417 | /*
418 |  * AVX2 Harley-Seal popcount (4th iteration).
419 |  * The algorithm is based on the paper "Faster Population Counts
420 |  * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
421 |  * Wojciech Mula (23 Nov 2016).
422 |  * @see https://arxiv.org/abs/1611.07612
423 |  */
424 | #if __has_attribute(target)
425 |   __attribute__ ((target ("avx2")))
426 | #endif
427 | static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size)
428 | {
429 |   __m256i cnt = _mm256_setzero_si256();
430 |   __m256i ones = _mm256_setzero_si256();
431 |   __m256i twos = _mm256_setzero_si256();
432 |   __m256i fours = _mm256_setzero_si256();
433 |   __m256i eights = _mm256_setzero_si256();
434 |   __m256i sixteens = _mm256_setzero_si256();
435 |   __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
436 | 
437 |   uint64_t i = 0;
438 |   uint64_t limit = size - size % 16;
439 |   uint64_t* cnt64;
440 | 
441 |   for(; i < limit; i += 16)
442 |   {
443 |     CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1));
444 |     CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3));
445 |     CSA256(&foursA, &twos, twos, twosA, twosB);
446 |     CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5));
447 |     CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7));
448 |     CSA256(&foursB, &twos, twos, twosA, twosB);
449 |     CSA256(&eightsA, &fours, fours, foursA, foursB);
450 |     CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9));
451 |     CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11));
452 |     CSA256(&foursA, &twos, twos, twosA, twosB);
453 |     CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13));
454 |     CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15));
455 |     CSA256(&foursB, &twos, twos, twosA, twosB);
456 |     CSA256(&eightsB, &fours, fours, foursA, foursB);
457 |     CSA256(&sixteens, &eights, eights, eightsA, eightsB);
458 | 
459 |     cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
460 |   }
461 | 
462 |   cnt = _mm256_slli_epi64(cnt, 4);
463 |   cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
464 |   cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
465 |   cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
466 |   cnt = _mm256_add_epi64(cnt, popcnt256(ones));
467 | 
468 |   for(; i < size; i++)
469 |     cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i)));
470 | 
471 |   cnt64 = (uint64_t*) &cnt;
472 | 
473 |   return cnt64[0] +
474 |          cnt64[1] +
475 |          cnt64[2] +
476 |          cnt64[3];
477 | }
478 | 
479 | #endif
480 | 
481 | #if defined(LIBPOPCNT_HAVE_AVX512) && \
482 |     __has_include(<immintrin.h>)
483 | 
484 | #include <immintrin.h>
485 | 
486 | #if __has_attribute(target)
487 |   __attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq")))
488 | #endif
489 | static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
490 | {
491 |     __m512i cnt = _mm512_setzero_si512();
492 |     const uint64_t* ptr64 = (const uint64_t*) ptr8;
493 |     uint64_t size64 = size / sizeof(uint64_t);
494 |     uint64_t i = 0;
495 | 
496 |     for (; i + 32 <= size64; i += 32)
497 |     {
498 |       __m512i vec0 = _mm512_loadu_epi64(&ptr64[i + 0]);
499 |       __m512i vec1 = _mm512_loadu_epi64(&ptr64[i + 8]);
500 |       __m512i vec2 = _mm512_loadu_epi64(&ptr64[i + 16]);
501 |       __m512i vec3 = _mm512_loadu_epi64(&ptr64[i + 24]);
502 | 
503 |       vec0 = _mm512_popcnt_epi64(vec0);
504 |       vec1 = _mm512_popcnt_epi64(vec1);
505 |       vec2 = _mm512_popcnt_epi64(vec2);
506 |       vec3 = _mm512_popcnt_epi64(vec3);
507 | 
508 |       cnt = _mm512_add_epi64(cnt, vec0);
509 |       cnt = _mm512_add_epi64(cnt, vec1);
510 |       cnt = _mm512_add_epi64(cnt, vec2);
511 |       cnt = _mm512_add_epi64(cnt, vec3);
512 |     }
513 | 
514 |     for (; i + 8 <= size64; i += 8)
515 |     {
516 |       __m512i vec = _mm512_loadu_epi64(&ptr64[i]);
517 |       vec = _mm512_popcnt_epi64(vec);
518 |       cnt = _mm512_add_epi64(cnt, vec);
519 |     }
520 | 
521 |     i *= sizeof(uint64_t);
522 | 
523 |     /* Process last 63 bytes */
524 |     if (i < size)
525 |     {
526 |       __mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size));
527 |       __m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
528 |       vec = _mm512_popcnt_epi64(vec);
529 |       cnt = _mm512_add_epi64(cnt, vec);
530 |     }
531 | 
532 |     return _mm512_reduce_add_epi64(cnt);
533 | }
534 | 
535 | #endif
536 | 
537 | /* x86 CPUs */
538 | #if defined(LIBPOPCNT_X86_OR_X64)
539 | 
540 | /*
541 |  * Count the number of 1 bits in the data array
542 |  * @data: An array
543 |  * @size: Size of data in bytes
544 |  */
545 | static uint64_t popcnt(const void* data, uint64_t size)
546 | {
547 | /*
548 |  * CPUID runtime checks are only enabled if this is needed.
549 |  * E.g. CPUID is disabled when a user compiles his
550 |  * code using -march=native on a CPU with AVX512.
551 |  */
552 | #if defined(LIBPOPCNT_HAVE_CPUID)
553 |   #if defined(__cplusplus)
554 |     /* C++11 thread-safe singleton */
555 |     static const int cpuid = get_cpuid();
556 |   #else
557 |     static int cpuid_ = -1;
558 |     int cpuid = cpuid_;
559 |     if (cpuid == -1)
560 |     {
561 |       cpuid = get_cpuid();
562 | 
563 |       #if defined(_MSC_VER)
564 |         _InterlockedCompareExchange(&cpuid_, cpuid, -1);
565 |       #else
566 |         __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
567 |       #endif
568 |     }
569 |   #endif
570 | #endif
571 | 
572 |   const uint8_t* ptr = (const uint8_t*) data;
573 |   uint64_t cnt = 0;
574 |   uint64_t i = 0;
575 | 
576 | #if defined(LIBPOPCNT_HAVE_AVX512)
577 |   #if defined(__AVX512__) || \
578 |      (defined(__AVX512F__) && \
579 |       defined(__AVX512BW__) && \
580 |       defined(__AVX512VPOPCNTDQ__))
581 |     /* For tiny arrays AVX512 is not worth it */
582 |     if (i + 40 <= size)
583 |   #else
584 |     if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) &&
585 |         i + 40 <= size)
586 |   #endif
587 |       return popcnt_avx512(ptr, size);
588 | #endif
589 | 
590 | #if defined(LIBPOPCNT_HAVE_AVX2)
591 |   #if defined(__AVX2__)
592 |     /* AVX2 requires arrays >= 512 bytes */
593 |     if (i + 512 <= size)
594 |   #else
595 |     if ((cpuid & LIBPOPCNT_BIT_AVX2) &&
596 |         i + 512 <= size)
597 |   #endif
598 |     {
599 |       const __m256i* ptr256 = (const __m256i*)(ptr + i);
600 |       cnt += popcnt_avx2(ptr256, (size - i) / 32);
601 |       i = size - size % 32;
602 |     }
603 | #endif
604 | 
605 | #if defined(LIBPOPCNT_HAVE_POPCNT)
606 |   /* 
607 |    * The user has compiled without -mpopcnt.
608 |    * Unfortunately the MSVC compiler does not have
609 |    * a POPCNT macro so we cannot get rid of the
610 |    * runtime check for MSVC.
611 |    */
612 |   #if !defined(__POPCNT__)
613 |     if (cpuid & LIBPOPCNT_BIT_POPCNT)
614 |   #endif
615 |     {
616 |       if (i + 8 <= size)
617 |       {
618 |         uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
619 | 
620 |         /* Align &ptr[i] to an 8 byte boundary */
621 |         if (rem != 0)
622 |         {
623 |           uint64_t val = 0;
624 |           uint64_t bytes = (uint64_t) (8 - rem % 8);
625 |           bytes = (bytes <= 7) ? bytes : 7;
626 |           for (uint64_t j = 0; j < bytes; j++)
627 |             val |= ((uint64_t) ptr[i + j]) << (j * 8);
628 |           cnt += popcnt64(val);
629 |           i += bytes;
630 |         }
631 |       }
632 | 
633 |       for (; i + 8 <= size; i += 8)
634 |         cnt += popcnt64(*(const uint64_t*)(ptr + i));
635 | 
636 |       if (i < size)
637 |       {
638 |         uint64_t val = 0;
639 |         uint64_t bytes = (uint64_t) (size - i);
640 |         bytes = (bytes <= 7) ? bytes : 7;
641 |         for (uint64_t j = 0; j < bytes; j++)
642 |           val |= ((uint64_t) ptr[i + j]) << (j * 8);
643 |         cnt += popcnt64(val);
644 |       }
645 | 
646 |       return cnt;
647 |     }
648 | #endif
649 | 
650 | /*
651 |  * This code is used for:
652 |  * 1) Compiler does not support POPCNT.
653 |  * 2) x86 CPU does not support POPCNT (cpuid != POPCNT).
654 |  */
655 | #if !defined(LIBPOPCNT_HAVE_POPCNT) || \
656 |     !defined(__POPCNT__)
657 | 
658 |   if (i + 8 <= size)
659 |   {
660 |     uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
661 | 
662 |     /* Align &ptr[i] to an 8 byte boundary */
663 |     if (rem != 0)
664 |     {
665 |       uint64_t val = 0;
666 |       uint64_t bytes = (uint64_t) (8 - rem % 8);
667 |       bytes = (bytes <= 7) ? bytes : 7;
668 |       for (uint64_t j = 0; j < bytes; j++)
669 |         val |= ((uint64_t) ptr[i + j]) << (j * 8);
670 |       cnt += popcnt64_bitwise(val);
671 |       i += bytes;
672 |     }
673 |   }
674 | 
675 |   for (; i + 8 <= size; i += 8)
676 |     cnt += popcnt64_bitwise(*(const uint64_t*)(ptr + i));
677 | 
678 |   if (i < size)
679 |   {
680 |     uint64_t val = 0;
681 |     uint64_t bytes = (uint64_t) (size - i);
682 |     bytes = (bytes <= 7) ? bytes : 7;
683 |     for (uint64_t j = 0; j < bytes; j++)
684 |       val |= ((uint64_t) ptr[i + j]) << (j * 8);
685 |     cnt += popcnt64_bitwise(val);
686 |   }
687 | 
688 |   return cnt;
689 | #endif
690 | }
691 | 
692 | /* Compile with e.g. -march=armv8-a+sve to enable ARM SVE */
693 | #elif defined(__ARM_FEATURE_SVE) && \
694 |       __has_include(<arm_sve.h>)
695 | 
696 | #include <arm_sve.h>
697 | 
698 | /*
699 |  * Count the number of 1 bits in the data array
700 |  * @data: An array
701 |  * @size: Size of data in bytes
702 |  */
703 | static inline uint64_t popcnt(const void* data, uint64_t size)
704 | {
705 |   uint64_t i = 0;
706 |   const uint64_t* ptr64 = (const uint64_t*) data;
707 |   uint64_t size64 = size / sizeof(uint64_t);
708 |   svuint64_t vcnt = svdup_u64(0);
709 | 
710 |   for (; i + svcntd() * 4 <= size64; i += svcntd() * 4)
711 |   {
712 |     svuint64_t vec0 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 0]);
713 |     svuint64_t vec1 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 1]);
714 |     svuint64_t vec2 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 2]);
715 |     svuint64_t vec3 = svld1_u64(svptrue_b64(), &ptr64[i + svcntd() * 3]);
716 | 
717 |     vec0 = svcnt_u64_x(svptrue_b64(), vec0);
718 |     vec1 = svcnt_u64_x(svptrue_b64(), vec1);
719 |     vec2 = svcnt_u64_x(svptrue_b64(), vec2);
720 |     vec3 = svcnt_u64_x(svptrue_b64(), vec3);
721 | 
722 |     vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec0);
723 |     vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec1);
724 |     vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec2);
725 |     vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec3);
726 |   }
727 | 
728 |   svbool_t pg = svwhilelt_b64(i, size64);
729 | 
730 |   while (svptest_any(svptrue_b64(), pg))
731 |   {
732 |     svuint64_t vec = svld1_u64(pg, &ptr64[i]);
733 |     vec = svcnt_u64_z(pg, vec);
734 |     vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
735 |     i += svcntd();
736 |     pg = svwhilelt_b64(i, size64);
737 |   }
738 | 
739 |   uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt);
740 |   uint64_t bytes = size % sizeof(uint64_t);
741 | 
742 |   if (bytes != 0)
743 |   {
744 |     i = size - bytes;
745 |     const uint8_t* ptr8 = (const uint8_t*) data;
746 |     svbool_t pg8 = svwhilelt_b8(i, size);
747 |     svuint8_t vec = svld1_u8(pg8, &ptr8[i]);
748 |     svuint8_t vcnt8 = svcnt_u8_z(pg8, vec);
749 |     cnt += svaddv_u8(pg8, vcnt8);
750 |   }
751 | 
752 |   return cnt;
753 | }
754 | 
755 | #elif (defined(__ARM_NEON) || \
756 |        defined(__aarch64__) || \
757 |        defined(_M_ARM64)) && \
758 |       __has_include(<arm_neon.h>)
759 | 
760 | #include <arm_neon.h>
761 | 
762 | static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
763 | {
764 |   return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
765 | }
766 | 
767 | /*
768 |  * Count the number of 1 bits in the data array
769 |  * @data: An array
770 |  * @size: Size of data in bytes
771 |  */
772 | static inline uint64_t popcnt(const void* data, uint64_t size)
773 | {
774 |   uint64_t i = 0;
775 |   uint64_t cnt = 0;
776 |   uint64_t chunk_size = 64;
777 |   const uint8_t* ptr = (const uint8_t*) data;
778 | 
779 |   if (size >= chunk_size)
780 |   {
781 |     uint64_t iters = size / chunk_size;
782 |     uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
783 |     uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
784 | 
785 |     do
786 |     {
787 |       uint8x16_t t0 = zero;
788 |       uint8x16_t t1 = zero;
789 |       uint8x16_t t2 = zero;
790 |       uint8x16_t t3 = zero;
791 | 
792 |       /*
793 |        * After every 31 iterations we need to add the
794 |        * temporary sums (t0, t1, t2, t3) to the total sum.
795 |        * We must ensure that the temporary sums <= 255
796 |        * and 31 * 8 bits = 248 which is OK.
797 |        */
798 |       uint64_t limit = (i + 31 < iters) ? i + 31 : iters;
799 |   
800 |       /* Each iteration processes 64 bytes */
801 |       for (; i < limit; i++)
802 |       {
803 |         uint8x16x4_t input = vld4q_u8(ptr);
804 |         ptr += chunk_size;
805 | 
806 |         t0 = vaddq_u8(t0, vcntq_u8(input.val[0]));
807 |         t1 = vaddq_u8(t1, vcntq_u8(input.val[1]));
808 |         t2 = vaddq_u8(t2, vcntq_u8(input.val[2]));
809 |         t3 = vaddq_u8(t3, vcntq_u8(input.val[3]));
810 |       }
811 | 
812 |       sum = vpadalq(sum, t0);
813 |       sum = vpadalq(sum, t1);
814 |       sum = vpadalq(sum, t2);
815 |       sum = vpadalq(sum, t3);
816 |     }
817 |     while (i < iters);
818 | 
819 |     i = 0;
820 |     size %= chunk_size;
821 | 
822 |     uint64_t tmp[2];
823 |     vst1q_u64(tmp, sum);
824 |     cnt += tmp[0];
825 |     cnt += tmp[1];
826 |   }
827 | 
828 |   if (i + 8 <= size)
829 |   {
830 |     uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
831 | 
832 |     /* Align &ptr[i] to an 8 byte boundary */
833 |     if (rem != 0)
834 |     {
835 |       uint64_t val = 0;
836 |       uint64_t bytes = (uint64_t) (8 - rem % 8);
837 |       bytes = (bytes <= 7) ? bytes : 7;
838 |       for (uint64_t j = 0; j < bytes; j++)
839 |         val |= ((uint64_t) ptr[i + j]) << (j * 8);
840 |       cnt += popcnt64(val);
841 |       i += bytes;
842 |     }
843 |   }
844 | 
845 |   for (; i + 8 <= size; i += 8)
846 |     cnt += popcnt64(*(const uint64_t*)(ptr + i));
847 | 
848 |   if (i < size)
849 |   {
850 |     uint64_t val = 0;
851 |     uint64_t bytes = (uint64_t) (size - i);
852 |     bytes = (bytes <= 7) ? bytes : 7;
853 |     for (uint64_t j = 0; j < bytes; j++)
854 |       val |= ((uint64_t) ptr[i + j]) << (j * 8);
855 |     cnt += popcnt64(val);
856 |   }
857 | 
858 |   return cnt;
859 | }
860 | 
861 | /* all other CPUs */
862 | #else
863 | 
864 | /*
865 |  * Count the number of 1 bits in the data array
866 |  * @data: An array
867 |  * @size: Size of data in bytes
868 |  */
869 | static inline uint64_t popcnt(const void* data, uint64_t size)
870 | {
871 |   uint64_t i = 0;
872 |   uint64_t cnt = 0;
873 |   const uint8_t* ptr = (const uint8_t*) data;
874 | 
875 |   if (i + 8 <= size)
876 |   {
877 |     uintptr_t rem = ((uintptr_t) &ptr[i]) % 8;
878 | 
879 |     /* Align &ptr[i] to an 8 byte boundary */
880 |     if (rem != 0)
881 |     {
882 |       uint64_t val = 0;
883 |       uint64_t bytes = (uint64_t) (8 - rem % 8);
884 |       bytes = (bytes <= 7) ? bytes : 7;
885 |       for (uint64_t j = 0; j < bytes; j++)
886 |         val |= ((uint64_t) ptr[i + j]) << (j * 8);
887 |       cnt += popcnt64(val);
888 |       i += bytes;
889 |     }
890 |   }
891 | 
892 |   for (; i + 8 <= size; i += 8)
893 |     cnt += popcnt64(*(const uint64_t*)(ptr + i));
894 | 
895 |   if (i < size)
896 |   {
897 |     uint64_t val = 0;
898 |     uint64_t bytes = (uint64_t) (size - i);
899 |     bytes = (bytes <= 7) ? bytes : 7;
900 |     for (uint64_t j = 0; j < bytes; j++)
901 |       val |= ((uint64_t) ptr[i + j]) << (j * 8);
902 |     cnt += popcnt64(val);
903 |   }
904 | 
905 |   return cnt;
906 | }
907 | 
908 | #endif
909 | 
910 | #ifdef __cplusplus
911 | } /* extern "C" */
912 | #endif
913 | 
914 | #endif /* LIBPOPCNT_H */
915 | 


--------------------------------------------------------------------------------