├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.rst ├── config.h ├── function_registry.cpp ├── original ├── ssse3_popcount-test.sh └── ssse3_popcount.c ├── popcnt-aarch64.cpp ├── popcnt-all.cpp ├── popcnt-avx2-cpu.cpp ├── popcnt-avx2-harley-seal.cpp ├── popcnt-avx2-lookup.cpp ├── popcnt-avx512-harley-seal.cpp ├── popcnt-avx512-vpopcnt.cpp ├── popcnt-avx512bw-lookup.cpp ├── popcnt-avx512vbmi-lookup.cpp ├── popcnt-bit-parallel-scalar.cpp ├── popcnt-bit-parallel-scalar32.cpp ├── popcnt-builtin.cpp ├── popcnt-cpu.cpp ├── popcnt-harley-seal.cpp ├── popcnt-lookup.cpp ├── popcnt-neon.cpp ├── popcnt-rvv.cpp ├── popcnt-sse-bit-parallel-better.cpp ├── popcnt-sse-bit-parallel.cpp ├── popcnt-sse-cpu.cpp ├── popcnt-sse-harley-seal.cpp ├── popcnt-sse-lookup.cpp ├── results ├── .gitignore ├── README.rst ├── arm │ ├── arm-64bit-clang3.8.0.csv │ ├── arm-64bit-clang3.8.0.metadata │ ├── arm-64bit-clang3.8.0.rst │ ├── arm-64bit-gcc4.8.5.csv │ ├── arm-64bit-gcc4.8.5.metadata │ ├── arm-64bit-gcc4.8.5.rst │ ├── armv7-32bit-gcc4.9.2.csv │ ├── armv7-32bit-gcc4.9.2.metadata │ ├── armv7-32bit-gcc4.9.2.rst │ └── run.sh ├── bulldozer │ ├── bulldozer-fx-8510-gcc4.8.4-sse.csv │ ├── bulldozer-fx-8510-gcc4.8.4-sse.metadata │ ├── bulldozer-fx-8510-gcc4.8.4-sse.rst │ └── run.sh ├── cannonlake │ ├── cannonlake-i3-8121U-gcc-8.3.1.csv │ ├── cannonlake-i3-8121U-gcc-8.3.1.metadata │ ├── cannonlake-i3-8121U-gcc-8.3.1.rst │ └── run.sh ├── cascadelake │ ├── cascadelake-Xeon-Gold-6240-gcc-8.3.0.csv │ ├── cascadelake-Xeon-Gold-6240-gcc-8.3.0.metadata │ ├── cascadelake-Xeon-Gold-6240-gcc-8.3.0.rst │ └── run.sh ├── haswell │ ├── haswell-i7-4770-clang3.8.0-avx2.csv │ ├── haswell-i7-4770-clang3.8.0-avx2.metadata │ ├── haswell-i7-4770-clang3.8.0-avx2.rst │ ├── haswell-i7-4770-gcc5.3.0-avx2.csv │ ├── haswell-i7-4770-gcc5.3.0-avx2.metadata │ ├── haswell-i7-4770-gcc5.3.0-avx2.rst │ └── run.sh ├── refresh_all.sh ├── report.sh ├── sandybridge-e │ ├── run.sh │ ├── sandybridgeE-i7-3930k-g++4.8-avx.csv │ ├── sandybridgeE-i7-3930k-g++4.8-avx.metadata │ ├── sandybridgeE-i7-3930k-g++4.8-avx.rst │ ├── sandybridgeE-i7-3930k-g++5.3-avx.csv │ ├── sandybridgeE-i7-3930k-g++5.3-avx.metadata │ └── sandybridgeE-i7-3930k-g++5.3-avx.rst ├── skylake-x │ ├── skylake-x-w-2104-gcc8.1.0.csv │ ├── skylake-x-w-2104-gcc8.1.0.metadata │ └── skylake-x-w-2104-gcc8.1.0.rst ├── skylake │ ├── run.sh │ ├── skylake-i7-6700-clang3.8.0-avx2.csv │ ├── skylake-i7-6700-clang3.8.0-avx2.metadata │ ├── skylake-i7-6700-clang3.8.0-avx2.rst │ ├── skylake-i7-6700-gcc5.3.0-avx2.csv │ ├── skylake-i7-6700-gcc5.3.0-avx2.metadata │ └── skylake-i7-6700-gcc5.3.0-avx2.rst └── westmere │ ├── run.sh │ ├── westmere-m540-gcc4.9.2-sse.csv │ ├── westmere-m540-gcc4.9.2-sse.metadata │ └── westmere-m540-gcc4.9.2-sse.rst ├── scripts ├── .gitignore ├── avx512vbmi-lookups.py ├── data.py ├── detail-pattern.rst ├── function_registry.py ├── main-pattern.rst ├── metadata.py ├── readme_listproc.py ├── report.py └── table.py ├── speed.cpp ├── sse_operators.cpp └── verify.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | speed_* 2 | verify_* 3 | *.txt 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use new trusty images, should yield newer compilers and packages 2 | sudo: required 3 | dist: precise 4 | language: cpp 5 | 6 | matrix: 7 | include: 8 | - compiler: gcc 9 | addons: 10 | apt: 11 | sources: 12 | - ubuntu-toolchain-r-test 13 | packages: 14 | - g++-5 15 | 16 | install: 17 | - export CXX=g++-5 18 | 19 | script: 20 | - make run_verify 21 | - make run_verify_avx 22 | - make run_verify_avx2 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2016, Wojciech Muła 2 | Copyright (c) 2016, Kim Walisch 3 | Copyright (c) 2016, Dan Luu 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 18 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 20 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # user can do CXX=g++ make 2 | # It's more fexlible to change from command line 3 | # The make builtin rule states CXX to be g++. 4 | .PHONY: all help clean build-all \ 5 | x86 avx avx2 arm \ 6 | speed verify \ 7 | speed_avx verify_avx \ 8 | speed_avx2 verify_avx2 \ 9 | speed_avx512bw verify_avx512bw \ 10 | speed_avx512vbmi verify_avx512vbmi \ 11 | speed_avx512vpopcnt verify_avx512vpopcnt \ 12 | speed_arm verify_arm \ 13 | speed_aarch64 verify_aarch64 \ 14 | speed_rvv verify_rvv 15 | 16 | COMPILER=$(notdir $(CXX)) 17 | FLAGS=-std=c++17 -O2 -Wall -pedantic -Wextra -Wfatal-errors 18 | FLAGS_INTEL=$(FLAGS) -mpopcnt -fabi-version=6 19 | FLAGS_ARM=$(FLAGS) -mfpu=neon -DHAVE_NEON_INSTRUCTIONS 20 | # It seems that for AArch64 no extra flags are needed (NEON is always available) 21 | FLAGS_AARCH64=$(FLAGS) -DHAVE_NEON_INSTRUCTIONS -DHAVE_AARCH64_ARCHITECTURE 22 | 23 | # note: static is needed by Spike emulator 24 | FLAGS_RVV=$(FLAGS) -march=rv64gcv -DHAVE_RVV_INSTRUCTIONS -static 25 | 26 | FLAGS_SSE=$(FLAGS_INTEL) -mssse3 -DHAVE_SSE_INSTRUCTIONS 27 | FLAGS_AVX=$(FLAGS_INTEL) -mavx -DHAVE_AVX_INSTRUCTIONS 28 | FLAGS_AVX2=$(FLAGS_INTEL) -mavx2 -DHAVE_AVX2_INSTRUCTIONS 29 | FLAGS_AVX512BW=$(FLAGS_INTEL) -mavx512bw -DHAVE_AVX512BW_INSTRUCTIONS 30 | FLAGS_AVX512VBMI=$(FLAGS_INTEL) -mavx512vbmi -DHAVE_AVX512BW_INSTRUCTIONS -DHAVE_AVX512VBMI_INSTRUCTIONS 31 | FLAGS_AVX512VPOPCNT=$(FLAGS_INTEL) -mavx512bw -mavx512vpopcntdq -DHAVE_AVX512VPOPCNT_INSTRUCTIONS 32 | 33 | DEPS=popcnt-*.cpp function_registry.cpp sse_operators.cpp config.h 34 | ALL=speed_$(COMPILER) verify_$(COMPILER) 35 | ALL_AVX=speed_avx_$(COMPILER) verify_avx_$(COMPILER) 36 | ALL_AVX2=speed_avx2_$(COMPILER) verify_avx2_$(COMPILER) 37 | ALL_AVX512BW=speed_avx512bw_$(COMPILER) verify_avx512bw_$(COMPILER) 38 | ALL_AVX512VBMI=speed_avx512vbmi_$(COMPILER) verify_avx512vbmi_$(COMPILER) 39 | ALL_AVX512VPOPCNT=speed_avx512vpopcnt_$(COMPILER) verify_avx512vpopcnt_$(COMPILER) 40 | ALL_ARM=speed_arm_$(COMPILER) verify_arm_$(COMPILER) 41 | ALL_AARCH64=speed_aarch64_$(COMPILER) verify_aarch64_$(COMPILER) 42 | ALL_RVV=speed_rvv_$(COMPILER) verify_rvv_$(COMPILER) 43 | ALL_TARGETS=$(ALL) $(ALL_AVX) $(ALL_AVX2) $(ALL_AVX512) $(ALL_AVX512BW) $(ALL_AVX512VPOPCNT) $(ALL_RVV) 44 | 45 | all: $(ALL) 46 | 47 | help: 48 | @echo "Intel targets:" 49 | @echo "x86 - makes programs verify & speed (the default target)" 50 | @echo "run - runs benchmark program" 51 | @echo "run_verify - runs verification program" 52 | @echo 53 | @echo "avx - makes programs verify_avx & speed_avx" 54 | @echo "run_avx - runs benchmark program" 55 | @echo "run_verify_avx - runs verification program" 56 | @echo 57 | @echo "avx2 - makes programs verify_avx2 & speed_avx2" 58 | @echo "run_avx2 - runs benchmark program" 59 | @echo "run_verify_avx2 - runs verification program" 60 | @echo 61 | @echo "avx512bw - makes programs verify_avx512bw & speed_avx512bw" 62 | @echo "run_avx512bw - runs benchmark program" 63 | @echo "run_verify_avx512bw - runs verification program" 64 | @echo 65 | @echo "avx512vbmi - makes programs verify_avx512vbmi & speed_avx512vbmi" 66 | @echo "run_avx512vbmi - runs benchmark program" 67 | @echo "run_verify_avx512vbmi - runs verification program" 68 | @echo 69 | @echo "avx512vpopcnt - makes programs verify_avx512vpopcnt & speed_avx512vpopcnt" 70 | @echo "run_avx512vpopcnt - runs benchmark program" 71 | @echo "run_verify_avx512vpopcnt - runs verification program" 72 | @echo 73 | @echo "ARM Neon target:" 74 | @echo "arm - makes programs verify_arm & speed_arm (using Neon instructions)" 75 | @echo "run_arm - runs benchmark program" 76 | @echo "run_verify_arm - runs verification program" 77 | @echo 78 | @echo "RVV target:" 79 | @echo "run_verify_rvv - runs verification program" 80 | 81 | x86: $(ALL) 82 | 83 | avx: $(ALL_AVX) 84 | 85 | avx2: $(ALL_AVX2) 86 | 87 | arm: $(ALL_ARM) 88 | 89 | aarch64: $(ALL_AARCH64) 90 | 91 | avx512bw: $(ALL_AVX512BW) 92 | 93 | avx512vbmi: $(ALL_AVX512VBMI) 94 | 95 | avx512vpopcnt: $(ALL_AVX512VPOPCNT) 96 | 97 | speed_$(COMPILER): $(DEPS) speed.cpp 98 | $(CXX) $(FLAGS_SSE) speed.cpp -o $@ 99 | 100 | verify_$(COMPILER): $(DEPS) verify.cpp 101 | $(CXX) $(FLAGS_SSE) verify.cpp -o $@ 102 | 103 | speed_avx_$(COMPILER): $(DEPS) speed.cpp 104 | $(CXX) $(FLAGS_AVX) speed.cpp -o $@ 105 | 106 | verify_avx_$(COMPILER): $(DEPS) verify.cpp 107 | $(CXX) $(FLAGS_AVX) verify.cpp -o $@ 108 | 109 | speed_avx2_$(COMPILER): $(DEPS) speed.cpp 110 | $(CXX) $(FLAGS_AVX2) speed.cpp -o $@ 111 | 112 | verify_avx2_$(COMPILER): $(DEPS) verify.cpp 113 | $(CXX) $(FLAGS_AVX2) verify.cpp -o $@ 114 | 115 | speed_avx512bw_$(COMPILER): $(DEPS) speed.cpp 116 | $(CXX) $(FLAGS_AVX512BW) speed.cpp -o $@ 117 | 118 | verify_avx512bw_$(COMPILER): $(DEPS) verify.cpp 119 | $(CXX) $(FLAGS_AVX512BW) verify.cpp -o $@ 120 | 121 | speed_avx512vbmi_$(COMPILER): $(DEPS) speed.cpp 122 | $(CXX) $(FLAGS_AVX512VBMI) speed.cpp -o $@ 123 | 124 | verify_avx512vbmi_$(COMPILER): $(DEPS) verify.cpp 125 | $(CXX) $(FLAGS_AVX512VBMI) verify.cpp -o $@ 126 | 127 | speed_avx512vpopcnt_$(COMPILER): $(DEPS) speed.cpp 128 | $(CXX) $(FLAGS_AVX512VPOPCNT) speed.cpp -o $@ 129 | 130 | verify_avx512vpopcnt_$(COMPILER): $(DEPS) verify.cpp 131 | $(CXX) $(FLAGS_AVX512VPOPCNT) verify.cpp -o $@ 132 | 133 | speed_arm_$(COMPILER): $(DEPS) speed.cpp 134 | $(CXX) $(FLAGS_ARM) speed.cpp -o $@ 135 | 136 | verify_arm_$(COMPILER): $(DEPS) verify.cpp 137 | $(CXX) $(FLAGS_ARM) verify.cpp -o $@ 138 | 139 | speed_aarch64_$(COMPILER): $(DEPS) speed.cpp 140 | $(CXX) $(FLAGS_AARCH64) speed.cpp -o $@ 141 | 142 | verify_aarch64_$(COMPILER): $(DEPS) verify.cpp 143 | $(CXX) $(FLAGS_AARCH64) verify.cpp -o $@ 144 | 145 | speed_rvv_$(COMPILER): $(DEPS) speed.cpp 146 | $(CXX) $(FLAGS_RVV) speed.cpp -o $@ 147 | 148 | verify_rvv_$(COMPILER): $(DEPS) verify.cpp 149 | $(CXX) $(FLAGS_RVV) verify.cpp -o $@ 150 | 151 | speed: speed_$(COMPILER) 152 | speed_avx: speed_avx_$(COMPILER) 153 | speed_avx2: speed_avx2_$(COMPILER) 154 | speed_avx512bw: speed_avx512bw_$(COMPILER) 155 | speed_avx512vbmi: speed_avx512vbmi_$(COMPILER) 156 | speed_avx512vpopcnt: speed_avx512vpopcnt_$(COMPILER) 157 | speed_arm: speed_arm_$(COMPILER) 158 | speed_aarch64: speed_aarch64_$(COMPILER) 159 | speed_rvv: speed_rvv_$(COMPILER) 160 | 161 | verify: verify_$(COMPILER) 162 | verify_avx: verify_avx_$(COMPILER) 163 | verify_avx2: verify_avx2_$(COMPILER) 164 | verify_avx512bw: verify_avx512bw_$(COMPILER) 165 | verify_avx512vbmi: verify_avx512vbmi_$(COMPILER) 166 | verify_avx512vpopcnt: verify_avx512vpopcnt_$(COMPILER) 167 | verify_arm: verify_arm_$(COMPILER) 168 | verify_aarch64: verify_aarch64_$(COMPILER) 169 | verify_rvv: verify_rvv_$(COMPILER) 170 | 171 | build-all: $(ALL_TARGETS) 172 | 173 | SIZE=10000000 174 | ITERS=100 175 | 176 | run: speed 177 | ./speed_$(COMPILER) $(SIZE) $(ITERS) 178 | 179 | run_avx: speed_avx 180 | ./speed_avx_$(COMPILER) $(SIZE) $(ITERS) 181 | 182 | run_avx2: speed_avx2 183 | ./speed_avx2_$(COMPILER) $(SIZE) $(ITERS) 184 | 185 | run_avx512bw: speed_avx512bw 186 | ./speed_avx512bw_$(COMPILER) $(SIZE) $(ITERS) 187 | 188 | run_avx512vbmi: speed_avx512vbmi 189 | ./speed_avx512vbmi_$(COMPILER) $(SIZE) $(ITERS) 190 | 191 | SIZE=1000000 192 | ITERS=100 193 | 194 | run_arm: speed_arm 195 | ./speed_arm_$(COMPILER) $(SIZE) $(ITERS) 196 | 197 | run_aarch64: speed_aarch64 198 | ./speed_aarch64_$(COMPILER) $(SIZE) $(ITERS) 199 | 200 | run_verify: verify_$(COMPILER) 201 | ./$^ 202 | 203 | run_verify_avx: verify_avx_$(COMPILER) 204 | ./$^ 205 | 206 | run_verify_avx2: verify_avx2_$(COMPILER) 207 | ./$^ 208 | 209 | run_verify_avx512bw: verify_avx512bw_$(COMPILER) 210 | ./$^ 211 | 212 | run_verify_avx512vbmi: verify_avx512vbmi_$(COMPILER) 213 | ./$^ 214 | 215 | run_verify_avx512vpopcnt: verify_avx512vpopcnt_$(COMPILER) 216 | ./$^ 217 | 218 | run_verify_arm: verify_arm_$(COMPILER) 219 | ./$^ 220 | 221 | SPIKE_ISA=rv64gcv_Zicntr 222 | 223 | run_verify_rvv: verify_rvv_$(COMPILER) 224 | # test for VLENB=16 225 | spike --isa=$(SPIKE_ISA) `which pk` $^ 226 | # test for VLENB=128 227 | spike --isa=$(SPIKE_ISA) --varch=vlen:512,elen:64 `which pk` $^ 228 | 229 | run_speed_rvv: speed_rvv_$(COMPILER) 230 | # test for VLENB=16 231 | spike --isa=$(SPIKE_ISA) `which pk` $^ $(SIZE) $(ITERS) 232 | # test for VLENB=128 233 | spike --isa=$(SPIKE_ISA) --varch=vlen:512,elen:64 `which pk` $^ $(SIZE) $(ITERS) 234 | 235 | clean: 236 | rm -f $(ALL_TARGETS) $(ALL_ARM) 237 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======================================================================== 2 | SIMD popcount 3 | ======================================================================== 4 | 5 | Sample programs for my article http://0x80.pl/articles/sse-popcount.html 6 | 7 | .. image:: https://travis-ci.org/WojciechMula/sse-popcount.svg?branch=master 8 | :target: https://travis-ci.org/WojciechMula/sse-popcount 9 | 10 | Paper 11 | ------------------------------------------------------------------------ 12 | 13 | Daniel Lemire, Nathan Kurz and I published an article 14 | `Faster Population Counts using AVX2 Instructions`__. 15 | 16 | __ https://arxiv.org/abs/1611.07612 17 | 18 | 19 | Introduction 20 | ------------------------------------------------------------------------ 21 | 22 | Subdirectory **original** contains code from 2008 --- it is 32-bit 23 | and GCC-centric. The **root directory** contains fresh C++11 code, 24 | written with intrinsics and tested on 64-bit machines. 25 | 26 | There are two programs: 27 | 28 | * ``verify`` --- it tests if all non-lookup implementations counts 29 | bits properly; 30 | * ``speed`` --- benchmarks different implementations of popcount 31 | procedure; please read help to find all options (run the program 32 | without arguments). 33 | 34 | There are several targets: 35 | 36 | * **default** --- builtin functions, SSE and popcnt instructions; 37 | * **AVX2** --- all above plus AVX2 implementations; 38 | * **AVX512BW** --- all above plus experimental AVX512BW code; 39 | * **AVX512VBMI** --- all above plus experimental AVX512VBMI code; 40 | * **AVX512 VPOPCNT** --- all above plus experimental AVX512 VPOPCNT 41 | code (should be compilable with very recent GCC__, software emulator 42 | doesn't support this extension yet); 43 | * **arm** --- builtin and ARM Neon implementations. 44 | 45 | Type ``make help`` to find out details. To run the default target 46 | benchmark simply type ``make``. 47 | 48 | __ https://github.com/gcc-mirror/gcc/commit/e0aa57d6b04908affdf4655a6b4a9f2d4d03483b 49 | 50 | 51 | Available implementations 52 | ------------------------------------------------------------------------ 53 | 54 | +---------------------------------------+------------------------------------------------------------------+ 55 | | procedure | description | 56 | +=======================================+==================================================================+ 57 | | lookup-8 | lookup in std::uint8_t[256] LUT | 58 | +---------------------------------------+------------------------------------------------------------------+ 59 | | lookup-64 | lookup in std::uint64_t[256] LUT | 60 | +---------------------------------------+------------------------------------------------------------------+ 61 | | bit-parallel | naive bit parallel method | 62 | +---------------------------------------+------------------------------------------------------------------+ 63 | | bit-parallel-optimized | a bit better bit parallel | 64 | +---------------------------------------+------------------------------------------------------------------+ 65 | | bit-parallel-optimized2 | better utilization of 2- and 4-bit subwords | 66 | +---------------------------------------+------------------------------------------------------------------+ 67 | | bit-parallel-mul | bit-parallel with fewer instructions | 68 | +---------------------------------------+------------------------------------------------------------------+ 69 | | bit-parallel32 | naive bit parallel method (32 bit) | 70 | +---------------------------------------+------------------------------------------------------------------+ 71 | | bit-parallel-optimized32 | a bit better bit parallel (32 bit) | 72 | +---------------------------------------+------------------------------------------------------------------+ 73 | | harley-seal | Harley-Seal popcount (4th iteration) | 74 | +---------------------------------------+------------------------------------------------------------------+ 75 | | sse-bit-parallel | SSE implementation of bit-parallel-optimized (unrolled) | 76 | +---------------------------------------+------------------------------------------------------------------+ 77 | | sse-bit-parallel-original | SSE implementation of bit-parallel-optimized | 78 | +---------------------------------------+------------------------------------------------------------------+ 79 | | sse-bit-parallel-better | SSE implementation of bit-parallel with fewer instructions | 80 | +---------------------------------------+------------------------------------------------------------------+ 81 | | sse-harley-seal | SSE implementation of Harley-Seal | 82 | +---------------------------------------+------------------------------------------------------------------+ 83 | | sse-lookup | SSSE3 variant using pshufb instruction (unrolled) | 84 | +---------------------------------------+------------------------------------------------------------------+ 85 | | sse-lookup-original | SSSE3 variant using pshufb instruction | 86 | +---------------------------------------+------------------------------------------------------------------+ 87 | | avx2-lookup | AVX2 variant using pshufb instruction (unrolled) | 88 | +---------------------------------------+------------------------------------------------------------------+ 89 | | avx2-lookup-original | AVX2 variant using pshufb instruction | 90 | +---------------------------------------+------------------------------------------------------------------+ 91 | | avx2-harley-seal | AVX2 implementation of Harley-Seal | 92 | +---------------------------------------+------------------------------------------------------------------+ 93 | | cpu | CPU instruction popcnt (64-bit variant) | 94 | +---------------------------------------+------------------------------------------------------------------+ 95 | | sse-cpu | load data with SSE, then count bits using popcnt | 96 | +---------------------------------------+------------------------------------------------------------------+ 97 | | avx2-cpu | load data with AVX2, then count bits using popcnt | 98 | +---------------------------------------+------------------------------------------------------------------+ 99 | | avx512-harley-seal | AVX512 implementation of Harley-Seal | 100 | +---------------------------------------+------------------------------------------------------------------+ 101 | | avx512bw-shuf | AVX512BW implementation uses shuffle instruction | 102 | +---------------------------------------+------------------------------------------------------------------+ 103 | | avx512vbmi-shuf | AVX512VBMI implementation uses shuffle instruction | 104 | +---------------------------------------+------------------------------------------------------------------+ 105 | | avx512-vpopcnt | AVX512 VPOPCNT | 106 | +---------------------------------------+------------------------------------------------------------------+ 107 | | builtin-popcnt | builtin for popcnt | 108 | +---------------------------------------+------------------------------------------------------------------+ 109 | | builtin-popcnt32 | builtin for popcnt (32-bit variant) | 110 | +---------------------------------------+------------------------------------------------------------------+ 111 | | builtin-popcnt-unrolled | unrolled builtin-popcnt | 112 | +---------------------------------------+------------------------------------------------------------------+ 113 | | builtin-popcnt-unrolled32 | unrolled builtin-popcnt32 | 114 | +---------------------------------------+------------------------------------------------------------------+ 115 | | builtin-popcnt-unrolled-errata | unrolled builtin-popcnt avoiding false-dependency | 116 | +---------------------------------------+------------------------------------------------------------------+ 117 | | builtin-popcnt-unrolled-errata-manual | unrolled builtin-popcnt avoiding false-dependency (asembly code) | 118 | +---------------------------------------+------------------------------------------------------------------+ 119 | | builtin-popcnt-movdq | builtin-popcnt where data is loaded via SSE registers | 120 | +---------------------------------------+------------------------------------------------------------------+ 121 | | builtin-popcnt-movdq-unrolled | builtin-popcnt-movdq unrolled | 122 | +---------------------------------------+------------------------------------------------------------------+ 123 | | builtin-popcnt-movdq-unrolled_manual | builtin-popcnt-movdq unrolled (assembly code) | 124 | +---------------------------------------+------------------------------------------------------------------+ 125 | | neon-vcnt | ARM Neon using VCNT | 126 | +---------------------------------------+------------------------------------------------------------------+ 127 | | neon-HS | Harley-Seal using Neon VCNT | 128 | +---------------------------------------+------------------------------------------------------------------+ 129 | | aarch64-cnt | ARMv8 Neon using CNT | 130 | +---------------------------------------+------------------------------------------------------------------+ 131 | 132 | 133 | Performance results 134 | ------------------------------------------------------------------------ 135 | 136 | The subdirectory results__ contains performance results from various 137 | computers. If you can, please contribute. 138 | 139 | __ results/README.rst 140 | 141 | 142 | Acknowledgments 143 | ------------------------------------------------------------------------ 144 | 145 | * **Kim Walisch** (@kimwalisch) wrote Harley-Seal scalar implementation. 146 | * **Simon Lindholm** (@simonlindholm) added unrolled versions of procedures. 147 | * **Dan Luu** (@danluu) agreed to include his procedures (``builint-*``) 148 | into this project. More details in Dan's article `Hand coded assembly 149 | beats intrinsics in speed and simplicity`__ 150 | 151 | __ http://danluu.com/assembly-intrinsics/ 152 | 153 | 154 | See also 155 | ------------------------------------------------------------------------ 156 | 157 | * libpopcnt__ --- library by Kim Walisch utilizing methods from our paper. 158 | 159 | __ https://github.com/kimwalisch/libpopcnt 160 | 161 | 162 | .. vim: nowrap 163 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | // does your shell supports ansi color seqences? 2 | #define HAVE_ANSI_CONSOLE 1 3 | 4 | #if defined(HAVE_AVX512VPOPCNT_INSTRUCTIONS) 5 | # if !defined(HAVE_AVX512BW_INSTRUCTIONS) 6 | # define HAVE_AVX512BW_INSTRUCTIONS 7 | # endif 8 | #endif 9 | 10 | #if defined(HAVE_AVX512BW_INSTRUCTIONS) 11 | // AVX512 implies AVX2 & AVX 12 | # if !defined(HAVE_AVX_INSTRUCTIONS) 13 | # define HAVE_AVX_INSTRUCTIONS 14 | # endif 15 | # if !defined(HAVE_AVX2_INSTRUCTIONS) 16 | # define HAVE_AVX2_INSTRUCTIONS 17 | # endif 18 | #endif 19 | 20 | #if defined(HAVE_AVX_INSTRUCTIONS) || defined(HAVE_AVX2_INSTRUCTIONS) 21 | # if !defined(HAVE_SSE_INSTRUCTIONS) 22 | # define HAVE_SSE_INSTRUCTIONS 23 | # endif 24 | #endif 25 | 26 | #if !defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_RVV_INSTRUCTIONS) 27 | # define HAVE_POPCNT_INSTRUCTION 1 28 | #endif 29 | 30 | #if defined(HAVE_NEON_INSTRUCTIONS) 31 | # include 32 | #elif defined(HAVE_RVV_INSTRUCTIONS) 33 | # ifdef __riscv_v_intrinsic 34 | # include 35 | # endif 36 | #else 37 | # include 38 | # include 39 | #endif 40 | 41 | #define FORCE_INLINE inline __attribute__((always_inline)) 42 | -------------------------------------------------------------------------------- /function_registry.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | using function_ptr = std::uint64_t (*)(const uint8_t* data, const size_t size); 8 | using function_64_ptr = std::uint64_t (*)(const uint64_t* data, int size); 9 | 10 | class Function { 11 | public: 12 | bool is_trusted; 13 | std::string name; 14 | std::string help; 15 | function_ptr function; 16 | function_64_ptr function_64; 17 | }; 18 | 19 | 20 | class FunctionRegistry final { 21 | 22 | private: 23 | using NameList = std::vector; 24 | using FunctionList = std::vector; 25 | 26 | NameList available; 27 | FunctionList functions; 28 | 29 | int widest_name; 30 | 31 | public: 32 | FunctionRegistry() { 33 | build(); 34 | 35 | widest_name = 0; 36 | for (const auto& name: available) { 37 | widest_name = std::max(widest_name, static_cast(name.size())); 38 | } 39 | } 40 | 41 | public: 42 | bool has(const std::string& name) const; 43 | const Function& get(const std::string& name) const; 44 | 45 | const NameList& get_available() const { 46 | return available; 47 | } 48 | 49 | const FunctionList& get_functions() const { 50 | return functions; 51 | } 52 | 53 | public: 54 | int get_widest_name() const { 55 | return widest_name; 56 | } 57 | 58 | private: 59 | void build(); 60 | void add(const std::string& name, const std::string& help, function_ptr fn); 61 | void add(const std::string& name, const std::string& help, function_64_ptr fn); 62 | void add_trusted(const std::string& name, const std::string& help, function_ptr fn); 63 | }; 64 | 65 | 66 | bool FunctionRegistry::has(const std::string& name) const { 67 | 68 | return std::find(available.begin(), available.end(), name) != available.end(); 69 | } 70 | 71 | 72 | const Function& FunctionRegistry::get(const std::string& name) const { 73 | 74 | auto pred = [&name](const Function& item) { 75 | return item.name == name; 76 | }; 77 | 78 | const auto& iter = std::find_if(functions.begin(), functions.end(), pred); 79 | if (iter == functions.end()) { 80 | fprintf(stderr, "function %s not found", name.c_str()); 81 | assert(false && "function not found"); 82 | } 83 | 84 | return *iter; 85 | } 86 | 87 | 88 | void FunctionRegistry::build() { 89 | // definition start 90 | 91 | add_trusted("lookup-8", 92 | "lookup in std::uint8_t[256] LUT", 93 | popcnt_lookup_8bit); 94 | 95 | add_trusted("lookup-64", 96 | "lookup in std::uint64_t[256] LUT", 97 | popcnt_lookup_64bit); 98 | 99 | add("bit-parallel", 100 | "naive bit parallel method", 101 | popcnt_parallel_64bit_naive); 102 | 103 | add("bit-parallel-optimized", 104 | "a bit better bit parallel", 105 | popcnt_parallel_64bit_optimized); 106 | 107 | add("bit-parallel-optimized2", 108 | "better utilization of 2- and 4-bit subwords", 109 | popcnt_parallel_64bit_optimized2); 110 | 111 | add("bit-parallel-mul", 112 | "bit-parallel with fewer instructions", 113 | popcnt_parallel_64bit_mul); 114 | 115 | add("bit-parallel32", 116 | "naive bit parallel method (32 bit)", 117 | popcnt_parallel_32bit_naive); 118 | 119 | add("bit-parallel-optimized32", 120 | "a bit better bit parallel (32 bit)", 121 | popcnt_parallel_32bit_optimized); 122 | 123 | add("harley-seal", 124 | "Harley-Seal popcount (4th iteration)", 125 | popcnt_harley_seal); 126 | 127 | #if defined(HAVE_SSE_INSTRUCTIONS) 128 | add("sse-bit-parallel", 129 | "SSE implementation of bit-parallel-optimized (unrolled)", 130 | popcnt_SSE_bit_parallel); 131 | 132 | add("sse-bit-parallel-original", 133 | "SSE implementation of bit-parallel-optimized", 134 | popcnt_SSE_bit_parallel_original); 135 | 136 | add("sse-bit-parallel-better", 137 | "SSE implementation of bit-parallel with fewer instructions", 138 | popcnt_SSE_bit_parallel_better); 139 | 140 | add("sse-harley-seal", 141 | "SSE implementation of Harley-Seal", 142 | popcnt_SSE_harley_seal); 143 | 144 | add("sse-lookup", 145 | "SSSE3 variant using pshufb instruction (unrolled)", 146 | popcnt_SSE_lookup); 147 | 148 | add("sse-lookup-original", 149 | "SSSE3 variant using pshufb instruction", 150 | popcnt_SSE_lookup_original); 151 | #endif 152 | 153 | #if defined(HAVE_AVX2_INSTRUCTIONS) 154 | add("avx2-lookup", 155 | "AVX2 variant using pshufb instruction (unrolled)", 156 | popcnt_AVX2_lookup); 157 | 158 | add("avx2-lookup-original", 159 | "AVX2 variant using pshufb instruction", 160 | popcnt_AVX2_lookup_original); 161 | 162 | add("avx2-harley-seal", 163 | "AVX2 implementation of Harley-Seal", 164 | popcnt_AVX2_harley_seal); 165 | #endif 166 | 167 | 168 | #if defined(HAVE_POPCNT_INSTRUCTION) 169 | add("cpu", 170 | "CPU instruction popcnt (64-bit variant)", 171 | popcnt_cpu_64bit); 172 | 173 | add("sse-cpu", 174 | "load data with SSE, then count bits using popcnt", 175 | popcnt_cpu_64bit); 176 | #endif 177 | 178 | #if defined(HAVE_AVX2_INSTRUCTIONS) 179 | add("avx2-cpu", 180 | "load data with AVX2, then count bits using popcnt", 181 | popcnt_AVX2_and_cpu); 182 | #endif 183 | 184 | #if defined(HAVE_AVX512BW_INSTRUCTIONS) 185 | add("avx512-harley-seal", 186 | "AVX512 implementation of Harley-Seal", 187 | popcnt_AVX512_harley_seal); 188 | 189 | add("avx512bw-shuf", 190 | "AVX512BW implementation uses shuffle instruction", 191 | popcnt_AVX512BW_lookup_original); 192 | #endif 193 | 194 | #if defined(HAVE_AVX512VBMI_INSTRUCTIONS) 195 | add("avx512vbmi-shuf", 196 | "AVX512VBMI implementation uses shuffle instruction", 197 | popcnt_AVX512VBMI_lookup); 198 | #endif 199 | 200 | #if defined(HAVE_AVX512VPOPCNT_INSTRUCTIONS) 201 | add("avx512-vpopcnt", 202 | "AVX512 VPOPCNT", 203 | popcnt_AVX512_vpopcnt); 204 | #endif 205 | 206 | add("builtin-popcnt", 207 | "builtin for popcnt", 208 | builtin_popcnt); 209 | 210 | add("builtin-popcnt32", 211 | "builtin for popcnt (32-bit variant)", 212 | builtin_popcnt32); 213 | 214 | add("builtin-popcnt-unrolled", 215 | "unrolled builtin-popcnt", 216 | builtin_popcnt_unrolled); 217 | 218 | add("builtin-popcnt-unrolled32", 219 | "unrolled builtin-popcnt32", 220 | builtin_popcnt_unrolled32); 221 | 222 | #if defined(HAVE_POPCNT_INSTRUCTION) 223 | add("builtin-popcnt-unrolled-errata", 224 | "unrolled builtin-popcnt avoiding false-dependency", 225 | builtin_popcnt_unrolled_errata); 226 | 227 | add("builtin-popcnt-unrolled-errata-manual", 228 | "unrolled builtin-popcnt avoiding false-dependency (asembly code)", 229 | builtin_popcnt_unrolled_errata_manual); 230 | 231 | add("builtin-popcnt-movdq", 232 | "builtin-popcnt where data is loaded via SSE registers", 233 | builtin_popcnt_movdq); 234 | 235 | add("builtin-popcnt-movdq-unrolled", 236 | "builtin-popcnt-movdq unrolled", 237 | builtin_popcnt_movdq_unrolled); 238 | 239 | add("builtin-popcnt-movdq-unrolled_manual", 240 | "builtin-popcnt-movdq unrolled (assembly code)", 241 | builtin_popcnt_movdq_unrolled_manual); 242 | #endif 243 | 244 | #if defined(HAVE_NEON_INSTRUCTIONS) 245 | add("neon-vcnt", 246 | "ARM Neon using VCNT", 247 | popcnt_neon_vcnt); 248 | 249 | add("neon-HS", 250 | "Harley-Seal using Neon VCNT", 251 | popcnt_neon_harley_seal); 252 | #endif 253 | 254 | #if defined(HAVE_AARCH64_ARCHITECTURE) 255 | add("aarch64-cnt", 256 | "ARMv8 Neon using CNT", 257 | popcnt_aarch64_cnt); 258 | #endif 259 | 260 | #if defined(HAVE_RVV_INSTRUCTIONS) 261 | add("rvv-1", 262 | "RVV generic implementation", 263 | popcnt_rvv_lookup); 264 | #endif 265 | // definition end 266 | } 267 | 268 | 269 | void FunctionRegistry::add(const std::string& name, const std::string& help, function_ptr fn) { 270 | 271 | available.push_back(name); 272 | functions.push_back({false, name, help, fn, nullptr}); 273 | } 274 | 275 | 276 | void FunctionRegistry::add(const std::string& name, const std::string& help, function_64_ptr fn) { 277 | 278 | available.push_back(name); 279 | functions.push_back({false, name, help, nullptr, fn}); 280 | } 281 | 282 | 283 | void FunctionRegistry::add_trusted(const std::string& name, const std::string& help, function_ptr fn) { 284 | 285 | available.push_back(name); 286 | functions.push_back({true, name, help, fn, nullptr}); 287 | } 288 | 289 | 290 | -------------------------------------------------------------------------------- /original/ssse3_popcount-test.sh: -------------------------------------------------------------------------------- 1 | # usage: sh ssse3_popcount.sh 2> ssse3_popcount_speedup.data 2 | 3 | for size in 1 8; do 4 | for proc in lookup "sse2-1" "sse2-2" "ssse3-1" "ssse3-2"; do 5 | /usr/bin/time -f "$proc $size %U" ./ssse3_popcount $proc $size 20000000 6 | done 7 | done 8 | for size in 32; do 9 | for proc in lookup "sse2-1" "sse2-2" "ssse3-1" "ssse3-2" "sse2-unrl" "ssse3-unrl"; do 10 | /usr/bin/time -f "$proc $size %U" ./ssse3_popcount $proc $size 2000000 11 | done 12 | done 13 | for size in 128 512 1024 2048; do 14 | for proc in lookup "sse2-1" "sse2-2" "ssse3-1" "ssse3-2" "sse2-unrl" "ssse3-unrl"; do 15 | /usr/bin/time -f "$proc $size %U" ./ssse3_popcount $proc $size 200000 16 | done 17 | done 18 | -------------------------------------------------------------------------------- /original/ssse3_popcount.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/sse-popcount/138c91e21c3e6dab7875521b5d33b995e0e4c85e/original/ssse3_popcount.c -------------------------------------------------------------------------------- /popcnt-aarch64.cpp: -------------------------------------------------------------------------------- 1 | // this is direct translation of popcnt_neon_vcnt 2 | 3 | uint64_t popcnt_aarch64_cnt(const uint8_t* data, const size_t size) 4 | { 5 | const size_t chunk_size = 16 * 4 * 4; 6 | 7 | uint8_t* ptr = const_cast(data); 8 | 9 | const size_t n = size / chunk_size; 10 | const size_t k = size % chunk_size; 11 | 12 | uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0)); 13 | 14 | for (size_t i=0; i < n; i++, ptr += chunk_size) { 15 | 16 | uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4); 17 | uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4); 18 | uint8x16x4_t input2 = vld4q_u8(ptr + 2 * 16 * 4); 19 | uint8x16x4_t input3 = vld4q_u8(ptr + 3 * 16 * 4); 20 | 21 | uint8x16_t t0 = vcntq_u8(input0.val[0]); 22 | t0 = vaddq_u8(t0, vcntq_u8(input0.val[1])); 23 | t0 = vaddq_u8(t0, vcntq_u8(input0.val[2])); 24 | t0 = vaddq_u8(t0, vcntq_u8(input0.val[3])); 25 | 26 | t0 = vaddq_u8(t0, vcntq_u8(input1.val[0])); 27 | t0 = vaddq_u8(t0, vcntq_u8(input1.val[1])); 28 | t0 = vaddq_u8(t0, vcntq_u8(input1.val[2])); 29 | t0 = vaddq_u8(t0, vcntq_u8(input1.val[3])); 30 | 31 | t0 = vaddq_u8(t0, vcntq_u8(input2.val[0])); 32 | t0 = vaddq_u8(t0, vcntq_u8(input2.val[1])); 33 | t0 = vaddq_u8(t0, vcntq_u8(input2.val[2])); 34 | t0 = vaddq_u8(t0, vcntq_u8(input2.val[3])); 35 | 36 | t0 = vaddq_u8(t0, vcntq_u8(input3.val[0])); 37 | t0 = vaddq_u8(t0, vcntq_u8(input3.val[1])); 38 | t0 = vaddq_u8(t0, vcntq_u8(input3.val[2])); 39 | t0 = vaddq_u8(t0, vcntq_u8(input3.val[3])); 40 | 41 | const uint16x8_t t1 = vpaddlq_u8(t0); 42 | 43 | sum = vpadalq_u16(sum, t1); 44 | } 45 | 46 | uint32_t scalar = 0; 47 | uint32_t tmp[4]; 48 | 49 | vst1q_u32(tmp, sum); 50 | for (int i=0; i < 4; i++) { 51 | scalar += tmp[i]; 52 | } 53 | 54 | for (size_t j=0; j < k; j++) { 55 | scalar += lookup8bit[ptr[j]]; 56 | } 57 | 58 | return scalar; 59 | } 60 | 61 | -------------------------------------------------------------------------------- /popcnt-all.cpp: -------------------------------------------------------------------------------- 1 | // includes all available implementations 2 | 3 | #include "popcnt-lookup.cpp" 4 | #include "popcnt-bit-parallel-scalar.cpp" 5 | #include "popcnt-bit-parallel-scalar32.cpp" 6 | #include "popcnt-harley-seal.cpp" 7 | #include "popcnt-builtin.cpp" 8 | 9 | #if defined(HAVE_SSE_INSTRUCTIONS) 10 | # include "sse_operators.cpp" 11 | # include "popcnt-sse-bit-parallel.cpp" 12 | # include "popcnt-sse-bit-parallel-better.cpp" 13 | # include "popcnt-sse-lookup.cpp" 14 | # include "popcnt-sse-harley-seal.cpp" 15 | # include "popcnt-sse-cpu.cpp" 16 | #endif 17 | 18 | #if defined(HAVE_POPCNT_INSTRUCTION) 19 | # include "popcnt-cpu.cpp" 20 | #endif 21 | 22 | #if defined(HAVE_AVX2_INSTRUCTIONS) 23 | # include "popcnt-avx2-lookup.cpp" 24 | # include "popcnt-avx2-harley-seal.cpp" 25 | # include "popcnt-avx2-cpu.cpp" 26 | #endif 27 | 28 | #if defined(HAVE_AVX512BW_INSTRUCTIONS) 29 | # include "popcnt-avx512-harley-seal.cpp" 30 | # include "popcnt-avx512bw-lookup.cpp" 31 | #endif 32 | 33 | #if defined(HAVE_AVX512VBMI_INSTRUCTIONS) 34 | # include "popcnt-avx512vbmi-lookup.cpp" 35 | #endif 36 | 37 | #if defined(HAVE_AVX512VPOPCNT_INSTRUCTIONS) 38 | # include "popcnt-avx512-vpopcnt.cpp" 39 | #endif 40 | 41 | #if defined(HAVE_NEON_INSTRUCTIONS) 42 | # include "popcnt-neon.cpp" 43 | #endif 44 | 45 | #if defined(HAVE_AARCH64_ARCHITECTURE) 46 | # include "popcnt-aarch64.cpp" 47 | #endif 48 | 49 | #if defined(HAVE_RVV_INSTRUCTIONS) 50 | # include "popcnt-rvv.cpp" 51 | #endif 52 | -------------------------------------------------------------------------------- /popcnt-avx2-cpu.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_AVX2_and_cpu(const uint8_t* data, const size_t n) { 2 | 3 | #define ITER { \ 4 | const __m256i v = _mm256_loadu_si256(reinterpret_cast(data + i)); \ 5 | result += _popcnt64(_mm256_extract_epi64(v, 0)); \ 6 | result += _popcnt64(_mm256_extract_epi64(v, 1)); \ 7 | result += _popcnt64(_mm256_extract_epi64(v, 2)); \ 8 | result += _popcnt64(_mm256_extract_epi64(v, 3)); \ 9 | i += 32; \ 10 | } 11 | 12 | size_t i = 0; 13 | uint64_t result = 0; 14 | 15 | while (i + 4*32 <= n) { 16 | ITER ITER ITER ITER 17 | } 18 | 19 | #undef ITER 20 | 21 | for (/**/; i < n; i++) { 22 | result += lookup8bit[data[i]]; 23 | } 24 | 25 | return result; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /popcnt-avx2-harley-seal.cpp: -------------------------------------------------------------------------------- 1 | namespace AVX2_harley_seal { 2 | 3 | __m256i popcount(const __m256i v) 4 | { 5 | const __m256i m1 = _mm256_set1_epi8(0x55); 6 | const __m256i m2 = _mm256_set1_epi8(0x33); 7 | const __m256i m4 = _mm256_set1_epi8(0x0F); 8 | 9 | const __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); 10 | const __m256i t2 = _mm256_add_epi8(t1 & m2, (_mm256_srli_epi16(t1, 2) & m2)); 11 | const __m256i t3 = _mm256_add_epi8(t2, _mm256_srli_epi16(t2, 4)) & m4; 12 | return _mm256_sad_epu8(t3, _mm256_setzero_si256()); 13 | } 14 | 15 | void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) 16 | { 17 | const __m256i u = a ^ b; 18 | h = (a & b) | (u & c); 19 | l = u ^ c; 20 | 21 | } 22 | 23 | uint64_t popcnt(const __m256i* data, const uint64_t size) 24 | { 25 | __m256i total = _mm256_setzero_si256(); 26 | __m256i ones = _mm256_setzero_si256(); 27 | __m256i twos = _mm256_setzero_si256(); 28 | __m256i fours = _mm256_setzero_si256(); 29 | __m256i eights = _mm256_setzero_si256(); 30 | __m256i sixteens = _mm256_setzero_si256(); 31 | __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; 32 | 33 | const uint64_t limit = size - size % 16; 34 | uint64_t i = 0; 35 | 36 | for(; i < limit; i += 16) 37 | { 38 | CSA(twosA, ones, ones, data[i+0], data[i+1]); 39 | CSA(twosB, ones, ones, data[i+2], data[i+3]); 40 | CSA(foursA, twos, twos, twosA, twosB); 41 | CSA(twosA, ones, ones, data[i+4], data[i+5]); 42 | CSA(twosB, ones, ones, data[i+6], data[i+7]); 43 | CSA(foursB, twos, twos, twosA, twosB); 44 | CSA(eightsA,fours, fours, foursA, foursB); 45 | CSA(twosA, ones, ones, data[i+8], data[i+9]); 46 | CSA(twosB, ones, ones, data[i+10], data[i+11]); 47 | CSA(foursA, twos, twos, twosA, twosB); 48 | CSA(twosA, ones, ones, data[i+12], data[i+13]); 49 | CSA(twosB, ones, ones, data[i+14], data[i+15]); 50 | CSA(foursB, twos, twos, twosA, twosB); 51 | CSA(eightsB, fours, fours, foursA, foursB); 52 | CSA(sixteens, eights, eights, eightsA, eightsB); 53 | 54 | total = _mm256_add_epi64(total, popcount(sixteens)); 55 | } 56 | 57 | total = _mm256_slli_epi64(total, 4); // * 16 58 | total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ... 59 | total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours), 2)); // += 4 * ... 60 | total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos), 1)); // += 2 * ... 61 | total = _mm256_add_epi64(total, popcount(ones)); 62 | 63 | for(; i < size; i++) 64 | total = _mm256_add_epi64(total, popcount(data[i])); 65 | 66 | 67 | return static_cast(_mm256_extract_epi64(total, 0)) 68 | + static_cast(_mm256_extract_epi64(total, 1)) 69 | + static_cast(_mm256_extract_epi64(total, 2)) 70 | + static_cast(_mm256_extract_epi64(total, 3)); 71 | } 72 | 73 | } // AVX2_harley_seal 74 | 75 | uint64_t popcnt_AVX2_harley_seal(const uint8_t* data, const size_t size) 76 | { 77 | uint64_t total = AVX2_harley_seal::popcnt((const __m256i*) data, size / 32); 78 | 79 | for (size_t i = size - size % 32; i < size; i++) 80 | total += lookup8bit[data[i]]; 81 | 82 | return total; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /popcnt-avx2-lookup.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_AVX2_lookup(const uint8_t* data, const size_t n) { 2 | 3 | size_t i = 0; 4 | 5 | const __m256i lookup = _mm256_setr_epi8( 6 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 7 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 8 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 9 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 10 | 11 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 12 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 13 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 14 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 15 | ); 16 | 17 | const __m256i low_mask = _mm256_set1_epi8(0x0f); 18 | 19 | __m256i acc = _mm256_setzero_si256(); 20 | 21 | #define ITER { \ 22 | const __m256i vec = _mm256_loadu_si256(reinterpret_cast(data + i)); \ 23 | const __m256i lo = _mm256_and_si256(vec, low_mask); \ 24 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \ 25 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \ 26 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \ 27 | local = _mm256_add_epi8(local, popcnt1); \ 28 | local = _mm256_add_epi8(local, popcnt2); \ 29 | i += 32; \ 30 | } 31 | 32 | while (i + 8*32 <= n) { 33 | __m256i local = _mm256_setzero_si256(); 34 | ITER ITER ITER ITER 35 | ITER ITER ITER ITER 36 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 37 | } 38 | 39 | __m256i local = _mm256_setzero_si256(); 40 | 41 | while (i + 32 <= n) { 42 | ITER; 43 | } 44 | 45 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 46 | 47 | #undef ITER 48 | 49 | uint64_t result = 0; 50 | 51 | result += static_cast(_mm256_extract_epi64(acc, 0)); 52 | result += static_cast(_mm256_extract_epi64(acc, 1)); 53 | result += static_cast(_mm256_extract_epi64(acc, 2)); 54 | result += static_cast(_mm256_extract_epi64(acc, 3)); 55 | 56 | for (/**/; i < n; i++) { 57 | result += lookup8bit[data[i]]; 58 | } 59 | 60 | return result; 61 | } 62 | 63 | 64 | std::uint64_t popcnt_AVX2_lookup_original(const uint8_t* data, const size_t n) { 65 | 66 | size_t i = 0; 67 | 68 | const __m256i lookup = _mm256_setr_epi8( 69 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 70 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 71 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 72 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 73 | 74 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 75 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 76 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 77 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 78 | ); 79 | 80 | const __m256i low_mask = _mm256_set1_epi8(0x0f); 81 | 82 | __m256i acc = _mm256_setzero_si256(); 83 | 84 | while (i + 32 < n) { 85 | 86 | __m256i local = _mm256_setzero_si256(); 87 | 88 | for (int k=0; k < 255/8 && i + 32 < n; k++, i += 32) { 89 | const __m256i vec = _mm256_loadu_si256(reinterpret_cast(data + i)); 90 | const __m256i lo = _mm256_and_si256(vec, low_mask); 91 | const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); 92 | 93 | const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); 94 | const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); 95 | 96 | local = _mm256_add_epi8(local, popcnt1); 97 | local = _mm256_add_epi8(local, popcnt2); 98 | } 99 | 100 | acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); 101 | } 102 | 103 | 104 | uint64_t result = 0; 105 | 106 | result += static_cast(_mm256_extract_epi64(acc, 0)); 107 | result += static_cast(_mm256_extract_epi64(acc, 1)); 108 | result += static_cast(_mm256_extract_epi64(acc, 2)); 109 | result += static_cast(_mm256_extract_epi64(acc, 3)); 110 | 111 | for (/**/; i < n; i++) { 112 | result += lookup8bit[data[i]]; 113 | } 114 | 115 | return result; 116 | } 117 | -------------------------------------------------------------------------------- /popcnt-avx512-harley-seal.cpp: -------------------------------------------------------------------------------- 1 | namespace AVX512_harley_seal { 2 | 3 | __m512i popcount(const __m512i v) 4 | { 5 | const __m512i m1 = _mm512_set1_epi8(0x55); 6 | const __m512i m2 = _mm512_set1_epi8(0x33); 7 | const __m512i m4 = _mm512_set1_epi8(0x0F); 8 | 9 | const __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1)); 10 | const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2)); 11 | const __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4; 12 | return _mm512_sad_epu8(t3, _mm512_setzero_si512()); 13 | } 14 | 15 | void CSA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) 16 | { 17 | /* 18 | c b a | l h 19 | ------+---- 20 | 0 0 0 | 0 0 21 | 0 0 1 | 1 0 22 | 0 1 0 | 1 0 23 | 0 1 1 | 0 1 24 | 1 0 0 | 1 0 25 | 1 0 1 | 0 1 26 | 1 1 0 | 0 1 27 | 1 1 1 | 1 1 28 | 29 | l - digit 30 | h - carry 31 | */ 32 | 33 | l = _mm512_ternarylogic_epi32(c, b, a, 0x96); 34 | h = _mm512_ternarylogic_epi32(c, b, a, 0xe8); 35 | } 36 | 37 | uint64_t popcnt(const __m512i* data, const uint64_t size) 38 | { 39 | __m512i total = _mm512_setzero_si512(); 40 | __m512i ones = _mm512_setzero_si512(); 41 | __m512i twos = _mm512_setzero_si512(); 42 | __m512i fours = _mm512_setzero_si512(); 43 | __m512i eights = _mm512_setzero_si512(); 44 | __m512i sixteens = _mm512_setzero_si512(); 45 | __m512i twosA, twosB, foursA, foursB, eightsA, eightsB; 46 | 47 | const uint64_t limit = size - size % 16; 48 | uint64_t i = 0; 49 | 50 | for(; i < limit; i += 16) 51 | { 52 | CSA(twosA, ones, ones, data[i+0], data[i+1]); 53 | CSA(twosB, ones, ones, data[i+2], data[i+3]); 54 | CSA(foursA, twos, twos, twosA, twosB); 55 | CSA(twosA, ones, ones, data[i+4], data[i+5]); 56 | CSA(twosB, ones, ones, data[i+6], data[i+7]); 57 | CSA(foursB, twos, twos, twosA, twosB); 58 | CSA(eightsA,fours, fours, foursA, foursB); 59 | CSA(twosA, ones, ones, data[i+8], data[i+9]); 60 | CSA(twosB, ones, ones, data[i+10], data[i+11]); 61 | CSA(foursA, twos, twos, twosA, twosB); 62 | CSA(twosA, ones, ones, data[i+12], data[i+13]); 63 | CSA(twosB, ones, ones, data[i+14], data[i+15]); 64 | CSA(foursB, twos, twos, twosA, twosB); 65 | CSA(eightsB, fours, fours, foursA, foursB); 66 | CSA(sixteens, eights, eights, eightsA, eightsB); 67 | 68 | total = _mm512_add_epi64(total, popcount(sixteens)); 69 | } 70 | 71 | total = _mm512_slli_epi64(total, 4); // * 16 72 | total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(eights), 3)); // += 8 * ... 73 | total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(fours), 2)); // += 4 * ... 74 | total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(twos), 1)); // += 2 * ... 75 | total = _mm512_add_epi64(total, popcount(ones)); 76 | 77 | for(; i < size; i++) 78 | total = _mm512_add_epi64(total, popcount(data[i])); 79 | 80 | 81 | return simd_sum_epu64(total); 82 | } 83 | 84 | } // AVX512_harley_seal 85 | 86 | uint64_t popcnt_AVX512_harley_seal(const uint8_t* data, const size_t size) 87 | { 88 | uint64_t total = AVX512_harley_seal::popcnt((const __m512i*) data, size / 64); 89 | 90 | for (size_t i = size - size % 64; i < size; i++) 91 | total += lookup8bit[data[i]]; 92 | 93 | return total; 94 | } 95 | 96 | -------------------------------------------------------------------------------- /popcnt-avx512-vpopcnt.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_AVX512_vpopcnt(const uint8_t *data, const size_t size) 2 | { 3 | 4 | const size_t chunks = size / 64; 5 | uint8_t *ptr = const_cast(data); 6 | const uint8_t *end = ptr + size; 7 | 8 | // count using AVX512 registers 9 | __m512i accumulator = _mm512_setzero_si512(); 10 | for (size_t i = 0; i < chunks; i++, ptr += 64) 11 | { 12 | const __m512i v = _mm512_loadu_si512((const __m512i *)ptr); 13 | const __m512i p = _mm512_popcnt_epi64(v); 14 | accumulator = _mm512_add_epi64(accumulator, p); 15 | } 16 | 17 | // use masked instrucitons for the tail 18 | if (ptr < end) 19 | { 20 | __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, end - ptr); 21 | const __m512i v = _mm512_maskz_loadu_epi64(mask, ptr); 22 | const __m512i p = _mm512_popcnt_epi64(v); 23 | accumulator = _mm512_add_epi64(accumulator, p); 24 | } 25 | 26 | return _mm512_reduce_add_epi64(accumulator); 27 | } 28 | -------------------------------------------------------------------------------- /popcnt-avx512bw-lookup.cpp: -------------------------------------------------------------------------------- 1 | namespace custom { 2 | 3 | std::uint64_t _mm256_hsum_epi64(__m256i v) { 4 | return _mm256_extract_epi64(v, 0) 5 | + _mm256_extract_epi64(v, 1) 6 | + _mm256_extract_epi64(v, 2) 7 | + _mm256_extract_epi64(v, 3); 8 | } 9 | 10 | std::uint64_t _mm512_hsum_epi64(__m512i v) { 11 | const __m256i t0 = _mm512_extracti64x4_epi64(v, 0); 12 | const __m256i t1 = _mm512_extracti64x4_epi64(v, 1); 13 | 14 | return _mm256_hsum_epi64(t0) 15 | + _mm256_hsum_epi64(t1); 16 | } 17 | 18 | } // namespace custom 19 | 20 | std::uint64_t popcnt_AVX512BW_lookup_original(const uint8_t* data, const size_t n) { 21 | 22 | size_t i = 0; 23 | 24 | const __m512i lookup = _mm512_setr_epi64( 25 | 0x0302020102010100llu, 0x0403030203020201llu, 26 | 0x0302020102010100llu, 0x0403030203020201llu, 27 | 0x0302020102010100llu, 0x0403030203020201llu, 28 | 0x0302020102010100llu, 0x0403030203020201llu 29 | ); 30 | 31 | const __m512i low_mask = _mm512_set1_epi8(0x0f); 32 | 33 | __m512i acc = _mm512_setzero_si512(); 34 | 35 | while (i + 64 < n) { 36 | 37 | __m512i local = _mm512_setzero_si512(); 38 | 39 | for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) { 40 | const __m512i vec = _mm512_loadu_si512(reinterpret_cast(data + i)); 41 | const __m512i lo = _mm512_and_si512(vec, low_mask); 42 | const __m512i hi = _mm512_and_si512(_mm512_srli_epi32(vec, 4), low_mask); 43 | 44 | const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo); 45 | const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi); 46 | 47 | local = _mm512_add_epi8(local, popcnt1); 48 | local = _mm512_add_epi8(local, popcnt2); 49 | } 50 | 51 | acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512())); 52 | } 53 | 54 | 55 | uint64_t result = custom::_mm512_hsum_epi64(acc); 56 | 57 | for (/**/; i < n; i++) { 58 | result += lookup8bit[data[i]]; 59 | } 60 | 61 | return result; 62 | } 63 | -------------------------------------------------------------------------------- /popcnt-avx512vbmi-lookup.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_AVX512VBMI_lookup(const uint8_t* data, const size_t n) { 2 | 3 | size_t i = 0; 4 | 5 | const __m512i lookup_lo = _mm512_setr_epi64(0x0302020102010100, 0x0403030203020201, 6 | 0x0403030203020201, 0x0504040304030302, 7 | 0x0403030203020201, 0x0504040304030302, 8 | 0x0504040304030302, 0x0605050405040403); 9 | const __m512i lookup_hi = _mm512_setr_epi64(0x0403030203020201, 0x0504040304030302, 10 | 0x0504040304030302, 0x0605050405040403, 11 | 0x0504040304030302, 0x0605050405040403, 12 | 0x0605050405040403, 0x0706060506050504); 13 | 14 | const __m512i lsb_mask = _mm512_set1_epi8(0x01); 15 | 16 | __m512i acc = _mm512_setzero_si512(); 17 | 18 | while (i + 64 < n) { 19 | 20 | __m512i local = _mm512_setzero_si512(); 21 | 22 | for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) { 23 | const __m512i vec = _mm512_loadu_si512(reinterpret_cast(data + i)); 24 | // get popcount from bits 6:0 25 | const __m512i lo = _mm512_permutex2var_epi8(lookup_lo, vec, lookup_hi); 26 | 27 | // and move 7th bit onto position 0 -- i.e. (x & 0x80 ? 1 : 0) 28 | const __m512i hi = _mm512_and_si512(_mm512_srli_epi32(vec, 7), lsb_mask); 29 | 30 | local = _mm512_add_epi8(local, lo); 31 | local = _mm512_add_epi8(local, hi); 32 | } 33 | 34 | acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512())); 35 | } 36 | 37 | 38 | uint64_t result = custom::_mm512_hsum_epi64(acc); 39 | 40 | for (/**/; i < n; i++) { 41 | result += lookup8bit[data[i]]; 42 | } 43 | 44 | return result; 45 | } 46 | 47 | -------------------------------------------------------------------------------- /popcnt-bit-parallel-scalar.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_parallel_64bit_naive(const uint8_t* data, const size_t n) { 2 | 3 | uint64_t result = 0; 4 | 5 | size_t i = 0; 6 | 7 | #define ITER { \ 8 | const uint64_t t1 = *reinterpret_cast(data + i); \ 9 | const uint64_t t2 = (t1 & 0x5555555555555555llu) + ((t1 >> 1) & 0x5555555555555555llu); \ 10 | const uint64_t t3 = (t2 & 0x3333333333333333llu) + ((t2 >> 2) & 0x3333333333333333llu); \ 11 | const uint64_t t4 = (t3 & 0x0f0f0f0f0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0f0f0f0f0fllu); \ 12 | const uint64_t t5 = (t4 & 0x00ff00ff00ff00ffllu) + ((t4 >> 8) & 0x00ff00ff00ff00ffllu); \ 13 | const uint64_t t6 = (t5 & 0x0000ffff0000ffffllu) + ((t5 >> 16) & 0x0000ffff0000ffffllu); \ 14 | const uint64_t t7 = (t6 & 0x00000000ffffffffllu) + ((t6 >> 32) & 0x00000000ffffffffllu); \ 15 | result += t7; \ 16 | i += 8; \ 17 | } 18 | 19 | while (i + 4*8 <= n) { 20 | ITER ITER ITER ITER 21 | } 22 | 23 | #undef ITER 24 | 25 | for (/**/; i < n; i++) { 26 | result += lookup8bit[data[i]]; 27 | } 28 | 29 | return result; 30 | } 31 | 32 | 33 | std::uint64_t popcnt_parallel_64bit_optimized(const uint8_t* data, const size_t n) { 34 | 35 | uint64_t result = 0; 36 | 37 | size_t i = 0; 38 | 39 | while (i + 4*8 <= n) { 40 | 41 | uint64_t partial = 0; // packed_byte 42 | 43 | #define ITER { \ 44 | const uint64_t t1 = *reinterpret_cast(data + i); \ 45 | const uint64_t t2 = (t1 & 0x5555555555555555llu) + ((t1 >> 1) & 0x5555555555555555llu); \ 46 | const uint64_t t3 = (t2 & 0x3333333333333333llu) + ((t2 >> 2) & 0x3333333333333333llu); \ 47 | const uint64_t t4 = (t3 & 0x0f0f0f0f0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0f0f0f0f0fllu); \ 48 | partial += t4; \ 49 | i += 8; \ 50 | } 51 | 52 | ITER ITER ITER ITER 53 | 54 | #undef ITER 55 | 56 | const uint64_t t5 = (partial & 0x00ff00ff00ff00ffllu) + ((partial >> 8) & 0x00ff00ff00ff00ffllu); 57 | const uint64_t t6 = (t5 & 0x0000ffff0000ffffllu) + ((t5 >> 16) & 0x0000ffff0000ffffllu); 58 | const uint64_t t7 = (t6 & 0x00000000ffffffffllu) + ((t6 >> 32) & 0x00000000ffffffffllu); 59 | 60 | result += t7; 61 | } 62 | 63 | for (/**/; i < n; i++) { 64 | result += lookup8bit[data[i]]; 65 | } 66 | 67 | return result; 68 | } 69 | 70 | 71 | // popcnt_mul from popcnt-harley-seal.cpp 72 | std::uint64_t popcnt_parallel_64bit_mul(const uint8_t* data, const size_t n) { 73 | 74 | uint64_t result = 0; 75 | 76 | size_t i = 0; 77 | 78 | for (/**/; i < n; i += 8) { 79 | 80 | const uint64_t m1 = UINT64_C(0x5555555555555555); 81 | const uint64_t m2 = UINT64_C(0x3333333333333333); 82 | const uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F); 83 | const uint64_t h01 = UINT64_C(0x0101010101010101); 84 | 85 | uint64_t x = *reinterpret_cast(data + i); 86 | x -= (x >> 1) & m1; 87 | x = (x & m2) + ((x >> 2) & m2); 88 | x = (x + (x >> 4)) & m4; 89 | 90 | result += (x * h01) >> 56; 91 | } 92 | 93 | for (/**/; i < n; i++) { 94 | result += lookup8bit[data[i]]; 95 | } 96 | 97 | return result; 98 | } 99 | 100 | 101 | std::uint64_t popcnt_parallel_64bit_optimized2(const uint8_t* data, const size_t n) { 102 | 103 | uint64_t result = 0; 104 | 105 | size_t i = 0; 106 | 107 | while (i + 7*8 <= n) { 108 | const uint64_t in0 = *reinterpret_cast(data + i + 8*0); 109 | const uint64_t in1 = *reinterpret_cast(data + i + 8*1); 110 | const uint64_t in2 = *reinterpret_cast(data + i + 8*2); 111 | const uint64_t in3 = *reinterpret_cast(data + i + 8*3); 112 | const uint64_t in4 = *reinterpret_cast(data + i + 8*4); 113 | const uint64_t in5 = *reinterpret_cast(data + i + 8*5); 114 | const uint64_t in6 = *reinterpret_cast(data + i + 8*6); 115 | i += 7*8; 116 | 117 | // 2-bit sums (we sum three 1-bit numbers: 3*1 = 2^2 - 1) 118 | const uint64_t a0 = in0 & 0x5555555555555555llu; 119 | const uint64_t a1 = (in0 >> 1) & 0x5555555555555555llu; 120 | const uint64_t a2 = in1 & 0x5555555555555555llu; 121 | const uint64_t A0 = (a0 + a1 + a2); 122 | 123 | const uint64_t a3 = (in1 >> 1) & 0x5555555555555555llu; 124 | const uint64_t a4 = in2 & 0x5555555555555555llu; 125 | const uint64_t a5 = (in2 >> 1) & 0x5555555555555555llu; 126 | const uint64_t A1 = (a3 + a4 + a5); 127 | 128 | const uint64_t a6 = in3 & 0x5555555555555555llu; 129 | const uint64_t a7 = (in3 >> 1) & 0x5555555555555555llu; 130 | const uint64_t a8 = in4 & 0x5555555555555555llu; 131 | const uint64_t A2 = (a6 + a7 + a8); 132 | 133 | const uint64_t a9 = (in4 >> 1) & 0x5555555555555555llu; 134 | const uint64_t a10 = in5 & 0x5555555555555555llu; 135 | const uint64_t a11 = (in5 >> 1) & 0x5555555555555555llu; 136 | const uint64_t A3 = (a9 + a10 + a11); 137 | 138 | const uint64_t a12 = in6 & 0x5555555555555555llu; 139 | const uint64_t a13 = (in6 >> 1) & 0x5555555555555555llu; 140 | const uint64_t A4 = (a12 + a13 + 0); 141 | 142 | // 4-bit sums (we sum five 2-bit numbers: 5*3 = 15 = 2^4 - 1 143 | const uint64_t b0 = A0 & 0x3333333333333333llu; 144 | const uint64_t b1 = (A0 >> 2) & 0x3333333333333333llu; 145 | const uint64_t b2 = A1 & 0x3333333333333333llu; 146 | const uint64_t b3 = (A1 >> 2) & 0x3333333333333333llu; 147 | const uint64_t b4 = A2 & 0x3333333333333333llu; 148 | const uint64_t b5 = (A2 >> 2) & 0x3333333333333333llu; 149 | const uint64_t b6 = A3 & 0x3333333333333333llu; 150 | const uint64_t b7 = (A3 >> 2) & 0x3333333333333333llu; 151 | const uint64_t b8 = A4 & 0x3333333333333333llu; 152 | const uint64_t b9 = (A4 >> 2) & 0x3333333333333333llu; 153 | 154 | const uint64_t B0 = (b0 + b1 + b2 + b3 + b4); 155 | const uint64_t B1 = (b5 + b6 + b7 + b8 + b9); 156 | 157 | // horiz sum of 4-bit values 158 | const uint64_t hsum = 0x0101010101010101llu; 159 | 160 | const uint64_t c0 = B0 & 0x0f0f0f0f0f0f0f0fllu; 161 | const uint64_t c1 = (B0 >> 4) & 0x0f0f0f0f0f0f0f0fllu; 162 | const uint64_t c2 = B1 & 0x0f0f0f0f0f0f0f0fllu; 163 | const uint64_t c3 = (B1 >> 4) & 0x0f0f0f0f0f0f0f0fllu; 164 | 165 | result += ((c0 + c1) * hsum) >> 56; 166 | result += ((c2 + c3) * hsum) >> 56; 167 | } 168 | 169 | for (/**/; i < n; i++) { 170 | result += lookup8bit[data[i]]; 171 | } 172 | 173 | return result; 174 | } 175 | -------------------------------------------------------------------------------- /popcnt-bit-parallel-scalar32.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_parallel_32bit_naive(const uint8_t* data, const size_t n) { 2 | 3 | uint32_t result = 0; 4 | 5 | size_t i = 0; 6 | 7 | #define ITER { \ 8 | const uint32_t t1 = *reinterpret_cast(data + i); \ 9 | const uint32_t t2 = (t1 & 0x55555555llu) + ((t1 >> 1) & 0x55555555llu); \ 10 | const uint32_t t3 = (t2 & 0x33333333llu) + ((t2 >> 2) & 0x33333333llu); \ 11 | const uint32_t t4 = (t3 & 0x0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0fllu); \ 12 | const uint32_t t5 = (t4 & 0x00ff00ffllu) + ((t4 >> 8) & 0x00ff00ffllu); \ 13 | const uint32_t t6 = (t5 & 0x0000ffffllu) + ((t5 >> 16) & 0x0000ffffllu); \ 14 | result += t6; \ 15 | i += 4; \ 16 | } 17 | 18 | while (i + 4*4 <= n) { 19 | ITER ITER ITER ITER 20 | } 21 | 22 | #undef ITER 23 | 24 | for (/**/; i < n; i++) { 25 | result += lookup8bit[data[i]]; 26 | } 27 | 28 | return result; 29 | } 30 | 31 | 32 | std::uint64_t popcnt_parallel_32bit_optimized(const uint8_t* data, const size_t n) { 33 | 34 | uint32_t result = 0; 35 | 36 | size_t i = 0; 37 | 38 | while (i + 4*4 <= n) { 39 | 40 | uint32_t partial = 0; // packed_byte 41 | 42 | #define ITER { \ 43 | const uint32_t t1 = *reinterpret_cast(data + i); \ 44 | const uint32_t t2 = (t1 & 0x55555555llu) + ((t1 >> 1) & 0x55555555llu); \ 45 | const uint32_t t3 = (t2 & 0x33333333llu) + ((t2 >> 2) & 0x33333333llu); \ 46 | const uint32_t t4 = (t3 & 0x0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0fllu); \ 47 | partial += t4; \ 48 | i += 4; \ 49 | } 50 | 51 | ITER ITER ITER ITER 52 | 53 | #undef ITER 54 | 55 | const uint32_t t5 = (partial & 0x00ff00ffllu) + ((partial >> 8) & 0x00ff00ffllu); 56 | const uint32_t t6 = (t5 & 0x0000ffffllu) + ((t5 >> 16) & 0x0000ffffllu); 57 | 58 | result += t6; 59 | } 60 | 61 | for (/**/; i < n; i++) { 62 | result += lookup8bit[data[i]]; 63 | } 64 | 65 | return result; 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /popcnt-builtin.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | // Note: this emits a popcnt with clang 3.4 but not with clang 3.0 5 | uint64_t builtin_popcnt(const uint64_t* buf, int len) { 6 | uint64_t cnt = 0; 7 | for (int i = 0; i < len; ++i) { 8 | cnt += __builtin_popcountll(buf[i]); 9 | } 10 | return cnt; 11 | } 12 | 13 | uint64_t builtin_popcnt32(const uint64_t* buf64, int len64) { 14 | uint64_t cnt = 0; 15 | const uint32_t* buf = (const uint32_t*) buf64; 16 | int len = len64 * 2; 17 | for (int i = 0; i < len; ++i) { 18 | cnt += __builtin_popcount(buf[i]); 19 | } 20 | return cnt; 21 | } 22 | 23 | uint64_t builtin_popcnt_unrolled(const uint64_t* buf, int len) { 24 | assert(len % 4 == 0); 25 | uint64_t cnt = 0; 26 | for (int i = 0; i < len; i+=4) { 27 | cnt += __builtin_popcountll(buf[i]); 28 | cnt += __builtin_popcountll(buf[i+1]); 29 | cnt += __builtin_popcountll(buf[i+2]); 30 | cnt += __builtin_popcountll(buf[i+3]); 31 | } 32 | return cnt; 33 | } 34 | 35 | uint64_t builtin_popcnt_unrolled32(const uint64_t* buf64, int len64) { 36 | const uint32_t* buf = (const uint32_t*) buf64; 37 | int len = len64 * 2; 38 | assert(len % 4 == 0); 39 | uint64_t cnt = 0; 40 | for (int i = 0; i < len; i+=4) { 41 | cnt += __builtin_popcount(buf[i]); 42 | cnt += __builtin_popcount(buf[i+1]); 43 | cnt += __builtin_popcount(buf[i+2]); 44 | cnt += __builtin_popcount(buf[i+3]); 45 | } 46 | return cnt; 47 | } 48 | 49 | // Attempt to work around false depdency errata. 50 | // gcc is too smart to fall for this and re-creates the dependency unless 51 | // compiled with -funroll-loops or something similar. 52 | // This works with clang, though. 53 | uint64_t builtin_popcnt_unrolled_errata(const uint64_t* buf, int len) { 54 | assert(len % 4 == 0); 55 | uint64_t cnt[4]; 56 | for (int i = 0; i < 4; ++i) { 57 | cnt[i] = 0; 58 | } 59 | 60 | for (int i = 0; i < len; i+=4) { 61 | cnt[0] += __builtin_popcountll(buf[i]); 62 | cnt[1] += __builtin_popcountll(buf[i+1]); 63 | cnt[2] += __builtin_popcountll(buf[i+2]); 64 | cnt[3] += __builtin_popcountll(buf[i+3]); 65 | } 66 | return cnt[0] + cnt[1] + cnt[2] + cnt[3]; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /popcnt-cpu.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_cpu_64bit(const uint8_t* data, const size_t n) { 2 | 3 | uint64_t result = 0; 4 | 5 | uint64_t v, i = 0; 6 | #define ITER { \ 7 | v = *reinterpret_cast(data + i); \ 8 | result += _popcnt64(v); \ 9 | i += 8; \ 10 | } 11 | 12 | while (i + 4*8 <= n) { 13 | ITER ITER ITER ITER 14 | } 15 | 16 | #undef ITER 17 | 18 | while (i < n) { 19 | result += lookup8bit[data[i]]; 20 | i++; 21 | } 22 | 23 | return result; 24 | } 25 | 26 | 27 | // Here's a version that doesn't rely on the compiler not doing 28 | // bad optimizations. 29 | // This code is from Alex Yee. 30 | 31 | uint64_t builtin_popcnt_unrolled_errata_manual(const uint64_t* buf, int len) { 32 | assert(len % 4 == 0); 33 | uint64_t cnt[4]; 34 | for (int i = 0; i < 4; ++i) { 35 | cnt[i] = 0; 36 | } 37 | 38 | for (int i = 0; i < len; i+=4) { 39 | __asm__ __volatile__( 40 | "popcnt %4, %4 \n\t" 41 | "add %4, %0 \n\t" 42 | "popcnt %5, %5 \n\t" 43 | "add %5, %1 \n\t" 44 | "popcnt %6, %6 \n\t" 45 | "add %6, %2 \n\t" 46 | "popcnt %7, %7 \n\t" 47 | "add %7, %3 \n\t" 48 | : "+r" (cnt[0]), "+r" (cnt[1]), "+r" (cnt[2]), "+r" (cnt[3]) 49 | : "r" (buf[i]), "r" (buf[i+1]), "r" (buf[i+2]), "r" (buf[i+3]) 50 | ); 51 | } 52 | return cnt[0] + cnt[1] + cnt[2] + cnt[3]; 53 | } 54 | 55 | 56 | // This works as intended with clang, but gcc turns the MOVQ intrinsic into an xmm->mem 57 | // operation which defeats the purpose of using MOVQ. 58 | 59 | uint64_t builtin_popcnt_movdq(const uint64_t* buf, int len) { 60 | uint64_t cnt = 0; 61 | __m128i temp; 62 | __m128i temp2; 63 | uint64_t lower64; 64 | uint64_t upper64; 65 | 66 | for (int i = 0; i < len; i+=2) { 67 | temp = _mm_load_si128((__m128i*)&buf[i]); 68 | lower64 = _mm_cvtsi128_si64(temp); 69 | cnt += __builtin_popcountll(lower64); 70 | temp2 = (__m128i)_mm_movehl_ps((__m128)temp, (__m128)temp); 71 | upper64 = _mm_cvtsi128_si64(temp2); 72 | cnt += __builtin_popcountll(upper64); 73 | } 74 | return cnt; 75 | } 76 | 77 | // With gcc, this code has the same problem as the previous fn, where movq 78 | // gets translated into an xmm->mem movq. 79 | // Clang handles the movq correctly but it optimizes away the seperate cnt 80 | // variables, causing the popcnt false register dependcy to reduce performance. 81 | 82 | uint64_t builtin_popcnt_movdq_unrolled(const uint64_t* buf, int len) { 83 | uint64_t cnt[4]; 84 | __m128i temp[2]; 85 | __m128i temp_upper[2]; 86 | uint64_t lower64[2]; 87 | uint64_t upper64[2]; 88 | 89 | for (int i = 0; i < 4; ++i) { 90 | cnt[i] = 0; 91 | } 92 | 93 | for (int i = 0; i < len; i+=4) { 94 | temp[0] = _mm_load_si128((__m128i*)&buf[i]); 95 | temp[1] = _mm_load_si128((__m128i*)&buf[i+2]); 96 | lower64[0] = _mm_cvtsi128_si64(temp[0]); 97 | lower64[1] = _mm_cvtsi128_si64(temp[1]); 98 | cnt[0] += __builtin_popcountll(lower64[0]); 99 | cnt[1] += __builtin_popcountll(lower64[1]); 100 | temp_upper[0] = (__m128i)_mm_movehl_ps((__m128)temp[0], (__m128)temp[0]); 101 | temp_upper[1] = (__m128i)_mm_movehl_ps((__m128)temp[1], (__m128)temp[1]); 102 | upper64[0] = _mm_cvtsi128_si64(temp_upper[0]); 103 | upper64[1] = _mm_cvtsi128_si64(temp_upper[1]); 104 | cnt[2] += __builtin_popcountll(upper64[0]); 105 | cnt[3] += __builtin_popcountll(upper64[1]); 106 | } 107 | 108 | __asm__ __volatile__("":::"memory"); // without this GCC 4.9.2 optimized out the loop 109 | 110 | return cnt[0] + cnt[1] + cnt[2] + cnt[3]; 111 | } 112 | 113 | uint64_t builtin_popcnt_movdq_unrolled_manual(const uint64_t* buf, int len) { 114 | uint64_t cnt[4]; 115 | 116 | for (int i = 0; i < 4; ++i) { 117 | cnt[i] = 0; 118 | } 119 | 120 | __m128i x0_upper = _mm_setzero_si128(); 121 | __m128i x1_upper = _mm_setzero_si128(); 122 | 123 | for (int i = 0; i < len; i+=4) { 124 | __m128i x0 = _mm_load_si128((__m128i*)&buf[i]); 125 | __m128i x1 = _mm_load_si128((__m128i*)&buf[i+2]); 126 | 127 | uint64_t dummy0; 128 | uint64_t dummy1; 129 | uint64_t dummy0_upper; 130 | uint64_t dummy1_upper; 131 | 132 | __asm__ __volatile__( 133 | "movhlps %10, %6 \n\t" 134 | "movhlps %11, %7 \n\t" 135 | "movq %10, %4 \n\t" 136 | "movq %11, %5 \n\t" 137 | "popcnt %4, %4 \n\t" 138 | "add %4, %0 \n\t" 139 | "popcnt %5, %5 \n\t" 140 | "add %5, %1 \n\t" 141 | "movq %6, %8 \n\t" 142 | "movq %7, %9 \n\t" 143 | "popcnt %8, %8 \n\t" 144 | "add %8, %2 \n\t" 145 | "popcnt %9, %9 \n\t" 146 | "add %9, %3 \n\t" 147 | : "+r" (cnt[0]), "+r" (cnt[1]), "+r" (cnt[2]), "+r" (cnt[3]), 148 | "=&r" (dummy0), "=&r" (dummy1), "+x" (x0_upper), "+x" (x1_upper), 149 | "=&r" (dummy0_upper), "=&r" (dummy1_upper) 150 | : "x" (x0), "x" (x1) 151 | ); 152 | } 153 | return cnt[0] + cnt[1] + cnt[2] + cnt[3]; 154 | } 155 | 156 | -------------------------------------------------------------------------------- /popcnt-harley-seal.cpp: -------------------------------------------------------------------------------- 1 | namespace { 2 | 3 | /// This uses fewer arithmetic operations than any other known 4 | /// implementation on machines with fast multiplication. 5 | /// It uses 12 arithmetic operations, one of which is a multiply. 6 | /// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation 7 | /// 8 | uint64_t popcount_mul(uint64_t x) 9 | { 10 | const uint64_t m1 = UINT64_C(0x5555555555555555); 11 | const uint64_t m2 = UINT64_C(0x3333333333333333); 12 | const uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F); 13 | const uint64_t h01 = UINT64_C(0x0101010101010101); 14 | 15 | x -= (x >> 1) & m1; 16 | x = (x & m2) + ((x >> 2) & m2); 17 | x = (x + (x >> 4)) & m4; 18 | return (x * h01) >> 56; 19 | } 20 | 21 | /// Carry-save adder (CSA). 22 | /// @see Chapter 5 in "Hacker's Delight". 23 | /// 24 | void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c) 25 | { 26 | uint64_t u = a ^ b; 27 | h = (a & b) | (u & c); 28 | l = u ^ c; 29 | } 30 | 31 | /// Harley-Seal popcount (4th iteration). 32 | /// The Harley-Seal popcount algorithm is one of the fastest algorithms 33 | /// for counting 1 bits in an array using only integer operations. 34 | /// This implementation uses only 5.69 instructions per 64-bit word. 35 | /// @see Chapter 5 in "Hacker's Delight" 2nd edition. 36 | /// 37 | uint64_t popcnt_harley_seal_64bit(const uint64_t* data, const uint64_t size) 38 | { 39 | uint64_t total = 0; 40 | uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; 41 | uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB; 42 | uint64_t limit = size - size % 16; 43 | uint64_t i = 0; 44 | 45 | for(; i < limit; i += 16) 46 | { 47 | CSA(twosA, ones, ones, data[i+0], data[i+1]); 48 | CSA(twosB, ones, ones, data[i+2], data[i+3]); 49 | CSA(foursA, twos, twos, twosA, twosB); 50 | CSA(twosA, ones, ones, data[i+4], data[i+5]); 51 | CSA(twosB, ones, ones, data[i+6], data[i+7]); 52 | CSA(foursB, twos, twos, twosA, twosB); 53 | CSA(eightsA,fours, fours, foursA, foursB); 54 | CSA(twosA, ones, ones, data[i+8], data[i+9]); 55 | CSA(twosB, ones, ones, data[i+10], data[i+11]); 56 | CSA(foursA, twos, twos, twosA, twosB); 57 | CSA(twosA, ones, ones, data[i+12], data[i+13]); 58 | CSA(twosB, ones, ones, data[i+14], data[i+15]); 59 | CSA(foursB, twos, twos, twosA, twosB); 60 | CSA(eightsB, fours, fours, foursA, foursB); 61 | CSA(sixteens, eights, eights, eightsA, eightsB); 62 | 63 | total += popcount_mul(sixteens); 64 | } 65 | 66 | total *= 16; 67 | total += 8 * popcount_mul(eights); 68 | total += 4 * popcount_mul(fours); 69 | total += 2 * popcount_mul(twos); 70 | total += 1 * popcount_mul(ones); 71 | 72 | for(; i < size; i++) 73 | total += popcount_mul(data[i]); 74 | 75 | return total; 76 | } 77 | 78 | } // namespace 79 | 80 | uint64_t popcnt_harley_seal(const uint8_t* data, const size_t size) 81 | { 82 | uint64_t total = popcnt_harley_seal_64bit((const uint64_t*) data, size / 8); 83 | 84 | for (size_t i = size - size % 8; i < size; i++) 85 | total += lookup8bit[data[i]]; 86 | 87 | return total; 88 | } 89 | -------------------------------------------------------------------------------- /popcnt-lookup.cpp: -------------------------------------------------------------------------------- 1 | // ---- lookup[256] ------------------------------------------------------- 2 | 3 | uint64_t lookup64bit[256] = { 4 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 5 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 6 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 7 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 8 | /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3, 9 | /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4, 10 | /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4, 11 | /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5, 12 | /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3, 13 | /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4, 14 | /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4, 15 | /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5, 16 | /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4, 17 | /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5, 18 | /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5, 19 | /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6, 20 | /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3, 21 | /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4, 22 | /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4, 23 | /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5, 24 | /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4, 25 | /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5, 26 | /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5, 27 | /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6, 28 | /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4, 29 | /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5, 30 | /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5, 31 | /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6, 32 | /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5, 33 | /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6, 34 | /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6, 35 | /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7, 36 | /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3, 37 | /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4, 38 | /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4, 39 | /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5, 40 | /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4, 41 | /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5, 42 | /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5, 43 | /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6, 44 | /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4, 45 | /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5, 46 | /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5, 47 | /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6, 48 | /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5, 49 | /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6, 50 | /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6, 51 | /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7, 52 | /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4, 53 | /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5, 54 | /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5, 55 | /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6, 56 | /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5, 57 | /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6, 58 | /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6, 59 | /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7, 60 | /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5, 61 | /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6, 62 | /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6, 63 | /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7, 64 | /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6, 65 | /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7, 66 | /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7, 67 | /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8 68 | }; 69 | 70 | 71 | uint8_t lookup8bit[256] = { 72 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 73 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 74 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 75 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, 76 | /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3, 77 | /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4, 78 | /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4, 79 | /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5, 80 | /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3, 81 | /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4, 82 | /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4, 83 | /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5, 84 | /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4, 85 | /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5, 86 | /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5, 87 | /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6, 88 | /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3, 89 | /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4, 90 | /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4, 91 | /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5, 92 | /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4, 93 | /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5, 94 | /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5, 95 | /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6, 96 | /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4, 97 | /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5, 98 | /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5, 99 | /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6, 100 | /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5, 101 | /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6, 102 | /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6, 103 | /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7, 104 | /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3, 105 | /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4, 106 | /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4, 107 | /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5, 108 | /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4, 109 | /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5, 110 | /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5, 111 | /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6, 112 | /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4, 113 | /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5, 114 | /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5, 115 | /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6, 116 | /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5, 117 | /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6, 118 | /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6, 119 | /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7, 120 | /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4, 121 | /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5, 122 | /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5, 123 | /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6, 124 | /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5, 125 | /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6, 126 | /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6, 127 | /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7, 128 | /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5, 129 | /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6, 130 | /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6, 131 | /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7, 132 | /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6, 133 | /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7, 134 | /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7, 135 | /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8 136 | }; 137 | 138 | 139 | std::uint64_t popcnt_lookup_8bit(const uint8_t* data, const size_t n) { 140 | 141 | size_t result = 0; 142 | 143 | size_t i = 0; 144 | while (i + 4 <= n) { 145 | result += lookup8bit[data[i]]; i++; 146 | result += lookup8bit[data[i]]; i++; 147 | result += lookup8bit[data[i]]; i++; 148 | result += lookup8bit[data[i]]; i++; 149 | } 150 | 151 | while (i < n) { 152 | result += lookup8bit[data[i]]; i++; 153 | } 154 | 155 | return result; 156 | } 157 | 158 | 159 | std::uint64_t popcnt_lookup_64bit(const uint8_t* data, const size_t n) { 160 | 161 | size_t result = 0; 162 | 163 | size_t i = 0; 164 | while (i + 4 <= n) { 165 | result += lookup64bit[data[i]]; i++; 166 | result += lookup64bit[data[i]]; i++; 167 | result += lookup64bit[data[i]]; i++; 168 | result += lookup64bit[data[i]]; i++; 169 | } 170 | 171 | while (i < n) { 172 | result += lookup64bit[data[i]]; i++; 173 | } 174 | 175 | return result; 176 | } 177 | -------------------------------------------------------------------------------- /popcnt-neon.cpp: -------------------------------------------------------------------------------- 1 | 2 | FORCE_INLINE uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t) 3 | { 4 | return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t))); 5 | } 6 | 7 | 8 | uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size) 9 | { 10 | uint64_t i = 0; 11 | uint64_t cnt = 0; 12 | uint64_t chunk_size = 64; 13 | 14 | if (size >= chunk_size) 15 | { 16 | uint64_t iters = size / chunk_size; 17 | const uint8_t* ptr = (const uint8_t*) data; 18 | uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0)); 19 | uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0)); 20 | 21 | do 22 | { 23 | uint8x16_t t0 = zero; 24 | uint8x16_t t1 = zero; 25 | uint8x16_t t2 = zero; 26 | uint8x16_t t3 = zero; 27 | 28 | /* 29 | * After every 31 iterations we need to add the 30 | * temporary sums (t0, t1, t2, t3) to the total sum. 31 | * We must ensure that the temporary sums <= 255 32 | * and 31 * 8 bits = 248 which is OK. 33 | */ 34 | uint64_t limit = (i + 31 < iters) ? i + 31 : iters; 35 | 36 | /* Each iteration processes 64 bytes */ 37 | for (; i < limit; i++) 38 | { 39 | uint8x16x4_t input = vld4q_u8(ptr); 40 | ptr += chunk_size; 41 | 42 | t0 = vaddq_u8(t0, vcntq_u8(input.val[0])); 43 | t1 = vaddq_u8(t1, vcntq_u8(input.val[1])); 44 | t2 = vaddq_u8(t2, vcntq_u8(input.val[2])); 45 | t3 = vaddq_u8(t3, vcntq_u8(input.val[3])); 46 | } 47 | 48 | sum = vpadalq(sum, t0); 49 | sum = vpadalq(sum, t1); 50 | sum = vpadalq(sum, t2); 51 | sum = vpadalq(sum, t3); 52 | } 53 | while (i < iters); 54 | 55 | uint64_t tmp[2]; 56 | vst1q_u64(tmp, sum); 57 | cnt += tmp[0]; 58 | cnt += tmp[1]; 59 | 60 | /* Convert back to byte index */ 61 | i *= chunk_size; 62 | } 63 | 64 | for (; i < size; i++) { 65 | cnt += lookup8bit[data[i]]; 66 | } 67 | 68 | return cnt; 69 | } 70 | 71 | 72 | void FORCE_INLINE CSA(uint8x16_t& h, uint8x16_t& l, uint8x16_t a, uint8x16_t b, uint8x16_t c) 73 | { 74 | uint8x16_t u = veorq_u8(a, b); 75 | h = vorrq_u8(vandq_u8(a, b), vandq_u8(u, c)); 76 | l = veorq_u8(u, c); 77 | } 78 | 79 | 80 | uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) { 81 | 82 | const uint8x16_t pcnt = vcntq_u8(reg); 83 | const uint16x8_t t0 = vpaddlq_u8(pcnt); 84 | const uint32x4_t t1 = vpaddlq_u16(t0); 85 | const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1)); 86 | 87 | return t2; 88 | } 89 | 90 | 91 | uint64_t popcnt_neon_harley_seal(const uint8_t* data, const size_t size) 92 | { 93 | uint32x2_t total = vdup_n_u32(0); 94 | uint8x16_t ones, twos, fours, eights, sixteens; 95 | uint8x16_t twosA, twosB, foursA, foursB, eightsA, eightsB; 96 | uint64_t limit = size - size % (16*16); 97 | uint64_t i = 0; 98 | 99 | ones = twos = fours = eights = sixteens = vdupq_n_u8(0); 100 | 101 | uint8_t* ptr = const_cast(data); 102 | 103 | for(; i < limit; i += 16*16) 104 | { 105 | CSA(twosA, ones, ones, vld1q_u8(ptr + 16*0), vld1q_u8(ptr + 16*1)); 106 | CSA(twosB, ones, ones, vld1q_u8(ptr + 16*2), vld1q_u8(ptr + 16*3)); 107 | CSA(foursA, twos, twos, twosA, twosB); 108 | CSA(twosA, ones, ones, vld1q_u8(ptr + 16*4), vld1q_u8(ptr + 16*5)); 109 | CSA(twosB, ones, ones, vld1q_u8(ptr + 16*6), vld1q_u8(ptr + 16*7)); 110 | CSA(foursB, twos, twos, twosA, twosB); 111 | CSA(eightsA,fours, fours, foursA, foursB); 112 | CSA(twosA, ones, ones, vld1q_u8(ptr + 16*8), vld1q_u8(ptr + 16*9)); 113 | CSA(twosB, ones, ones, vld1q_u8(ptr + 16*10), vld1q_u8(ptr + 16*11)); 114 | CSA(foursA, twos, twos, twosA, twosB); 115 | CSA(twosA, ones, ones, vld1q_u8(ptr + 16*12), vld1q_u8(ptr + 16*13)); 116 | CSA(twosB, ones, ones, vld1q_u8(ptr + 16*14), vld1q_u8(ptr + 16*15)); 117 | CSA(foursB, twos, twos, twosA, twosB); 118 | CSA(eightsB, fours, fours, foursA, foursB); 119 | CSA(sixteens, eights, eights, eightsA, eightsB); 120 | 121 | total = vadd_u32(total, popcnt_neon_qreg(sixteens)); 122 | 123 | ptr += 16*16; 124 | } 125 | 126 | total = vshl_n_u32(total, 4); 127 | total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(eights), 3)); 128 | total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(fours), 2)); 129 | total = vadd_u32(total, vshl_n_u32(popcnt_neon_qreg(twos), 1)); 130 | total = vadd_u32(total, popcnt_neon_qreg(ones)); 131 | 132 | uint32_t scalar = 0; 133 | 134 | scalar += vget_lane_u32(total, 0); 135 | scalar += vget_lane_u32(total, 1); 136 | 137 | for(; i < size; i++) { 138 | scalar += lookup8bit[*ptr++]; 139 | } 140 | 141 | return scalar; 142 | } 143 | -------------------------------------------------------------------------------- /popcnt-rvv.cpp: -------------------------------------------------------------------------------- 1 | // The most generic procedure 2 | std::uint64_t popcnt_rvv_lookup(const uint8_t* data, const size_t n) { 3 | const unsigned vlenb = __riscv_vlenb(); 4 | const unsigned vlenb_m8 = vlenb * 8; 5 | const unsigned vlenb_log2 = __builtin_ctz(vlenb_m8); 6 | const unsigned vlenb_mask = ((1 << vlenb_log2) - 1); 7 | 8 | // Note: when VLEN * LMUL=8 >= 256, we can have a specialisation without the inner loop on `consumed` 9 | 10 | // 1. load lookup table 11 | const size_t vl_lookup = __riscv_vsetvl_e8m8(256); 12 | const vuint8m8_t lookup = __riscv_vle8_v_u8m8(lookup8bit, vl_lookup); 13 | 14 | std::uint64_t result = 0; 15 | 16 | const size_t vl = __riscv_vsetvlmax_e16m1(); 17 | 18 | const vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0, vl); 19 | 20 | // 2. process input in chunks 21 | const uint8_t* end = data + n; 22 | while (data + vlenb_m8 < end) { 23 | const size_t vl = __riscv_vsetvlmax_e8m8(); 24 | vuint8m8_t vcounter = __riscv_vmv_v_x_u8m8(0, vl); 25 | 26 | for (int k=0; k < 255/8 && data + vlenb_m8 < end; k++, data += vlenb_m8) { 27 | // a. load input chunk 28 | vuint8m8_t input = __riscv_vle8_v_u8m8(data, vl); 29 | 30 | unsigned consumed = 0; 31 | while (consumed < 8) { 32 | // b. get lower log2 bits & add popcount 33 | const vuint8m8_t masked = __riscv_vand_vx_u8m8(input, vlenb_mask, vl); 34 | const vuint8m8_t popcount = __riscv_vrgather_vv_u8m8(lookup, masked, vl); 35 | 36 | // c. update local pointer 37 | vcounter = __riscv_vadd_vv_u8m8(vcounter, popcount, vl); 38 | 39 | // d. bring upper, unprocessed bits 40 | input = __riscv_vsrl_vx_u8m8(input, vlenb_log2, vl); 41 | consumed += vlenb_log2; 42 | } 43 | } 44 | 45 | { // update the main counter 46 | vuint16m1_t tmp = __riscv_vwredsumu(vcounter, zero, vl); 47 | const uint16_t tmp0 = __riscv_vmv_x(tmp); 48 | result += uint64_t(tmp0); 49 | } 50 | } 51 | 52 | // 3. process tail 53 | while (data < end) { 54 | result += lookup8bit[*data]; 55 | data += 1; 56 | } 57 | 58 | return result; 59 | } 60 | -------------------------------------------------------------------------------- /popcnt-sse-bit-parallel-better.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation 3 | 4 | the three first steps from popcount_2: 5 | 6 | int popcount_2(uint64_t x) { 7 | x -= (x >> 1) & m1; //put count of each 2 bits into those 2 bits 8 | x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits 9 | x = (x + (x >> 4)) & m4; //put count of each 8 bits into those 8 bits 10 | x += x >> 8; //put count of each 16 bits into their lowest 8 bits 11 | x += x >> 16; //put count of each 32 bits into their lowest 8 bits 12 | x += x >> 32; //put count of each 64 bits into their lowest 8 bits 13 | return x & 0x7f; 14 | } 15 | */ 16 | 17 | std::uint64_t popcnt_SSE_bit_parallel_better(const uint8_t* data, const size_t n) { 18 | 19 | size_t i = 0; 20 | 21 | const __m128i m1 = _mm_set1_epi8(0x55); 22 | const __m128i m2 = _mm_set1_epi8(0x33); 23 | const __m128i m4 = _mm_set1_epi8(0x0f); 24 | 25 | __m128i acc = _mm_setzero_si128(); 26 | 27 | while (i + 4*16 < n) { 28 | 29 | __m128i partial = _mm_setzero_si128(); 30 | 31 | #define ITER { \ 32 | const __m128i t1 = _mm_loadu_si128(reinterpret_cast(data + i)); \ 33 | const __m128i t2 = _mm_sub_epi8(t1, _mm_srli_epi16(t1, 1) & m1); \ 34 | const __m128i t3 = _mm_add_epi8(t2 & m2, _mm_srli_epi16(t2, 2) & m2); \ 35 | const __m128i t4 = _mm_add_epi8(t3, _mm_srli_epi16(t3, 4)) & m4; \ 36 | partial = _mm_add_epi8(partial, t4); \ 37 | i += 16; \ 38 | } 39 | 40 | ITER ITER ITER ITER 41 | 42 | #undef ITER 43 | 44 | acc = _mm_add_epi64(acc, _mm_sad_epu8(partial, _mm_setzero_si128())); 45 | } 46 | 47 | size_t result = lower_qword(acc) + higher_qword(acc); 48 | 49 | for (/**/; i < n; i++) { 50 | result += lookup8bit[data[i]]; 51 | } 52 | 53 | return result; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /popcnt-sse-bit-parallel.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_SSE_bit_parallel(const uint8_t* data, const size_t n) { 2 | 3 | size_t i = 0; 4 | 5 | const __m128i pattern_2bit = _mm_set1_epi8(0x55); 6 | const __m128i pattern_4bit = _mm_set1_epi8(0x33); 7 | const __m128i pattern_16bit = _mm_set1_epi8(0x0f); 8 | 9 | __m128i acc = _mm_setzero_si128(); 10 | 11 | while (i + 4*16 < n) { 12 | 13 | __m128i partial = _mm_setzero_si128(); 14 | 15 | #define ITER { \ 16 | const __m128i t1 = _mm_loadu_si128(reinterpret_cast(data + i)); \ 17 | const __m128i t2 = (t1 & pattern_2bit) + ((t1 >> shift16(1)) & pattern_2bit); \ 18 | const __m128i t3 = (t2 & pattern_4bit) + ((t2 >> shift16(2)) & pattern_4bit); \ 19 | const __m128i t4 = (t3 & pattern_16bit) + ((t3 >> shift16(4)) & pattern_16bit); \ 20 | partial = _mm_add_epi8(partial, t4); \ 21 | i += 16; \ 22 | } 23 | 24 | ITER ITER ITER ITER 25 | 26 | #undef ITER 27 | 28 | acc = _mm_add_epi64(acc, _mm_sad_epu8(partial, _mm_setzero_si128())); 29 | } 30 | 31 | size_t result = lower_qword(acc) + higher_qword(acc); 32 | 33 | for (/**/; i < n; i++) { 34 | result += lookup8bit[data[i]]; 35 | } 36 | 37 | return result; 38 | } 39 | 40 | 41 | std::uint64_t popcnt_SSE_bit_parallel_original(const uint8_t* data, const size_t n) { 42 | 43 | size_t i = 0; 44 | 45 | const __m128i pattern_2bit = _mm_set1_epi8(0x55); 46 | const __m128i pattern_4bit = _mm_set1_epi8(0x33); 47 | const __m128i pattern_16bit = _mm_set1_epi8(0x0f); 48 | 49 | __m128i acc = _mm_setzero_si128(); 50 | 51 | while (i + 16 < n) { 52 | 53 | __m128i partial = _mm_setzero_si128(); 54 | 55 | for (int k=0; k < 255/8 && i + 16 < n; k++, i += 16) { 56 | 57 | const __m128i t1 = _mm_loadu_si128(reinterpret_cast(data + i)); 58 | const __m128i t2 = (t1 & pattern_2bit) + ((t1 >> shift16(1)) & pattern_2bit); 59 | const __m128i t3 = (t2 & pattern_4bit) + ((t2 >> shift16(2)) & pattern_4bit); 60 | const __m128i t4 = (t3 & pattern_16bit) + ((t3 >> shift16(4)) & pattern_16bit); 61 | partial = _mm_add_epi8(partial, t4); 62 | } 63 | 64 | acc = _mm_add_epi64(acc, _mm_sad_epu8(partial, _mm_setzero_si128())); 65 | } 66 | 67 | size_t result = lower_qword(acc) + higher_qword(acc); 68 | 69 | for (/**/; i < n; i++) { 70 | result += lookup8bit[data[i]]; 71 | } 72 | 73 | return result; 74 | } 75 | -------------------------------------------------------------------------------- /popcnt-sse-cpu.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_SSE_and_cpu(const uint8_t* data, const size_t n) { 2 | 3 | #define ITER { \ 4 | const __m128i v = _mm_loadu_si128(reinterpret_cast(data + i)); \ 5 | result += _popcnt64(lower_qword(v)); \ 6 | result += _popcnt64(higher_qword(v)); \ 7 | i += 16; \ 8 | } 9 | 10 | size_t i = 0; 11 | uint64_t result = 0; 12 | 13 | while (i + 4*16 <= n) { 14 | ITER ITER ITER ITER 15 | } 16 | 17 | #undef ITER 18 | 19 | for (/**/; i < n; i++) { 20 | result += lookup8bit[data[i]]; 21 | } 22 | 23 | return result; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /popcnt-sse-harley-seal.cpp: -------------------------------------------------------------------------------- 1 | // direct translation of popcnt-harley-seal.cpp 2 | 3 | namespace SSE_harley_seal { 4 | 5 | __m128i popcount(const __m128i x) 6 | { 7 | const __m128i m1 = _mm_set1_epi8(0x55); 8 | const __m128i m2 = _mm_set1_epi8(0x33); 9 | const __m128i m4 = _mm_set1_epi8(0x0F); 10 | 11 | const __m128i t1 = x; 12 | const __m128i t2 = _mm_sub_epi8(t1, _mm_srli_epi16(t1, 1) & m1); 13 | const __m128i t3 = _mm_add_epi8(t2 & m2, _mm_srli_epi16(t2, 2) & m2); 14 | const __m128i t4 = _mm_add_epi8(t3, _mm_srli_epi16(t3, 4)) & m4; 15 | 16 | return _mm_sad_epu8(t4, _mm_setzero_si128()); 17 | } 18 | 19 | void CSA(__m128i& h, __m128i& l, __m128i a, __m128i b, __m128i c) 20 | { 21 | const __m128i u = a ^ b; 22 | h = (a & b) | (u & c); 23 | l = u ^ c; 24 | 25 | } 26 | 27 | uint64_t popcnt(const __m128i* data, const uint64_t size) 28 | { 29 | __m128i total = _mm_setzero_si128(); 30 | __m128i ones = _mm_setzero_si128(); 31 | __m128i twos = _mm_setzero_si128(); 32 | __m128i fours = _mm_setzero_si128(); 33 | __m128i eights = _mm_setzero_si128(); 34 | __m128i sixteens = _mm_setzero_si128(); 35 | __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; 36 | 37 | const uint64_t limit = size - size % 16; 38 | uint64_t i = 0; 39 | 40 | for(; i < limit; i += 16) 41 | { 42 | CSA(twosA, ones, ones, data[i+0], data[i+1]); 43 | CSA(twosB, ones, ones, data[i+2], data[i+3]); 44 | CSA(foursA, twos, twos, twosA, twosB); 45 | CSA(twosA, ones, ones, data[i+4], data[i+5]); 46 | CSA(twosB, ones, ones, data[i+6], data[i+7]); 47 | CSA(foursB, twos, twos, twosA, twosB); 48 | CSA(eightsA,fours, fours, foursA, foursB); 49 | CSA(twosA, ones, ones, data[i+8], data[i+9]); 50 | CSA(twosB, ones, ones, data[i+10], data[i+11]); 51 | CSA(foursA, twos, twos, twosA, twosB); 52 | CSA(twosA, ones, ones, data[i+12], data[i+13]); 53 | CSA(twosB, ones, ones, data[i+14], data[i+15]); 54 | CSA(foursB, twos, twos, twosA, twosB); 55 | CSA(eightsB, fours, fours, foursA, foursB); 56 | CSA(sixteens, eights, eights, eightsA, eightsB); 57 | 58 | total = _mm_add_epi64(total, popcount(sixteens)); 59 | } 60 | 61 | 62 | total = _mm_slli_epi64(total, 4); // * 16 63 | total = _mm_add_epi64(total, _mm_slli_epi64(popcount(eights), 3)); // += 8 * ... 64 | total = _mm_add_epi64(total, _mm_slli_epi64(popcount(fours), 2)); // += 4 * ... 65 | total = _mm_add_epi64(total, _mm_slli_epi64(popcount(twos), 1)); // += 2 * ... 66 | total = _mm_add_epi64(total, popcount(ones)); 67 | 68 | for(; i < size; i++) 69 | total += popcount(data[i]); 70 | 71 | return lower_qword(total) + higher_qword(total); 72 | } 73 | 74 | } // SSE_harley_seal 75 | 76 | uint64_t popcnt_SSE_harley_seal(const uint8_t* data, const size_t size) 77 | { 78 | uint64_t total = SSE_harley_seal::popcnt((const __m128i*) data, size / 16); 79 | 80 | for (size_t i = size - size % 16; i < size; i++) 81 | total += lookup8bit[data[i]]; 82 | 83 | return total; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /popcnt-sse-lookup.cpp: -------------------------------------------------------------------------------- 1 | std::uint64_t popcnt_SSE_lookup(const uint8_t* data, const size_t n) { 2 | 3 | size_t i = 0; 4 | 5 | const __m128i lookup = _mm_setr_epi8( 6 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 7 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 8 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 9 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 10 | ); 11 | 12 | const __m128i low_mask = _mm_set1_epi8(0x0f); 13 | 14 | __m128i acc = _mm_setzero_si128(); 15 | 16 | #define ITER { \ 17 | const __m128i vec = _mm_loadu_si128(reinterpret_cast(data + i)); \ 18 | const __m128i lo = vec & low_mask; \ 19 | const __m128i hi = (vec >> shift16(4)) & low_mask; \ 20 | const __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo); \ 21 | const __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi); \ 22 | local = _mm_add_epi8(local, popcnt1); \ 23 | local = _mm_add_epi8(local, popcnt2); \ 24 | i += 16; \ 25 | } 26 | 27 | while (i + 8*16 <= n) { 28 | __m128i local = _mm_setzero_si128(); 29 | ITER ITER ITER ITER 30 | ITER ITER ITER ITER 31 | acc = _mm_add_epi64(acc, _mm_sad_epu8(local, _mm_setzero_si128())); 32 | } 33 | 34 | __m128i local = _mm_setzero_si128(); 35 | 36 | while (i + 16 <= n) { 37 | ITER 38 | } 39 | 40 | acc = _mm_add_epi64(acc, _mm_sad_epu8(local, _mm_setzero_si128())); 41 | 42 | #undef ITER 43 | 44 | size_t result = lower_qword(acc) + higher_qword(acc); 45 | 46 | for (/**/; i < n; i++) { 47 | result += lookup8bit[data[i]]; 48 | } 49 | 50 | return result; 51 | } 52 | 53 | 54 | std::uint64_t popcnt_SSE_lookup_original(const uint8_t* data, const size_t n) { 55 | 56 | size_t i = 0; 57 | 58 | const __m128i lookup = _mm_setr_epi8( 59 | /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, 60 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, 61 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, 62 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 63 | ); 64 | 65 | const __m128i low_mask = _mm_set1_epi8(0x0f); 66 | 67 | __m128i acc = _mm_setzero_si128(); 68 | 69 | while (i + 16 < n) { 70 | __m128i local = _mm_setzero_si128(); 71 | 72 | for (int k=0; k < 255/8 && i + 16 < n; k++, i += 16) { 73 | const __m128i vec = _mm_loadu_si128(reinterpret_cast(data + i)); 74 | const __m128i lo = vec & low_mask; 75 | const __m128i hi = (vec >> shift16(4)) & low_mask; 76 | const __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo); 77 | const __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi); 78 | local = _mm_add_epi8(local, popcnt1); 79 | local = _mm_add_epi8(local, popcnt2); 80 | } 81 | acc = _mm_add_epi64(acc, _mm_sad_epu8(local, _mm_setzero_si128())); 82 | } 83 | 84 | size_t result = lower_qword(acc) + higher_qword(acc); 85 | 86 | for (/**/; i < n; i++) { 87 | result += lookup8bit[data[i]]; 88 | } 89 | 90 | return result; 91 | } 92 | -------------------------------------------------------------------------------- /results/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /results/README.rst: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | Microbenchmark results 3 | ================================================================================ 4 | 5 | * Cannon Lake: 6 | 7 | * `GCC 8.3.1`__ 8 | 9 | * Cascade Lake: 10 | 11 | * `GCC 8.3.0`__ 12 | 13 | * Skylake-X: 14 | 15 | * `GCC 8.1.0`__ 16 | 17 | * Skylake: 18 | 19 | * `Clang 3.8.0`__ 20 | * `GCC 5.3.0`__ 21 | 22 | * Haswell 23 | 24 | * `Clang 3.8.0`__ 25 | * `GCC 5.3.0`__ 26 | 27 | * Sandybridge-E 28 | 29 | * `GCC 4.8.5`__ 30 | * `GCC 5.3.0`__ 31 | 32 | * Westmere 33 | 34 | * `GCC 4.9.2`__ 35 | 36 | * Bulldozer 37 | 38 | * `GCC 4.8.4`__ 39 | 40 | * ARMv7 41 | 42 | * `GCC 4.9.2`__ 43 | 44 | * ARMv8 45 | 46 | * `GCC 4.8.5`__ 47 | * `Clang 3.8.0`__ 48 | 49 | __ cannonlake/cannonlake-i3-8121U-gcc-8.3.1.rst 50 | __ cascadelake/cascadelake-Xeon-Gold-6240-gcc-8.3.0.rst 51 | __ skylake-x/skylake-x-w-2104-gcc8.1.0.rst 52 | __ skylake/skylake-i7-6700-clang3.8.0-avx2.rst 53 | __ skylake/skylake-i7-6700-gcc5.3.0-avx2.rst 54 | __ haswell/haswell-i7-4770-gcc5.3.0-avx2.rst 55 | __ haswell/haswell-i7-4770-clang3.8.0-avx2.rst 56 | __ sandybridge-e/sandybridgeE-i7-3930k-g++4.8-avx.rst 57 | __ sandybridge-e/sandybridgeE-i7-3930k-g++5.3-avx.rst 58 | __ westmere/westmere-m540-gcc4.9.2-sse.rst 59 | __ bulldozer/bulldozer-fx-8510-gcc4.8.4-sse.rst 60 | __ arm/armv7-32bit-gcc4.9.2.rst 61 | __ arm/arm-64bit-gcc4.8.5.rst 62 | __ arm/arm-64bit-clang3.8.0.rst 63 | -------------------------------------------------------------------------------- /results/arm/arm-64bit-clang3.8.0.csv: -------------------------------------------------------------------------------- 1 | lookup-8, 32, 10000000, 0.441582 2 | lookup-64, 32, 10000000, 0.459142 3 | bit-parallel, 32, 10000000, 0.329409 4 | bit-parallel-optimized, 32, 10000000, 0.241336 5 | bit-parallel-mul, 32, 10000000, 0.248093 6 | bit-parallel32, 32, 10000000, 0.353133 7 | bit-parallel-optimized32, 32, 10000000, 0.306443 8 | harley-seal, 32, 10000000, 0.606159 9 | builtin-popcnt, 32, 10000000, 0.117700 10 | builtin-popcnt32, 32, 10000000, 0.236088 11 | builtin-popcnt-unrolled, 32, 10000000, 0.092716 12 | builtin-popcnt-unrolled32, 32, 10000000, 0.229926 13 | neon-vcnt, 32, 10000000, 0.576741 14 | neon-HS, 32, 10000000, 0.724174 15 | aarch64-cnt, 32, 10000000, 0.570933 16 | lookup-8, 64, 5000000, 0.497284 17 | lookup-64, 64, 5000000, 0.509031 18 | bit-parallel, 64, 5000000, 0.311924 19 | bit-parallel-optimized, 64, 5000000, 0.226550 20 | bit-parallel-mul, 64, 5000000, 0.171573 21 | bit-parallel32, 64, 5000000, 0.335423 22 | bit-parallel-optimized32, 64, 5000000, 0.283957 23 | harley-seal, 64, 5000000, 0.367020 24 | builtin-popcnt, 64, 5000000, 0.097103 25 | builtin-popcnt32, 64, 5000000, 0.236084 26 | builtin-popcnt-unrolled, 64, 5000000, 0.092753 27 | builtin-popcnt-unrolled32, 64, 5000000, 0.229427 28 | neon-vcnt, 64, 5000000, 0.476642 29 | neon-HS, 64, 5000000, 0.566963 30 | aarch64-cnt, 64, 5000000, 0.473708 31 | lookup-8, 128, 2500000, 0.460572 32 | lookup-64, 128, 2500000, 0.466422 33 | bit-parallel, 128, 2500000, 0.303136 34 | bit-parallel-optimized, 128, 2500000, 0.219372 35 | bit-parallel-mul, 128, 2500000, 0.200095 36 | bit-parallel32, 128, 2500000, 0.326634 37 | bit-parallel-optimized32, 128, 2500000, 0.276213 38 | harley-seal, 128, 2500000, 0.120645 39 | builtin-popcnt, 128, 2500000, 0.094177 40 | builtin-popcnt32, 128, 2500000, 0.264833 41 | builtin-popcnt-unrolled, 128, 2500000, 0.092712 42 | builtin-popcnt-unrolled32, 128, 2500000, 0.229501 43 | neon-vcnt, 128, 2500000, 0.065477 44 | neon-HS, 128, 2500000, 0.475945 45 | aarch64-cnt, 128, 2500000, 0.425257 46 | lookup-8, 256, 1250000, 0.442136 47 | lookup-64, 256, 1250000, 0.442551 48 | bit-parallel, 256, 1250000, 0.298649 49 | bit-parallel-optimized, 256, 1250000, 0.215518 50 | bit-parallel-mul, 256, 1250000, 0.175065 51 | bit-parallel32, 256, 1250000, 0.338371 52 | bit-parallel-optimized32, 256, 1250000, 0.281774 53 | harley-seal, 256, 1250000, 0.093058 54 | builtin-popcnt, 256, 1250000, 0.108129 55 | builtin-popcnt32, 256, 1250000, 0.250090 56 | builtin-popcnt-unrolled, 256, 1250000, 0.092682 57 | builtin-popcnt-unrolled32, 256, 1250000, 0.237605 58 | neon-vcnt, 256, 1250000, 0.055594 59 | neon-HS, 256, 1250000, 0.091770 60 | aarch64-cnt, 256, 1250000, 0.056897 61 | lookup-8, 512, 1000000, 0.692899 62 | lookup-64, 512, 1000000, 0.695050 63 | bit-parallel, 512, 1000000, 0.487924 64 | bit-parallel-optimized, 512, 1000000, 0.355534 65 | bit-parallel-mul, 512, 1000000, 0.263675 66 | bit-parallel32, 512, 1000000, 0.527307 67 | bit-parallel-optimized32, 512, 1000000, 0.443540 68 | harley-seal, 512, 1000000, 0.127149 69 | builtin-popcnt, 512, 1000000, 0.161856 70 | builtin-popcnt32, 512, 1000000, 0.388484 71 | builtin-popcnt-unrolled, 512, 1000000, 0.155358 72 | builtin-popcnt-unrolled32, 512, 1000000, 0.373766 73 | neon-vcnt, 512, 1000000, 0.084454 74 | neon-HS, 512, 1000000, 0.128557 75 | aarch64-cnt, 512, 1000000, 0.082228 76 | lookup-8, 1024, 500000, 0.685299 77 | lookup-64, 1024, 500000, 0.685371 78 | bit-parallel, 1024, 500000, 0.478433 79 | bit-parallel-optimized, 1024, 500000, 0.346344 80 | bit-parallel-mul, 1024, 500000, 0.251572 81 | bit-parallel32, 1024, 500000, 0.516698 82 | bit-parallel-optimized32, 1024, 500000, 0.428121 83 | harley-seal, 1024, 500000, 0.116068 84 | builtin-popcnt, 1024, 500000, 0.156242 85 | builtin-popcnt32, 1024, 500000, 0.382531 86 | builtin-popcnt-unrolled, 1024, 500000, 0.151822 87 | builtin-popcnt-unrolled32, 1024, 500000, 0.370425 88 | neon-vcnt, 1024, 500000, 0.077243 89 | neon-HS, 1024, 500000, 0.116519 90 | aarch64-cnt, 1024, 500000, 0.077384 91 | lookup-8, 2048, 250000, 0.681751 92 | lookup-64, 2048, 250000, 0.682310 93 | bit-parallel, 2048, 250000, 0.475164 94 | bit-parallel-optimized, 2048, 250000, 0.343156 95 | bit-parallel-mul, 2048, 250000, 0.242102 96 | bit-parallel32, 2048, 250000, 0.513199 97 | bit-parallel-optimized32, 2048, 250000, 0.456079 98 | harley-seal, 2048, 250000, 0.112855 99 | builtin-popcnt, 2048, 250000, 0.153486 100 | builtin-popcnt32, 2048, 250000, 0.379641 101 | builtin-popcnt-unrolled, 2048, 250000, 0.150076 102 | builtin-popcnt-unrolled32, 2048, 250000, 0.369033 103 | neon-vcnt, 2048, 250000, 0.078042 104 | neon-HS, 2048, 250000, 0.113880 105 | aarch64-cnt, 2048, 250000, 0.074639 106 | lookup-8, 4096, 125000, 0.679827 107 | lookup-64, 4096, 125000, 0.680086 108 | bit-parallel, 4096, 125000, 0.472903 109 | bit-parallel-optimized, 4096, 125000, 0.341028 110 | bit-parallel-mul, 4096, 125000, 0.243314 111 | bit-parallel32, 4096, 125000, 0.510773 112 | bit-parallel-optimized32, 4096, 125000, 0.454050 113 | harley-seal, 4096, 125000, 0.109090 114 | builtin-popcnt, 4096, 125000, 0.152040 115 | builtin-popcnt32, 4096, 125000, 0.378102 116 | builtin-popcnt-unrolled, 4096, 125000, 0.149179 117 | builtin-popcnt-unrolled32, 4096, 125000, 0.368040 118 | neon-vcnt, 4096, 125000, 0.075548 119 | neon-HS, 4096, 125000, 0.111219 120 | aarch64-cnt, 4096, 125000, 0.073878 121 | lookup-8, 32, 10000000, 0.441378 122 | lookup-64, 32, 10000000, 0.459200 123 | bit-parallel, 32, 10000000, 0.329540 124 | bit-parallel-optimized, 32, 10000000, 0.241284 125 | bit-parallel-mul, 32, 10000000, 0.247167 126 | bit-parallel32, 32, 10000000, 0.353064 127 | bit-parallel-optimized32, 32, 10000000, 0.308747 128 | harley-seal, 32, 10000000, 0.606144 129 | builtin-popcnt, 32, 10000000, 0.117691 130 | builtin-popcnt32, 32, 10000000, 0.236035 131 | builtin-popcnt-unrolled, 32, 10000000, 0.092680 132 | builtin-popcnt-unrolled32, 32, 10000000, 0.229689 133 | neon-vcnt, 32, 10000000, 0.576688 134 | neon-HS, 32, 10000000, 0.720784 135 | aarch64-cnt, 32, 10000000, 0.570795 136 | lookup-8, 64, 5000000, 0.497255 137 | lookup-64, 64, 5000000, 0.497257 138 | bit-parallel, 64, 5000000, 0.311882 139 | bit-parallel-optimized, 64, 5000000, 0.226672 140 | bit-parallel-mul, 64, 5000000, 0.171149 141 | bit-parallel32, 64, 5000000, 0.335465 142 | bit-parallel-optimized32, 64, 5000000, 0.282039 143 | harley-seal, 64, 5000000, 0.366915 144 | builtin-popcnt, 64, 5000000, 0.097097 145 | builtin-popcnt32, 64, 5000000, 0.236089 146 | builtin-popcnt-unrolled, 64, 5000000, 0.093349 147 | builtin-popcnt-unrolled32, 64, 5000000, 0.229280 148 | neon-vcnt, 64, 5000000, 0.465313 149 | neon-HS, 64, 5000000, 0.539487 150 | aarch64-cnt, 64, 5000000, 0.464885 151 | lookup-8, 128, 2500000, 0.460493 152 | lookup-64, 128, 2500000, 0.460671 153 | bit-parallel, 128, 2500000, 0.303112 154 | bit-parallel-optimized, 128, 2500000, 0.219238 155 | bit-parallel-mul, 128, 2500000, 0.197132 156 | bit-parallel32, 128, 2500000, 0.326594 157 | bit-parallel-optimized32, 128, 2500000, 0.276007 158 | harley-seal, 128, 2500000, 0.120633 159 | builtin-popcnt, 128, 2500000, 0.094154 160 | builtin-popcnt32, 128, 2500000, 0.264802 161 | builtin-popcnt-unrolled, 128, 2500000, 0.092682 162 | builtin-popcnt-unrolled32, 128, 2500000, 0.229445 163 | neon-vcnt, 128, 2500000, 0.065468 164 | neon-HS, 128, 2500000, 0.465915 165 | aarch64-cnt, 128, 2500000, 0.425169 166 | lookup-8, 256, 1250000, 0.442123 167 | lookup-64, 256, 1250000, 0.445044 168 | bit-parallel, 256, 1250000, 0.298644 169 | bit-parallel-optimized, 256, 1250000, 0.215530 170 | bit-parallel-mul, 256, 1250000, 0.174333 171 | bit-parallel32, 256, 1250000, 0.341308 172 | bit-parallel-optimized32, 256, 1250000, 0.303117 173 | harley-seal, 256, 1250000, 0.093048 174 | builtin-popcnt, 256, 1250000, 0.108131 175 | builtin-popcnt32, 256, 1250000, 0.250091 176 | builtin-popcnt-unrolled, 256, 1250000, 0.092688 177 | builtin-popcnt-unrolled32, 256, 1250000, 0.237607 178 | neon-vcnt, 256, 1250000, 0.055705 179 | neon-HS, 256, 1250000, 0.091413 180 | aarch64-cnt, 256, 1250000, 0.056896 181 | lookup-8, 512, 1000000, 0.692648 182 | lookup-64, 512, 1000000, 0.695039 183 | bit-parallel, 512, 1000000, 0.487832 184 | bit-parallel-optimized, 512, 1000000, 0.355438 185 | bit-parallel-mul, 512, 1000000, 0.263622 186 | bit-parallel32, 512, 1000000, 0.527289 187 | bit-parallel-optimized32, 512, 1000000, 0.468412 188 | harley-seal, 512, 1000000, 0.127107 189 | builtin-popcnt, 512, 1000000, 0.161829 190 | builtin-popcnt32, 512, 1000000, 0.388397 191 | builtin-popcnt-unrolled, 512, 1000000, 0.155356 192 | builtin-popcnt-unrolled32, 512, 1000000, 0.373668 193 | neon-vcnt, 512, 1000000, 0.084737 194 | neon-HS, 512, 1000000, 0.128518 195 | aarch64-cnt, 512, 1000000, 0.082383 196 | lookup-8, 1024, 500000, 0.685303 197 | lookup-64, 1024, 500000, 0.686584 198 | bit-parallel, 1024, 500000, 0.479296 199 | bit-parallel-optimized, 1024, 500000, 0.347242 200 | bit-parallel-mul, 1024, 500000, 0.250701 201 | bit-parallel32, 1024, 500000, 0.517843 202 | bit-parallel-optimized32, 1024, 500000, 0.460168 203 | harley-seal, 1024, 500000, 0.116073 204 | builtin-popcnt, 1024, 500000, 0.156237 205 | builtin-popcnt32, 1024, 500000, 0.382546 206 | builtin-popcnt-unrolled, 1024, 500000, 0.151841 207 | builtin-popcnt-unrolled32, 1024, 500000, 0.370440 208 | neon-vcnt, 1024, 500000, 0.077186 209 | neon-HS, 1024, 500000, 0.116516 210 | aarch64-cnt, 1024, 500000, 0.077384 211 | lookup-8, 2048, 250000, 0.681786 212 | lookup-64, 2048, 250000, 0.682281 213 | bit-parallel, 2048, 250000, 0.475075 214 | bit-parallel-optimized, 2048, 250000, 0.343189 215 | bit-parallel-mul, 2048, 250000, 0.242054 216 | bit-parallel32, 2048, 250000, 0.513219 217 | bit-parallel-optimized32, 2048, 250000, 0.456128 218 | harley-seal, 2048, 250000, 0.112849 219 | builtin-popcnt, 2048, 250000, 0.153461 220 | builtin-popcnt32, 2048, 250000, 0.379638 221 | builtin-popcnt-unrolled, 2048, 250000, 0.150085 222 | builtin-popcnt-unrolled32, 2048, 250000, 0.368940 223 | neon-vcnt, 2048, 250000, 0.077825 224 | neon-HS, 2048, 250000, 0.114076 225 | aarch64-cnt, 2048, 250000, 0.074633 226 | lookup-8, 4096, 125000, 0.680066 227 | lookup-64, 4096, 125000, 0.679877 228 | bit-parallel, 4096, 125000, 0.472755 229 | bit-parallel-optimized, 4096, 125000, 0.340924 230 | bit-parallel-mul, 4096, 125000, 0.243401 231 | bit-parallel32, 4096, 125000, 0.510907 232 | bit-parallel-optimized32, 4096, 125000, 0.454068 233 | harley-seal, 4096, 125000, 0.109106 234 | builtin-popcnt, 4096, 125000, 0.152066 235 | builtin-popcnt32, 4096, 125000, 0.378157 236 | builtin-popcnt-unrolled, 4096, 125000, 0.149203 237 | builtin-popcnt-unrolled32, 4096, 125000, 0.368079 238 | neon-vcnt, 4096, 125000, 0.075149 239 | neon-HS, 4096, 125000, 0.111005 240 | aarch64-cnt, 4096, 125000, 0.073901 241 | lookup-8, 32, 10000000, 0.441436 242 | lookup-64, 32, 10000000, 0.458986 243 | bit-parallel, 32, 10000000, 0.329402 244 | bit-parallel-optimized, 32, 10000000, 0.241296 245 | bit-parallel-mul, 32, 10000000, 0.248751 246 | bit-parallel32, 32, 10000000, 0.353062 247 | bit-parallel-optimized32, 32, 10000000, 0.304034 248 | harley-seal, 32, 10000000, 0.606173 249 | builtin-popcnt, 32, 10000000, 0.117697 250 | builtin-popcnt32, 32, 10000000, 0.236038 251 | builtin-popcnt-unrolled, 32, 10000000, 0.092680 252 | builtin-popcnt-unrolled32, 32, 10000000, 0.229935 253 | neon-vcnt, 32, 10000000, 0.576710 254 | neon-HS, 32, 10000000, 0.721302 255 | aarch64-cnt, 32, 10000000, 0.570811 256 | lookup-8, 64, 5000000, 0.497289 257 | lookup-64, 64, 5000000, 0.509086 258 | bit-parallel, 64, 5000000, 0.311897 259 | bit-parallel-optimized, 64, 5000000, 0.226553 260 | bit-parallel-mul, 64, 5000000, 0.171651 261 | bit-parallel32, 64, 5000000, 0.335422 262 | bit-parallel-optimized32, 64, 5000000, 0.287247 263 | harley-seal, 64, 5000000, 0.366962 264 | builtin-popcnt, 64, 5000000, 0.097096 265 | builtin-popcnt32, 64, 5000000, 0.236810 266 | builtin-popcnt-unrolled, 64, 5000000, 0.092684 267 | builtin-popcnt-unrolled32, 64, 5000000, 0.229430 268 | neon-vcnt, 64, 5000000, 0.476672 269 | neon-HS, 64, 5000000, 0.554714 270 | aarch64-cnt, 64, 5000000, 0.473740 271 | lookup-8, 128, 2500000, 0.460538 272 | lookup-64, 128, 2500000, 0.466578 273 | bit-parallel, 128, 2500000, 0.303064 274 | bit-parallel-optimized, 128, 2500000, 0.219200 275 | bit-parallel-mul, 128, 2500000, 0.200078 276 | bit-parallel32, 128, 2500000, 0.326601 277 | bit-parallel-optimized32, 128, 2500000, 0.275919 278 | harley-seal, 128, 2500000, 0.120634 279 | builtin-popcnt, 128, 2500000, 0.094151 280 | builtin-popcnt32, 128, 2500000, 0.264819 281 | builtin-popcnt-unrolled, 128, 2500000, 0.092685 282 | builtin-popcnt-unrolled32, 128, 2500000, 0.229434 283 | neon-vcnt, 128, 2500000, 0.065483 284 | neon-HS, 128, 2500000, 0.485485 285 | aarch64-cnt, 128, 2500000, 0.425154 286 | lookup-8, 256, 1250000, 0.442167 287 | lookup-64, 256, 1250000, 0.442175 288 | bit-parallel, 256, 1250000, 0.298757 289 | bit-parallel-optimized, 256, 1250000, 0.215544 290 | bit-parallel-mul, 256, 1250000, 0.175119 291 | bit-parallel32, 256, 1250000, 0.338420 292 | bit-parallel-optimized32, 256, 1250000, 0.281749 293 | harley-seal, 256, 1250000, 0.093058 294 | builtin-popcnt, 256, 1250000, 0.108150 295 | builtin-popcnt32, 256, 1250000, 0.250143 296 | builtin-popcnt-unrolled, 256, 1250000, 0.092696 297 | builtin-popcnt-unrolled32, 256, 1250000, 0.237699 298 | neon-vcnt, 256, 1250000, 0.055996 299 | neon-HS, 256, 1250000, 0.091127 300 | aarch64-cnt, 256, 1250000, 0.056908 301 | lookup-8, 512, 1000000, 0.692810 302 | lookup-64, 512, 1000000, 0.692690 303 | bit-parallel, 512, 1000000, 0.486104 304 | bit-parallel-optimized, 512, 1000000, 0.353687 305 | bit-parallel-mul, 512, 1000000, 0.260119 306 | bit-parallel32, 512, 1000000, 0.524934 307 | bit-parallel-optimized32, 512, 1000000, 0.435450 308 | harley-seal, 512, 1000000, 0.127110 309 | builtin-popcnt, 512, 1000000, 0.161822 310 | builtin-popcnt32, 512, 1000000, 0.388965 311 | builtin-popcnt-unrolled, 512, 1000000, 0.155358 312 | builtin-popcnt-unrolled32, 512, 1000000, 0.373671 313 | neon-vcnt, 512, 1000000, 0.082946 314 | neon-HS, 512, 1000000, 0.128598 315 | aarch64-cnt, 512, 1000000, 0.082388 316 | lookup-8, 1024, 500000, 0.685492 317 | lookup-64, 1024, 500000, 0.686556 318 | bit-parallel, 1024, 500000, 0.479366 319 | bit-parallel-optimized, 1024, 500000, 0.347205 320 | bit-parallel-mul, 1024, 500000, 0.250765 321 | bit-parallel32, 1024, 500000, 0.517997 322 | bit-parallel-optimized32, 1024, 500000, 0.460282 323 | harley-seal, 1024, 500000, 0.116128 324 | builtin-popcnt, 1024, 500000, 0.156260 325 | builtin-popcnt32, 1024, 500000, 0.382556 326 | builtin-popcnt-unrolled, 1024, 500000, 0.151827 327 | builtin-popcnt-unrolled32, 1024, 500000, 0.370487 328 | neon-vcnt, 1024, 500000, 0.077196 329 | neon-HS, 1024, 500000, 0.116536 330 | aarch64-cnt, 1024, 500000, 0.077423 331 | lookup-8, 2048, 250000, 0.681629 332 | lookup-64, 2048, 250000, 0.682229 333 | bit-parallel, 2048, 250000, 0.475029 334 | bit-parallel-optimized, 2048, 250000, 0.343077 335 | bit-parallel-mul, 2048, 250000, 0.241998 336 | bit-parallel32, 2048, 250000, 0.513136 337 | bit-parallel-optimized32, 2048, 250000, 0.456106 338 | harley-seal, 2048, 250000, 0.112838 339 | builtin-popcnt, 2048, 250000, 0.153442 340 | builtin-popcnt32, 2048, 250000, 0.379590 341 | builtin-popcnt-unrolled, 2048, 250000, 0.150071 342 | builtin-popcnt-unrolled32, 2048, 250000, 0.368809 343 | neon-vcnt, 2048, 250000, 0.077908 344 | neon-HS, 2048, 250000, 0.113951 345 | aarch64-cnt, 2048, 250000, 0.074668 346 | lookup-8, 4096, 125000, 0.679858 347 | lookup-64, 4096, 125000, 0.680148 348 | bit-parallel, 4096, 125000, 0.472983 349 | bit-parallel-optimized, 4096, 125000, 0.341068 350 | bit-parallel-mul, 4096, 125000, 0.243377 351 | bit-parallel32, 4096, 125000, 0.510887 352 | bit-parallel-optimized32, 4096, 125000, 0.454094 353 | harley-seal, 4096, 125000, 0.109095 354 | builtin-popcnt, 4096, 125000, 0.152064 355 | builtin-popcnt32, 4096, 125000, 0.378158 356 | builtin-popcnt-unrolled, 4096, 125000, 0.149182 357 | builtin-popcnt-unrolled32, 4096, 125000, 0.368035 358 | neon-vcnt, 4096, 125000, 0.075089 359 | neon-HS, 4096, 125000, 0.111170 360 | aarch64-cnt, 4096, 125000, 0.073923 361 | lookup-8, 32, 10000000, 0.441512 362 | lookup-64, 32, 10000000, 0.458985 363 | bit-parallel, 32, 10000000, 0.329353 364 | bit-parallel-optimized, 32, 10000000, 0.241286 365 | bit-parallel-mul, 32, 10000000, 0.247165 366 | bit-parallel32, 32, 10000000, 0.353070 367 | bit-parallel-optimized32, 32, 10000000, 0.303578 368 | harley-seal, 32, 10000000, 0.606187 369 | builtin-popcnt, 32, 10000000, 0.117697 370 | builtin-popcnt32, 32, 10000000, 0.236247 371 | builtin-popcnt-unrolled, 32, 10000000, 0.092695 372 | builtin-popcnt-unrolled32, 32, 10000000, 0.229665 373 | neon-vcnt, 32, 10000000, 0.576682 374 | neon-HS, 32, 10000000, 0.718420 375 | aarch64-cnt, 32, 10000000, 0.570830 376 | lookup-8, 64, 5000000, 0.497341 377 | lookup-64, 64, 5000000, 0.509211 378 | bit-parallel, 64, 5000000, 0.311956 379 | bit-parallel-optimized, 64, 5000000, 0.226646 380 | bit-parallel-mul, 64, 5000000, 0.171861 381 | bit-parallel32, 64, 5000000, 0.335462 382 | bit-parallel-optimized32, 64, 5000000, 0.284381 383 | harley-seal, 64, 5000000, 0.366811 384 | builtin-popcnt, 64, 5000000, 0.097112 385 | builtin-popcnt32, 64, 5000000, 0.236879 386 | builtin-popcnt-unrolled, 64, 5000000, 0.092696 387 | builtin-popcnt-unrolled32, 64, 5000000, 0.229463 388 | neon-vcnt, 64, 5000000, 0.476727 389 | neon-HS, 64, 5000000, 0.567021 390 | aarch64-cnt, 64, 5000000, 0.473760 391 | lookup-8, 128, 2500000, 0.460635 392 | lookup-64, 128, 2500000, 0.466363 393 | bit-parallel, 128, 2500000, 0.303092 394 | bit-parallel-optimized, 128, 2500000, 0.219203 395 | bit-parallel-mul, 128, 2500000, 0.200079 396 | bit-parallel32, 128, 2500000, 0.326642 397 | bit-parallel-optimized32, 128, 2500000, 0.275821 398 | harley-seal, 128, 2500000, 0.120639 399 | builtin-popcnt, 128, 2500000, 0.094153 400 | builtin-popcnt32, 128, 2500000, 0.264820 401 | builtin-popcnt-unrolled, 128, 2500000, 0.092692 402 | builtin-popcnt-unrolled32, 128, 2500000, 0.229499 403 | neon-vcnt, 128, 2500000, 0.065476 404 | neon-HS, 128, 2500000, 0.465866 405 | aarch64-cnt, 128, 2500000, 0.425241 406 | lookup-8, 256, 1250000, 0.442126 407 | lookup-64, 256, 1250000, 0.445022 408 | bit-parallel, 256, 1250000, 0.298678 409 | bit-parallel-optimized, 256, 1250000, 0.215524 410 | bit-parallel-mul, 256, 1250000, 0.174329 411 | bit-parallel32, 256, 1250000, 0.341468 412 | bit-parallel-optimized32, 256, 1250000, 0.303050 413 | harley-seal, 256, 1250000, 0.093049 414 | builtin-popcnt, 256, 1250000, 0.108131 415 | builtin-popcnt32, 256, 1250000, 0.250105 416 | builtin-popcnt-unrolled, 256, 1250000, 0.092680 417 | builtin-popcnt-unrolled32, 256, 1250000, 0.237649 418 | neon-vcnt, 256, 1250000, 0.056296 419 | neon-HS, 256, 1250000, 0.091060 420 | aarch64-cnt, 256, 1250000, 0.056888 421 | lookup-8, 512, 1000000, 0.692666 422 | lookup-64, 512, 1000000, 0.694974 423 | bit-parallel, 512, 1000000, 0.487859 424 | bit-parallel-optimized, 512, 1000000, 0.355440 425 | bit-parallel-mul, 512, 1000000, 0.263630 426 | bit-parallel32, 512, 1000000, 0.527275 427 | bit-parallel-optimized32, 512, 1000000, 0.468410 428 | harley-seal, 512, 1000000, 0.127106 429 | builtin-popcnt, 512, 1000000, 0.161824 430 | builtin-popcnt32, 512, 1000000, 0.388421 431 | builtin-popcnt-unrolled, 512, 1000000, 0.155350 432 | builtin-popcnt-unrolled32, 512, 1000000, 0.373689 433 | neon-vcnt, 512, 1000000, 0.084754 434 | neon-HS, 512, 1000000, 0.128523 435 | aarch64-cnt, 512, 1000000, 0.082287 436 | lookup-8, 1024, 500000, 0.685295 437 | lookup-64, 1024, 500000, 0.686439 438 | bit-parallel, 1024, 500000, 0.479331 439 | bit-parallel-optimized, 1024, 500000, 0.347182 440 | bit-parallel-mul, 1024, 500000, 0.250706 441 | bit-parallel32, 1024, 500000, 0.517862 442 | bit-parallel-optimized32, 1024, 500000, 0.460177 443 | harley-seal, 1024, 500000, 0.116081 444 | builtin-popcnt, 1024, 500000, 0.156232 445 | builtin-popcnt32, 1024, 500000, 0.382562 446 | builtin-popcnt-unrolled, 1024, 500000, 0.151818 447 | builtin-popcnt-unrolled32, 1024, 500000, 0.370459 448 | neon-vcnt, 1024, 500000, 0.077188 449 | neon-HS, 1024, 500000, 0.116523 450 | aarch64-cnt, 1024, 500000, 0.077386 451 | lookup-8, 2048, 250000, 0.681699 452 | lookup-64, 2048, 250000, 0.682344 453 | bit-parallel, 2048, 250000, 0.475137 454 | bit-parallel-optimized, 2048, 250000, 0.343163 455 | bit-parallel-mul, 2048, 250000, 0.242081 456 | bit-parallel32, 2048, 250000, 0.513178 457 | bit-parallel-optimized32, 2048, 250000, 0.456133 458 | harley-seal, 2048, 250000, 0.112872 459 | builtin-popcnt, 2048, 250000, 0.153466 460 | builtin-popcnt32, 2048, 250000, 0.379646 461 | builtin-popcnt-unrolled, 2048, 250000, 0.150097 462 | builtin-popcnt-unrolled32, 2048, 250000, 0.368928 463 | neon-vcnt, 2048, 250000, 0.078084 464 | neon-HS, 2048, 250000, 0.113837 465 | aarch64-cnt, 2048, 250000, 0.074693 466 | lookup-8, 4096, 125000, 0.679796 467 | lookup-64, 4096, 125000, 0.679902 468 | bit-parallel, 4096, 125000, 0.472675 469 | bit-parallel-optimized, 4096, 125000, 0.340797 470 | bit-parallel-mul, 4096, 125000, 0.244873 471 | bit-parallel32, 4096, 125000, 0.510497 472 | bit-parallel-optimized32, 4096, 125000, 0.422497 473 | harley-seal, 4096, 125000, 0.109086 474 | builtin-popcnt, 4096, 125000, 0.152050 475 | builtin-popcnt32, 4096, 125000, 0.378128 476 | builtin-popcnt-unrolled, 4096, 125000, 0.149186 477 | builtin-popcnt-unrolled32, 4096, 125000, 0.368000 478 | neon-vcnt, 4096, 125000, 0.074806 479 | neon-HS, 4096, 125000, 0.110893 480 | aarch64-cnt, 4096, 125000, 0.073338 481 | lookup-8, 32, 10000000, 0.441485 482 | lookup-64, 32, 10000000, 0.459040 483 | bit-parallel, 32, 10000000, 0.329638 484 | bit-parallel-optimized, 32, 10000000, 0.241343 485 | bit-parallel-mul, 32, 10000000, 0.247172 486 | bit-parallel32, 32, 10000000, 0.353215 487 | bit-parallel-optimized32, 32, 10000000, 0.305807 488 | harley-seal, 32, 10000000, 0.606190 489 | builtin-popcnt, 32, 10000000, 0.117698 490 | builtin-popcnt32, 32, 10000000, 0.235751 491 | builtin-popcnt-unrolled, 32, 10000000, 0.092762 492 | builtin-popcnt-unrolled32, 32, 10000000, 0.229552 493 | neon-vcnt, 32, 10000000, 0.577724 494 | neon-HS, 32, 10000000, 0.715818 495 | aarch64-cnt, 32, 10000000, 0.570883 496 | lookup-8, 64, 5000000, 0.497813 497 | lookup-64, 64, 5000000, 0.509044 498 | bit-parallel, 64, 5000000, 0.313493 499 | bit-parallel-optimized, 64, 5000000, 0.226561 500 | bit-parallel-mul, 64, 5000000, 0.171959 501 | bit-parallel32, 64, 5000000, 0.335435 502 | bit-parallel-optimized32, 64, 5000000, 0.287760 503 | harley-seal, 64, 5000000, 0.366791 504 | builtin-popcnt, 64, 5000000, 0.097132 505 | builtin-popcnt32, 64, 5000000, 0.235386 506 | builtin-popcnt-unrolled, 64, 5000000, 0.092680 507 | builtin-popcnt-unrolled32, 64, 5000000, 0.229498 508 | neon-vcnt, 64, 5000000, 0.476666 509 | neon-HS, 64, 5000000, 0.545690 510 | aarch64-cnt, 64, 5000000, 0.473747 511 | lookup-8, 128, 2500000, 0.460587 512 | lookup-64, 128, 2500000, 0.466506 513 | bit-parallel, 128, 2500000, 0.303211 514 | bit-parallel-optimized, 128, 2500000, 0.219276 515 | bit-parallel-mul, 128, 2500000, 0.200133 516 | bit-parallel32, 128, 2500000, 0.326662 517 | bit-parallel-optimized32, 128, 2500000, 0.275690 518 | harley-seal, 128, 2500000, 0.120649 519 | builtin-popcnt, 128, 2500000, 0.094180 520 | builtin-popcnt32, 128, 2500000, 0.264879 521 | builtin-popcnt-unrolled, 128, 2500000, 0.092694 522 | builtin-popcnt-unrolled32, 128, 2500000, 0.229414 523 | neon-vcnt, 128, 2500000, 0.065473 524 | neon-HS, 128, 2500000, 0.480661 525 | aarch64-cnt, 128, 2500000, 0.425294 526 | lookup-8, 256, 1250000, 0.442101 527 | lookup-64, 256, 1250000, 0.445065 528 | bit-parallel, 256, 1250000, 0.298646 529 | bit-parallel-optimized, 256, 1250000, 0.215515 530 | bit-parallel-mul, 256, 1250000, 0.174338 531 | bit-parallel32, 256, 1250000, 0.341294 532 | bit-parallel-optimized32, 256, 1250000, 0.301112 533 | harley-seal, 256, 1250000, 0.093060 534 | builtin-popcnt, 256, 1250000, 0.108130 535 | builtin-popcnt32, 256, 1250000, 0.250107 536 | builtin-popcnt-unrolled, 256, 1250000, 0.092681 537 | builtin-popcnt-unrolled32, 256, 1250000, 0.237624 538 | neon-vcnt, 256, 1250000, 0.056109 539 | neon-HS, 256, 1250000, 0.092155 540 | aarch64-cnt, 256, 1250000, 0.056886 541 | lookup-8, 512, 1000000, 0.694970 542 | lookup-64, 512, 1000000, 0.694963 543 | bit-parallel, 512, 1000000, 0.487858 544 | bit-parallel-optimized, 512, 1000000, 0.355425 545 | bit-parallel-mul, 512, 1000000, 0.263639 546 | bit-parallel32, 512, 1000000, 0.527249 547 | bit-parallel-optimized32, 512, 1000000, 0.468410 548 | harley-seal, 512, 1000000, 0.127111 549 | builtin-popcnt, 512, 1000000, 0.161885 550 | builtin-popcnt32, 512, 1000000, 0.388379 551 | builtin-popcnt-unrolled, 512, 1000000, 0.155354 552 | builtin-popcnt-unrolled32, 512, 1000000, 0.373663 553 | neon-vcnt, 512, 1000000, 0.083657 554 | neon-HS, 512, 1000000, 0.128519 555 | aarch64-cnt, 512, 1000000, 0.082383 556 | lookup-8, 1024, 500000, 0.685397 557 | lookup-64, 1024, 500000, 0.686562 558 | bit-parallel, 1024, 500000, 0.479430 559 | bit-parallel-optimized, 1024, 500000, 0.347248 560 | bit-parallel-mul, 1024, 500000, 0.250727 561 | bit-parallel32, 1024, 500000, 0.517907 562 | bit-parallel-optimized32, 1024, 500000, 0.460255 563 | harley-seal, 1024, 500000, 0.116101 564 | builtin-popcnt, 1024, 500000, 0.156270 565 | builtin-popcnt32, 1024, 500000, 0.382608 566 | builtin-popcnt-unrolled, 1024, 500000, 0.151836 567 | builtin-popcnt-unrolled32, 1024, 500000, 0.370550 568 | neon-vcnt, 1024, 500000, 0.077197 569 | neon-HS, 1024, 500000, 0.116593 570 | aarch64-cnt, 1024, 500000, 0.077406 571 | lookup-8, 2048, 250000, 0.681645 572 | lookup-64, 2048, 250000, 0.681608 573 | bit-parallel, 2048, 250000, 0.474606 574 | bit-parallel-optimized, 2048, 250000, 0.344134 575 | bit-parallel-mul, 2048, 250000, 0.247177 576 | bit-parallel32, 2048, 250000, 0.512586 577 | bit-parallel-optimized32, 2048, 250000, 0.424286 578 | harley-seal, 2048, 250000, 0.112843 579 | builtin-popcnt, 2048, 250000, 0.153440 580 | builtin-popcnt32, 2048, 250000, 0.379549 581 | builtin-popcnt-unrolled, 2048, 250000, 0.150058 582 | builtin-popcnt-unrolled32, 2048, 250000, 0.368825 583 | neon-vcnt, 2048, 250000, 0.077857 584 | neon-HS, 2048, 250000, 0.113954 585 | aarch64-cnt, 2048, 250000, 0.074667 586 | lookup-8, 4096, 125000, 0.679858 587 | lookup-64, 4096, 125000, 0.680060 588 | bit-parallel, 4096, 125000, 0.472906 589 | bit-parallel-optimized, 4096, 125000, 0.341053 590 | bit-parallel-mul, 4096, 125000, 0.243311 591 | bit-parallel32, 4096, 125000, 0.510806 592 | bit-parallel-optimized32, 4096, 125000, 0.453994 593 | harley-seal, 4096, 125000, 0.109085 594 | builtin-popcnt, 4096, 125000, 0.152049 595 | builtin-popcnt32, 4096, 125000, 0.378130 596 | builtin-popcnt-unrolled, 4096, 125000, 0.149196 597 | builtin-popcnt-unrolled32, 4096, 125000, 0.368001 598 | neon-vcnt, 4096, 125000, 0.075443 599 | neon-HS, 4096, 125000, 0.110874 600 | aarch64-cnt, 4096, 125000, 0.073874 601 | -------------------------------------------------------------------------------- /results/arm/arm-64bit-clang3.8.0.metadata: -------------------------------------------------------------------------------- 1 | cpu=Cortex A57 (AMD Opteron A1100) 2 | architecture=64-bit 3 | compiler=clang 3.8.0 (tags/RELEASE_380/final 262553) 4 | runs=5 5 | spec=https://shop.softiron.com/product/overdrive-1000/ 6 | -------------------------------------------------------------------------------- /results/arm/arm-64bit-gcc4.8.5.csv: -------------------------------------------------------------------------------- 1 | lookup-8, 32, 10000000, 0.417926 2 | lookup-64, 32, 10000000, 0.432530 3 | bit-parallel, 32, 10000000, 0.305096 4 | bit-parallel-optimized, 32, 10000000, 0.244263 5 | bit-parallel-mul, 32, 10000000, 0.200080 6 | bit-parallel32, 32, 10000000, 0.488445 7 | bit-parallel-optimized32, 32, 10000000, 0.420894 8 | harley-seal, 32, 10000000, 0.317784 9 | builtin-popcnt, 32, 10000000, 0.506175 10 | builtin-popcnt32, 32, 10000000, 0.970984 11 | builtin-popcnt-unrolled, 32, 10000000, 0.488428 12 | builtin-popcnt-unrolled32, 32, 10000000, 0.953378 13 | neon-vcnt, 32, 10000000, 0.594403 14 | neon-HS, 32, 10000000, 0.785591 15 | lookup-8, 64, 5000000, 0.456098 16 | lookup-64, 64, 5000000, 0.491358 17 | bit-parallel, 64, 5000000, 0.291289 18 | bit-parallel-optimized, 64, 5000000, 0.235378 19 | bit-parallel-mul, 64, 5000000, 0.163292 20 | bit-parallel32, 64, 5000000, 0.467856 21 | bit-parallel-optimized32, 64, 5000000, 0.413378 22 | harley-seal, 64, 5000000, 0.229493 23 | builtin-popcnt, 64, 5000000, 0.829735 24 | builtin-popcnt32, 64, 5000000, 0.929300 25 | builtin-popcnt-unrolled, 64, 5000000, 0.373433 26 | builtin-popcnt-unrolled32, 64, 5000000, 0.692674 27 | neon-vcnt, 64, 5000000, 0.479621 28 | neon-HS, 64, 5000000, 0.589748 29 | lookup-8, 128, 2500000, 0.414982 30 | lookup-64, 128, 2500000, 0.452015 31 | bit-parallel, 128, 2500000, 0.285427 32 | bit-parallel-optimized, 128, 2500000, 0.225171 33 | bit-parallel-mul, 128, 2500000, 0.187606 34 | bit-parallel32, 128, 2500000, 0.463460 35 | bit-parallel-optimized32, 128, 2500000, 0.409773 36 | harley-seal, 128, 2500000, 0.130938 37 | builtin-popcnt, 128, 2500000, 0.495840 38 | builtin-popcnt32, 128, 2500000, 0.965202 39 | builtin-popcnt-unrolled, 128, 2500000, 0.453208 40 | builtin-popcnt-unrolled32, 128, 2500000, 0.891947 41 | neon-vcnt, 128, 2500000, 0.090271 42 | neon-HS, 128, 2500000, 0.482137 43 | lookup-8, 256, 1250000, 0.395769 44 | lookup-64, 256, 1250000, 0.422958 45 | bit-parallel, 256, 1250000, 0.280981 46 | bit-parallel-optimized, 256, 1250000, 0.219938 47 | bit-parallel-mul, 256, 1250000, 0.164033 48 | bit-parallel32, 256, 1250000, 0.482525 49 | bit-parallel-optimized32, 256, 1250000, 0.428117 50 | harley-seal, 256, 1250000, 0.101531 51 | builtin-popcnt, 256, 1250000, 0.483281 52 | builtin-popcnt32, 256, 1250000, 0.953306 53 | builtin-popcnt-unrolled, 256, 1250000, 0.446496 54 | builtin-popcnt-unrolled32, 256, 1250000, 0.888746 55 | neon-vcnt, 256, 1250000, 0.075025 56 | neon-HS, 256, 1250000, 0.095978 57 | lookup-8, 512, 1000000, 0.617986 58 | lookup-64, 512, 1000000, 0.658603 59 | bit-parallel, 512, 1000000, 0.460777 60 | bit-parallel-optimized, 512, 1000000, 0.362593 61 | bit-parallel-mul, 512, 1000000, 0.245406 62 | bit-parallel32, 512, 1000000, 0.753278 63 | bit-parallel-optimized32, 512, 1000000, 0.667360 64 | harley-seal, 512, 1000000, 0.138883 65 | builtin-popcnt, 512, 1000000, 0.763418 66 | builtin-popcnt32, 512, 1000000, 1.517166 67 | builtin-popcnt-unrolled, 512, 1000000, 0.709181 68 | builtin-popcnt-unrolled32, 512, 1000000, 1.407855 69 | neon-vcnt, 512, 1000000, 0.106769 70 | neon-HS, 512, 1000000, 0.131847 71 | lookup-8, 1024, 500000, 0.610653 72 | lookup-64, 1024, 500000, 0.649347 73 | bit-parallel, 1024, 500000, 0.451650 74 | bit-parallel-optimized, 1024, 500000, 0.353070 75 | bit-parallel-mul, 1024, 500000, 0.237585 76 | bit-parallel32, 1024, 500000, 0.743844 77 | bit-parallel-optimized32, 1024, 500000, 0.658507 78 | harley-seal, 1024, 500000, 0.127108 79 | builtin-popcnt, 1024, 500000, 0.836553 80 | builtin-popcnt32, 1024, 500000, 1.136867 81 | builtin-popcnt-unrolled, 1024, 500000, 0.517245 82 | builtin-popcnt-unrolled32, 1024, 500000, 1.024785 83 | neon-vcnt, 1024, 500000, 0.102239 84 | neon-HS, 1024, 500000, 0.116078 85 | lookup-8, 2048, 250000, 0.606467 86 | lookup-64, 2048, 250000, 0.681305 87 | bit-parallel, 2048, 250000, 0.446335 88 | bit-parallel-optimized, 2048, 250000, 0.347681 89 | bit-parallel-mul, 2048, 250000, 0.232440 90 | bit-parallel32, 2048, 250000, 0.738205 91 | bit-parallel-optimized32, 2048, 250000, 0.653473 92 | harley-seal, 2048, 250000, 0.123318 93 | builtin-popcnt, 2048, 250000, 1.283274 94 | builtin-popcnt32, 2048, 250000, 1.132485 95 | builtin-popcnt-unrolled, 2048, 250000, 0.512814 96 | builtin-popcnt-unrolled32, 2048, 250000, 1.020820 97 | neon-vcnt, 2048, 250000, 0.100544 98 | neon-HS, 2048, 250000, 0.110446 99 | lookup-8, 4096, 125000, 0.604539 100 | lookup-64, 4096, 125000, 0.642552 101 | bit-parallel, 4096, 125000, 0.444792 102 | bit-parallel-optimized, 4096, 125000, 0.346013 103 | bit-parallel-mul, 4096, 125000, 0.230745 104 | bit-parallel32, 4096, 125000, 0.736777 105 | bit-parallel-optimized32, 4096, 125000, 0.651855 106 | harley-seal, 4096, 125000, 0.119316 107 | builtin-popcnt, 4096, 125000, 0.754467 108 | builtin-popcnt32, 4096, 125000, 1.507636 109 | builtin-popcnt-unrolled, 4096, 125000, 0.698612 110 | builtin-popcnt-unrolled32, 4096, 125000, 1.395229 111 | neon-vcnt, 4096, 125000, 0.096392 112 | neon-HS, 4096, 125000, 0.107254 113 | lookup-8, 32, 10000000, 0.417886 114 | lookup-64, 32, 10000000, 0.432626 115 | bit-parallel, 32, 10000000, 0.305459 116 | bit-parallel-optimized, 32, 10000000, 0.244241 117 | bit-parallel-mul, 32, 10000000, 0.200196 118 | bit-parallel32, 32, 10000000, 0.488494 119 | bit-parallel-optimized32, 32, 10000000, 0.420784 120 | harley-seal, 32, 10000000, 0.317822 121 | builtin-popcnt, 32, 10000000, 0.506084 122 | builtin-popcnt32, 32, 10000000, 0.971016 123 | builtin-popcnt-unrolled, 32, 10000000, 0.500212 124 | builtin-popcnt-unrolled32, 32, 10000000, 0.953382 125 | neon-vcnt, 32, 10000000, 0.594440 126 | neon-HS, 32, 10000000, 0.785930 127 | lookup-8, 64, 5000000, 0.456124 128 | lookup-64, 64, 5000000, 0.491365 129 | bit-parallel, 64, 5000000, 0.291298 130 | bit-parallel-optimized, 64, 5000000, 0.235406 131 | bit-parallel-mul, 64, 5000000, 0.163296 132 | bit-parallel32, 64, 5000000, 0.467811 133 | bit-parallel-optimized32, 64, 5000000, 0.413418 134 | harley-seal, 64, 5000000, 0.229501 135 | builtin-popcnt, 64, 5000000, 0.676548 136 | builtin-popcnt32, 64, 5000000, 0.759100 137 | builtin-popcnt-unrolled, 64, 5000000, 0.358985 138 | builtin-popcnt-unrolled32, 64, 5000000, 0.692674 139 | neon-vcnt, 64, 5000000, 0.485490 140 | neon-HS, 64, 5000000, 0.589832 141 | lookup-8, 128, 2500000, 0.415003 142 | lookup-64, 128, 2500000, 0.445795 143 | bit-parallel, 128, 2500000, 0.285432 144 | bit-parallel-optimized, 128, 2500000, 0.225127 145 | bit-parallel-mul, 128, 2500000, 0.187575 146 | bit-parallel32, 128, 2500000, 0.463445 147 | bit-parallel-optimized32, 128, 2500000, 0.409865 148 | harley-seal, 128, 2500000, 0.131003 149 | builtin-popcnt, 128, 2500000, 0.495820 150 | builtin-popcnt32, 128, 2500000, 0.965149 151 | builtin-popcnt-unrolled, 128, 2500000, 0.457626 152 | builtin-popcnt-unrolled32, 128, 2500000, 0.892792 153 | neon-vcnt, 128, 2500000, 0.090792 154 | neon-HS, 128, 2500000, 0.482097 155 | lookup-8, 256, 1250000, 0.395816 156 | lookup-64, 256, 1250000, 0.422950 157 | bit-parallel, 256, 1250000, 0.280990 158 | bit-parallel-optimized, 256, 1250000, 0.219949 159 | bit-parallel-mul, 256, 1250000, 0.164029 160 | bit-parallel32, 256, 1250000, 0.482530 161 | bit-parallel-optimized32, 256, 1250000, 0.428131 162 | harley-seal, 256, 1250000, 0.101511 163 | builtin-popcnt, 256, 1250000, 0.661692 164 | builtin-popcnt32, 256, 1250000, 0.719372 165 | builtin-popcnt-unrolled, 256, 1250000, 0.331726 166 | builtin-popcnt-unrolled32, 256, 1250000, 0.655385 167 | neon-vcnt, 256, 1250000, 0.074767 168 | neon-HS, 256, 1250000, 0.095993 169 | lookup-8, 512, 1000000, 0.617994 170 | lookup-64, 512, 1000000, 0.658654 171 | bit-parallel, 512, 1000000, 0.460780 172 | bit-parallel-optimized, 512, 1000000, 0.362501 173 | bit-parallel-mul, 512, 1000000, 0.245438 174 | bit-parallel32, 512, 1000000, 0.753247 175 | bit-parallel-optimized32, 512, 1000000, 0.667374 176 | harley-seal, 512, 1000000, 0.138883 177 | builtin-popcnt, 512, 1000000, 0.763340 178 | builtin-popcnt32, 512, 1000000, 1.515964 179 | builtin-popcnt-unrolled, 512, 1000000, 0.712119 180 | builtin-popcnt-unrolled32, 512, 1000000, 1.407691 181 | neon-vcnt, 512, 1000000, 0.106543 182 | neon-HS, 512, 1000000, 0.131834 183 | lookup-8, 1024, 500000, 0.610548 184 | lookup-64, 1024, 500000, 0.649349 185 | bit-parallel, 1024, 500000, 0.451661 186 | bit-parallel-optimized, 1024, 500000, 0.353063 187 | bit-parallel-mul, 1024, 500000, 0.237586 188 | bit-parallel32, 1024, 500000, 0.743820 189 | bit-parallel-optimized32, 1024, 500000, 0.658488 190 | harley-seal, 1024, 500000, 0.127105 191 | builtin-popcnt, 1024, 500000, 0.758233 192 | builtin-popcnt32, 1024, 500000, 1.511139 193 | builtin-popcnt-unrolled, 1024, 500000, 0.702927 194 | builtin-popcnt-unrolled32, 1024, 500000, 1.400527 195 | neon-vcnt, 1024, 500000, 0.101861 196 | neon-HS, 1024, 500000, 0.115950 197 | lookup-8, 2048, 250000, 0.606606 198 | lookup-64, 2048, 250000, 0.681339 199 | bit-parallel, 2048, 250000, 0.446384 200 | bit-parallel-optimized, 2048, 250000, 0.347655 201 | bit-parallel-mul, 2048, 250000, 0.232547 202 | bit-parallel32, 2048, 250000, 0.738224 203 | bit-parallel-optimized32, 2048, 250000, 0.653599 204 | harley-seal, 2048, 250000, 0.123296 205 | builtin-popcnt, 2048, 250000, 0.589609 206 | builtin-popcnt32, 2048, 250000, 1.132600 207 | builtin-popcnt-unrolled, 2048, 250000, 0.512891 208 | builtin-popcnt-unrolled32, 2048, 250000, 1.020925 209 | neon-vcnt, 2048, 250000, 0.100615 210 | neon-HS, 2048, 250000, 0.110228 211 | lookup-8, 4096, 125000, 0.604583 212 | lookup-64, 4096, 125000, 0.642996 213 | bit-parallel, 4096, 125000, 0.444814 214 | bit-parallel-optimized, 4096, 125000, 0.346028 215 | bit-parallel-mul, 4096, 125000, 0.230744 216 | bit-parallel32, 4096, 125000, 0.736799 217 | bit-parallel-optimized32, 4096, 125000, 0.651864 218 | harley-seal, 4096, 125000, 0.119318 219 | builtin-popcnt, 4096, 125000, 0.754476 220 | builtin-popcnt32, 4096, 125000, 1.507642 221 | builtin-popcnt-unrolled, 4096, 125000, 0.698652 222 | builtin-popcnt-unrolled32, 4096, 125000, 1.395251 223 | neon-vcnt, 4096, 125000, 0.098039 224 | neon-HS, 4096, 125000, 0.107250 225 | lookup-8, 32, 10000000, 0.417896 226 | lookup-64, 32, 10000000, 0.432639 227 | bit-parallel, 32, 10000000, 0.303902 228 | bit-parallel-optimized, 32, 10000000, 0.244029 229 | bit-parallel-mul, 32, 10000000, 0.200121 230 | bit-parallel32, 32, 10000000, 0.488440 231 | bit-parallel-optimized32, 32, 10000000, 0.421096 232 | harley-seal, 32, 10000000, 0.317814 233 | builtin-popcnt, 32, 10000000, 0.506113 234 | builtin-popcnt32, 32, 10000000, 0.971081 235 | builtin-popcnt-unrolled, 32, 10000000, 0.500334 236 | builtin-popcnt-unrolled32, 32, 10000000, 0.953329 237 | neon-vcnt, 32, 10000000, 0.594361 238 | neon-HS, 32, 10000000, 0.785191 239 | lookup-8, 64, 5000000, 0.456146 240 | lookup-64, 64, 5000000, 0.491365 241 | bit-parallel, 64, 5000000, 0.291278 242 | bit-parallel-optimized, 64, 5000000, 0.235402 243 | bit-parallel-mul, 64, 5000000, 0.163201 244 | bit-parallel32, 64, 5000000, 0.467811 245 | bit-parallel-optimized32, 64, 5000000, 0.413418 246 | harley-seal, 64, 5000000, 0.229500 247 | builtin-popcnt, 64, 5000000, 0.646910 248 | builtin-popcnt32, 64, 5000000, 0.759099 249 | builtin-popcnt-unrolled, 64, 5000000, 0.358994 250 | builtin-popcnt-unrolled32, 64, 5000000, 0.692911 251 | neon-vcnt, 64, 5000000, 0.485475 252 | neon-HS, 64, 5000000, 0.589838 253 | lookup-8, 128, 2500000, 0.414889 254 | lookup-64, 128, 2500000, 0.445761 255 | bit-parallel, 128, 2500000, 0.285413 256 | bit-parallel-optimized, 128, 2500000, 0.225087 257 | bit-parallel-mul, 128, 2500000, 0.187572 258 | bit-parallel32, 128, 2500000, 0.463427 259 | bit-parallel-optimized32, 128, 2500000, 0.409725 260 | harley-seal, 128, 2500000, 0.130993 261 | builtin-popcnt, 128, 2500000, 0.828241 262 | builtin-popcnt32, 128, 2500000, 1.060847 263 | builtin-popcnt-unrolled, 128, 2500000, 0.345856 264 | builtin-popcnt-unrolled32, 128, 2500000, 0.663626 265 | neon-vcnt, 128, 2500000, 0.090956 266 | neon-HS, 128, 2500000, 0.482093 267 | lookup-8, 256, 1250000, 0.395812 268 | lookup-64, 256, 1250000, 0.422943 269 | bit-parallel, 256, 1250000, 0.281002 270 | bit-parallel-optimized, 256, 1250000, 0.219938 271 | bit-parallel-mul, 256, 1250000, 0.164031 272 | bit-parallel32, 256, 1250000, 0.482552 273 | bit-parallel-optimized32, 256, 1250000, 0.428090 274 | harley-seal, 256, 1250000, 0.101513 275 | builtin-popcnt, 256, 1250000, 0.600183 276 | builtin-popcnt32, 256, 1250000, 0.719626 277 | builtin-popcnt-unrolled, 256, 1250000, 0.331737 278 | builtin-popcnt-unrolled32, 256, 1250000, 0.655385 279 | neon-vcnt, 256, 1250000, 0.074859 280 | neon-HS, 256, 1250000, 0.096002 281 | lookup-8, 512, 1000000, 0.618068 282 | lookup-64, 512, 1000000, 0.658578 283 | bit-parallel, 512, 1000000, 0.460772 284 | bit-parallel-optimized, 512, 1000000, 0.362580 285 | bit-parallel-mul, 512, 1000000, 0.245406 286 | bit-parallel32, 512, 1000000, 0.753303 287 | bit-parallel-optimized32, 512, 1000000, 0.667355 288 | harley-seal, 512, 1000000, 0.138883 289 | builtin-popcnt, 512, 1000000, 0.588002 290 | builtin-popcnt32, 512, 1000000, 1.140480 291 | builtin-popcnt-unrolled, 512, 1000000, 0.526156 292 | builtin-popcnt-unrolled32, 512, 1000000, 1.032865 293 | neon-vcnt, 512, 1000000, 0.106650 294 | neon-HS, 512, 1000000, 0.131833 295 | lookup-8, 1024, 500000, 0.610603 296 | lookup-64, 1024, 500000, 0.649360 297 | bit-parallel, 1024, 500000, 0.451655 298 | bit-parallel-optimized, 1024, 500000, 0.353063 299 | bit-parallel-mul, 1024, 500000, 0.237585 300 | bit-parallel32, 1024, 500000, 0.743830 301 | bit-parallel-optimized32, 1024, 500000, 0.658492 302 | harley-seal, 1024, 500000, 0.127105 303 | builtin-popcnt, 1024, 500000, 0.758232 304 | builtin-popcnt32, 1024, 500000, 1.511135 305 | builtin-popcnt-unrolled, 1024, 500000, 0.704377 306 | builtin-popcnt-unrolled32, 1024, 500000, 1.400533 307 | neon-vcnt, 1024, 500000, 0.101731 308 | neon-HS, 1024, 500000, 0.116071 309 | lookup-8, 2048, 250000, 0.606497 310 | lookup-64, 2048, 250000, 0.644808 311 | bit-parallel, 2048, 250000, 0.447076 312 | bit-parallel-optimized, 2048, 250000, 0.348367 313 | bit-parallel-mul, 2048, 250000, 0.232917 314 | bit-parallel32, 2048, 250000, 0.739087 315 | bit-parallel-optimized32, 2048, 250000, 0.654069 316 | harley-seal, 2048, 250000, 0.123283 317 | builtin-popcnt, 2048, 250000, 0.755747 318 | builtin-popcnt32, 2048, 250000, 1.508779 319 | builtin-popcnt-unrolled, 2048, 250000, 0.699840 320 | builtin-popcnt-unrolled32, 2048, 250000, 1.397009 321 | neon-vcnt, 2048, 250000, 0.100430 322 | neon-HS, 2048, 250000, 0.110167 323 | lookup-8, 4096, 125000, 0.604587 324 | lookup-64, 4096, 125000, 0.642554 325 | bit-parallel, 4096, 125000, 0.444791 326 | bit-parallel-optimized, 4096, 125000, 0.346032 327 | bit-parallel-mul, 4096, 125000, 0.230752 328 | bit-parallel32, 4096, 125000, 0.736785 329 | bit-parallel-optimized32, 4096, 125000, 0.651846 330 | harley-seal, 4096, 125000, 0.119318 331 | builtin-popcnt, 4096, 125000, 0.754491 332 | builtin-popcnt32, 4096, 125000, 1.507653 333 | builtin-popcnt-unrolled, 4096, 125000, 0.698644 334 | builtin-popcnt-unrolled32, 4096, 125000, 1.395235 335 | neon-vcnt, 4096, 125000, 0.096151 336 | neon-HS, 4096, 125000, 0.107250 337 | lookup-8, 32, 10000000, 0.417871 338 | lookup-64, 32, 10000000, 0.432613 339 | bit-parallel, 32, 10000000, 0.302889 340 | bit-parallel-optimized, 32, 10000000, 0.244226 341 | bit-parallel-mul, 32, 10000000, 0.200217 342 | bit-parallel32, 32, 10000000, 0.488461 343 | bit-parallel-optimized32, 32, 10000000, 0.420815 344 | harley-seal, 32, 10000000, 0.317787 345 | builtin-popcnt, 32, 10000000, 0.425657 346 | builtin-popcnt32, 32, 10000000, 0.759196 347 | builtin-popcnt-unrolled, 32, 10000000, 0.405968 348 | builtin-popcnt-unrolled32, 32, 10000000, 0.747354 349 | neon-vcnt, 32, 10000000, 0.594438 350 | neon-HS, 32, 10000000, 0.785399 351 | lookup-8, 64, 5000000, 0.456125 352 | lookup-64, 64, 5000000, 0.491375 353 | bit-parallel, 64, 5000000, 0.291292 354 | bit-parallel-optimized, 64, 5000000, 0.235375 355 | bit-parallel-mul, 64, 5000000, 0.163025 356 | bit-parallel32, 64, 5000000, 0.467857 357 | bit-parallel-optimized32, 64, 5000000, 0.413391 358 | harley-seal, 64, 5000000, 0.229501 359 | builtin-popcnt, 64, 5000000, 0.419513 360 | builtin-popcnt32, 64, 5000000, 0.759121 361 | builtin-popcnt-unrolled, 64, 5000000, 0.373265 362 | builtin-popcnt-unrolled32, 64, 5000000, 0.692445 363 | neon-vcnt, 64, 5000000, 0.485479 364 | neon-HS, 64, 5000000, 0.589713 365 | lookup-8, 128, 2500000, 0.414929 366 | lookup-64, 128, 2500000, 0.445880 367 | bit-parallel, 128, 2500000, 0.285430 368 | bit-parallel-optimized, 128, 2500000, 0.225113 369 | bit-parallel-mul, 128, 2500000, 0.187588 370 | bit-parallel32, 128, 2500000, 0.463483 371 | bit-parallel-optimized32, 128, 2500000, 0.409735 372 | harley-seal, 128, 2500000, 0.130937 373 | builtin-popcnt, 128, 2500000, 0.400273 374 | builtin-popcnt32, 128, 2500000, 0.732647 375 | builtin-popcnt-unrolled, 128, 2500000, 0.345748 376 | builtin-popcnt-unrolled32, 128, 2500000, 0.663803 377 | neon-vcnt, 128, 2500000, 0.092700 378 | neon-HS, 128, 2500000, 0.482165 379 | lookup-8, 256, 1250000, 0.395811 380 | lookup-64, 256, 1250000, 0.422961 381 | bit-parallel, 256, 1250000, 0.280981 382 | bit-parallel-optimized, 256, 1250000, 0.219930 383 | bit-parallel-mul, 256, 1250000, 0.164041 384 | bit-parallel32, 256, 1250000, 0.482537 385 | bit-parallel-optimized32, 256, 1250000, 0.428098 386 | harley-seal, 256, 1250000, 0.101551 387 | builtin-popcnt, 256, 1250000, 0.483511 388 | builtin-popcnt32, 256, 1250000, 0.953305 389 | builtin-popcnt-unrolled, 256, 1250000, 0.444393 390 | builtin-popcnt-unrolled32, 256, 1250000, 0.888847 391 | neon-vcnt, 256, 1250000, 0.074843 392 | neon-HS, 256, 1250000, 0.095985 393 | lookup-8, 512, 1000000, 0.618072 394 | lookup-64, 512, 1000000, 0.658572 395 | bit-parallel, 512, 1000000, 0.460771 396 | bit-parallel-optimized, 512, 1000000, 0.362528 397 | bit-parallel-mul, 512, 1000000, 0.245436 398 | bit-parallel32, 512, 1000000, 0.753254 399 | bit-parallel-optimized32, 512, 1000000, 0.667350 400 | harley-seal, 512, 1000000, 0.138914 401 | builtin-popcnt, 512, 1000000, 0.763348 402 | builtin-popcnt32, 512, 1000000, 1.515999 403 | builtin-popcnt-unrolled, 512, 1000000, 0.709143 404 | builtin-popcnt-unrolled32, 512, 1000000, 1.407678 405 | neon-vcnt, 512, 1000000, 0.107125 406 | neon-HS, 512, 1000000, 0.131833 407 | lookup-8, 1024, 500000, 0.610584 408 | lookup-64, 1024, 500000, 0.649357 409 | bit-parallel, 1024, 500000, 0.451649 410 | bit-parallel-optimized, 1024, 500000, 0.353068 411 | bit-parallel-mul, 1024, 500000, 0.237584 412 | bit-parallel32, 1024, 500000, 0.743839 413 | bit-parallel-optimized32, 1024, 500000, 0.658490 414 | harley-seal, 1024, 500000, 0.127114 415 | builtin-popcnt, 1024, 500000, 0.936116 416 | builtin-popcnt32, 1024, 500000, 1.135139 417 | builtin-popcnt-unrolled, 1024, 500000, 0.517216 418 | builtin-popcnt-unrolled32, 1024, 500000, 1.024844 419 | neon-vcnt, 1024, 500000, 0.102645 420 | neon-HS, 1024, 500000, 0.116143 421 | lookup-8, 2048, 250000, 0.606498 422 | lookup-64, 2048, 250000, 0.644897 423 | bit-parallel, 2048, 250000, 0.447130 424 | bit-parallel-optimized, 2048, 250000, 0.348438 425 | bit-parallel-mul, 2048, 250000, 0.232898 426 | bit-parallel32, 2048, 250000, 0.739177 427 | bit-parallel-optimized32, 2048, 250000, 0.654123 428 | harley-seal, 2048, 250000, 0.123286 429 | builtin-popcnt, 2048, 250000, 0.755840 430 | builtin-popcnt32, 2048, 250000, 1.508903 431 | builtin-popcnt-unrolled, 2048, 250000, 0.700558 432 | builtin-popcnt-unrolled32, 2048, 250000, 1.397109 433 | neon-vcnt, 2048, 250000, 0.099808 434 | neon-HS, 2048, 250000, 0.110315 435 | lookup-8, 4096, 125000, 0.604576 436 | lookup-64, 4096, 125000, 0.642549 437 | bit-parallel, 4096, 125000, 0.444824 438 | bit-parallel-optimized, 4096, 125000, 0.346010 439 | bit-parallel-mul, 4096, 125000, 0.230747 440 | bit-parallel32, 4096, 125000, 0.736792 441 | bit-parallel-optimized32, 4096, 125000, 0.651895 442 | harley-seal, 4096, 125000, 0.119310 443 | builtin-popcnt, 4096, 125000, 0.754517 444 | builtin-popcnt32, 4096, 125000, 1.507635 445 | builtin-popcnt-unrolled, 4096, 125000, 0.698676 446 | builtin-popcnt-unrolled32, 4096, 125000, 1.395243 447 | neon-vcnt, 4096, 125000, 0.097607 448 | neon-HS, 4096, 125000, 0.107263 449 | lookup-8, 32, 10000000, 0.417940 450 | lookup-64, 32, 10000000, 0.432546 451 | bit-parallel, 32, 10000000, 0.305017 452 | bit-parallel-optimized, 32, 10000000, 0.244178 453 | bit-parallel-mul, 32, 10000000, 0.181275 454 | bit-parallel32, 32, 10000000, 0.488452 455 | bit-parallel-optimized32, 32, 10000000, 0.420826 456 | harley-seal, 32, 10000000, 0.317835 457 | builtin-popcnt, 32, 10000000, 0.464897 458 | builtin-popcnt32, 32, 10000000, 0.759159 459 | builtin-popcnt-unrolled, 32, 10000000, 0.406642 460 | builtin-popcnt-unrolled32, 32, 10000000, 0.747395 461 | neon-vcnt, 32, 10000000, 0.600280 462 | neon-HS, 32, 10000000, 0.761993 463 | lookup-8, 64, 5000000, 0.456070 464 | lookup-64, 64, 5000000, 0.491355 465 | bit-parallel, 64, 5000000, 0.291279 466 | bit-parallel-optimized, 64, 5000000, 0.235383 467 | bit-parallel-mul, 64, 5000000, 0.163217 468 | bit-parallel32, 64, 5000000, 0.467822 469 | bit-parallel-optimized32, 64, 5000000, 0.413381 470 | harley-seal, 64, 5000000, 0.229544 471 | builtin-popcnt, 64, 5000000, 0.818638 472 | builtin-popcnt32, 64, 5000000, 0.759113 473 | builtin-popcnt-unrolled, 64, 5000000, 0.373642 474 | builtin-popcnt-unrolled32, 64, 5000000, 0.691673 475 | neon-vcnt, 64, 5000000, 0.485478 476 | neon-HS, 64, 5000000, 0.589892 477 | lookup-8, 128, 2500000, 0.414992 478 | lookup-64, 128, 2500000, 0.457564 479 | bit-parallel, 128, 2500000, 0.285464 480 | bit-parallel-optimized, 128, 2500000, 0.225145 481 | bit-parallel-mul, 128, 2500000, 0.180969 482 | bit-parallel32, 128, 2500000, 0.463443 483 | bit-parallel-optimized32, 128, 2500000, 0.409840 484 | harley-seal, 128, 2500000, 0.130941 485 | builtin-popcnt, 128, 2500000, 0.396614 486 | builtin-popcnt32, 128, 2500000, 0.732686 487 | builtin-popcnt-unrolled, 128, 2500000, 0.345731 488 | builtin-popcnt-unrolled32, 128, 2500000, 0.664208 489 | neon-vcnt, 128, 2500000, 0.089942 490 | neon-HS, 128, 2500000, 0.476292 491 | lookup-8, 256, 1250000, 0.395811 492 | lookup-64, 256, 1250000, 0.422950 493 | bit-parallel, 256, 1250000, 0.280992 494 | bit-parallel-optimized, 256, 1250000, 0.219931 495 | bit-parallel-mul, 256, 1250000, 0.164031 496 | bit-parallel32, 256, 1250000, 0.482571 497 | bit-parallel-optimized32, 256, 1250000, 0.428090 498 | harley-seal, 256, 1250000, 0.101507 499 | builtin-popcnt, 256, 1250000, 0.483294 500 | builtin-popcnt32, 256, 1250000, 0.953316 501 | builtin-popcnt-unrolled, 256, 1250000, 0.446441 502 | builtin-popcnt-unrolled32, 256, 1250000, 0.888796 503 | neon-vcnt, 256, 1250000, 0.075118 504 | neon-HS, 256, 1250000, 0.095478 505 | lookup-8, 512, 1000000, 0.617903 506 | lookup-64, 512, 1000000, 0.658492 507 | bit-parallel, 512, 1000000, 0.460767 508 | bit-parallel-optimized, 512, 1000000, 0.362484 509 | bit-parallel-mul, 512, 1000000, 0.245385 510 | bit-parallel32, 512, 1000000, 0.753263 511 | bit-parallel-optimized32, 512, 1000000, 0.667305 512 | harley-seal, 512, 1000000, 0.138875 513 | builtin-popcnt, 512, 1000000, 0.849915 514 | builtin-popcnt32, 512, 1000000, 1.140412 515 | builtin-popcnt-unrolled, 512, 1000000, 0.526208 516 | builtin-popcnt-unrolled32, 512, 1000000, 1.032743 517 | neon-vcnt, 512, 1000000, 0.107749 518 | neon-HS, 512, 1000000, 0.131813 519 | lookup-8, 1024, 500000, 0.610617 520 | lookup-64, 1024, 500000, 0.649367 521 | bit-parallel, 1024, 500000, 0.451647 522 | bit-parallel-optimized, 1024, 500000, 0.353083 523 | bit-parallel-mul, 1024, 500000, 0.237582 524 | bit-parallel32, 1024, 500000, 0.743851 525 | bit-parallel-optimized32, 1024, 500000, 0.658472 526 | harley-seal, 1024, 500000, 0.127112 527 | builtin-popcnt, 1024, 500000, 0.758227 528 | builtin-popcnt32, 1024, 500000, 1.511185 529 | builtin-popcnt-unrolled, 1024, 500000, 0.704392 530 | builtin-popcnt-unrolled32, 1024, 500000, 1.400572 531 | neon-vcnt, 1024, 500000, 0.101798 532 | neon-HS, 1024, 500000, 0.115927 533 | lookup-8, 2048, 250000, 0.606530 534 | lookup-64, 2048, 250000, 0.644839 535 | bit-parallel, 2048, 250000, 0.447158 536 | bit-parallel-optimized, 2048, 250000, 0.348382 537 | bit-parallel-mul, 2048, 250000, 0.232898 538 | bit-parallel32, 2048, 250000, 0.739189 539 | bit-parallel-optimized32, 2048, 250000, 0.654146 540 | harley-seal, 2048, 250000, 0.123291 541 | builtin-popcnt, 2048, 250000, 0.589275 542 | builtin-popcnt32, 2048, 250000, 1.132540 543 | builtin-popcnt-unrolled, 2048, 250000, 0.512848 544 | builtin-popcnt-unrolled32, 2048, 250000, 1.020939 545 | neon-vcnt, 2048, 250000, 0.100079 546 | neon-HS, 2048, 250000, 0.110248 547 | lookup-8, 4096, 125000, 0.604591 548 | lookup-64, 4096, 125000, 0.642552 549 | bit-parallel, 4096, 125000, 0.444792 550 | bit-parallel-optimized, 4096, 125000, 0.346039 551 | bit-parallel-mul, 4096, 125000, 0.230761 552 | bit-parallel32, 4096, 125000, 0.736747 553 | bit-parallel-optimized32, 4096, 125000, 0.651874 554 | harley-seal, 4096, 125000, 0.119308 555 | builtin-popcnt, 4096, 125000, 0.754513 556 | builtin-popcnt32, 4096, 125000, 1.507628 557 | builtin-popcnt-unrolled, 4096, 125000, 0.698271 558 | builtin-popcnt-unrolled32, 4096, 125000, 1.395282 559 | neon-vcnt, 4096, 125000, 0.097226 560 | neon-HS, 4096, 125000, 0.107274 561 | -------------------------------------------------------------------------------- /results/arm/arm-64bit-gcc4.8.5.metadata: -------------------------------------------------------------------------------- 1 | cpu=Cortex A57 (AMD Opteron A1100) 2 | architecture=64-bit 3 | compiler=GCC 4.8.5 (SUSE Linux) 4 | runs=5 5 | spec=https://shop.softiron.com/product/overdrive-1000/ 6 | -------------------------------------------------------------------------------- /results/arm/armv7-32bit-gcc4.9.2.csv: -------------------------------------------------------------------------------- 1 | lookup-8, 32, 10000000, 0.985725 2 | lookup-64, 32, 10000000, 1.220601 3 | bit-parallel, 32, 10000000, 1.803319 4 | bit-parallel-optimized, 32, 10000000, 1.461279 5 | bit-parallel-mul, 32, 10000000, 1.227767 6 | bit-parallel32, 32, 10000000, 1.152607 7 | bit-parallel-optimized32, 32, 10000000, 0.985908 8 | harley-seal, 32, 10000000, 1.394531 9 | builtin-popcnt, 32, 10000000, 0.907007 10 | builtin-popcnt32, 32, 10000000, 1.636626 11 | builtin-popcnt-unrolled, 32, 10000000, 1.069284 12 | builtin-popcnt-unrolled32, 32, 10000000, 1.794872 13 | neon-vcnt, 32, 10000000, 2.308262 14 | neon-HS, 32, 10000000, 2.589560 15 | lookup-8, 64, 5000000, 0.928977 16 | lookup-64, 64, 5000000, 1.197677 17 | bit-parallel, 64, 5000000, 1.685783 18 | bit-parallel-optimized, 64, 5000000, 1.349290 19 | bit-parallel-mul, 64, 5000000, 1.133567 20 | bit-parallel32, 64, 5000000, 1.060994 21 | bit-parallel-optimized32, 64, 5000000, 0.893912 22 | harley-seal, 64, 5000000, 1.198813 23 | builtin-popcnt, 64, 5000000, 0.901408 24 | builtin-popcnt32, 64, 5000000, 1.679057 25 | builtin-popcnt-unrolled, 64, 5000000, 0.964972 26 | builtin-popcnt-unrolled32, 64, 5000000, 1.700271 27 | neon-vcnt, 64, 5000000, 2.089906 28 | neon-HS, 64, 5000000, 2.231522 29 | lookup-8, 128, 2500000, 0.864230 30 | lookup-64, 128, 2500000, 1.130737 31 | bit-parallel, 128, 2500000, 1.620060 32 | bit-parallel-optimized, 128, 2500000, 1.290692 33 | bit-parallel-mul, 128, 2500000, 1.097294 34 | bit-parallel32, 128, 2500000, 1.013321 35 | bit-parallel-optimized32, 128, 2500000, 0.859721 36 | harley-seal, 128, 2500000, 0.875713 37 | builtin-popcnt, 128, 2500000, 0.928242 38 | builtin-popcnt32, 128, 2500000, 1.638849 39 | builtin-popcnt-unrolled, 128, 2500000, 0.911323 40 | builtin-popcnt-unrolled32, 128, 2500000, 1.649693 41 | neon-vcnt, 128, 2500000, 0.185597 42 | neon-HS, 128, 2500000, 2.048049 43 | lookup-8, 256, 1250000, 0.833712 44 | lookup-64, 256, 1250000, 1.100987 45 | bit-parallel, 256, 1250000, 1.592540 46 | bit-parallel-optimized, 256, 1250000, 1.266066 47 | bit-parallel-mul, 256, 1250000, 1.067491 48 | bit-parallel32, 256, 1250000, 1.003103 49 | bit-parallel-optimized32, 256, 1250000, 0.836951 50 | harley-seal, 256, 1250000, 0.698730 51 | builtin-popcnt, 256, 1250000, 0.915045 52 | builtin-popcnt32, 256, 1250000, 1.622709 53 | builtin-popcnt-unrolled, 256, 1250000, 0.887420 54 | builtin-popcnt-unrolled32, 256, 1250000, 1.636294 55 | neon-vcnt, 256, 1250000, 0.142252 56 | neon-HS, 256, 1250000, 0.215171 57 | lookup-8, 512, 1000000, 1.306581 58 | lookup-64, 512, 1000000, 1.733276 59 | bit-parallel, 512, 1000000, 2.526488 60 | bit-parallel-optimized, 512, 1000000, 2.005194 61 | bit-parallel-mul, 512, 1000000, 1.679989 62 | bit-parallel32, 512, 1000000, 1.575064 63 | bit-parallel-optimized32, 512, 1000000, 1.308926 64 | harley-seal, 512, 1000000, 0.973332 65 | builtin-popcnt, 512, 1000000, 1.450636 66 | builtin-popcnt32, 512, 1000000, 2.577057 67 | builtin-popcnt-unrolled, 512, 1000000, 1.402799 68 | builtin-popcnt-unrolled32, 512, 1000000, 2.587956 69 | neon-vcnt, 512, 1000000, 0.191818 70 | neon-HS, 512, 1000000, 0.279654 71 | lookup-8, 1024, 500000, 1.295942 72 | lookup-64, 1024, 500000, 1.723668 73 | bit-parallel, 1024, 500000, 2.515489 74 | bit-parallel-optimized, 1024, 500000, 1.993608 75 | bit-parallel-mul, 1024, 500000, 1.670344 76 | bit-parallel32, 1024, 500000, 1.564330 77 | bit-parallel-optimized32, 1024, 500000, 1.297311 78 | harley-seal, 1024, 500000, 0.903717 79 | builtin-popcnt, 1024, 500000, 1.445857 80 | builtin-popcnt32, 1024, 500000, 2.574160 81 | builtin-popcnt-unrolled, 1024, 500000, 1.391179 82 | builtin-popcnt-unrolled32, 1024, 500000, 2.579558 83 | neon-vcnt, 1024, 500000, 0.174692 84 | neon-HS, 1024, 500000, 0.247834 85 | lookup-8, 2048, 250000, 1.287632 86 | lookup-64, 2048, 250000, 1.714808 87 | bit-parallel, 2048, 250000, 2.503552 88 | bit-parallel-optimized, 2048, 250000, 1.982959 89 | bit-parallel-mul, 2048, 250000, 1.661237 90 | bit-parallel32, 2048, 250000, 1.555010 91 | bit-parallel-optimized32, 2048, 250000, 1.288221 92 | harley-seal, 2048, 250000, 0.868396 93 | builtin-popcnt, 2048, 250000, 1.438058 94 | builtin-popcnt32, 2048, 250000, 2.566260 95 | builtin-popcnt-unrolled, 2048, 250000, 1.381764 96 | builtin-popcnt-unrolled32, 2048, 250000, 2.569126 97 | neon-vcnt, 2048, 250000, 0.167359 98 | neon-HS, 2048, 250000, 0.231286 99 | lookup-8, 4096, 125000, 1.284558 100 | lookup-64, 4096, 125000, 1.711518 101 | bit-parallel, 4096, 125000, 2.499747 102 | bit-parallel-optimized, 4096, 125000, 1.979111 103 | bit-parallel-mul, 4096, 125000, 1.658133 104 | bit-parallel32, 4096, 125000, 1.551721 105 | bit-parallel-optimized32, 4096, 125000, 1.284797 106 | harley-seal, 4096, 125000, 0.849608 107 | builtin-popcnt, 4096, 125000, 1.435742 108 | builtin-popcnt32, 4096, 125000, 2.564447 109 | builtin-popcnt-unrolled, 4096, 125000, 1.378372 110 | builtin-popcnt-unrolled32, 4096, 125000, 2.565822 111 | neon-vcnt, 4096, 125000, 0.162050 112 | neon-HS, 4096, 125000, 0.224122 113 | lookup-8, 32, 10000000, 0.951001 114 | lookup-64, 32, 10000000, 1.219289 115 | bit-parallel, 32, 10000000, 1.801784 116 | bit-parallel-optimized, 32, 10000000, 1.459774 117 | bit-parallel-mul, 32, 10000000, 1.226212 118 | bit-parallel32, 32, 10000000, 1.151140 119 | bit-parallel-optimized32, 32, 10000000, 1.032193 120 | harley-seal, 32, 10000000, 1.393014 121 | builtin-popcnt, 32, 10000000, 0.906339 122 | builtin-popcnt32, 32, 10000000, 1.635252 123 | builtin-popcnt-unrolled, 32, 10000000, 1.067722 124 | builtin-popcnt-unrolled32, 32, 10000000, 1.793414 125 | neon-vcnt, 32, 10000000, 2.305474 126 | neon-HS, 32, 10000000, 2.588104 127 | lookup-8, 64, 5000000, 0.927181 128 | lookup-64, 64, 5000000, 1.193984 129 | bit-parallel, 64, 5000000, 1.680846 130 | bit-parallel-optimized, 64, 5000000, 1.347161 131 | bit-parallel-mul, 64, 5000000, 1.132332 132 | bit-parallel32, 64, 5000000, 1.059385 133 | bit-parallel-optimized32, 64, 5000000, 0.945414 134 | harley-seal, 64, 5000000, 1.197024 135 | builtin-popcnt, 64, 5000000, 0.893458 136 | builtin-popcnt32, 64, 5000000, 1.676656 137 | builtin-popcnt-unrolled, 64, 5000000, 0.963442 138 | builtin-popcnt-unrolled32, 64, 5000000, 1.697498 139 | neon-vcnt, 64, 5000000, 2.086923 140 | neon-HS, 64, 5000000, 2.228280 141 | lookup-8, 128, 2500000, 0.869428 142 | lookup-64, 128, 2500000, 1.134951 143 | bit-parallel, 128, 2500000, 1.626365 144 | bit-parallel-optimized, 128, 2500000, 1.295581 145 | bit-parallel-mul, 128, 2500000, 1.101476 146 | bit-parallel32, 128, 2500000, 1.019929 147 | bit-parallel-optimized32, 128, 2500000, 0.852576 148 | harley-seal, 128, 2500000, 0.881419 149 | builtin-popcnt, 128, 2500000, 0.925642 150 | builtin-popcnt32, 128, 2500000, 1.645158 151 | builtin-popcnt-unrolled, 128, 2500000, 0.914705 152 | builtin-popcnt-unrolled32, 128, 2500000, 1.659385 153 | neon-vcnt, 128, 2500000, 0.186306 154 | neon-HS, 128, 2500000, 2.055924 155 | lookup-8, 256, 1250000, 0.833755 156 | lookup-64, 256, 1250000, 1.101079 157 | bit-parallel, 256, 1250000, 1.593196 158 | bit-parallel-optimized, 256, 1250000, 1.265218 159 | bit-parallel-mul, 256, 1250000, 1.067763 160 | bit-parallel32, 256, 1250000, 1.003270 161 | bit-parallel-optimized32, 256, 1250000, 0.837143 162 | harley-seal, 256, 1250000, 0.698866 163 | builtin-popcnt, 256, 1250000, 0.914346 164 | builtin-popcnt32, 256, 1250000, 1.623013 165 | builtin-popcnt-unrolled, 256, 1250000, 0.887054 166 | builtin-popcnt-unrolled32, 256, 1250000, 1.636600 167 | neon-vcnt, 256, 1250000, 0.142110 168 | neon-HS, 256, 1250000, 0.215178 169 | lookup-8, 512, 1000000, 1.307074 170 | lookup-64, 512, 1000000, 1.733643 171 | bit-parallel, 512, 1000000, 2.526870 172 | bit-parallel-optimized, 512, 1000000, 2.005513 173 | bit-parallel-mul, 512, 1000000, 1.680181 174 | bit-parallel32, 512, 1000000, 1.575320 175 | bit-parallel-optimized32, 512, 1000000, 1.309147 176 | harley-seal, 512, 1000000, 0.973455 177 | builtin-popcnt, 512, 1000000, 1.441958 178 | builtin-popcnt32, 512, 1000000, 2.577508 179 | builtin-popcnt-unrolled, 512, 1000000, 1.403062 180 | builtin-popcnt-unrolled32, 512, 1000000, 2.588361 181 | neon-vcnt, 512, 1000000, 0.191869 182 | neon-HS, 512, 1000000, 0.279422 183 | lookup-8, 1024, 500000, 1.293961 184 | lookup-64, 1024, 500000, 1.720972 185 | bit-parallel, 1024, 500000, 2.511511 186 | bit-parallel-optimized, 1024, 500000, 1.990422 187 | bit-parallel-mul, 1024, 500000, 1.667564 188 | bit-parallel32, 1024, 500000, 1.561767 189 | bit-parallel-optimized32, 1024, 500000, 1.295250 190 | harley-seal, 1024, 500000, 0.902151 191 | builtin-popcnt, 1024, 500000, 1.440897 192 | builtin-popcnt32, 1024, 500000, 2.570014 193 | builtin-popcnt-unrolled, 1024, 500000, 1.388985 194 | builtin-popcnt-unrolled32, 1024, 500000, 2.575424 195 | neon-vcnt, 1024, 500000, 0.174339 196 | neon-HS, 1024, 500000, 0.247356 197 | lookup-8, 2048, 250000, 1.289948 198 | lookup-64, 2048, 250000, 1.717610 199 | bit-parallel, 2048, 250000, 2.507756 200 | bit-parallel-optimized, 2048, 250000, 1.986315 201 | bit-parallel-mul, 2048, 250000, 1.664005 202 | bit-parallel32, 2048, 250000, 1.557490 203 | bit-parallel-optimized32, 2048, 250000, 1.290323 204 | harley-seal, 2048, 250000, 0.869842 205 | builtin-popcnt, 2048, 250000, 1.439898 206 | builtin-popcnt32, 2048, 250000, 2.570373 207 | builtin-popcnt-unrolled, 2048, 250000, 1.384136 208 | builtin-popcnt-unrolled32, 2048, 250000, 2.573174 209 | neon-vcnt, 2048, 250000, 0.167708 210 | neon-HS, 2048, 250000, 0.231686 211 | lookup-8, 4096, 125000, 1.284684 212 | lookup-64, 4096, 125000, 1.711672 213 | bit-parallel, 4096, 125000, 2.499709 214 | bit-parallel-optimized, 4096, 125000, 1.979078 215 | bit-parallel-mul, 4096, 125000, 1.658120 216 | bit-parallel32, 4096, 125000, 1.551580 217 | bit-parallel-optimized32, 4096, 125000, 1.284759 218 | harley-seal, 4096, 125000, 0.849619 219 | builtin-popcnt, 4096, 125000, 1.436648 220 | builtin-popcnt32, 4096, 125000, 2.564393 221 | builtin-popcnt-unrolled, 4096, 125000, 1.378229 222 | builtin-popcnt-unrolled32, 4096, 125000, 2.565773 223 | neon-vcnt, 4096, 125000, 0.162078 224 | neon-HS, 4096, 125000, 0.224111 225 | lookup-8, 32, 10000000, 0.972420 226 | lookup-64, 32, 10000000, 1.238415 227 | bit-parallel, 32, 10000000, 1.808543 228 | bit-parallel-optimized, 32, 10000000, 1.465251 229 | bit-parallel-mul, 32, 10000000, 1.230851 230 | bit-parallel32, 32, 10000000, 1.155620 231 | bit-parallel-optimized32, 32, 10000000, 0.988065 232 | harley-seal, 32, 10000000, 1.398309 233 | builtin-popcnt, 32, 10000000, 0.912780 234 | builtin-popcnt32, 32, 10000000, 1.655394 235 | builtin-popcnt-unrolled, 32, 10000000, 1.073545 236 | builtin-popcnt-unrolled32, 32, 10000000, 1.800399 237 | neon-vcnt, 32, 10000000, 2.313871 238 | neon-HS, 32, 10000000, 2.597768 239 | lookup-8, 64, 5000000, 0.928931 240 | lookup-64, 64, 5000000, 1.196002 241 | bit-parallel, 64, 5000000, 1.683743 242 | bit-parallel-optimized, 64, 5000000, 1.349572 243 | bit-parallel-mul, 64, 5000000, 1.133919 244 | bit-parallel32, 64, 5000000, 1.061125 245 | bit-parallel-optimized32, 64, 5000000, 0.893994 246 | harley-seal, 64, 5000000, 1.198974 247 | builtin-popcnt, 64, 5000000, 0.899457 248 | builtin-popcnt32, 64, 5000000, 1.679535 249 | builtin-popcnt-unrolled, 64, 5000000, 0.964997 250 | builtin-popcnt-unrolled32, 64, 5000000, 1.700253 251 | neon-vcnt, 64, 5000000, 2.090321 252 | neon-HS, 64, 5000000, 2.231913 253 | lookup-8, 128, 2500000, 0.864273 254 | lookup-64, 128, 2500000, 1.131122 255 | bit-parallel, 128, 2500000, 1.620613 256 | bit-parallel-optimized, 128, 2500000, 1.290890 257 | bit-parallel-mul, 128, 2500000, 1.097479 258 | bit-parallel32, 128, 2500000, 1.013525 259 | bit-parallel-optimized32, 128, 2500000, 0.850087 260 | harley-seal, 128, 2500000, 0.875848 261 | builtin-popcnt, 128, 2500000, 0.927725 262 | builtin-popcnt32, 128, 2500000, 1.639070 263 | builtin-popcnt-unrolled, 128, 2500000, 0.911307 264 | builtin-popcnt-unrolled32, 128, 2500000, 1.649589 265 | neon-vcnt, 128, 2500000, 0.185619 266 | neon-HS, 128, 2500000, 2.048365 267 | lookup-8, 256, 1250000, 0.832490 268 | lookup-64, 256, 1250000, 1.099262 269 | bit-parallel, 256, 1250000, 1.590107 270 | bit-parallel-optimized, 256, 1250000, 1.263772 271 | bit-parallel-mul, 256, 1250000, 1.066099 272 | bit-parallel32, 256, 1250000, 1.001570 273 | bit-parallel-optimized32, 256, 1250000, 0.835635 274 | harley-seal, 256, 1250000, 0.697596 275 | builtin-popcnt, 256, 1250000, 0.908156 276 | builtin-popcnt32, 256, 1250000, 1.620334 277 | builtin-popcnt-unrolled, 256, 1250000, 0.885308 278 | builtin-popcnt-unrolled32, 256, 1250000, 1.633888 279 | neon-vcnt, 256, 1250000, 0.141820 280 | neon-HS, 256, 1250000, 0.214794 281 | lookup-8, 512, 1000000, 1.306595 282 | lookup-64, 512, 1000000, 1.733582 283 | bit-parallel, 512, 1000000, 2.526981 284 | bit-parallel-optimized, 512, 1000000, 2.005536 285 | bit-parallel-mul, 512, 1000000, 1.680176 286 | bit-parallel32, 512, 1000000, 1.575309 287 | bit-parallel-optimized32, 512, 1000000, 1.309319 288 | harley-seal, 512, 1000000, 0.973538 289 | builtin-popcnt, 512, 1000000, 1.446040 290 | builtin-popcnt32, 512, 1000000, 2.577488 291 | builtin-popcnt-unrolled, 512, 1000000, 1.403042 292 | builtin-popcnt-unrolled32, 512, 1000000, 2.588320 293 | neon-vcnt, 512, 1000000, 0.191871 294 | neon-HS, 512, 1000000, 0.279470 295 | lookup-8, 1024, 500000, 1.299501 296 | lookup-64, 1024, 500000, 1.727446 297 | bit-parallel, 1024, 500000, 2.520682 298 | bit-parallel-optimized, 1024, 500000, 1.997854 299 | bit-parallel-mul, 1024, 500000, 1.673787 300 | bit-parallel32, 1024, 500000, 1.567685 301 | bit-parallel-optimized32, 1024, 500000, 1.300090 302 | harley-seal, 1024, 500000, 0.906039 303 | builtin-popcnt, 1024, 500000, 1.443760 304 | builtin-popcnt32, 1024, 500000, 2.574330 305 | builtin-popcnt-unrolled, 1024, 500000, 1.391227 306 | builtin-popcnt-unrolled32, 1024, 500000, 2.579740 307 | neon-vcnt, 1024, 500000, 0.174806 308 | neon-HS, 1024, 500000, 0.247863 309 | lookup-8, 2048, 250000, 1.287831 310 | lookup-64, 2048, 250000, 1.714649 311 | bit-parallel, 2048, 250000, 2.503593 312 | bit-parallel-optimized, 2048, 250000, 1.982913 313 | bit-parallel-mul, 2048, 250000, 1.661272 314 | bit-parallel32, 2048, 250000, 1.554974 315 | bit-parallel-optimized32, 2048, 250000, 1.288237 316 | harley-seal, 2048, 250000, 0.868377 317 | builtin-popcnt, 2048, 250000, 1.439598 318 | builtin-popcnt32, 2048, 250000, 2.566308 319 | builtin-popcnt-unrolled, 2048, 250000, 1.381795 320 | builtin-popcnt-unrolled32, 2048, 250000, 2.569012 321 | neon-vcnt, 2048, 250000, 0.167365 322 | neon-HS, 2048, 250000, 0.231295 323 | lookup-8, 4096, 125000, 1.286900 324 | lookup-64, 4096, 125000, 1.714772 325 | bit-parallel, 4096, 125000, 2.503962 326 | bit-parallel-optimized, 4096, 125000, 1.982492 327 | bit-parallel-mul, 4096, 125000, 1.660955 328 | bit-parallel32, 4096, 125000, 1.554275 329 | bit-parallel-optimized32, 4096, 125000, 1.286970 330 | harley-seal, 4096, 125000, 0.851158 331 | builtin-popcnt, 4096, 125000, 1.442359 332 | builtin-popcnt32, 4096, 125000, 2.568521 333 | builtin-popcnt-unrolled, 4096, 125000, 1.380493 334 | builtin-popcnt-unrolled32, 4096, 125000, 2.569916 335 | neon-vcnt, 4096, 125000, 0.162404 336 | neon-HS, 4096, 125000, 0.224520 337 | lookup-8, 32, 10000000, 0.951236 338 | lookup-64, 32, 10000000, 1.219092 339 | bit-parallel, 32, 10000000, 1.801807 340 | bit-parallel-optimized, 32, 10000000, 1.459761 341 | bit-parallel-mul, 32, 10000000, 1.226208 342 | bit-parallel32, 32, 10000000, 1.151122 343 | bit-parallel-optimized32, 32, 10000000, 1.047497 344 | harley-seal, 32, 10000000, 1.393029 345 | builtin-popcnt, 32, 10000000, 0.906503 346 | builtin-popcnt32, 32, 10000000, 1.639774 347 | builtin-popcnt-unrolled, 32, 10000000, 1.067675 348 | builtin-popcnt-unrolled32, 32, 10000000, 1.793444 349 | neon-vcnt, 32, 10000000, 2.305275 350 | neon-HS, 32, 10000000, 2.588129 351 | lookup-8, 64, 5000000, 0.927109 352 | lookup-64, 64, 5000000, 1.194003 353 | bit-parallel, 64, 5000000, 1.680812 354 | bit-parallel-optimized, 64, 5000000, 1.347184 355 | bit-parallel-mul, 64, 5000000, 1.130465 356 | bit-parallel32, 64, 5000000, 1.059390 357 | bit-parallel-optimized32, 64, 5000000, 0.892510 358 | harley-seal, 64, 5000000, 1.197037 359 | builtin-popcnt, 64, 5000000, 0.894111 360 | builtin-popcnt32, 64, 5000000, 1.676706 361 | builtin-popcnt-unrolled, 64, 5000000, 0.963417 362 | builtin-popcnt-unrolled32, 64, 5000000, 1.697516 363 | neon-vcnt, 64, 5000000, 2.086878 364 | neon-HS, 64, 5000000, 2.228382 365 | lookup-8, 128, 2500000, 0.864035 366 | lookup-64, 128, 2500000, 1.130848 367 | bit-parallel, 128, 2500000, 1.620331 368 | bit-parallel-optimized, 128, 2500000, 1.290859 369 | bit-parallel-mul, 128, 2500000, 1.097597 370 | bit-parallel32, 128, 2500000, 1.013616 371 | bit-parallel-optimized32, 128, 2500000, 0.853931 372 | harley-seal, 128, 2500000, 0.875852 373 | builtin-popcnt, 128, 2500000, 0.930838 374 | builtin-popcnt32, 128, 2500000, 1.639078 375 | builtin-popcnt-unrolled, 128, 2500000, 0.911324 376 | builtin-popcnt-unrolled32, 128, 2500000, 1.649519 377 | neon-vcnt, 128, 2500000, 0.185608 378 | neon-HS, 128, 2500000, 2.048371 379 | lookup-8, 256, 1250000, 0.832473 380 | lookup-64, 256, 1250000, 1.099284 381 | bit-parallel, 256, 1250000, 1.590907 382 | bit-parallel-optimized, 256, 1250000, 1.262679 383 | bit-parallel-mul, 256, 1250000, 1.065923 384 | bit-parallel32, 256, 1250000, 1.001626 385 | bit-parallel-optimized32, 256, 1250000, 0.835642 386 | harley-seal, 256, 1250000, 0.697606 387 | builtin-popcnt, 256, 1250000, 0.911804 388 | builtin-popcnt32, 256, 1250000, 1.620586 389 | builtin-popcnt-unrolled, 256, 1250000, 0.885243 390 | builtin-popcnt-unrolled32, 256, 1250000, 1.633896 391 | neon-vcnt, 256, 1250000, 0.141814 392 | neon-HS, 256, 1250000, 0.214795 393 | lookup-8, 512, 1000000, 1.309059 394 | lookup-64, 512, 1000000, 1.736914 395 | bit-parallel, 512, 1000000, 2.531194 396 | bit-parallel-optimized, 512, 1000000, 2.008982 397 | bit-parallel-mul, 512, 1000000, 1.682945 398 | bit-parallel32, 512, 1000000, 1.577916 399 | bit-parallel-optimized32, 512, 1000000, 1.311307 400 | harley-seal, 512, 1000000, 0.975093 401 | builtin-popcnt, 512, 1000000, 1.449006 402 | builtin-popcnt32, 512, 1000000, 2.581605 403 | builtin-popcnt-unrolled, 512, 1000000, 1.405427 404 | builtin-popcnt-unrolled32, 512, 1000000, 2.592492 405 | neon-vcnt, 512, 1000000, 0.192348 406 | neon-HS, 512, 1000000, 0.279980 407 | lookup-8, 1024, 500000, 1.293997 408 | lookup-64, 1024, 500000, 1.720970 409 | bit-parallel, 1024, 500000, 2.511312 410 | bit-parallel-optimized, 1024, 500000, 1.990623 411 | bit-parallel-mul, 1024, 500000, 1.667585 412 | bit-parallel32, 1024, 500000, 1.561768 413 | bit-parallel-optimized32, 1024, 500000, 1.295197 414 | harley-seal, 1024, 500000, 0.902112 415 | builtin-popcnt, 1024, 500000, 1.439392 416 | builtin-popcnt32, 1024, 500000, 2.570365 417 | builtin-popcnt-unrolled, 1024, 500000, 1.388855 418 | builtin-popcnt-unrolled32, 1024, 500000, 2.575428 419 | neon-vcnt, 1024, 500000, 0.174352 420 | neon-HS, 1024, 500000, 0.247320 421 | lookup-8, 2048, 250000, 1.290009 422 | lookup-64, 2048, 250000, 1.717437 423 | bit-parallel, 2048, 250000, 2.507789 424 | bit-parallel-optimized, 2048, 250000, 1.986219 425 | bit-parallel-mul, 2048, 250000, 1.663948 426 | bit-parallel32, 2048, 250000, 1.557486 427 | bit-parallel-optimized32, 2048, 250000, 1.290328 428 | harley-seal, 2048, 250000, 0.869970 429 | builtin-popcnt, 2048, 250000, 1.440664 430 | builtin-popcnt32, 2048, 250000, 2.570338 431 | builtin-popcnt-unrolled, 2048, 250000, 1.384035 432 | builtin-popcnt-unrolled32, 2048, 250000, 2.573215 433 | neon-vcnt, 2048, 250000, 0.167637 434 | neon-HS, 2048, 250000, 0.231734 435 | lookup-8, 4096, 125000, 1.284792 436 | lookup-64, 4096, 125000, 1.711573 437 | bit-parallel, 4096, 125000, 2.499767 438 | bit-parallel-optimized, 4096, 125000, 1.979078 439 | bit-parallel-mul, 4096, 125000, 1.658093 440 | bit-parallel32, 4096, 125000, 1.551595 441 | bit-parallel-optimized32, 4096, 125000, 1.284746 442 | harley-seal, 4096, 125000, 0.849840 443 | builtin-popcnt, 4096, 125000, 1.408587 444 | builtin-popcnt32, 4096, 125000, 2.564388 445 | builtin-popcnt-unrolled, 4096, 125000, 1.378237 446 | builtin-popcnt-unrolled32, 4096, 125000, 2.565843 447 | neon-vcnt, 4096, 125000, 0.162068 448 | neon-HS, 4096, 125000, 0.224111 449 | lookup-8, 32, 10000000, 0.965503 450 | lookup-64, 32, 10000000, 1.227317 451 | bit-parallel, 32, 10000000, 1.801881 452 | bit-parallel-optimized, 32, 10000000, 1.459770 453 | bit-parallel-mul, 32, 10000000, 1.226186 454 | bit-parallel32, 32, 10000000, 1.151085 455 | bit-parallel-optimized32, 32, 10000000, 0.984321 456 | harley-seal, 32, 10000000, 1.393037 457 | builtin-popcnt, 32, 10000000, 0.908010 458 | builtin-popcnt32, 32, 10000000, 1.635973 459 | builtin-popcnt-unrolled, 32, 10000000, 1.067673 460 | builtin-popcnt-unrolled32, 32, 10000000, 1.793448 461 | neon-vcnt, 32, 10000000, 2.305252 462 | neon-HS, 32, 10000000, 2.588125 463 | lookup-8, 64, 5000000, 0.927124 464 | lookup-64, 64, 5000000, 1.193966 465 | bit-parallel, 64, 5000000, 1.680834 466 | bit-parallel-optimized, 64, 5000000, 1.347268 467 | bit-parallel-mul, 64, 5000000, 1.130282 468 | bit-parallel32, 64, 5000000, 1.059453 469 | bit-parallel-optimized32, 64, 5000000, 0.892523 470 | harley-seal, 64, 5000000, 1.199251 471 | builtin-popcnt, 64, 5000000, 0.890113 472 | builtin-popcnt32, 64, 5000000, 1.676810 473 | builtin-popcnt-unrolled, 64, 5000000, 0.963464 474 | builtin-popcnt-unrolled32, 64, 5000000, 1.697510 475 | neon-vcnt, 64, 5000000, 2.086867 476 | neon-HS, 64, 5000000, 2.228310 477 | lookup-8, 128, 2500000, 0.863995 478 | lookup-64, 128, 2500000, 1.130855 479 | bit-parallel, 128, 2500000, 1.620394 480 | bit-parallel-optimized, 128, 2500000, 1.290870 481 | bit-parallel-mul, 128, 2500000, 1.097458 482 | bit-parallel32, 128, 2500000, 1.013715 483 | bit-parallel-optimized32, 128, 2500000, 0.863894 484 | harley-seal, 128, 2500000, 0.875836 485 | builtin-popcnt, 128, 2500000, 0.928382 486 | builtin-popcnt32, 128, 2500000, 1.639087 487 | builtin-popcnt-unrolled, 128, 2500000, 0.911314 488 | builtin-popcnt-unrolled32, 128, 2500000, 1.649550 489 | neon-vcnt, 128, 2500000, 0.185598 490 | neon-HS, 128, 2500000, 2.048371 491 | lookup-8, 256, 1250000, 0.832477 492 | lookup-64, 256, 1250000, 1.099364 493 | bit-parallel, 256, 1250000, 1.590109 494 | bit-parallel-optimized, 256, 1250000, 1.262703 495 | bit-parallel-mul, 256, 1250000, 1.065990 496 | bit-parallel32, 256, 1250000, 1.001574 497 | bit-parallel-optimized32, 256, 1250000, 0.835667 498 | harley-seal, 256, 1250000, 0.697690 499 | builtin-popcnt, 256, 1250000, 0.913032 500 | builtin-popcnt32, 256, 1250000, 1.620353 501 | builtin-popcnt-unrolled, 256, 1250000, 0.885259 502 | builtin-popcnt-unrolled32, 256, 1250000, 1.633893 503 | neon-vcnt, 256, 1250000, 0.141795 504 | neon-HS, 256, 1250000, 0.214802 505 | lookup-8, 512, 1000000, 1.308822 506 | lookup-64, 512, 1000000, 1.736549 507 | bit-parallel, 512, 1000000, 2.531174 508 | bit-parallel-optimized, 512, 1000000, 2.008855 509 | bit-parallel-mul, 512, 1000000, 1.683009 510 | bit-parallel32, 512, 1000000, 1.577856 511 | bit-parallel-optimized32, 512, 1000000, 1.311278 512 | harley-seal, 512, 1000000, 0.975128 513 | builtin-popcnt, 512, 1000000, 1.444352 514 | builtin-popcnt32, 512, 1000000, 2.581640 515 | builtin-popcnt-unrolled, 512, 1000000, 1.405398 516 | builtin-popcnt-unrolled32, 512, 1000000, 2.592554 517 | neon-vcnt, 512, 1000000, 0.192290 518 | neon-HS, 512, 1000000, 0.279941 519 | lookup-8, 1024, 500000, 1.293953 520 | lookup-64, 1024, 500000, 1.721078 521 | bit-parallel, 1024, 500000, 2.511335 522 | bit-parallel-optimized, 1024, 500000, 1.990429 523 | bit-parallel-mul, 1024, 500000, 1.667559 524 | bit-parallel32, 1024, 500000, 1.561751 525 | bit-parallel-optimized32, 1024, 500000, 1.295284 526 | harley-seal, 1024, 500000, 0.902118 527 | builtin-popcnt, 1024, 500000, 1.438992 528 | builtin-popcnt32, 1024, 500000, 2.569992 529 | builtin-popcnt-unrolled, 1024, 500000, 1.388903 530 | builtin-popcnt-unrolled32, 1024, 500000, 2.575608 531 | neon-vcnt, 1024, 500000, 0.174334 532 | neon-HS, 1024, 500000, 0.247324 533 | lookup-8, 2048, 250000, 1.290016 534 | lookup-64, 2048, 250000, 1.717610 535 | bit-parallel, 2048, 250000, 2.507685 536 | bit-parallel-optimized, 2048, 250000, 1.986058 537 | bit-parallel-mul, 2048, 250000, 1.664310 538 | bit-parallel32, 2048, 250000, 1.558752 539 | bit-parallel-optimized32, 2048, 250000, 1.292605 540 | harley-seal, 2048, 250000, 0.869969 541 | builtin-popcnt, 2048, 250000, 1.442570 542 | builtin-popcnt32, 2048, 250000, 2.570431 543 | builtin-popcnt-unrolled, 2048, 250000, 1.384148 544 | builtin-popcnt-unrolled32, 2048, 250000, 2.573200 545 | neon-vcnt, 2048, 250000, 0.167634 546 | neon-HS, 2048, 250000, 0.231696 547 | lookup-8, 4096, 125000, 1.284632 548 | lookup-64, 4096, 125000, 1.711656 549 | bit-parallel, 4096, 125000, 2.499731 550 | bit-parallel-optimized, 4096, 125000, 1.979046 551 | bit-parallel-mul, 4096, 125000, 1.658145 552 | bit-parallel32, 4096, 125000, 1.551612 553 | bit-parallel-optimized32, 4096, 125000, 1.284752 554 | harley-seal, 4096, 125000, 0.849656 555 | builtin-popcnt, 4096, 125000, 1.432718 556 | builtin-popcnt32, 4096, 125000, 2.564379 557 | builtin-popcnt-unrolled, 4096, 125000, 1.378419 558 | builtin-popcnt-unrolled32, 4096, 125000, 2.565781 559 | neon-vcnt, 4096, 125000, 0.162081 560 | neon-HS, 4096, 125000, 0.224106 561 | -------------------------------------------------------------------------------- /results/arm/armv7-32bit-gcc4.9.2.metadata: -------------------------------------------------------------------------------- 1 | cpu=ARMv7 2 | architecture=32-bit 3 | compiler=GCC 4.9.2 (Linux raspberrypi) 4 | runs=5 5 | 6 | 7 | -------------------------------------------------------------------------------- /results/arm/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_arm_g++ --csv" 4 | #prog="../../speed_aarch64_clang++ --csv" 5 | iters=5 6 | tmp=tmp.csv 7 | result=result.csv 8 | 9 | rm -f $tmp 10 | for i in `seq $iters` 11 | do 12 | echo "run $i" 13 | $prog 32 10000000 | tee -a $tmp 14 | $prog 64 5000000 | tee -a $tmp 15 | $prog 128 2500000 | tee -a $tmp 16 | $prog 256 1250000 | tee -a $tmp 17 | $prog 512 1000000 | tee -a $tmp 18 | $prog 1024 500000 | tee -a $tmp 19 | $prog 2048 250000 | tee -a $tmp 20 | $prog 4096 125000 | tee -a $tmp 21 | done 22 | 23 | mv $tmp $result 24 | -------------------------------------------------------------------------------- /results/bulldozer/bulldozer-fx-8510-gcc4.8.4-sse.metadata: -------------------------------------------------------------------------------- 1 | cpu=Bulldozer FX-8150 2 | architecture=SSE 3 | compiler=GCC 4.8.4 (Ubuntu) 4 | runs=5 5 | 6 | 7 | -------------------------------------------------------------------------------- /results/bulldozer/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_g++ --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/cannonlake/cannonlake-i3-8121U-gcc-8.3.1.metadata: -------------------------------------------------------------------------------- 1 | cpu=Cannon Lake Core i3-8121U @ 2.20 GHz 2 | cpu_url=https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html 3 | architecture=AVX512VBMI 4 | compiler=gcc version 8.3.1 20190311 (Red Hat 8.3.1-3) 5 | runs=5 6 | -------------------------------------------------------------------------------- /results/cannonlake/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_avx512vbmi_g++ --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/cascadelake/cascadelake-Xeon-Gold-6240-gcc-8.3.0.metadata: -------------------------------------------------------------------------------- 1 | cpu=Cascade Lake Xeon Gold 6240 @ 2.6 GHz 2 | cpu_url=https://ark.intel.com/content/www/es/es/ark/products/192443/intel-xeon-gold-6240-processor-24-75m-cache-2-60-ghz.html 3 | architecture=AVX512BW 4 | compiler=gcc version 8.3.0 (crosstool-NG 1.24.0) 5 | runs=5 6 | 7 | -------------------------------------------------------------------------------- /results/cascadelake/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog=$1 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | taskset -c 0 $prog --csv 32 100000000 | tee -a $tmp 13 | taskset -c 0 $prog --csv 64 50000000 | tee -a $tmp 14 | taskset -c 0 $prog --csv 128 25000000 | tee -a $tmp 15 | taskset -c 0 $prog --csv 256 12500000 | tee -a $tmp 16 | taskset -c 0 $prog --csv 512 10000000 | tee -a $tmp 17 | taskset -c 0 $prog --csv 1024 5000000 | tee -a $tmp 18 | taskset -c 0 $prog --csv 2048 2500000 | tee -a $tmp 19 | taskset -c 0 $prog --csv 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/haswell/haswell-i7-4770-clang3.8.0-avx2.metadata: -------------------------------------------------------------------------------- 1 | cpu=Haswell Core i7-4770 CPU @ 3.40GHz 2 | architecture=AVX2 3 | compiler=3.8.0-svn257311-1~exp1 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/haswell/haswell-i7-4770-gcc5.3.0-avx2.metadata: -------------------------------------------------------------------------------- 1 | cpu=Haswell Core i7-4770 CPU @ 3.40GHz 2 | architecture=AVX2 3 | compiler=GCC 5.3.0 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/haswell/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_avx2_g++ --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/refresh_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for dir in * 4 | do 5 | if [[ -d ${dir} ]]; then 6 | ./report.sh ${dir} 7 | fi 8 | done 9 | -------------------------------------------------------------------------------- /results/report.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ $1 == "" ]]; then 6 | echo "Usage: ./report.sh subdirectory" 7 | exit 1 8 | fi 9 | DIR="$1" 10 | 11 | if [[ ! -d ${DIR} ]]; then 12 | echo "'${DIR}' is not not a directory" 13 | exit 1 14 | fi 15 | 16 | SCRIPT=scripts/report.py 17 | if [[ ! -f $SCRIPT ]]; then 18 | SCRIPT="../${SCRIPT}" 19 | fi 20 | if [[ ! -f $SCRIPT ]]; then 21 | SCRIPT="../${SCRIPT}" 22 | fi 23 | if [[ ! -f $SCRIPT ]]; then 24 | echo "Could not locate 'report.py'" 25 | exit 1 26 | fi 27 | 28 | REPORT="python2 ${SCRIPT}" 29 | 30 | for file in ${DIR}/*.metadata 31 | do 32 | name=${DIR}/`basename $file .metadata` 33 | CSV=${name}.csv 34 | RST=${name}.rst 35 | HTML=${name}.html 36 | $REPORT --csv="${CSV}" --output="${RST}" 37 | if type rst2html > /dev/null 38 | then 39 | rst2html "${RST}" > "${HTML}" 40 | echo "${HTML} was created" 41 | fi 42 | done 43 | -------------------------------------------------------------------------------- /results/sandybridge-e/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_avx_g++-5 --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/sandybridge-e/sandybridgeE-i7-3930k-g++4.8-avx.metadata: -------------------------------------------------------------------------------- 1 | cpu=SandyBridge-E Core i7-3930k CPU @ 3.2GHz 2 | architecture=AVX 3 | compiler=GCC 4.8.5 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/sandybridge-e/sandybridgeE-i7-3930k-g++5.3-avx.metadata: -------------------------------------------------------------------------------- 1 | cpu=SandyBridge-E Core i7-3930k CPU @ 3.2GHz 2 | architecture=AVX 3 | compiler=GCC 5.3.0 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/skylake-x/skylake-x-w-2104-gcc8.1.0.metadata: -------------------------------------------------------------------------------- 1 | cpu=Xeon W-2104 CPU @ 3.20GHz 2 | cpu_url=https://ark.intel.com/content/www/us/en/ark/products/125039/intel-xeon-w-2104-processor-8-25m-cache-3-20-ghz.html 3 | architecture=AVX512BW 4 | compiler=gcc version 8.1.0 (Ubuntu 8.1.0-5ubuntu1~16.04) 5 | runs=5 6 | -------------------------------------------------------------------------------- /results/skylake/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_avx2_g++ --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/skylake/skylake-i7-6700-clang3.8.0-avx2.metadata: -------------------------------------------------------------------------------- 1 | cpu=Skylake Core i7-6700 CPU @ 3.40GHz 2 | architecture=AVX2 3 | compiler=3.8.0-svn257311-1~exp1 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/skylake/skylake-i7-6700-gcc5.3.0-avx2.metadata: -------------------------------------------------------------------------------- 1 | cpu=Skylake Core i7-6700 CPU @ 3.40GHz 2 | architecture=AVX2 3 | compiler=GCC 5.3.0 (Ubuntu) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /results/westmere/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prog="../../speed_g++ --csv" 4 | iters=5 5 | tmp=tmp.csv 6 | result=result.csv 7 | 8 | rm -f $tmp 9 | for i in `seq $iters` 10 | do 11 | echo "run $i" 12 | $prog 32 100000000 | tee -a $tmp 13 | $prog 64 50000000 | tee -a $tmp 14 | $prog 128 25000000 | tee -a $tmp 15 | $prog 256 12500000 | tee -a $tmp 16 | $prog 512 10000000 | tee -a $tmp 17 | $prog 1024 5000000 | tee -a $tmp 18 | $prog 2048 2500000 | tee -a $tmp 19 | $prog 4096 1250000 | tee -a $tmp 20 | done 21 | 22 | mv $tmp $result 23 | -------------------------------------------------------------------------------- /results/westmere/westmere-m540-gcc4.9.2-sse.metadata: -------------------------------------------------------------------------------- 1 | cpu=Core i5 M540 @ 2.53GHz 2 | architecture=SSE 3 | compiler=GCC 4.9.2 (Debian 4.9.2-10) 4 | runs=5 5 | 6 | -------------------------------------------------------------------------------- /scripts/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /scripts/avx512vbmi-lookups.py: -------------------------------------------------------------------------------- 1 | def popcnt(x): 2 | n = 0 3 | while x > 0: 4 | n += x & 1 5 | x >>= 1 6 | 7 | return n 8 | 9 | 10 | v = [popcnt(x) for x in range(128)] 11 | print(v) 12 | s = ['%02x' % x for x in v] 13 | 14 | r = [] 15 | for i in range(0, 128, 8): 16 | qword = s[i:i+8] 17 | r.append('0x%s' % ''.join(reversed(qword))) 18 | 19 | print('_mm512_setr_epi64(%s)' % (', '.join(r[0:8]))) 20 | print('_mm512_setr_epi64(%s)' % (', '.join(r[8:16]))) 21 | 22 | -------------------------------------------------------------------------------- /scripts/data.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | class Measurments(object): 5 | def __init__(self, value): 6 | self.values = [value] 7 | 8 | def add_measurment(self, value): 9 | self.values.append(value) 10 | 11 | def value(self): 12 | return min(self.values) 13 | 14 | def __len__(self): 15 | return len(self.values) 16 | 17 | 18 | class Cell: 19 | def __init__(self, proc, size, measurments): 20 | self.procedure = proc 21 | self.size = size 22 | self.time = measurments.value() 23 | 24 | 25 | class ExperimentData(object): 26 | def __init__(self, csv_file): 27 | 28 | self.procedures = [] # order of procedures is cruical 29 | self.sizes = set() 30 | self.data = {} # (procedure, size) => measurments 31 | 32 | self.__load_csv(csv_file) 33 | self.sizes = sorted(self.sizes) # now a list! 34 | 35 | 36 | def get(self, procedure, size): 37 | return self.data[(procedure, size)].value() 38 | 39 | 40 | def data_for_procedure(self, procedure): 41 | 42 | assert procedure in self.procedures 43 | 44 | result = [] 45 | for size in self.sizes: 46 | key = (procedure, size) 47 | cell = Cell(procedure, size, self.data[key]) 48 | result.append(cell) 49 | 50 | return result 51 | 52 | 53 | def data_for_size(self, size): 54 | 55 | result = [] 56 | for procedure in self.procedures: 57 | key = (procedure, size) 58 | cell = Cell(procedure, size, self.data[key]) 59 | result.append(cell) 60 | 61 | return result 62 | 63 | 64 | def get_shortest_time(self, size): 65 | return min(item.time for item in self.data_for_size(size)) 66 | 67 | 68 | def __load_csv(self, csv_file): 69 | for line in csv_file: 70 | 71 | fields = line.replace(',', ' ').split() 72 | 73 | proc, size, _, time = fields 74 | size = int(size) 75 | time = float(time) 76 | 77 | if proc not in self.procedures: 78 | self.procedures.append(proc) 79 | 80 | self.sizes.add(size) 81 | 82 | key = (proc, size) 83 | 84 | if key in self.data: 85 | self.data[key].add_measurment(time) 86 | else: 87 | self.data[key] = Measurments(time) 88 | 89 | -------------------------------------------------------------------------------- /scripts/detail-pattern.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | Input size %(SIZE)sB 4 | ########################################################### 5 | 6 | %(TABLE)s 7 | 8 | -------------------------------------------------------------------------------- /scripts/function_registry.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from codecs import open 4 | from collections import OrderedDict 5 | 6 | 7 | class FunctionRegistry(object): 8 | def __init__(self): 9 | self.functions = self.__parse_cpp() 10 | 11 | def __parse_cpp(self): 12 | root = os.path.dirname(__file__) 13 | src = os.path.join(root, "../function_registry.cpp") 14 | 15 | with open(src) as f: 16 | lines = [line.strip() for line in f] 17 | 18 | start = lines.index("// definition start") 19 | end = lines.index("// definition end") 20 | 21 | definitions = lines[start + 1:end] 22 | i = 0 23 | L = OrderedDict() 24 | while i < len(definitions): 25 | 26 | line = definitions[i] 27 | if line.startswith("add_trusted("): 28 | name = line[len("add_trusted("):][1:-2] 29 | description = definitions[i+1][1:-2] 30 | 31 | L[name] = description 32 | 33 | i += 2 34 | elif line.startswith("add("): 35 | name = line[len("add("):][1:-2] 36 | description = definitions[i+1][1:-2] 37 | 38 | L[name] = description 39 | 40 | i += 2 41 | else: 42 | i += 1 43 | 44 | return L 45 | 46 | -------------------------------------------------------------------------------- /scripts/main-pattern.rst: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | Population count comparison for %(CPU)s 3 | ================================================================================ 4 | 5 | Generated on: %(DATE)s 6 | 7 | .. contents:: Contents 8 | 9 | 10 | Specification 11 | -------------------------------------------------- 12 | 13 | CPU: %(CPU_DETAILS)s 14 | 15 | Compiler: %(COMPILER)s 16 | 17 | Instruction set: %(ARCHITECTURE)s 18 | 19 | Number of runs: %(RUNS)s 20 | 21 | All times are given in **seconds**. 22 | 23 | 24 | Procedures 25 | ############################## 26 | 27 | %(PROCEDURES)s 28 | 29 | 30 | Running time 31 | -------------------------------------------------- 32 | 33 | %(TIME_TABLE)s 34 | 35 | %(TIME_GRAPHS)s 36 | 37 | 38 | Speedup 39 | -------------------------------------------------- 40 | 41 | %(SPEEDUP_TABLE)s 42 | 43 | 44 | CSV file 45 | -------------------------------------------------- 46 | 47 | Download `%(CSV_FILE)s <%(CSV_FILE)s>`_ 48 | -------------------------------------------------------------------------------- /scripts/metadata.py: -------------------------------------------------------------------------------- 1 | def load_metadata(file): 2 | d = {} 3 | for line in file: 4 | try: 5 | key, value = line.split("=", 2) 6 | except ValueError: 7 | continue 8 | 9 | value = value.strip() 10 | key = key.strip() 11 | 12 | d[key] = value 13 | 14 | return d 15 | 16 | -------------------------------------------------------------------------------- /scripts/readme_listproc.py: -------------------------------------------------------------------------------- 1 | from function_registry import FunctionRegistry 2 | from table import Table 3 | 4 | def main(): 5 | 6 | table = Table() 7 | reg = FunctionRegistry() 8 | 9 | table.set_header(["procedure", "description"]) 10 | for proc, dsc in reg.functions.iteritems(): 11 | table.add_row([proc, dsc]) 12 | 13 | print table 14 | 15 | 16 | if __name__ == '__main__': 17 | main() 18 | 19 | -------------------------------------------------------------------------------- /scripts/report.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import data 4 | from table import Table 5 | from metadata import load_metadata 6 | from codecs import open 7 | from collections import OrderedDict 8 | from function_registry import FunctionRegistry 9 | 10 | 11 | TIME_PATTERN = '%0.5f' 12 | 13 | class Report(object): 14 | def __init__(self, options): 15 | 16 | self.options = options 17 | 18 | csv_path = options.input 19 | metadata_path = os.path.splitext(csv_path)[0] + ".metadata" 20 | 21 | with open(csv_path, 'rt') as f: 22 | self.data = data.ExperimentData(f) 23 | 24 | with open(metadata_path, 'rt') as f: 25 | self.metadata = load_metadata(f) 26 | 27 | 28 | def generate_rest(self): 29 | 30 | if 'cpu_url' in self.metadata: 31 | cpu = '`%s <%s>`_' % (self.metadata['cpu'], self.metadata['cpu_url']) 32 | else: 33 | cpu = self.metadata['cpu'] 34 | 35 | params = { 36 | 'CSV_FILE' : os.path.basename(self.options.input), 37 | 'ARCHITECTURE' : self.metadata["architecture"], 38 | 'RUNS' : self.metadata["runs"], 39 | 'CPU' : self.metadata["cpu"], 40 | 'CPU_DETAILS' : cpu, 41 | 'COMPILER' : self.metadata["compiler"], 42 | 'DATE' : self.options.date, 43 | 'PROCEDURES' : self.generate_procedures_descriptions(), 44 | 'TIME_TABLE' : self.generate_time_table(), 45 | 'TIME_GRAPHS' : self.generate_time_graphs_per_size(), 46 | 'SPEEDUP_TABLE' : self.generate_speedup_table(), 47 | } 48 | 49 | pattern = self._load_file('main-pattern.rst') 50 | 51 | return pattern % params 52 | 53 | 54 | def generate_time_table(self): 55 | 56 | table = Table() 57 | 58 | # prepare header 59 | header = ["procedure"] 60 | for size in self.data.sizes: 61 | header.append('%d B' % size) 62 | 63 | table.set_header(header) 64 | 65 | # get data 66 | for procedure in self.data.procedures: 67 | data = self.data.data_for_procedure(procedure) 68 | row = [procedure] 69 | 70 | for item in data: 71 | fmt = TIME_PATTERN % item.time 72 | if item.time == self.data.get_shortest_time(item.size): 73 | row.append('**%s**' % fmt) 74 | else: 75 | row.append(fmt) 76 | 77 | table.add_row(row) 78 | 79 | return table 80 | 81 | 82 | def generate_time_graphs_per_size(self): 83 | 84 | pattern = self._load_file('detail-pattern.rst') 85 | 86 | result = '' 87 | 88 | for size in self.data.sizes: 89 | params = { 90 | 'SIZE' : size, 91 | 'TABLE' : self.generate_time_table_for_size(size), 92 | } 93 | 94 | result += pattern % params 95 | 96 | return result 97 | 98 | 99 | def generate_time_table_for_size(self, size): 100 | 101 | table = Table() 102 | table.set_header(["procedure", "time [s]", "relative time (less is better)"]) 103 | 104 | chars = 50 105 | 106 | data = self.data.data_for_size(size) 107 | max_time = max(item.time for item in data) 108 | 109 | for item in data: 110 | time = TIME_PATTERN % item.time 111 | bar = unicode_bar(item.time/max_time, chars) 112 | table.add_row([item.procedure, time, bar]) 113 | 114 | return table 115 | 116 | 117 | def generate_speedup_table(self): 118 | 119 | table = Table() 120 | 121 | # prepare header 122 | header = ["procedure"] 123 | for size in self.data.sizes: 124 | header.append('%d B' % size) 125 | 126 | table.set_header(header) 127 | 128 | reference_time = {} 129 | for size in self.data.sizes: 130 | time = self.data.get(self.data.procedures[0], size) 131 | reference_time[size] = time 132 | 133 | # get data 134 | for proc in self.data.procedures: 135 | measurments = self.data.data_for_procedure(proc) 136 | 137 | row = [proc] 138 | for item in measurments: 139 | speedup = reference_time[item.size] / item.time 140 | row.append('%0.2f' % speedup) 141 | 142 | table.add_row(row) 143 | 144 | return table 145 | 146 | 147 | def generate_procedures_descriptions(self): 148 | 149 | reg = FunctionRegistry() 150 | 151 | table = Table() 152 | header = ["procedure", "description"] 153 | table.set_header(header) 154 | 155 | for proc, desc in reg.functions.iteritems(): 156 | if proc in self.data.procedures: 157 | table.add_row([proc, desc]) 158 | 159 | return table 160 | 161 | 162 | def _load_file(self, path): 163 | 164 | root = os.path.dirname(__file__) 165 | src = os.path.join(root, path) 166 | 167 | with open(src, 'rt', encoding='utf-8') as f: 168 | return f.read() 169 | 170 | 171 | def unicode_bar(value, width): 172 | fractions = ( 173 | '', # 0 - empty 174 | u'\u258f', # 1/8 175 | u'\u258e', # 2/8 176 | u'\u258d', # 3/8 177 | u'\u258c', # 4/8 178 | u'\u258b', # 5/8 179 | u'\u258a', # 6/8 180 | u'\u2589', # 7/8 181 | ) 182 | 183 | block = u'\u2588' 184 | 185 | assert 0.0 <= value <= 1.0 186 | 187 | k8 = int(value * width * 8) 188 | 189 | k = k8 / 8 190 | f = k8 % 8 191 | 192 | return block * k + fractions[f] 193 | 194 | 195 | def get_options(): 196 | 197 | import optparse 198 | import sys 199 | import time 200 | 201 | current_date = time.strftime('%Y-%m-%d') 202 | default_output = "report.rst" 203 | 204 | opt = optparse.OptionParser() 205 | opt.add_option("--csv", dest="input", 206 | help="input CSV filename") 207 | opt.add_option("--output", dest="output", default=default_output, 208 | help="output RST filename [default: %s]" % default_output) 209 | 210 | # for archivists :) 211 | opt.add_option("--date", dest="date", default=current_date, 212 | help="date [default: %s]" % current_date) 213 | 214 | options, _ = opt.parse_args() 215 | 216 | return options 217 | 218 | 219 | def main(): 220 | options = get_options() 221 | report = Report(options) 222 | 223 | with open(options.output, 'wt', encoding='utf-8') as out: 224 | out.write(report.generate_rest()) 225 | 226 | print "%s generated" % options.output 227 | 228 | 229 | if __name__ == '__main__': 230 | main() 231 | -------------------------------------------------------------------------------- /scripts/table.py: -------------------------------------------------------------------------------- 1 | class TableBase(object): 2 | def __init__(self): 3 | self.header = [] 4 | self.rows = [] 5 | 6 | def set_header(self, header): 7 | assert len(header) > 0 8 | self.header = header 9 | 10 | def add_row(self, row): 11 | assert len(row) == len(self.header) 12 | 13 | self.rows.append(row) 14 | 15 | 16 | class RestructuredTextTableRenderer(object): 17 | 18 | def __init__(self, table): 19 | self.table = table 20 | self.widths = self._calculate_widths() 21 | self.padding = 1 22 | 23 | 24 | def get_header(self): 25 | return self.table.header 26 | 27 | 28 | def get_rows(self): 29 | return self.table.rows 30 | 31 | 32 | def _calculate_widths(self): 33 | 34 | width = [0] * len(self.get_header()) 35 | 36 | for row in [self.get_header()] + self.get_rows(): 37 | for index, image in enumerate(row): 38 | w = len(image) 39 | width[index] = max(w, width[index]) 40 | 41 | return width 42 | 43 | 44 | def _render_separator(self, fill): 45 | 46 | assert len(fill) == 1 47 | 48 | result = '+' 49 | 50 | for width in self.widths: 51 | width = width + 2 * self.padding 52 | 53 | result += (fill * width) + '+' 54 | 55 | return result 56 | 57 | 58 | def _render_row(self, row): 59 | 60 | result = '|' 61 | padding = ' ' * self.padding 62 | 63 | for image, width in zip(row, self.widths): 64 | result += '%s%*s%s|' % (padding, -width, image, padding) 65 | 66 | return result 67 | 68 | 69 | def get_image(self): # rest = RestructuredText 70 | 71 | lines = [] 72 | 73 | separator = self._render_separator('-') 74 | 75 | lines.append(separator) 76 | lines.append(self._render_row(self.get_header())) 77 | lines.append(self._render_separator('=')) 78 | 79 | for row in self.get_rows(): 80 | lines.append(self._render_row(row)) 81 | lines.append(separator) 82 | 83 | return '\n'.join(lines) 84 | 85 | 86 | class Table(TableBase): 87 | 88 | def __unicode__(self): 89 | 90 | renderer = RestructuredTextTableRenderer(self) 91 | return renderer.get_image() 92 | 93 | def __str__(self): 94 | 95 | renderer = RestructuredTextTableRenderer(self) 96 | return renderer.get_image() 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | table = Table() 102 | 103 | table.set_header(["procedure", "size", "time"]) 104 | table.add_row(["foo", "100", "0.5"]) 105 | table.add_row(["bar", "105", "1.5"]) 106 | table.add_row(["baz", "111", "0.2"]) 107 | 108 | print table 109 | 110 | -------------------------------------------------------------------------------- /speed.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // -------------------------------------------------- 9 | 10 | #include "config.h" 11 | #include "popcnt-all.cpp" 12 | #include "function_registry.cpp" 13 | 14 | // -------------------------------------------------- 15 | 16 | 17 | class Error final { 18 | std::string message; 19 | 20 | public: 21 | Error(const std::string& msg) : message(msg) {} 22 | 23 | public: 24 | const char* c_str() const { 25 | return message.c_str(); 26 | } 27 | }; 28 | 29 | // -------------------------------------------------- 30 | 31 | class CommandLine final { 32 | 33 | public: 34 | bool print_help; 35 | bool print_csv; 36 | size_t size; 37 | size_t iteration_count; 38 | std::string executable; 39 | std::set functions; 40 | 41 | public: 42 | CommandLine(int argc, char* argv[], const FunctionRegistry& names); 43 | }; 44 | 45 | // -------------------------------------------------- 46 | 47 | CommandLine::CommandLine(int argc, char* argv[], const FunctionRegistry& names) 48 | : print_help(false) 49 | , print_csv(false) 50 | , size(0) 51 | , iteration_count(0) { 52 | 53 | int positional = 0; 54 | for (int i=1; i < argc; i++) { 55 | const std::string arg = argv[i]; 56 | 57 | if (arg == "--help" || arg == "-h") { 58 | print_help = true; 59 | return; 60 | } 61 | 62 | if (arg == "--csv") { 63 | print_csv = true; 64 | continue; 65 | } 66 | 67 | // positional arguments 68 | if (positional == 0) { 69 | int tmp = std::atoi(arg.c_str()); 70 | if (tmp <= 0) { 71 | throw Error("Size must be greater than 0."); 72 | } 73 | 74 | if (tmp % 32 != 0) { 75 | throw Error("Size must be divisible by 32."); 76 | } 77 | 78 | size = tmp; 79 | 80 | } else if (positional == 1) { 81 | 82 | int tmp = std::atoi(arg.c_str()); 83 | if (tmp <= 0) { 84 | throw Error("Iteration count must be greater than 0."); 85 | } 86 | 87 | iteration_count = tmp; 88 | } else { 89 | if (names.has(arg)) { 90 | functions.insert(std::move(arg)); 91 | } else { 92 | throw Error("'" + arg + "' is not valid function name"); 93 | } 94 | } 95 | 96 | positional += 1; 97 | } 98 | 99 | if (positional < 2) { 100 | print_help = true; 101 | } 102 | } 103 | 104 | 105 | // -------------------------------------------------- 106 | 107 | class Application final { 108 | 109 | const CommandLine& cmd; 110 | const FunctionRegistry& names; 111 | std::uint8_t* data __attribute__((aligned(64))); 112 | 113 | uint64_t count; 114 | double time; 115 | 116 | struct Result { 117 | uint64_t count; 118 | double time; 119 | }; 120 | 121 | 122 | public: 123 | Application(const CommandLine& cmdline, const FunctionRegistry& names); 124 | ~Application(); 125 | 126 | int run(); 127 | 128 | private: 129 | void print_help(); 130 | void run_procedures(); 131 | void run_procedure(const std::string& name); 132 | 133 | template 134 | Result run(const std::string& name, FN function, double reference); 135 | }; 136 | 137 | 138 | Application::Application(const CommandLine& cmdline, const FunctionRegistry& names) 139 | : cmd(cmdline) 140 | , names(names) 141 | , data(nullptr) {} 142 | 143 | 144 | int Application::run() { 145 | 146 | if (cmd.print_help) { 147 | print_help(); 148 | } else { 149 | run_procedures(); 150 | } 151 | 152 | return 0; 153 | } 154 | 155 | void* aligned_alloc_crossplatform(size_t align, size_t size) { 156 | #ifdef _MSC_VER 157 | return _aligned_malloc(size, align); 158 | #elif defined(__cplusplus) && __cplusplus >= 201703L 159 | // C++17 standard compliant compilers (GCC, Clang, and others supporting C++17) 160 | // Note: size must be an integral multiple of alignment. 161 | if (size % align != 0) { 162 | size = ((size / align) + 1) * align; // Adjust size to be a multiple of align 163 | } 164 | return std::aligned_alloc(align, size); 165 | #else 166 | // Fallback for non-C++17 compilers or different environments 167 | void* ptr = nullptr; 168 | if (posix_memalign(&ptr, align, size) != 0) { 169 | return nullptr; 170 | } 171 | return ptr; 172 | #endif 173 | } 174 | 175 | void Application::run_procedures() { 176 | 177 | data = reinterpret_cast(aligned_alloc_crossplatform(64, cmd.size)); 178 | if (!data) { 179 | throw std::bad_alloc(); 180 | } 181 | 182 | 183 | for (size_t i=0; i < cmd.size; i++) { 184 | data[i] = i; 185 | } 186 | 187 | count = 0; 188 | time = 0; 189 | 190 | if (!cmd.functions.empty()) { 191 | for (const auto& name: cmd.functions) { 192 | run_procedure(name); 193 | } 194 | } else { 195 | for (const auto& name: names.get_available()) { 196 | run_procedure(name); 197 | } 198 | } 199 | } 200 | 201 | Application::~Application() { 202 | free(data); 203 | } 204 | 205 | void Application::run_procedure(const std::string& name) { 206 | 207 | #define RUN(function_name, function) \ 208 | if (name == function_name) { \ 209 | auto result = run(name, function, time); \ 210 | count += result.count; \ 211 | if (time == 0.0) { \ 212 | time = result.time; \ 213 | } \ 214 | } 215 | 216 | RUN("lookup-8", popcnt_lookup_8bit) 217 | RUN("lookup-64", popcnt_lookup_64bit); 218 | RUN("bit-parallel", popcnt_parallel_64bit_naive); 219 | RUN("bit-parallel-optimized", popcnt_parallel_64bit_optimized); 220 | RUN("bit-parallel-optimized2", popcnt_parallel_64bit_optimized2); 221 | RUN("bit-parallel-mul", popcnt_parallel_64bit_mul); 222 | RUN("bit-parallel32", popcnt_parallel_32bit_naive); 223 | RUN("bit-parallel-optimized32", popcnt_parallel_32bit_optimized); 224 | RUN("harley-seal", popcnt_harley_seal); 225 | #if defined(HAVE_SSE_INSTRUCTIONS) 226 | RUN("sse-bit-parallel", popcnt_SSE_bit_parallel); 227 | RUN("sse-bit-parallel-original", popcnt_SSE_bit_parallel_original); 228 | RUN("sse-bit-parallel-better", popcnt_SSE_bit_parallel_better); 229 | RUN("sse-harley-seal", popcnt_SSE_harley_seal); 230 | RUN("sse-lookup", popcnt_SSE_lookup); 231 | RUN("sse-lookup-original", popcnt_SSE_lookup_original); 232 | RUN("sse-cpu", popcnt_SSE_and_cpu); 233 | #endif 234 | 235 | #if defined(HAVE_AVX2_INSTRUCTIONS) 236 | RUN("avx2-lookup", popcnt_AVX2_lookup); 237 | RUN("avx2-lookup-original", popcnt_AVX2_lookup_original); 238 | RUN("avx2-harley-seal", popcnt_AVX2_harley_seal); 239 | RUN("avx2-cpu", popcnt_AVX2_and_cpu); 240 | #endif 241 | 242 | #if defined(HAVE_AVX512BW_INSTRUCTIONS) 243 | RUN("avx512-harley-seal", popcnt_AVX512_harley_seal); 244 | RUN("avx512bw-shuf", popcnt_AVX512BW_lookup_original); 245 | #endif 246 | 247 | #if defined(HAVE_AVX512VBMI_INSTRUCTIONS) 248 | RUN("avx512vbmi-shuf", popcnt_AVX512VBMI_lookup); 249 | #endif 250 | 251 | #if defined(HAVE_AVX512VPOPCNT_INSTRUCTIONS) 252 | RUN("avx512-vpopcnt", popcnt_AVX512_vpopcnt); 253 | #endif 254 | 255 | #if defined(HAVE_POPCNT_INSTRUCTION) 256 | RUN("cpu", popcnt_cpu_64bit); 257 | #endif 258 | 259 | #if defined(HAVE_NEON_INSTRUCTIONS) 260 | RUN("neon-vcnt", popcnt_neon_vcnt); 261 | RUN("neon-HS", popcnt_neon_harley_seal); 262 | #endif 263 | 264 | #if defined(HAVE_AARCH64_ARCHITECTURE) 265 | RUN("aarch64-cnt", popcnt_aarch64_cnt); 266 | #endif 267 | 268 | #if defined(HAVE_RVV_INSTRUCTIONS) 269 | RUN("rvv-1", popcnt_rvv_lookup); 270 | #endif 271 | 272 | #define RUN_BUILTIN(function_name, function) \ 273 | { \ 274 | auto wrapper = [](const uint8_t* data, size_t size) { \ 275 | return function(reinterpret_cast(data), size/8); \ 276 | }; \ 277 | RUN(function_name, wrapper); \ 278 | } 279 | 280 | RUN_BUILTIN("builtin-popcnt", builtin_popcnt); 281 | RUN_BUILTIN("builtin-popcnt32", builtin_popcnt32); 282 | RUN_BUILTIN("builtin-popcnt-unrolled", builtin_popcnt_unrolled); 283 | RUN_BUILTIN("builtin-popcnt-unrolled32", builtin_popcnt_unrolled32); 284 | #if defined(HAVE_POPCNT_INSTRUCTION) 285 | RUN_BUILTIN("builtin-popcnt-unrolled-errata", builtin_popcnt_unrolled_errata); 286 | RUN_BUILTIN("builtin-popcnt-unrolled-errata-manual", builtin_popcnt_unrolled_errata_manual); 287 | RUN_BUILTIN("builtin-popcnt-movdq", builtin_popcnt_movdq); 288 | RUN_BUILTIN("builtin-popcnt-movdq-unrolled", builtin_popcnt_movdq_unrolled); 289 | RUN_BUILTIN("builtin-popcnt-movdq-unrolled_manual", builtin_popcnt_movdq_unrolled_manual); 290 | #endif 291 | } 292 | 293 | 294 | template 295 | Application::Result Application::run(const std::string& name, FN function, double reference) { 296 | 297 | Result result; 298 | 299 | if (cmd.print_csv) { 300 | printf("%s, %zu, %zu, ", name.c_str(), cmd.size, cmd.iteration_count); 301 | fflush(stdout); 302 | } else { 303 | const auto& dsc = names.get(name); 304 | printf("%*s ... ", -names.get_widest_name(), dsc.name.c_str()); 305 | fflush(stdout); 306 | } 307 | 308 | size_t n = 0; 309 | size_t k = cmd.iteration_count; 310 | 311 | const auto t1 = std::chrono::high_resolution_clock::now(); 312 | while (k-- > 0) { 313 | n += function(data, cmd.size); 314 | } 315 | 316 | const auto t2 = std::chrono::high_resolution_clock::now(); 317 | 318 | const std::chrono::duration td = t2-t1; 319 | 320 | if (cmd.print_csv) { 321 | printf("%0.6f\n", td.count()); 322 | } else { 323 | //printf("reference result = %lu, time = %0.6f s", n, td.count()); 324 | printf("time = %0.6f s", td.count()); 325 | 326 | if (reference > 0.0) { 327 | const auto speedup = reference/td.count(); 328 | 329 | printf(" (speedup: %3.2f)", speedup); 330 | } 331 | 332 | printf("\n"); 333 | } 334 | 335 | result.count = n; // to prevent compiler from optimizing out the loop 336 | result.time = td.count(); 337 | 338 | fflush(stdout); 339 | 340 | return result; 341 | } 342 | 343 | void Application::print_help() { 344 | std::printf("usage: %s [--csv] buffer_size iteration_count [function(s)]\n", cmd.executable.c_str()); 345 | std::puts(""); 346 | std::puts("--csv - print results in CVS format:"); 347 | std::puts(" function name, buffer_size, iteration_count, time"); 348 | std::puts(""); 349 | std::puts("1. buffer_size - size of buffer in bytes"); 350 | std::puts("2. iteration_count - as the name states"); 351 | 352 | std::puts("3. one or more functions (if not given all will run):"); 353 | 354 | const int w = names.get_widest_name(); 355 | for (const auto& item: names.get_functions()) { 356 | 357 | std::printf(" * %*s - %s\n", -w, item.name.c_str(), item.help.c_str()); 358 | } 359 | } 360 | 361 | 362 | int main(int argc, char* argv[]) { 363 | 364 | try { 365 | FunctionRegistry names; 366 | CommandLine cmd(argc, argv, names); 367 | Application app(cmd, names); 368 | 369 | return app.run(); 370 | } catch (Error& e) { 371 | puts(e.c_str()); 372 | 373 | return EXIT_FAILURE; 374 | } 375 | 376 | return EXIT_SUCCESS; 377 | } 378 | 379 | -------------------------------------------------------------------------------- /sse_operators.cpp: -------------------------------------------------------------------------------- 1 | struct sse_vector final { 2 | union { 3 | __m128i v; 4 | uint8_t u8[16]; 5 | uint16_t u16[8]; 6 | uint32_t u32[4]; 7 | uint64_t u64[2]; 8 | }; 9 | 10 | sse_vector() = delete; 11 | sse_vector(sse_vector&) = delete; 12 | 13 | explicit sse_vector(const __m128i& vec): v(vec) {} 14 | }; 15 | 16 | 17 | __m128i operator&(sse_vector a, sse_vector b) { 18 | 19 | return _mm_and_si128(a.v, b.v); 20 | } 21 | 22 | 23 | __m128i operator|(sse_vector a, sse_vector b) { 24 | 25 | return _mm_or_si128(a.v, b.v); 26 | } 27 | 28 | 29 | __m128i operator^(sse_vector a, sse_vector b) { 30 | 31 | return _mm_xor_si128(a.v, b.v); 32 | } 33 | 34 | 35 | struct shift16 final { 36 | const unsigned bits; 37 | 38 | shift16() = delete; 39 | explicit shift16(unsigned bits) : bits(bits) {}; 40 | }; 41 | 42 | 43 | __m128i operator>>(const __m128i a, const shift16 amount) { 44 | 45 | return _mm_srli_epi16(a, amount.bits); 46 | } 47 | 48 | 49 | uint64_t lower_qword(const __m128i v) { 50 | 51 | return _mm_cvtsi128_si64(v); 52 | } 53 | 54 | 55 | uint64_t higher_qword(const __m128i v) { 56 | 57 | return lower_qword(_mm_srli_si128(v, 8)); 58 | } 59 | 60 | 61 | uint64_t simd_sum_epu64(const __m128i v) { 62 | 63 | return lower_qword(v) + higher_qword(v); 64 | } 65 | 66 | 67 | #if defined(HAVE_AVX2_INSTRUCTIONS) 68 | uint64_t simd_sum_epu64(const __m256i v) { 69 | 70 | return static_cast(_mm256_extract_epi64(v, 0)) 71 | + static_cast(_mm256_extract_epi64(v, 1)) 72 | + static_cast(_mm256_extract_epi64(v, 2)) 73 | + static_cast(_mm256_extract_epi64(v, 3)); 74 | } 75 | #endif 76 | 77 | 78 | #if defined(HAVE_AVX512BW_INSTRUCTIONS) 79 | uint64_t simd_sum_epu64(const __m512i v) { 80 | 81 | const __m256i lo = _mm512_extracti64x4_epi64(v, 0); 82 | const __m256i hi = _mm512_extracti64x4_epi64(v, 1); 83 | 84 | return simd_sum_epu64(lo) + simd_sum_epu64(hi); 85 | } 86 | #endif 87 | -------------------------------------------------------------------------------- /verify.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // -------------------------------------------------- 8 | 9 | #include "config.h" 10 | #include "popcnt-all.cpp" 11 | #include "function_registry.cpp" 12 | 13 | // -------------------------------------------------- 14 | 15 | 16 | #if HAVE_ANSI_CONSOLE 17 | void puts(const char* str, int ansi_color) { 18 | printf("\033[%dm%s\033[0m\n", ansi_color, str); 19 | } 20 | #else 21 | void puts(const char* str, int) { 22 | puts(str); 23 | } 24 | #endif // HAVE_ANSI_CONSOLE 25 | 26 | 27 | static const int RED = 31; 28 | static const int GREEN = 32; 29 | 30 | 31 | class Application final { 32 | 33 | const FunctionRegistry& names; 34 | 35 | static const size_t size = 1024; 36 | std::uint8_t data[size] __attribute__((aligned(64))); 37 | bool failed; 38 | 39 | public: 40 | Application(const FunctionRegistry& names); 41 | 42 | bool run(); 43 | 44 | private: 45 | void run_const_val(const char* name, uint8_t val); 46 | void run_ascending(); 47 | void run_quasirandom(); 48 | void verify(const char* name); 49 | }; 50 | 51 | 52 | int main() { 53 | 54 | FunctionRegistry names; 55 | Application app(names); 56 | 57 | const bool ok = app.run(); 58 | 59 | if (ok) { 60 | return EXIT_SUCCESS; 61 | } else { 62 | puts("There are errors", RED); 63 | return EXIT_FAILURE; 64 | } 65 | } 66 | 67 | 68 | Application::Application(const FunctionRegistry& names) 69 | : names(names) 70 | , failed(false) {} 71 | 72 | 73 | bool Application::run() { 74 | 75 | run_const_val("all zeros", 0x00); 76 | run_const_val("all ones", 0xff); 77 | run_const_val("single bit (0x01)", 0x01); 78 | run_const_val("single bit (0x02)", 0x02); 79 | run_const_val("single bit (0x04)", 0x04); 80 | run_const_val("single bit (0x08)", 0x08); 81 | run_const_val("single bit (0x10)", 0x10); 82 | run_const_val("single bit (0x20)", 0x20); 83 | run_const_val("single bit (0x40)", 0x40); 84 | run_const_val("single bit (0x80)", 0x80); 85 | run_ascending(); 86 | run_quasirandom(); 87 | 88 | return !failed; 89 | } 90 | 91 | 92 | void Application::run_const_val(const char* name, uint8_t val) { 93 | 94 | for (size_t i=0; i < size; i++) { 95 | data[i] = val; 96 | } 97 | 98 | verify(name); 99 | } 100 | 101 | 102 | void Application::run_ascending() { 103 | 104 | for (size_t i=0; i < size; i++) { 105 | data[i] = i; 106 | } 107 | 108 | verify("ascending"); 109 | } 110 | 111 | 112 | void Application::run_quasirandom() { 113 | 114 | for (size_t i=0; i < size; i++) { 115 | data[i] = i*33 + 12345; 116 | } 117 | 118 | verify("quasirandom"); 119 | } 120 | 121 | 122 | 123 | void Application::verify(const char* name) { 124 | 125 | const int w = names.get_widest_name(); 126 | 127 | puts(""); 128 | printf("test '%s':\n", name); 129 | 130 | const size_t reference = popcnt_lookup_8bit(data, size); 131 | 132 | for (auto& item: names.get_functions()) { 133 | if (item.is_trusted) { 134 | continue; 135 | } 136 | 137 | printf("%*s : ", -w, item.name.c_str()); 138 | size_t result; 139 | if (item.function) { 140 | result = item.function(data, size); 141 | } else { 142 | result = item.function_64(reinterpret_cast(data), size/8); 143 | } 144 | 145 | if (result == reference) { 146 | puts("OK", GREEN); 147 | } else { 148 | puts("ERROR", RED); 149 | printf("result = %zu, reference = %zu\n", result, reference); 150 | failed = true; 151 | } 152 | } 153 | } 154 | 155 | 156 | --------------------------------------------------------------------------------