├── .travis.yml ├── LICENSE ├── makefile ├── README.md ├── turbohist.c ├── time_.h ├── conf.h ├── sse_neon.h └── turbohist_.c /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | 3 | compiler: 4 | - gcc 5 | - clang 6 | 7 | branches: 8 | only: 9 | - master 10 | 11 | script: 12 | - make 13 | - ./turbohist 14 | 15 | matrix: 16 | include: 17 | - name: Linux arm 18 | os: linux 19 | arch: arm64 20 | compiler: gcc 21 | 22 | - name: Windows-MinGW 23 | os: windows 24 | script: 25 | - mingw32-make 26 | - ./turbohist 27 | 28 | - name: macOS, xcode 29 | os: osx 30 | 31 | # - name: Linux amd64 32 | # os: linux 33 | # arch: amd64 34 | # - name: Power ppc64le 35 | # os: linux-ppc64le 36 | # compiler: gcc 37 | 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-2019, Powturbo 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | - homepage : https://sites.google.com/site/powturbo/ 28 | - github : https://github.com/powturbo 29 | - twitter : https://twitter.com/powturbo 30 | - email : powturbo [_AT_] gmail [_DOT_] com 31 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | # powturbo (c) Copyright 2013-2022 2 | # Download or clone Turbo-Histogram: 3 | # git clone git://github.com/powturbo/Turbo-Histogram.git 4 | 5 | #uncomment to enable 6 | #https://github.com/nkurz/countbench (inline assembly) 7 | #COUNTBENCH=1 8 | #https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0 (delete/comment or rename main) 9 | #RYG=1 10 | # timer: rdtsc cycles/byte or wall time in MB/s 11 | #RDTSC=1 12 | #AVX2=1 13 | 14 | #------------------------------------------------------------------------------------- 15 | CC ?= gcc 16 | CXX ?= g++ 17 | #CC=clang 18 | #CXX=clang++ 19 | ASM ?= nasm 20 | 21 | OPT=-fstrict-aliasing 22 | ifeq (,$(findstring clang, $(CC))) 23 | OPT+=-falign-loops 24 | endif 25 | 26 | #------- OS/ARCH ------------------- 27 | ifneq (,$(filter Windows%,$(OS))) 28 | OS := Windows 29 | # CC=gcc 30 | # CXX=g++ 31 | ARCH=x86_64 32 | LDFLAGS+=-Wl,--stack,8194304 33 | FASM=win64 34 | else 35 | OS := $(shell uname -s) 36 | ARCH := $(shell uname -m) 37 | FASM=elf64 38 | 39 | ifneq (,$(findstring aarch64,$(CC))) 40 | ARCH = aarch64 41 | else ifneq (,$(findstring powerpc64le,$(CC))) 42 | ARCH = ppc64le 43 | endif 44 | endif 45 | 46 | ifeq ($(ARCH),ppc64le) 47 | _SSE=-D__SSSE3__ 48 | MARCH=-mcpu=power9 -mtune=power9 $(_SSE) 49 | else ifeq ($(ARCH),aarch64) 50 | MARCH+=-march=armv8-a 51 | ifneq (,$(findstring clang, $(CC))) 52 | MARCH+=-march=armv8-a 53 | OPT+=-fomit-frame-pointer 54 | else 55 | MARCH+=-march=armv8-a 56 | endif 57 | SSE=-march=armv8-a 58 | else ifeq ($(ARCH),$(filter $(ARCH),x86_64)) 59 | LDFLAG+=-lm 60 | # set minimum arch sandy bridge SSE4.1 + AVX 61 | _SSE=-march=corei7-avx -mtune=corei7-avx 62 | # SSE+=-mno-avx -mno-aes 63 | _AVX2=-march=haswell 64 | # CFLAGS=$(SSE) 65 | # CFLAGS=$(AVX2) 66 | endif 67 | 68 | ifeq ($(AVX2),1) 69 | MARCH=$(_AVX2) 70 | else 71 | MARCH=$(_SSE) 72 | endif 73 | 74 | CFLAGS+=$(MARCH) -w $(OPT) 75 | ifeq ($(STATIC),1) 76 | LDFLAGS+=-static 77 | endif 78 | 79 | ifeq ($(RDTSC),1) 80 | CFLAGS+=-D_RDTSC 81 | endif 82 | 83 | ifeq ($(COUNTBENCH),1) 84 | CFLAGS+=-D_COUNTBENCH 85 | endif 86 | 87 | ifeq ($(RYG),1) 88 | CFLAGS+=-D_RYG 89 | ASMLIB=histo_asm.o 90 | endif 91 | 92 | all: turbohist 93 | 94 | histo_asm.o: histo_asm.nas 95 | $(ASM) -f $(FASM) histo_asm.nas -o histo_asm.o 96 | 97 | turbohist: turbohist.o $(ASMLIB) 98 | $(CC) $^ $(LDFLAGS) -o turbohist 99 | 100 | .c.o: 101 | $(CC) -O3 $(CFLAGS) $< -c -o $@ 102 | 103 | 104 | ifeq ($(OS),Windows) 105 | clean: 106 | del /S *.o 107 | # del /S *.exe 108 | else 109 | clean: 110 | find . -name "turbohist" -type f -delete 111 | find . -name "*.o" -type f -delete 112 | find . -name "*~" -type f -delete 113 | find . -name "core" -type f -delete 114 | endif 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TurboHist: Fastest Histogram Construction 2 | ========================================= 3 | 4 | - **~0.18 - 0.90 cycles per byte** 5 | - 100% C (C++ compatible header) without inline assembly 6 | - Both 32 and 64 bits supported 7 | - Portable scalar functions faster than SIMD functions 8 | - **Up to 22 times** faster than naive solution 9 | - :new: (2022.01) more faster, beats even other very fast assembler functions 10 | 11 | # Benchmark: 12 | - Single thread 13 | - Realistic and practical benchmark with large files. 14 | - No PURE cache benchmark 15 | 16 | #### - Uniform/Skewed distribution: 17 | - Uniform: [enwik9](http://mattmahoney.net/dc/text.html) 18 | - Skewed: enwik9 bwt generated w. libdivsufsort 19 | - 1GB zeros 20 | - Accurate benchmarking with command "turbohist file -I15" 21 | 22 | ###### Benchmark Intel CPU: i7-9700K 3.6GHz gcc 11.2 23 | Uniform distribution - enwik9 Text file, size=1.000.0000.000 24 | | Function | MB/s |Cycle/Byte|Language |Package | 25 | |----------------------------|-------:|---------:|----------|---------| 26 | | 1:hist_1_8 naiv 8 bits| 2761.01|1.3423 |C |TurboHist| 27 | | 2:hist_4_8 4 bins/ 8 bits| 2725.92|1.3249|C|TurboHist| 28 | | 3:hist_8_8 8 bins/ 8 bits| 2850.05|1.2627|C|TurboHist| 29 | | 4:hist_4_32 4 bins/32 bits| 3691.02|0.9660|C|TurboHist| 30 | | 5:hist_8_32 8 bins/32 bits| 3867.26|0.9561|C|TurboHist| 31 | | 6:hist_4_64 4 bins/64 bits|4040.55|0.9103|C|TurboHist| 32 | | 7:hist_8_64 8 bins/64 bits|**4053.37**|**0.9035**|C|TurboHist| 33 | | 8:histr_4_64 4/64+run | 3915.85|0.9668|C|TurboHist| 34 | | 9:histr_8_64 8/64+run | 3916.51|0.9286|C|TurboHist| 35 | |10:hist_4_128 4 bins/sse4.1 | 3643.20|1.0081|C|TurboHist| 36 | |11:hist_8_128 8 bins/sse4.1 | 3607.06|0.9845|C|TurboHist| 37 | |12:hist_4_256 4 bins/avx2 | 3522.27|1.0195|C|TurboHist| 38 | |13:hist_8_256 8 bins/avx2 | 3542.25|1.0366|C|TurboHist| 39 | |15:hist_8_64asm inline asm |**4161.87**|**0.8787**|inline asm|TurboHist| 40 | |18:count2x64 inline asm | 3963.91|0.9172|inline asm|Countbench| 41 | |20:histo_ref | 2702.57|1.3567|C|[Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0)| 42 | |21:histo_cpp_1x | 1876.13|1.8236|C|Ryg| 43 | |22:histo_cpp_2x | 2664.78|1.5935|C|Ryg| 44 | |23:histo_cpp_4x | 2817.77|1.2944|C|Ryg| 45 | |24:histo_asm_scalar4 | 3130.08|1.1609|asm|Ryg| 46 | |25:histo_asm_scalar8 | 3353.08|1.0636|asm|Ryg| 47 | |26:histo_asm_scalar8_var | 3704.88|0.9856|asm|Ryg| 48 | |27:histo_asm_scalar8_var2 | 4085.48|0.8913|asm|Ryg| 49 | |28:histo_asm_scalar8_var3 | 4132.54|0.8870|asm|Ryg| 50 | |29:histo_asm_scalar8_var4 | 4083.92|0.8970|asm|Ryg| 51 | |30:histo_asm_scalar8_var5 | 4002.21|0.9025|asm|Ryg| 52 | |31:histo_asm_sse4 | 3153.01|1.1445|asm|Ryg| 53 | |32:memcpy |13724.29|0.2698|C| 54 | 55 | Skewed distribution - enwik9.bwt Text file, size=1.000.0000.000 56 | | Function | MB/s |Cycle/Byte|Language | 57 | |----------------------------|-------:|---------:|----------| 58 | | 1:hist_1_8 naiv 8 bits| 1170.89|3.0642|C|TurboHist| 59 | | 2:hist_4_8 4 bins/ 8 bits| 2707.74|1.3321|C|TurboHist| 60 | | 3:hist_8_8 8 bins/ 8 bits| 2804.08|1.3208|C|TurboHist| 61 | | 4:hist_4_32 4 bins/32 bits| 3118.54|1.1402|C|TurboHist| 62 | | 5:hist_8_32 8 bins/32 bits| 3780.16|0.9714|C|TurboHist| 63 | | 6:hist_4_64 4 bins/64 bits| 3646.25|0.9980|C|TurboHist| 64 | | 7:hist_8_64 8 bins/64 bits| 3941.96|0.9282|C|TurboHist| 65 | | 8:histr_4_64 4/64+run | 5061.62|0.7270|C|TurboHist| 66 | | 9:histr_8_64 8/64+run |**5135.29**|**0.7229**|C|TurboHist| 67 | |10:hist_4_128 4 bins/sse4.1 | 3535.36|1.0365|C|TurboHist| 68 | |11:hist_8_128 8 bins/sse4.1 | 3654.41|0.9791|C|TurboHist| 69 | |12:hist_4_256 4 bins/avx2 | 3329.87|1.1022|C|TurboHist| 70 | |13:hist_8_256 8 bins/avx2 | 3540.36|1.0343|C|TurboHist| 71 | |15:hist_8_64asm inline asm | 4047.74|0.9013|inline asm|TurboHist| 72 | |18:count2x64 inline asm | 3969.92|0.9262|inline asm|[Countbench](https://github.com/nkurz/countbench)| 73 | |20:histo_ref | 1182.61|3.0718|C|Ryg| 74 | |21:histo_cpp_1x | 1213.42|2.9748|C|Ryg| 75 | |22:histo_cpp_2x | 2115.60|1.7373|C|Ryg| 76 | |23:histo_cpp_4x | 1801.97|2.0024|C|Ryg| 77 | |24:histo_asm_scalar4 | 3092.87|1.1561|asm|Ryg| 78 | |25:histo_asm_scalar8 | 3203.95|1.1139|asm|Ryg| 79 | |26:histo_asm_scalar8_var | 3460.45|1.0422|asm|Ryg| 80 | |27:histo_asm_scalar8_var2 | 3659.61|0.9878|asm|Ryg| 81 | |28:histo_asm_scalar8_var3 | 3769.96|0.9569|asm|Ryg| 82 | |29:histo_asm_scalar8_var4 | 3996.75|0.8905|asm|Ryg| 83 | |30:histo_asm_scalar8_var5 | 4642.10|0.7719|asm|Ryg| 84 | |31:histo_asm_sse4 | 3091.36|1.1670|asm|Ryg| 85 | |32:memcpy |15594.28|0.2412|C| 86 | 87 | All zeros: size=1.000.0000.000 88 | | Function | MB/s |Cycle/Byte|Language | 89 | |----------------------------|-------:|---------:|----------| 90 | | 1:hist_1_8 naiv 8 bits| 877.27|4.0805|C|TurboHist| 91 | | 2:hist_4_8 4 bins/ 8 bits| 2650.84|1.3485|C|TurboHist| 92 | | 3:hist_8_8 8 bins/ 8 bits| 2743.40|1.2994|C|TurboHist| 93 | | 4:hist_4_32 4 bins/32 bits| 2978.83|1.2006|C|TurboHist| 94 | | 5:hist_8_32 8 bins/32 bits| 3775.45|0.9555|C|TurboHist| 95 | | 6:hist_4_64 4 bins/64 bits| 3411.11|1.0530|C|TurboHist| 96 | | 7:hist_8_64 8 bins/64 bits| 3928.09|0.9342|C|TurboHist| 97 | | 8:histr_4_64 4/64+run |18998.87|0.1868|C|TurboHist| 98 | | 9:histr_8_64 8/64+run |**19629.28**|**0.1869**|C|TurboHist| 99 | |10:hist_4_128 4 bins/sse4.1 | 3365.40|1.0717|C|TurboHist| 100 | |11:hist_8_128 8 bins/sse4.1 | 3632.61|0.9950|C|TurboHist| 101 | |12:hist_4_256 4 bins/avx2 | 3112.15|1.1576|C|TurboHist| 102 | |13:hist_8_256 8 bins/avx2 | 3497.08|1.0205|C|TurboHist| 103 | |15:hist_8_64asm inline asm |4089.97|0.8817|inline asm|TurboHist| 104 | |18:count2x64 inline asm | 3881.98|0.9158|inline asm|Countbench| 105 | |20:histo_ref | 882.93|4.1072|C|Ryg| 106 | |21:histo_cpp_1x | 873.20|4.1069|C|Ryg| 107 | |22:histo_cpp_2x | 1720.19|2.0961|C|Ryg| 108 | |23:histo_cpp_4x | 1866.99|2.0817|C|Ryg| 109 | |24:histo_asm_scalar4 | 2995.84|1.1942|asm|Ryg| 110 | |25:histo_asm_scalar8 | 3107.30|1.1618|asm|Ryg| 111 | |26:histo_asm_scalar8_var | 3288.67|1.1143|asm|Ryg| 112 | |27:histo_asm_scalar8_var2 | 3290.92|1.0957|asm|Ryg| 113 | |28:histo_asm_scalar8_var3 | 3707.41|0.9763|asm|Ryg| 114 | |29:histo_asm_scalar8_var4 | 3988.01|0.9019|asm|Ryg| 115 | |30:histo_asm_scalar8_var5 |14076.09|0.2564|asm|Ryg| 116 | |31:histo_asm_sse4 | 3020.32|1.1975|asm|Ryg| 117 | |32:memcpy |14057.53|0.2636|C| 118 | 119 | (**bold** = pareto) MB=1.000.000 120 | - [Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0)
121 | - [Countbench](https://github.com/nkurz/countbench) 122 | 123 | ## Compile: 124 | 125 | 126 | make 127 | or 128 | make AVX2=1 129 | 130 | ## Usage: 131 | 132 | 133 | turbohist [-e#] file [-I#] [-z] 134 | options: 135 | -e# # = function numbers separated by , 136 | -I# # = number of iteration 137 | set to -I15 for accurate timings 138 | -z set read buffer to zeros 139 | 140 | ### Examples: 141 | 142 | ./turbohist file 143 | ./turbohist -e1,7,9 144 | 145 | ### Environment: 146 | ###### OS/Compiler (32 + 64 bits): 147 | - Windows: MinGW-w64 makefile 148 | - Linux amd/intel: GNU GCC (>=4.6) 149 | - Linux amd/intel: Clang (>=3.2) 150 | - Linux arm: aarch64 ARMv8: gcc (>=6.3) 151 | - MaxOS: XCode (>=9) + Apple M1 152 | - PowerPC ppc64le: gcc (>=8.0) 153 | 154 | Last update: 01 JAN 2022 155 | -------------------------------------------------------------------------------- /turbohist.c: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright (c) 2013-2022, Powturbo 3 | - homepage : https://sites.google.com/site/powturbo/ 4 | - github : https://github.com/powturbo 5 | - twitter : https://twitter.com/powturbo 6 | - email : powturbo [_AT_] gmail [_DOT_] com 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are 11 | met: 12 | 13 | 1. Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | **/ 32 | // Turbo histogram benchmark 33 | #include 34 | #include 35 | #include 36 | #ifdef __APPLE__ 37 | #include 38 | #else 39 | #include 40 | #endif 41 | #ifdef _MSC_VER 42 | #include "vs/getopt.h" 43 | #else 44 | #include 45 | #include 46 | #endif 47 | #include "conf.h" 48 | #include "time_.h" 49 | 50 | #include "turbohist_.c" 51 | #ifdef _RYG 52 | #include "histotest.cpp" 53 | #endif 54 | 55 | NOINLINE void libmemcpy(unsigned char *dst, unsigned char *src, int len) { 56 | void *(*memcpy_ptr)(void *, const void *, size_t) = memcpy; 57 | if (time(NULL) == 1) 58 | memcpy_ptr = NULL; 59 | memcpy_ptr(dst, src, len); 60 | } 61 | 62 | void usage(char *pgm) { 63 | fprintf(stderr, "\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__); 64 | fprintf(stderr, "Usage: %s [options] [file]\n", pgm); 65 | fprintf(stderr, "Benchmark:\n"); 66 | fprintf(stderr, " -I# # = Number of runs (default=3)\n"); 67 | fprintf(stderr, " -z set the read buffer to zeros\n"); 68 | fprintf(stderr, "Ex. ./turbohist file -I15\n"); 69 | fprintf(stderr, " ./turbohist file -I15 -z\n"); 70 | fprintf(stderr, " ./turbohist file -e1,4,8,15 -I15\n"); 71 | exit(0); 72 | } 73 | 74 | int check(unsigned *cnt, unsigned n, unsigned *scnt) { unsigned i; for(i=0;i<256;i++) if(cnt[i]!=scnt[i]) { printf("Error sum at %d ", i); return 0; } printf(" %s", TM_MBS); } 75 | 76 | int bench(unsigned char *in, unsigned n, unsigned *cnt, unsigned id, unsigned *scnt) { 77 | switch(id) { 78 | case 1: TMBENCH(" 1:hist_1_8 naiv 8 bits", hist_1_8( in, n, cnt),n); break; 79 | case 2: TMBENCH(" 2:hist_4_8 4 bins/ 8 bits", hist_4_8( in, n, cnt),n); break; 80 | case 3: TMBENCH(" 3:hist_8_8 8 bins/ 8 bits", hist_8_8( in, n, cnt),n); break; 81 | case 4: TMBENCH(" 4:hist_4_32 4 bins/32 bits", hist_4_32( in, n, cnt),n); break; 82 | case 5: TMBENCH(" 5:hist_8_32 8 bins/32 bits", hist_8_32( in, n, cnt),n); break; 83 | case 6: TMBENCH(" 6:hist_4_64 4 bins/64 bits", hist_4_64( in, n, cnt),n); break; 84 | case 7: TMBENCH(" 7:hist_8_64 8 bins/64 bits", hist_8_64( in, n, cnt),n); break; 85 | case 8: TMBENCH(" 8:histr_4_64 4/64+run ", histr_4_64( in, n, cnt),n); break; 86 | case 9: TMBENCH(" 9:histr_8_64 8/64+run ", histr_8_64( in, n, cnt),n); break; 87 | #ifdef __ARM_NEON 88 | case 10: TMBENCH("10:hist_4_128 4 bins/neon ", hist_4_128( in, n, cnt),n); break; 89 | case 11: TMBENCH("11:hist_8_128 8 bins/neon ", hist_8_128( in, n, cnt),n); break; 90 | #else 91 | case 10: TMBENCH("10:hist_4_128 4 bins/sse4.1 ", hist_4_128( in, n, cnt),n); break; 92 | case 11: TMBENCH("11:hist_8_128 8 bins/sse4.1 ", hist_8_128( in, n, cnt),n); break; 93 | #endif 94 | #ifdef __AVX2__ 95 | case 12: TMBENCH("12:hist_4_256 4 bins/avx2 ", hist_4_256( in, n, cnt),n); break; 96 | case 13: TMBENCH("13:hist_8_256 8 bins/avx2 ", hist_8_256( in, n, cnt),n); break; 97 | #endif 98 | #ifdef __x86_64 99 | case 15: TMBENCH("15:hist_8_64asm inline asm ", hist_8_64a( in, n, cnt),n); break; 100 | #endif 101 | #ifdef _COUNTBENCH 102 | case 18: TMBENCH("18:count2x64 inline asm ", count2x64( in, n, cnt),n); break; 103 | // case 19: TMBENCH("19:count2x64c ", count2x64c( in, n, cnt),n); break; 104 | #endif 105 | #ifdef _RYG 106 | case 20: TMBENCH("20:histo_ref ", histo_ref( cnt, in, n),n); break; 107 | case 21: TMBENCH("21:histo_cpp_1x ", histo_cpp_1x( cnt, in, n),n); break; 108 | case 22: TMBENCH("22:histo_cpp_2x ", histo_cpp_2x( cnt, in, n),n); break; 109 | case 23: TMBENCH("23:histo_cpp_4x ", histo_cpp_4x( cnt, in, n),n); break; 110 | case 24: TMBENCH("24:histo_asm_scalar4 ", histo_asm_scalar4( cnt, in, n),n); break; 111 | case 25: TMBENCH("25:histo_asm_scalar8 ", histo_asm_scalar8( cnt, in, n),n); break; 112 | case 26: TMBENCH("26:histo_asm_scalar8_var ", histo_asm_scalar8_var( cnt, in, n),n); break; 113 | case 27: TMBENCH("27:histo_asm_scalar8_var2 ", histo_asm_scalar8_var2(cnt, in, n),n); break; 114 | case 28: TMBENCH("28:histo_asm_scalar8_var3 ", histo_asm_scalar8_var3(cnt, in, n),n); break; 115 | case 29: TMBENCH("29:histo_asm_scalar8_var4 ", histo_asm_scalar8_var4(cnt, in, n),n); break; 116 | case 30: TMBENCH("30:histo_asm_scalar8_var5 ", histo_asm_scalar8_var5(cnt, in, n),n); break; 117 | case 31: TMBENCH("31:histo_asm_sse4 ", histo_asm_sse4( cnt, in, n),n); break; 118 | #ifdef __AVX2__ 119 | case 37: TMBENCH("37:histo_asm_avx256_8x_1 ", histo_asm_avx256_8x_1( cnt, in, n),n); break; 120 | case 38: TMBENCH("38:histo_asm_avx256_8x_2 ", histo_asm_avx256_8x_2( cnt, in, n),n); break; 121 | case 39: TMBENCH("39:histo_asm_avx256_8x_3 ", histo_asm_avx256_8x_3( cnt, in, n),n); break; 122 | #endif 123 | #endif 124 | case 32: { unsigned char *cpy = malloc(n); if(cpy) { TMBENCH("32:memcpy ", libmemcpy(cpy, in, n),n); free(cpy); printf(" %s", TM_MBS); } } return 0; break; 125 | #define ID_LAST 32 126 | default: return 0; 127 | } 128 | check(cnt,n,scnt); 129 | return 1; 130 | } 131 | 132 | int main(int argc, char *argv[]) { 133 | unsigned char *finame = argv[1], *scmd = NULL, *in; 134 | unsigned n, fno, zero=0, scnt[256], cnt[256]; 135 | 136 | int c, digit_optind = 0; 137 | for(;;) { 138 | int this_option_optind = optind ? optind : 1; 139 | int option_index = 0; 140 | static struct option long_options[] = { 141 | { "help", 0, 0, 'h'}, 142 | { 0, 0, 0, 0} 143 | }; 144 | if((c = getopt_long(argc, argv, "e:hI:z", long_options, &option_index)) == -1) break; 145 | switch(c) { 146 | case 0: 147 | printf("Option %s", long_options[option_index].name); 148 | if(optarg) printf (" with arg %s", optarg); printf ("\n"); 149 | break; 150 | case 'I': if((tm_Rep = atoi(optarg))<=0) tm_rep =tm_Rep =1; break; 151 | case 'z': zero++; break; 152 | case 'e': scmd = optarg; break; 153 | case 'h': 154 | default: 155 | usage(argv[0]); 156 | exit(0); 157 | } 158 | } 159 | 160 | printf("\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__); 161 | char _scmd[33]; 162 | sprintf(_scmd, "1-%d", ID_LAST); 163 | 164 | for(fno = optind; fno < argc; fno++) { 165 | finame = argv[fno]; 166 | 167 | FILE *fi = fopen(finame, "rb"); 168 | if(!fi) perror(finame), exit(1); // printf("'%s'\n", finame); 169 | 170 | fseek(fi, 0, SEEK_END); 171 | long long flen = ftell(fi); 172 | fseek(fi, 0, SEEK_SET); 173 | 174 | if(flen > GB) flen = GB; 175 | n = flen; 176 | if(!(in = (unsigned char*)malloc(n))) 177 | printf("malloc error\n"), exit(-1); 178 | n = fread(in, 1, n, fi); 179 | fclose(fi); 180 | if(n <= 0) 181 | exit(0); 182 | 183 | if(zero) memset(in, 0, n); 184 | int i; hist_1_8(in, n, scnt); // first run 185 | unsigned char *p = (scmd && (scmd[0] != '0' || scmd[1]))?scmd:_scmd; 186 | do { 187 | int id = strtoul(p, &p, 10),idx = id, i; 188 | if(id >= 0) { 189 | while(isspace(*p)) p++; if(*p == '-') { if((idx = strtoul(p+1, &p, 10)) < id) idx = id; if(idx > ID_LAST) idx = ID_LAST; } //printf("ID=%d,%d ", id, idx); 190 | for(i = id; i <= idx; i++) { 191 | if(bench(in, n, cnt, i, scnt)) printf("\t%s\n", finame); 192 | } 193 | } 194 | } while(*p++); 195 | printf("\n"); 196 | free(in); 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /time_.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright (C) powturbo 2013-2022 3 | GPL v2 License 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License along 16 | with this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | - homepage : https://sites.google.com/site/powturbo/ 20 | - github : https://github.com/powturbo 21 | - twitter : https://twitter.com/powturbo 22 | - email : powturbo [_AT_] gmail [_DOT_] com 23 | **/ 24 | // time_.h : parameter free high precision time/benchmark functions 25 | #include 26 | #include 27 | #ifdef _WIN32 28 | #include 29 | #ifndef sleep 30 | #define sleep(n) Sleep((n) * 1000) 31 | #endif 32 | typedef unsigned __int64 uint64_t; 33 | 34 | #else 35 | #include 36 | #include 37 | #define Sleep(ms) usleep((ms) * 1000) 38 | #endif 39 | 40 | #if defined (__i386__) || defined( __x86_64__ ) // ------------------ rdtsc -------------------------- 41 | #ifdef _MSC_VER 42 | #include // __rdtsc 43 | #else 44 | #include 45 | #endif 46 | 47 | #ifdef __corei7__ 48 | #define RDTSC_INI(_c_) do { unsigned _cl, _ch; \ 49 | __asm volatile ("couid\n\t" \ 50 | "rdtsc\n\t" \ 51 | "mov %%edx, %0\n" \ 52 | "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \ 53 | "%rax", "%rbx", "%rcx", "%rdx"); \ 54 | _c_ = (uint64_t)_ch << 32 | _cl; \ 55 | } while(0) 56 | 57 | #define RDTSC(_c_) do { unsigned _cl, _ch; \ 58 | __asm volatile("rdtscp\n" \ 59 | "mov %%edx, %0\n" \ 60 | "mov %%eax, %1\n" \ 61 | "cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\ 62 | "%rbx", "%rcx", "%rdx");\ 63 | _c_ = (uint64_t)_ch << 32 | _cl;\ 64 | } while(0) 65 | #else 66 | #define RDTSC(_c_) do { unsigned _cl, _ch;\ 67 | /* __asm volatile ("cpuid \n"\ 68 | "rdtsc"\ 69 | : "=a"(_cl), "=d"(_ch)\ 70 | : "a"(0)\ 71 | : "%ebx", "%ecx");\ 72 | _c_ = (uint64_t)_ch << 32 | _cl;\ 73 | } while(0)*/ 74 | #define RDTSC(_c_) do { unsigned _cl, _ch;\ 75 | __asm volatile("rdtsc" : "=a"(_cl), "=d"(_ch) );\ 76 | _c_ = (uint64_t)_ch << 32 | _cl;\ 77 | } while(0) 78 | #endif 79 | 80 | #define RDTSC_INI(_c_) RDTSC(_c_) 81 | #else // ------------------ time -------------------------- 82 | #define RDTSC_INI(_c_) 83 | #define RDTSC(_c_) 84 | #endif 85 | 86 | #ifndef TM_F 87 | #define TM_F 1.0 // TM_F=4 -> MI/s 88 | #endif 89 | 90 | #ifdef _RDTSC //---------------------- rdtsc -------------------------------- 91 | #define TM_M (CLOCKS_PER_SEC*1000000ull) 92 | #define TM_PRE 4 93 | #define TM_MBS "cycle/byte" 94 | static double TMBS(unsigned l, double t) { return (double)t/(double)l; } 95 | 96 | typedef uint64_t tm_t; 97 | static tm_t tmtime() { uint64_t c; RDTSC(c); return c; } 98 | static tm_t tminit() { uint64_t c; __asm volatile("" ::: "memory"); RDTSC_INI(c); return c; } 99 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start); } 100 | static int tmiszero(tm_t t) { return !t; } 101 | #else //---------------------- time ----------------------------------- 102 | #define TM_M 1 103 | #define TM_PRE 2 104 | #define TM_MBS "MB/s" 105 | static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; } 106 | 107 | #ifdef _WIN32 //-------- windows 108 | static LARGE_INTEGER tps; 109 | 110 | typedef unsigned __int64 tm_t; 111 | static tm_t tmtime() { LARGE_INTEGER tm; tm_t t; QueryPerformanceCounter(&tm); return tm.QuadPart; } 112 | static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; } 113 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; } 114 | static int tmiszero(tm_t t) { return !t; } 115 | #else // Linux & compatible / MacOS 116 | #ifdef __APPLE__ 117 | #include 118 | #ifndef MAC_OS_X_VERSION_10_12 119 | #define MAC_OS_X_VERSION_10_12 101200 120 | #endif 121 | #define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) 122 | #if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME) 123 | #include 124 | #define CLOCK_REALTIME 0 125 | #define CLOCK_MONOTONIC 0 126 | int clock_gettime(int /*clk_id*/, struct timespec* t) { 127 | struct timeval now; 128 | int rv = gettimeofday(&now, NULL); 129 | if (rv) return rv; 130 | t->tv_sec = now.tv_sec; 131 | t->tv_nsec = now.tv_usec * 1000; 132 | return 0; 133 | } 134 | #endif 135 | #endif 136 | 137 | typedef struct timespec tm_t; 138 | static tm_t tmtime() { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; } 139 | static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; } 140 | static tm_t tminit() { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; } 141 | static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); } 142 | #endif 143 | #endif 144 | 145 | //---------------------------------------- bench ---------------------------------------------------------------------- 146 | // for each a function call is repeated until exceding tm_tx seconds. 147 | // A run duration is always tm_tx seconds 148 | // The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision) 149 | 150 | // sleep after each 8 runs to avoid cpu trottling. 151 | #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0) 152 | 153 | // benchmark loop 154 | #define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\ 155 | for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\ 156 | for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */ 157 | 158 | #define TMEND(_len_) \ 159 | _tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\ 160 | }\ 161 | /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\ 162 | /*other runs: break the loop only after 'tm_rm' repeats */ \ 163 | _tm_t = tmdiff(_tm_t0, tmtime());\ 164 | /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\ 165 | if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("repeats=%u,%u,%.4f ", _tm_Rn, _tm_Rx, _tm_t);*/ } \ 166 | tm_tm = _tm_t; _tm_c++;\ 167 | } else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\ 168 | if(tm_verbose) { printf("%8.*f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TM_PRE, TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\ 169 | if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\ 170 | }\ 171 | } 172 | 173 | static unsigned tm_rep = 1u<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2; 174 | static tm_t tm_0, tm_T; 175 | static double tm_tm, tm_tx = 1.0*TM_M, tm_TX = 60.0*TM_M; 176 | 177 | static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; } 178 | 179 | #define TMBENCH(_name_, _func_, _len_) do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\ 180 | TMBEG(tm_Rep) _func_; TMEND(_len_); \ 181 | double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE, TMBS(_len_, dm/dr) );\ 182 | } while(0) 183 | 184 | // second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding 185 | #define TMBENCH2(_name_, _func_, _len_) do { \ 186 | TMBEG(tm_Rep2) _func_; TMEND(_len_);\ 187 | double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE,TMBS(_len_, dm/dr) );\ 188 | if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\ 189 | } while(0) 190 | 191 | // Check 192 | #define TMBENCHT(_name_,_func_, _len_, _res_) do { \ 193 | TMBEG(tm_Rep) \ 194 | if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\ 195 | TMEND(_len_);\ 196 | if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE, TMBS(_len_,(double)tm_tm/(double)tm_rm) );\ 197 | if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\ 198 | } while(0) 199 | 200 | static void pr(unsigned l, unsigned n) { 201 | double r = (double)l*100.0/n; 202 | if(r>0.1) printf("%10u %6.2f%% ", l, r); 203 | else if(r>0.01) printf("%10u %7.3f%% ", l, r); 204 | else printf("%10u %8.4f%% ", l, r); fflush(stdout); 205 | } 206 | 207 | //---------------------------------------------------------------------------------------------------------------------------------- 208 | #define Kb (1u<<10) 209 | #define Mb (1u<<20) 210 | #define Gb (1u<<30) 211 | #define KB 1000 212 | #define MB 1000000 213 | #define GB 1000000000 214 | 215 | static unsigned argtoi(char *s, unsigned def) { 216 | char *p; 217 | unsigned n = strtol(s, &p, 10),f = 1; 218 | switch(*p) { 219 | case 'K': f = KB; break; 220 | case 'M': f = MB; break; 221 | case 'G': f = GB; break; 222 | case 'k': f = Kb; break; 223 | case 'm': f = Mb; break; 224 | case 'g': f = Gb; break; 225 | case 'B': return n; break; 226 | case 'b': def = 0; 227 | default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def; 228 | } 229 | return n*f; 230 | } 231 | static uint64_t argtol(char *s) { 232 | char *p; 233 | uint64_t n = strtol(s, &p, 10),f=1; 234 | switch(*p) { 235 | case 'K': f = KB; break; 236 | case 'M': f = MB; break; 237 | case 'G': f = GB; break; 238 | case 'k': f = Kb; break; 239 | case 'm': f = Mb; break; 240 | case 'g': f = Gb; break; 241 | case 'B': return n; break; 242 | case 'b': return 1u << n; 243 | default: f = MB; 244 | } 245 | return n*f; 246 | } 247 | 248 | static uint64_t argtot(char *s) { 249 | char *p; 250 | uint64_t n = strtol(s, &p, 10),f=1; 251 | switch(*p) { 252 | case 'h': f = 3600000; break; 253 | case 'm': f = 60000; break; 254 | case 's': f = 1000; break; 255 | case 'M': f = 1; break; 256 | default: f = 1000; 257 | } 258 | return n*f; 259 | } 260 | 261 | static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; } 262 | 263 | -------------------------------------------------------------------------------- /conf.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright (C) powturbo 2013-2019 3 | GPL v2 License 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License along 16 | with this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | - homepage : https://sites.google.com/site/powturbo/ 20 | - github : https://github.com/powturbo 21 | - twitter : https://twitter.com/powturbo 22 | - email : powturbo [_AT_] gmail [_DOT_] com 23 | **/ 24 | 25 | // conf.h - config & common 26 | #ifndef CONF_H 27 | #define CONF_H 28 | //------------------------- Compiler ------------------------------------------ 29 | #if defined(__GNUC__) 30 | #include 31 | #define ALIGNED(t,v,n) t v __attribute__ ((aligned (n))) 32 | #define ALWAYS_INLINE inline __attribute__((always_inline)) 33 | #define NOINLINE __attribute__((noinline)) 34 | #define _PACKED __attribute__ ((packed)) 35 | #define likely(x) __builtin_expect((x),1) 36 | #define unlikely(x) __builtin_expect((x),0) 37 | 38 | #define popcnt32(_x_) __builtin_popcount(_x_) 39 | #define popcnt64(_x_) __builtin_popcountll(_x_) 40 | 41 | #if defined(__i386__) || defined(__x86_64__) 42 | //x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5 43 | // x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6, 44 | static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } 45 | static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } 46 | static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; } 47 | static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); } 48 | 49 | static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } 50 | static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } 51 | static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } 52 | static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } 53 | #else 54 | static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); } 55 | static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } 56 | static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; } 57 | 58 | static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); } 59 | static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); } 60 | static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); } 61 | static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); } 62 | #endif 63 | 64 | #define ctz64(_x_) __builtin_ctzll(_x_) 65 | #define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1< 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 71 | #define bswap16(x) __builtin_bswap16(x) 72 | #else 73 | static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); } 74 | #endif 75 | #define bswap32(x) __builtin_bswap32(x) 76 | #define bswap64(x) __builtin_bswap64(x) 77 | 78 | #elif _MSC_VER //---------------------------------------------------- 79 | #include 80 | #include 81 | #if _MSC_VER < 1600 82 | #include "vs/stdint.h" 83 | #define __builtin_prefetch(x,a) 84 | #define inline __inline 85 | #else 86 | #include 87 | #define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA) 88 | #endif 89 | 90 | #define ALIGNED(t,v,n) __declspec(align(n)) t v 91 | #define ALWAYS_INLINE __forceinline 92 | #define NOINLINE __declspec(noinline) 93 | #define THREADLOCAL __declspec(thread) 94 | #define likely(x) (x) 95 | #define unlikely(x) (x) 96 | 97 | static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; } 98 | static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; } 99 | static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; } 100 | static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; } 101 | #if !defined(_M_ARM64) && !defined(_M_X64) 102 | static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { 103 | unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0); 104 | *ret = x0 ? bottom : 32 + top; return x != 0; 105 | } 106 | static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { 107 | unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1); _BitScanReverse(&bottom, (unsigned long)x); 108 | *ret = x1 ? top + 32 : bottom; return x != 0; 109 | } 110 | #endif 111 | static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; } 112 | static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; } 113 | static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; } 114 | 115 | #define rol32(x,s) _lrotl(x, s) 116 | #define ror32(x,s) _lrotr(x, s) 117 | 118 | #define bswap16(x) _byteswap_ushort(x) 119 | #define bswap32(x) _byteswap_ulong(x) 120 | #define bswap64(x) _byteswap_uint64(x) 121 | 122 | #define popcnt32(x) __popcnt(x) 123 | #ifdef _WIN64 124 | #define popcnt64(x) __popcnt64(x) 125 | #else 126 | #define popcnt64(x) (popcnt32(x) + popcnt32(x>>32)) 127 | #endif 128 | 129 | #define sleep(x) Sleep(x/1000) 130 | #define fseeko _fseeki64 131 | #define ftello _ftelli64 132 | #define strcasecmp _stricmp 133 | #define strncasecmp _strnicmp 134 | #define strtoull _strtoui64 135 | static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); } 136 | #endif 137 | 138 | #define __bsr8(_x_) __bsr32(_x_) 139 | #define __bsr16(_x_) __bsr32(_x_) 140 | #define bsr8(_x_) bsr32(_x_) 141 | #define bsr16(_x_) bsr32(_x_) 142 | #define ctz8(_x_) ctz32(_x_) 143 | #define ctz16(_x_) ctz32(_x_) 144 | #define clz8(_x_) (clz32(_x_)-24) 145 | #define clz16(_x_) (clz32(_x_)-16) 146 | 147 | #define popcnt8(x) popcnt32(x) 148 | #define popcnt16(x) popcnt32(x) 149 | 150 | //--------------- Unaligned memory access ------------------------------------- 151 | #ifdef UA_MEMCPY 152 | #include 153 | static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; } 154 | static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; } 155 | static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; } 156 | static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; } 157 | static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; } 158 | static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; } 159 | 160 | static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); } 161 | static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); } 162 | static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); } 163 | static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); } 164 | static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); } 165 | static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); } 166 | #elif defined(__i386__) || defined(__x86_64__) || \ 167 | defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\ 168 | defined(__powerpc__) || defined(__s390__) ||\ 169 | defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ 170 | defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ 171 | defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ 172 | defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) 173 | #define ctou16(_cp_) (*(unsigned short *)(_cp_)) 174 | #define ctou32(_cp_) (*(unsigned *)(_cp_)) 175 | #define ctof32(_cp_) (*(float *)(_cp_)) 176 | 177 | #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER) 178 | #define ctou64(_cp_) (*(uint64_t *)(_cp_)) 179 | #define ctof64(_cp_) (*(double *)(_cp_)) 180 | #elif defined(__ARM_FEATURE_UNALIGNED) 181 | struct _PACKED longu { uint64_t l; }; 182 | struct _PACKED doubleu { double d; }; 183 | #define ctou64(_cp_) ((struct longu *)(_cp_))->l 184 | #define ctof64(_cp_) ((struct doubleu *)(_cp_))->d 185 | #endif 186 | 187 | #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__) 188 | struct _PACKED shortu { unsigned short s; }; 189 | struct _PACKED unsignedu { unsigned u; }; 190 | struct _PACKED longu { uint64_t l; }; 191 | struct _PACKED floatu { float f; }; 192 | struct _PACKED doubleu { double d; }; 193 | 194 | #define ctou16(_cp_) ((struct shortu *)(_cp_))->s 195 | #define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u 196 | #define ctou64(_cp_) ((struct longu *)(_cp_))->l 197 | #define ctof32(_cp_) ((struct floatu *)(_cp_))->f 198 | #define ctof64(_cp_) ((struct doubleu *)(_cp_))->d 199 | #else 200 | #error "unknown cpu" 201 | #endif 202 | 203 | #define ctou24(_cp_) (ctou32(_cp_) & 0xffffff) 204 | #define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull) 205 | #define ctou8(_cp_) (*(_cp_)) 206 | //--------------------- wordsize ---------------------------------------------- 207 | #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\ 208 | defined(__x86_64__) || defined(_M_X64) ||\ 209 | defined(__ia64) || defined(_M_IA64) ||\ 210 | defined(__aarch64__) ||\ 211 | defined(__mips64) ||\ 212 | defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ 213 | defined(__s390x__) 214 | #define __WORDSIZE 64 215 | #else 216 | #define __WORDSIZE 32 217 | #endif 218 | #endif 219 | 220 | //---------------------misc --------------------------------------------------- 221 | //#define bzhi63(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64 222 | //#define bzhi63(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32 223 | #define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // b Constant 224 | #define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1))) 225 | #define BZHI16(_u_, _b_) BZHI32(_u_, _b_) 226 | #define BZHI8( _u_, _b_) BZHI32(_u_, _b_) 227 | 228 | #ifdef __AVX2__ 229 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) 230 | #include 231 | #else 232 | #include 233 | #endif 234 | #define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) // b variable 235 | #define bzhi31(_u_, _b_) _bzhi_u32(_u_, _b_) 236 | 237 | #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) 238 | #define bzhi64(_u_, _b_) BZHI64(_u_, _b_) 239 | #define bzhi63(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) 240 | #else 241 | #define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_) 242 | #define bzhi63(_u_, _b_) _bzhi_u64(_u_, _b_) 243 | #endif 244 | #else 245 | #define bzhi64(_u_, _b_) BZHI64(_u_, _b_) 246 | #define bzhi63(_u_, _b_) ((_u_) & ((1ull <<(_b_))-1)) 247 | #define bzhi32(_u_, _b_) ((_u_) & ((1ull <<(_b_))-1)) 248 | #define bzhi31(_u_, _b_) ((_u_) & ((1 <<(_b_))-1)) 249 | #endif 250 | 251 | #define bzhi16(_u_, _b_) bzhi31(_u_, _b_) 252 | #define bzhi8( _u_, _b_) bzhi31(_u_, _b_) 253 | 254 | 255 | #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1)) 256 | #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1))) 257 | 258 | #define TEMPLATE2_(_x_, _y_) _x_##_y_ 259 | #define T2(_x_, _y_) TEMPLATE2_(_x_,_y_) 260 | 261 | #define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_ 262 | #define T3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_) 263 | 264 | #define CACHE_LINE_SIZE 64 265 | #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) 266 | 267 | #define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_))) 268 | 269 | //--- NDEBUG ------- 270 | #include 271 | #ifdef _MSC_VER 272 | #ifdef NDEBUG 273 | #define AS(expr, fmt, ...) 274 | #define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0) 275 | #define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) 276 | #else 277 | #define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0) 278 | #define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0) 279 | #define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) 280 | #endif 281 | #else 282 | #ifdef NDEBUG 283 | #define AS(expr, fmt,args...) 284 | #define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0) 285 | #define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) 286 | #else 287 | #define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0) 288 | #define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0) 289 | #define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) 290 | #endif 291 | #endif 292 | -------------------------------------------------------------------------------- /sse_neon.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright (C) powturbo 2013-2019 3 | GPL v2 License 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License along 16 | with this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | - homepage : https://sites.google.com/site/powturbo/ 20 | - github : https://github.com/powturbo 21 | - twitter : https://twitter.com/powturbo 22 | - email : powturbo [_AT_] gmail [_DOT_] com 23 | **/ 24 | // intel sse to arm neon 25 | 26 | #ifndef _SSE_NEON_H_ 27 | #define _SSE_NEON_H_ 28 | #include "conf.h" 29 | 30 | #ifdef __ARM_NEON //-------------------------------------------------------------------------------------------------- 31 | #include 32 | #define __m128i uint32x4_t 33 | 34 | //#define USE_MACROS 35 | #define uint8x16_to_8x8x2(_a_) ((uint8x8x2_t) { vget_low_u8(_a_), vget_high_u8(_a_) }) 36 | 37 | #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ---------------------------------------------------------- 38 | #define _mm_set_epi8(u15,u14,u13,u12,\ 39 | u11,u10, u9, u8,\ 40 | u7,u6,u5,u4,\ 41 | u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);}) 42 | #define _mm_set_epi16( u7,u6,u5,u4,\ 43 | u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);}) 44 | #define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);}) 45 | #define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);}) 46 | #define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2)) 47 | #define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1)) 48 | #else 49 | static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8, 50 | uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4, 51 | uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) { 52 | uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); } 53 | static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4, 54 | uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); } 55 | static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); } 56 | static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); } 57 | #endif 58 | 59 | #define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ ) 60 | #define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_) 61 | #define _mm_set1_epi32( _u32_) vdupq_n_u32(_u32_) 62 | #define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_) 63 | #define _mm_setzero_si128() vdupq_n_u32( 0 ) 64 | //---------------------------------------------- Arithmetic ----------------------------------------------------------------------- 65 | #define _mm_add_epi8( _a_,_b_) (__m128i)vaddq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 66 | #define _mm_add_epi16( _a_,_b_) (__m128i)vaddq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 67 | #define _mm_add_epi32( _a_,_b_) vaddq_u32( _a_, _b_ ) 68 | #define _mm_sub_epi16( _a_,_b_) (__m128i)vsubq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 69 | #define _mm_sub_epi32( _a_,_b_) (__m128i)vsubq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_)) 70 | #define _mm_subs_epu8( _a_,_b_) (__m128i)vqsubq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 71 | 72 | #define _mm_mullo_epi32(_a_,_b_) (__m128i)vmulq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_)) 73 | #define mm_mullo_epu32(_a_,_b_) vmulq_u32(_a_,_b_) 74 | #define _mm_mul_epu32( _a_,_b_) (__m128i)vmull_u32(vget_low_u32(_a_),vget_low_u32(_b_)) 75 | #define _mm_adds_epu16( _a_,_b_) (__m128i)vqaddq_u16((uint16x8_t)(_a_),(uint16x8_t)(_b_)) 76 | static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { 77 | int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)a), vget_low_s16( (int16x8_t)b)); 78 | int32x4_t mhi = vmull_s16(vget_high_s16((int16x8_t)a), vget_high_s16((int16x8_t)b)); 79 | int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)); 80 | int32x2_t ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi)); 81 | return (__m128i)vcombine_s32(alo, ahi); 82 | } 83 | //---------------------------------------------- Special math functions ----------------------------------------------------------- 84 | #define _mm_min_epu8( _a_,_b_) (__m128i)vminq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 85 | #define _mm_min_epu16( _a_,_b_) (__m128i)vminq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 86 | #define _mm_min_epi16( _a_,_b_) (__m128i)vminq_s16((int16x8_t)(_a_), (int16x8_t)(_b_)) 87 | //---------------------------------------------- Logical -------------------------------------------------------------------------- 88 | #define mm_testnz_epu32(_a_) vmaxvq_u32(_a_) //vaddvq_u32(_a_) 89 | #define mm_testnz_epu8(_a_) vmaxv_u8(_a_) 90 | #define _mm_or_si128( _a_,_b_) (__m128i)vorrq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_)) 91 | #define _mm_and_si128( _a_,_b_) (__m128i)vandq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_)) 92 | #define _mm_xor_si128( _a_,_b_) (__m128i)veorq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_)) 93 | //---------------------------------------------- Shift ---------------------------------------------------------------------------- 94 | #define _mm_slli_epi16( _a_,_m_) (__m128i)vshlq_n_u16((uint16x8_t)(_a_), _m_) 95 | #define _mm_slli_epi32( _a_,_m_) (__m128i)vshlq_n_u32((uint32x4_t)(_a_), _m_) 96 | #define _mm_slli_epi64( _a_,_m_) (__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_) 97 | #define _mm_slli_si128( _a_,_m_) (__m128i)vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_a_), 16 - (_m_) ) // _m_: 1 - 15 98 | 99 | #define _mm_srli_epi16( _a_,_m_) (__m128i)vshrq_n_u16((uint16x8_t)(_a_), _m_) 100 | #define _mm_srli_epi32( _a_,_m_) (__m128i)vshrq_n_u32((uint32x4_t)(_a_), _m_) 101 | #define _mm_srli_epi64( _a_,_m_) (__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_) 102 | #define _mm_srli_si128( _a_,_m_) (__m128i)vextq_s8((int8x16_t)(_a_), vdupq_n_s8(0), (_m_)) 103 | 104 | #define _mm_srai_epi16( _a_,_m_) (__m128i)vshrq_n_s16((int16x8_t)(_a_), _m_) 105 | #define _mm_srai_epi32( _a_,_m_) (__m128i)vshrq_n_s32((int32x4_t)(_a_), _m_) 106 | #define _mm_srai_epi64( _a_,_m_) (__m128i)vshrq_n_s64((int64x2_t)(_a_), _m_) 107 | 108 | #define _mm_sllv_epi32( _a_,_b_) (__m128i)vshlq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_)) 109 | #define _mm_srlv_epi32( _a_,_b_) (__m128i)vshlq_u32((uint32x4_t)(_a_), vnegq_s32((int32x4_t)(_b_))) 110 | //---------------------------------------------- Compare --------- true/false->1/0 (all bits set) --------------------------------- 111 | #define _mm_cmpeq_epi8( _a_,_b_) (__m128i)vceqq_s8( ( int8x16_t)(_a_), ( int8x16_t)(_b_)) 112 | #define _mm_cmpeq_epi16(_a_,_b_) (__m128i)vceqq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_)) 113 | #define _mm_cmpeq_epi32(_a_,_b_) (__m128i)vceqq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_)) 114 | 115 | #define _mm_cmpgt_epi16(_a_,_b_) (__m128i)vcgtq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_)) 116 | #define _mm_cmpgt_epi32(_a_,_b_) (__m128i)vcgtq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_)) 117 | 118 | #define _mm_cmpgt_epu16(_a_,_b_) (__m128i)vcgtq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 119 | #define mm_cmpgt_epu32(_a_,_b_) (__m128i)vcgtq_u32( _a_, _b_) 120 | //---------------------------------------------- Load ----------------------------------------------------------------------------- 121 | #define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0)) 122 | #define mm_loadu_epi64p( _u64p_,_a_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_a_), 0) 123 | #define _mm_loadu_si128( _ip_) vld1q_u32(_ip_) 124 | #define _mm_load_si128( _ip_) vld1q_u32(_ip_) 125 | //---------------------------------------------- Store ---------------------------------------------------------------------------- 126 | #define _mm_storel_epi64(_ip_,_a_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_a_), 0) 127 | #define _mm_storeu_si128(_ip_,_a_) vst1q_u32((__m128i *)(_ip_),_a_) 128 | //---------------------------------------------- Convert -------------------------------------------------------------------------- 129 | #define mm_cvtsi64_si128p(_u64p_,_a_) mm_loadu_epi64p(_u64p_,_a_) 130 | #define _mm_cvtsi64_si128(_a_) (__m128i)vdupq_n_u64(_a_) //vld1q_s64(_a_) 131 | //---------------------------------------------- Reverse bits/bytes --------------------------------------------------------------- 132 | #define mm_rbit_epi8(a) (__m128i)vrbitq_u8( (uint8x16_t)(a)) // reverse bits 133 | #define mm_rev_epi16(a) vrev16q_u8((uint8x16_t)(a)) // reverse bytes 134 | #define mm_rev_epi32(a) vrev32q_u8((uint8x16_t)(a)) 135 | #define mm_rev_epi64(a) vrev64q_u8((uint8x16_t)(a)) 136 | //--------------------------------------------- Insert/extract -------------------------------------------------------------------- 137 | #define mm_extract_epi32x(_a_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _a_, _id_) 138 | #define _mm_extract_epi64x(_a_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_a_), _id_) 139 | 140 | #define _mm_extract_epi8(_a_, _id_) vgetq_lane_u8( (uint8x16_t)(_a_), _id_) 141 | #define _mm_extract_epi16(_a_, _id_) vgetq_lane_u16(_a_, _id_) 142 | #define _mm_extract_epi32(_a_, _id_) vgetq_lane_u32(_a_, _id_) 143 | #define mm_extract_epu32(_a_, _id_) vgetq_lane_u32(_a_, _id_) 144 | #define _mm_cvtsi128_si32(_a_) vgetq_lane_u32((uint32x4_t)(_a_),0) 145 | #define _mm_cvtsi128_si64(_a_) vgetq_lane_u64((uint64x2_t)(_a_),0) 146 | 147 | #define _mm_insert_epu32p(_a_,_u32p_,_id_) vsetq_lane_u32(_x_, _a_, _id_) 148 | #define mm_insert_epi32p(_a_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_a_), _id_) 149 | #define _mm_cvtsi32_si128(_a_) (__m128i)vsetq_lane_s32(_a_, vdupq_n_s32(0), 0) 150 | 151 | #define _mm_blendv_epi8(_a_,_b_,_m_) vbslq_u32(_m_,_b_,_a_) 152 | //---------------------------------------------- Miscellaneous -------------------------------------------------------------------- 153 | #define _mm_alignr_epi8(_a_,_b_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_b_), (uint8x16_t)(_a_), _m_) 154 | #define _mm_packs_epi16( _a_,_b_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_a_)), vqmovn_s16((int16x8_t)(_b_))) 155 | #define _mm_packs_epi32( _a_,_b_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_a_)), vqmovn_s32((int32x4_t)(_b_))) 156 | 157 | #define _mm_packs_epu16( _a_,_b_) (__m128i)vcombine_u8((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 158 | #define _mm_packus_epi16( _a_,_b_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_a_)), vqmovun_s16((int16x8_t)(_b_))) 159 | 160 | static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { 161 | const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7}; 162 | uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m)))); 163 | return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0); 164 | } 165 | //-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff) 166 | #ifdef __aarch64__ 167 | static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM 168 | //static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); } 169 | static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); } 170 | static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); } 171 | static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); } 172 | #else 173 | static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); } 174 | #endif 175 | // --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack ----------------------------------------- 176 | #define _MM_SHUFFLE(u3,u2,u1,u0) ((u3) << 6 | (u2) << 4 | (u1) << 2 | (u0)) 177 | 178 | #define _mm_shuffle_epi8(_a_, _b_) (__m128i)vqtbl1q_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 179 | #if defined(__aarch64__) 180 | #define mm_shuffle_nnnn_epi32(_a_,_m_) (__m128i)vdupq_laneq_u32(_a_, _m_) 181 | #else 182 | #define mm_shuffle_nnnn_epi32(_a_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_a_, _m_) 183 | #endif 184 | 185 | #ifdef USE_MACROS 186 | #define mm_shuffle_2031_epi32(_a_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_a_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);}) 187 | #define mm_shuffle_3120_epi32(_a_) ({ uint32x4_t _zv = _a_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);}) 188 | #else 189 | static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i a) { uint32x4_t v = (uint32x4_t)vrev64q_u32(a); uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);} 190 | static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i a) { uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);} 191 | #endif 192 | 193 | #if defined(USE_MACROS) || defined(__clang__) 194 | #define _mm_shuffle_epi32(_a_, _m_) ({ const uint32x4_t _av =_a_;\ 195 | uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\ 196 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\ 197 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\ 198 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\ 199 | }) 200 | #define _mm_shuffle_epi32s(_a_, _m_) _mm_set_epi32(vgetq_lane_u32(_a_, ((_m_) ) & 0x3),\ 201 | vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3),\ 202 | vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3),\ 203 | vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3)) 204 | #else 205 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _a_, const unsigned _m_) { const uint32x4_t _av =_a_; 206 | uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3)); 207 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1); 208 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2); 209 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); 210 | return _v; 211 | } 212 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _a_, const unsigned _m_) { 213 | return _mm_set_epi32(vgetq_lane_u32(_a_, ((_m_) ) & 0x3), 214 | vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3), 215 | vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3), 216 | vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3)); 217 | } 218 | #endif 219 | #ifdef USE_MACROS 220 | #define _mm_unpacklo_epi8( _a_,_b_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}) 221 | #define _mm_unpacklo_epi16(_a_,_b_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}) 222 | #define _mm_unpacklo_epi32(_a_,_b_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _a_ ), vget_low_u32( _b_ )); vcombine_u32(_zv.val[0], _zv.val[1]);}) 223 | #define _mm_unpacklo_epi64(_a_,_b_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_))) 224 | 225 | #define _mm_unpackhi_epi8( _a_,_b_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}) 226 | #define _mm_unpackhi_epi16(_a_,_b_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}) 227 | #define _mm_unpackhi_epi32(_a_,_b_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _a_ ), vget_high_u32( _b_ )); vcombine_u32(_zv.val[0], _zv.val[1]);}) 228 | #define _mm_unpackhi_epi64(_a_,_b_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_))) 229 | #else 230 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);} 231 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);} 232 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _a_ ), vget_low_u32( _b_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);} 233 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _a_, __m128i _b_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_))); } 234 | 235 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); } 236 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); } 237 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _a_ ), vget_high_u32( _b_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); } 238 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _a_, __m128i _b_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_))); } 239 | #endif 240 | 241 | #else //------------------------------------- intel SSE2/SSSE3 -------------------------------------------------------------- 242 | #define mm_movemask_epu32(_a_) _mm_movemask_ps(_mm_castsi128_ps(_a_)) 243 | #define mm_movemask_epu16(_a_) _mm_movemask_epi8(_a_) 244 | #define mm_loadu_epi64p( _u64p_,_a_) _a_ = _mm_cvtsi64_si128(ctou64(_u64p_)) 245 | 246 | #define mm_extract_epu32( _a_, _id_) _mm_extract_epi32(_a_, _id_) 247 | #define mm_extract_epi32x(_a_,_u32_, _id_) _u32_ = _mm_extract_epi32(_a_, _id_) 248 | #define mm_extract_epi64x(_a_,_u64_, _id_) _u64_ = _mm_extract_epi64(_a_, _id_) 249 | #define mm_insert_epi32p( _a_,_u32p_,_c_) _mm_insert_epi32( _a_,ctou32(_u32p_),_c_) 250 | 251 | #define mm_mullo_epu32( _a_,_b_) _mm_mullo_epi32(_a_,_b_) 252 | #define mm_cvtsi64_si128p(_u64p_,_a_) _a_ = _mm_cvtsi64_si128(ctou64(_u64p_)) 253 | 254 | #define mm_cmpgt_epu32( _a_, _b_) _mm_cmpgt_epi32(_mm_xor_si128(_a_, cv80000000), _mm_xor_si128(_b_, cv80000000)) 255 | 256 | #define mm_shuffle_nnnn_epi32(_a_, _n_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(_n_,_n_,_n_,_n_)) 257 | #define mm_shuffle_2031_epi32(_a_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(2,0,3,1)) 258 | #define mm_shuffle_3120_epi32(_a_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(3,1,2,0)) 259 | 260 | static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes 261 | __m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf); 262 | __m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8)); 263 | __m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128(_mm_srli_epi64(v, 4), cv0f_8)); 264 | return _mm_or_si128(_mm_slli_epi64(lv,4), hv); 265 | } 266 | 267 | static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t 268 | static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); } 269 | static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); } 270 | static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); } 271 | #endif 272 | #endif 273 | -------------------------------------------------------------------------------- /turbohist_.c: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright (c) 2013-2022, Powturbo 3 | - homepage : https://sites.google.com/site/powturbo/ 4 | - github : https://github.com/powturbo 5 | - twitter : https://twitter.com/powturbo 6 | - email : powturbo [_AT_] gmail [_DOT_] com 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are 11 | met: 12 | 13 | 1. Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | **/ 32 | // 1D Histogram: hist[r]_X_Y r:run aware X: number bins used, Y: processing unit 1:8 bits, 4:32 bits, 8:64 bits 33 | #include "conf.h" 34 | #ifdef __ARM_NEON 35 | #define PREFETCH(_ip_,_rw_) 36 | #else 37 | #define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_) 38 | #endif 39 | 40 | #define CSIZE (256 + 8) 41 | typedef unsigned cnt_t; 42 | 43 | #if 1 // fast when auto-vectorization enabled (for ex. with gcc -O3) 44 | #define HISTEND(_c_,_cn_,_cnt_) { int _i,_j;\ 45 | memset(_cnt_, 0, 256*sizeof(_cnt_[0]));\ 46 | for(_i=0; _i < 256; _i++)\ 47 | for(_j=0; _j < _cn_;_j++) _cnt_[_i] += _c_[_j][_i];\ 48 | } 49 | 50 | #define HISTEND8(_c_,_cnt_) HISTEND(_c_,8,_cnt_) 51 | #define HISTEND4(_c_,_cnt_) HISTEND(_c_,4,_cnt_) 52 | #else 53 | static ALWAYS_INLINE histend4(cnt_t c[4][CSIZE], cnt_t *__restrict cnt) { unsigned i; 54 | #ifdef __AVX2__ 55 | for(i = 0; i != 256; i+=8) { 56 | __m256i sv = _mm256_load_si256((const __m256i *)&c[0][i]); 57 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[1][i]), sv); 58 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[2][i]), sv); 59 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[3][i]), sv); 60 | _mm256_storeu_si256((__m256i *)&cnt[i], sv); 61 | } 62 | #elif defined(__SSE2__) || defined(__ARM_NEON) 63 | for(i = 0; i != 256; i+=4) { 64 | __m128i sv = _mm_load_si128((const __m128i *)&c[0][i]); 65 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[1][i]), sv); 66 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[2][i]), sv); 67 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[3][i]), sv); 68 | _mm_storeu_si128((__m128i *)&cnt[i], sv); 69 | } 70 | #else 71 | for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i]; 72 | #endif 73 | } 74 | 75 | static ALWAYS_INLINE histend8(cnt_t c[8][CSIZE], cnt_t *__restrict cnt) { unsigned i; 76 | #ifdef __AVX2__ 77 | for(i = 0; i != 256; i+=8) { 78 | __m256i v0 = _mm256_load_si256((const __m256i *)&c[0][i]); 79 | __m256i v1 = _mm256_load_si256((const __m256i *)&c[1][i]); 80 | __m256i s0 = _mm256_add_epi32(v0, v1); 81 | v0 = _mm256_load_si256((const __m256i *)&c[2][i]); 82 | v1 = _mm256_load_si256((const __m256i *)&c[3][i]); 83 | __m256i s1 = _mm256_add_epi32(v0, v1); 84 | s0 = _mm256_add_epi32(s0, s1); 85 | 86 | v0 = _mm256_load_si256((const __m256i *)&c[4][i]); 87 | v1 = _mm256_load_si256((const __m256i *)&c[5][i]); 88 | s1 = _mm256_add_epi32(v0, v1); 89 | v0 = _mm256_load_si256((const __m256i *)&c[6][i]); 90 | v1 = _mm256_load_si256((const __m256i *)&c[7][i]); 91 | s0 = _mm256_add_epi32(s0, v0); 92 | s1 = _mm256_add_epi32(s1, v1); 93 | 94 | _mm256_storeu_si256((__m256i *)&cnt[i], _mm256_add_epi32(s0, s1)); 95 | } 96 | #elif defined(__SSE2__) || defined(__ARM_NEON) 97 | for(i = 0; i != 256; i+=4) { 98 | __m128i v0 = _mm_load_si128((const __m128i *)&c[0][i]); 99 | __m128i v1 = _mm_load_si128((const __m128i *)&c[1][i]); 100 | __m128i sv = _mm_add_epi32(v0, v1); 101 | v0 = _mm_load_si128((const __m128i *)&c[2][i]); 102 | v1 = _mm_load_si128((const __m128i *)&c[3][i]); 103 | sv = _mm_add_epi32(sv, v0); 104 | sv = _mm_add_epi32(sv, v1); 105 | 106 | v0 = _mm_load_si128((const __m128i *)&c[4][i]); 107 | v1 = _mm_load_si128((const __m128i *)&c[5][i]); 108 | sv = _mm_add_epi32(sv, v0); 109 | sv = _mm_add_epi32(sv, v1); 110 | v0 = _mm_load_si128((const __m128i *)&c[6][i]); 111 | v1 = _mm_load_si128((const __m128i *)&c[7][i]); 112 | sv = _mm_add_epi32(sv, v0); 113 | _mm_storeu_si128((__m128i *)&cnt[i], _mm_add_epi32(sv, v1)); 114 | } 115 | #else 116 | for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i]+c[4][i]+c[5][i]+c[6][i]+c[7][i]; 117 | #endif 118 | } 119 | 120 | #define HISTEND8(_c_,_cnt_) histend8(_c_,_cnt_) 121 | #define HISTEND4(_c_,_cnt_) histend4(_c_,_cnt_) 122 | #endif 123 | 124 | //---------------------------- 8 bits ------------------------------------------------------ 125 | static void hist_1_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 126 | unsigned char *ip = in; 127 | 128 | memset(cnt, 0, 256*sizeof(cnt[0])); 129 | while(ip < in+inlen) cnt[*ip++]++; 130 | } 131 | 132 | static void hist_4_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 133 | cnt_t c[4][CSIZE] = {0},i; 134 | unsigned char *ip = in; 135 | 136 | while(ip != in+(inlen&~(4-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++; 137 | while(ip != in+ inlen ) c[0][*ip++]++; 138 | HISTEND4(c, cnt); 139 | } 140 | 141 | static void hist_8_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 142 | cnt_t c[8][CSIZE] = {0},i; 143 | unsigned char *ip = in; 144 | 145 | while(ip != in+(inlen&~(8-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++, c[4][*ip++]++, c[5][*ip++]++, c[6][*ip++]++, c[7][*ip++]++; 146 | while(ip != in+ inlen ) c[0][*ip++]++; 147 | HISTEND8(c, cnt); 148 | } 149 | 150 | //----------------------------- 32 bits -------------------------------------------------------- 151 | #if defined(__i386__) || defined(__x86_64__) 152 | #define CU32(_u_,_i_,_c_) {\ 153 | c[_i_+0][(unsigned char )(_u_) ]+=_c_;\ 154 | c[_i_+1][(unsigned short)(_u_)>>8]+=_c_; _u_>>=16;\ 155 | c[_i_+2][(unsigned char )(_u_) ]+=_c_;\ 156 | c[_i_+3][(unsigned short)(_u_)>>8]+=_c_;\ 157 | } 158 | #else 159 | #define CU32(_u_,_i_,_c_) {\ 160 | c[_i_+0][(unsigned char) (_u_) ]+=_c_;\ 161 | c[_i_+1][(unsigned char)((_u_)>> 8)]+=_c_;\ 162 | c[_i_+2][(unsigned char)((_u_)>>16)]+=_c_;\ 163 | c[_i_+3][ (_u_)>>24 ]+=_c_;\ 164 | } 165 | #endif 166 | 167 | #define UZ 4 // Load size 1x 32 bits = 4 bytes 168 | #define I132(_i_,_o_) { unsigned u1 = ctou32(ip+UZ+_i_*UZ*2+0); CU32(u0, 0, 1);\ 169 | u0 = ctou32(ip+UZ+_i_*UZ*2+4); CU32(u1,_o_,1);\ 170 | } 171 | 172 | #define N32 32 173 | static void hist_4_32(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 174 | #define IC 0 175 | cnt_t c[4][CSIZE] = {0}, i; 176 | unsigned char *ip = in; 177 | 178 | if(inlen >= UZ+N32) { 179 | unsigned u0 = ctou32(ip); 180 | for(; ip <= in+inlen-(UZ+N32); ip += N32) { 181 | I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC); 182 | PREFETCH(ip+512, 0); 183 | } 184 | } 185 | while(ip != in+inlen) c[0][*ip++]++; 186 | HISTEND4(c, cnt); 187 | } 188 | 189 | static void hist_8_32(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 190 | #define IC 4 191 | cnt_t c[8][CSIZE] = {0}, i; 192 | unsigned char *ip = in; 193 | 194 | if(inlen >= UZ+N32) { 195 | unsigned u0 = ctou32(ip); 196 | for(; ip <= in+inlen-(UZ+N32); ip += N32) { 197 | I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC); //I132(4,IC); I132(5,IC); I132(6,IC); I132(7,IC); 198 | PREFETCH(ip+512, 0); 199 | } 200 | } 201 | while(ip != in+inlen) c[0][*ip++]++; 202 | HISTEND8(c, cnt); 203 | } 204 | 205 | //-------------------- 64 bits --------------------------------------------------- 206 | #if defined(__i386__) || defined(__x86_64__) 207 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\ 208 | c[0 ][(unsigned char )_x ]+=_c_;\ 209 | c[1 ][(unsigned short)_x>> 8]+=_c_; _x>>=16;\ 210 | c[2 ][(unsigned char )_x ]+=_c_;\ 211 | c[3 ][(unsigned short)_x>> 8]+=_c_; _x=(_u_)>>=32;\ 212 | c[0+_o_][(unsigned char )_x ]+=_c_;\ 213 | c[1+_o_][(unsigned short)_x>> 8]+=_c_; _x>>=16;\ 214 | c[2+_o_][(unsigned char )_x ]+=_c_;\ 215 | c[3+_o_][(unsigned short)_x>> 8]+=_c_;\ 216 | } 217 | #else 218 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\ 219 | c[0 ][(unsigned char) _x ]+=_c_;\ 220 | c[1 ][(unsigned char)(_x>> 8)]+=_c_;\ 221 | c[2 ][(unsigned char)(_x>>16)]+=_c_;\ 222 | c[3 ][ _x>>24 ]+=_c_; _x=(_u_)>>=32;\ 223 | c[0+_o_][(unsigned char) _x ]+=_c_;\ 224 | c[1+_o_][(unsigned char)(_x>> 8)]+=_c_;\ 225 | c[2+_o_][(unsigned char)(_x>>16)]+=_c_;\ 226 | c[3+_o_][ _x>>24 ]+=_c_;\ 227 | } 228 | #endif 229 | 230 | #define UZ 8 // Load size 1x 64 bits = 8 bytes 231 | #define I164(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0); CU64(u0, _o_, 1);\ 232 | u0 = ctou64(ip+UZ+_i_*UZ*2+ 8); CU64(u1, _o_, 1);\ 233 | } 234 | 235 | #define N64 64 236 | static void hist_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 237 | #define IC 0 238 | cnt_t c[4][CSIZE] = {0}, i; 239 | unsigned char *ip = in; 240 | 241 | if(inlen >= UZ+N64) { 242 | uint64_t u0 = ctou64(ip); 243 | for(; ip <= in+inlen-(UZ+N64); ip += N64) { 244 | I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC); 245 | PREFETCH(ip+512, 0); 246 | } 247 | } 248 | while(ip != in+inlen) c[0][*ip++]++; 249 | HISTEND4(c, cnt); 250 | } 251 | 252 | static void hist_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 253 | #define IC 4 254 | cnt_t c[8][CSIZE] = {0}, i; 255 | unsigned char *ip = in; 256 | 257 | if(inlen >= UZ+N64) { 258 | uint64_t u0 = ctou64(ip); 259 | for(; ip <= in+inlen-(UZ+N64); ip += N64) { 260 | I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC); 261 | PREFETCH(ip+512, 0); 262 | } 263 | } 264 | while(ip != in+inlen) c[0][*ip++]++; 265 | HISTEND8(c, cnt); 266 | } 267 | 268 | //----- hist_8_64a with inline assembly ----------------------------------------- 269 | #ifdef __x86_64 270 | #define RSHR(r, b) __asm volatile ("shr %1, %0": "+r" (r): "i" (b) ) 271 | 272 | #define CU16(x, u, offset, size, base, scale) \ 273 | __asm volatile (\ 274 | "movzbl %b1, %k0\n"\ 275 | "incl (%c2+0)*%c3(%4, %0, %c5)\n"\ 276 | "movzbl %h1, %k0\n"\ 277 | "incl (%c2+1)*%c3(%4, %0, %c5)\n"\ 278 | :"=&R" (x)\ 279 | :"Q" (u), "i" (offset), "i" (size), "r" (base), "i" (scale) \ 280 | :"memory"\ 281 | ) 282 | 283 | #define N64 64 284 | unsigned hist_8_64a(unsigned char *in, unsigned inlen, unsigned *__restrict cnt) { 285 | unsigned c[8][CSIZE]= {0}; 286 | unsigned char *ip = in; 287 | 288 | if(inlen >= 8+N64) { 289 | uint64_t u0 = ctou64(ip),b; 290 | for(; ip <= in+inlen-(8+N64); ip += N64) { 291 | uint64_t x, u1; 292 | #define ST(u) CU16(x, u, 0, CSIZE*4, c, 4);\ 293 | RSHR(u, 16); CU16(x, u, 2, CSIZE*4, c, 4);\ 294 | RSHR(u, 16); CU16(x, u, 4, CSIZE*4, c, 4);\ 295 | RSHR(u, 16); CU16(x, u, 6, CSIZE*4, c, 4); 296 | u1 = ctou64(ip+8+ 0); ST(u0); 297 | u0 = ctou64(ip+8+ 8); ST(u1); 298 | u1 = ctou64(ip+8+16); ST(u0); 299 | u0 = ctou64(ip+8+24); ST(u1); 300 | u1 = ctou64(ip+8+32); ST(u0); 301 | u0 = ctou64(ip+8+40); ST(u1); 302 | u1 = ctou64(ip+8+48); ST(u0); 303 | u0 = ctou64(ip+8+56); ST(u1); PREFETCH(ip+768, 0); 304 | } 305 | } 306 | while(ip < in+inlen) c[0][*ip++]++; 307 | HISTEND8(c, cnt); 308 | } 309 | #endif 310 | 311 | #define UZ 16 // Load size 2x 64 bits = 2*8 bytes 312 | #define CR64(u,v,_o_,_c_) if(likely(u!=v)) { CU64(u,_o_,1); CU64(v,_o_,1); } else if((u^(v<<8)) < (1<<8)) c[_c_][(unsigned char)u]+=UZ; else CU64(u, _o_,2) 313 | #define I2R64(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0), v1 = ctou64(ip+UZ+_i_*UZ*2+ 8); CR64(u0,v0,_o_,_i_);\ 314 | u0 = ctou64(ip+UZ+_i_*UZ*2+16); v0 = ctou64(ip+UZ+_i_*UZ*2+24); CR64(u1,v1,_o_,_i_);\ 315 | } 316 | 317 | #define N64 64 318 | static void histr_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 319 | #define IC 0 320 | cnt_t c[4][CSIZE] = {0},i; 321 | unsigned char *ip = in,*in_; 322 | 323 | if(inlen >= UZ+N64) { 324 | uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8); 325 | for(; ip <= in+inlen-(UZ+N64); ip += N64) { 326 | I2R64(0,IC); I2R64(1,IC); 327 | PREFETCH(ip+512, 0); 328 | } 329 | } 330 | while(ip != in+inlen) 331 | c[0][*ip++]++; 332 | HISTEND4(c, cnt); 333 | } 334 | 335 | static void histr_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 336 | #define IC 4 337 | cnt_t c[8][CSIZE] = {0},i; 338 | unsigned char *ip = in,*in_; 339 | 340 | if(inlen >= UZ+N64) { 341 | uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8); 342 | for(; ip <= in+inlen-(UZ+N64); ip += N64) { 343 | I2R64(0,IC); I2R64(1,IC); 344 | PREFETCH(ip+512, 0); 345 | } 346 | } 347 | while(ip != in+inlen) c[0][*ip++]++; 348 | HISTEND8(c, cnt); 349 | } 350 | 351 | #if defined(__SSE4_1__) || defined(__ARM_NEON) //---------- sse4.1 --------------------------------------- 352 | #ifdef __SSE4_1__ 353 | #include 354 | #else 355 | #include "sse_neon.h" 356 | #endif 357 | static void hist_4_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 358 | cnt_t c[4][CSIZE]={0},i; 359 | 360 | unsigned char *ip = in; 361 | if(inlen >= 32+64) { 362 | __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16)); 363 | for(; ip <= in+inlen-(32+64); ip += 64) { 364 | __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16)); 365 | c[0][_mm_extract_epi8(u0, 0)]++; 366 | c[1][_mm_extract_epi8(v0, 0)]++; 367 | c[2][_mm_extract_epi8(u0, 1)]++; 368 | c[3][_mm_extract_epi8(v0, 1)]++; 369 | c[0][_mm_extract_epi8(u0, 2)]++; 370 | c[1][_mm_extract_epi8(v0, 2)]++; 371 | c[2][_mm_extract_epi8(u0, 3)]++; 372 | c[3][_mm_extract_epi8(v0, 3)]++; 373 | c[0][_mm_extract_epi8(u0, 4)]++; 374 | c[1][_mm_extract_epi8(v0, 4)]++; 375 | c[2][_mm_extract_epi8(u0, 5)]++; 376 | c[3][_mm_extract_epi8(v0, 5)]++; 377 | c[0][_mm_extract_epi8(u0, 6)]++; 378 | c[1][_mm_extract_epi8(v0, 6)]++; 379 | c[2][_mm_extract_epi8(u0, 7)]++; 380 | c[3][_mm_extract_epi8(v0, 7)]++; 381 | c[0][_mm_extract_epi8(u0, 8)]++; 382 | c[1][_mm_extract_epi8(v0, 8)]++; 383 | c[2][_mm_extract_epi8(u0, 9)]++; 384 | c[3][_mm_extract_epi8(v0, 9)]++; 385 | c[0][_mm_extract_epi8(u0, 10)]++; 386 | c[1][_mm_extract_epi8(v0, 10)]++; 387 | c[2][_mm_extract_epi8(u0, 11)]++; 388 | c[3][_mm_extract_epi8(v0, 11)]++; 389 | c[0][_mm_extract_epi8(u0, 12)]++; 390 | c[1][_mm_extract_epi8(v0, 12)]++; 391 | c[2][_mm_extract_epi8(u0, 13)]++; 392 | c[3][_mm_extract_epi8(v0, 13)]++; 393 | c[0][_mm_extract_epi8(u0, 14)]++; 394 | c[1][_mm_extract_epi8(v0, 14)]++; 395 | c[2][_mm_extract_epi8(u0, 15)]++; 396 | c[3][_mm_extract_epi8(v0, 15)]++; 397 | 398 | u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48)); 399 | c[0][_mm_extract_epi8(u1, 0)]++; 400 | c[1][_mm_extract_epi8(v1, 0)]++; 401 | c[2][_mm_extract_epi8(u1, 1)]++; 402 | c[3][_mm_extract_epi8(v1, 1)]++; 403 | c[0][_mm_extract_epi8(u1, 2)]++; 404 | c[1][_mm_extract_epi8(v1, 2)]++; 405 | c[2][_mm_extract_epi8(u1, 3)]++; 406 | c[3][_mm_extract_epi8(v1, 3)]++; 407 | c[0][_mm_extract_epi8(u1, 4)]++; 408 | c[1][_mm_extract_epi8(v1, 4)]++; 409 | c[2][_mm_extract_epi8(u1, 5)]++; 410 | c[3][_mm_extract_epi8(v1, 5)]++; 411 | c[0][_mm_extract_epi8(u1, 6)]++; 412 | c[1][_mm_extract_epi8(v1, 6)]++; 413 | c[2][_mm_extract_epi8(u1, 7)]++; 414 | c[3][_mm_extract_epi8(v1, 7)]++; 415 | c[0][_mm_extract_epi8(u1, 8)]++; 416 | c[1][_mm_extract_epi8(v1, 8)]++; 417 | c[2][_mm_extract_epi8(u1, 9)]++; 418 | c[3][_mm_extract_epi8(v1, 9)]++; 419 | c[0][_mm_extract_epi8(u1, 10)]++; 420 | c[1][_mm_extract_epi8(v1, 10)]++; 421 | c[2][_mm_extract_epi8(u1, 11)]++; 422 | c[3][_mm_extract_epi8(v1, 11)]++; 423 | c[0][_mm_extract_epi8(u1, 12)]++; 424 | c[1][_mm_extract_epi8(v1, 12)]++; 425 | c[2][_mm_extract_epi8(u1, 13)]++; 426 | c[3][_mm_extract_epi8(v1, 13)]++; 427 | c[0][_mm_extract_epi8(u1, 14)]++; 428 | c[1][_mm_extract_epi8(v1, 14)]++; 429 | c[2][_mm_extract_epi8(u1, 15)]++; 430 | c[3][_mm_extract_epi8(v1, 15)]++; PREFETCH(ip+512, 0); 431 | } 432 | } 433 | while(ip < in+inlen) c[0][*ip++]++; 434 | HISTEND4(c, cnt); 435 | } 436 | 437 | unsigned hist_8_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 438 | cnt_t c[8][CSIZE]={0},i; 439 | 440 | unsigned char *ip = in; 441 | if(inlen >= 32+64) { 442 | __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16)); 443 | for(; ip <= in+inlen-(32+64); ip += 64) { 444 | __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16)); 445 | c[0][_mm_extract_epi8(u0, 0)]++; 446 | c[1][_mm_extract_epi8(v0, 0)]++; 447 | c[2][_mm_extract_epi8(u0, 1)]++; 448 | c[3][_mm_extract_epi8(v0, 1)]++; 449 | c[4][_mm_extract_epi8(u0, 2)]++; 450 | c[5][_mm_extract_epi8(v0, 2)]++; 451 | c[6][_mm_extract_epi8(u0, 3)]++; 452 | c[7][_mm_extract_epi8(v0, 3)]++; 453 | c[0][_mm_extract_epi8(u0, 4)]++; 454 | c[1][_mm_extract_epi8(v0, 4)]++; 455 | c[2][_mm_extract_epi8(u0, 5)]++; 456 | c[3][_mm_extract_epi8(v0, 5)]++; 457 | c[4][_mm_extract_epi8(u0, 6)]++; 458 | c[5][_mm_extract_epi8(v0, 6)]++; 459 | c[6][_mm_extract_epi8(u0, 7)]++; 460 | c[7][_mm_extract_epi8(v0, 7)]++; 461 | c[0][_mm_extract_epi8(u0, 8)]++; 462 | c[1][_mm_extract_epi8(v0, 8)]++; 463 | c[2][_mm_extract_epi8(u0, 9)]++; 464 | c[3][_mm_extract_epi8(v0, 9)]++; 465 | c[4][_mm_extract_epi8(u0, 10)]++; 466 | c[5][_mm_extract_epi8(v0, 10)]++; 467 | c[6][_mm_extract_epi8(u0, 11)]++; 468 | c[7][_mm_extract_epi8(v0, 11)]++; 469 | c[0][_mm_extract_epi8(u0, 12)]++; 470 | c[1][_mm_extract_epi8(v0, 12)]++; 471 | c[2][_mm_extract_epi8(u0, 13)]++; 472 | c[3][_mm_extract_epi8(v0, 13)]++; 473 | c[4][_mm_extract_epi8(u0, 14)]++; 474 | c[5][_mm_extract_epi8(v0, 14)]++; 475 | c[6][_mm_extract_epi8(u0, 15)]++; 476 | c[7][_mm_extract_epi8(v0, 15)]++; 477 | 478 | u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48)); 479 | c[0][_mm_extract_epi8(u1, 0)]++; 480 | c[1][_mm_extract_epi8(v1, 0)]++; 481 | c[2][_mm_extract_epi8(u1, 1)]++; 482 | c[3][_mm_extract_epi8(v1, 1)]++; 483 | c[4][_mm_extract_epi8(u1, 2)]++; 484 | c[5][_mm_extract_epi8(v1, 2)]++; 485 | c[6][_mm_extract_epi8(u1, 3)]++; 486 | c[7][_mm_extract_epi8(v1, 3)]++; 487 | c[0][_mm_extract_epi8(u1, 4)]++; 488 | c[1][_mm_extract_epi8(v1, 4)]++; 489 | c[2][_mm_extract_epi8(u1, 5)]++; 490 | c[3][_mm_extract_epi8(v1, 5)]++; 491 | c[4][_mm_extract_epi8(u1, 6)]++; 492 | c[5][_mm_extract_epi8(v1, 6)]++; 493 | c[6][_mm_extract_epi8(u1, 7)]++; 494 | c[7][_mm_extract_epi8(v1, 7)]++; 495 | c[0][_mm_extract_epi8(u1, 8)]++; 496 | c[1][_mm_extract_epi8(v1, 8)]++; 497 | c[2][_mm_extract_epi8(u1, 9)]++; 498 | c[3][_mm_extract_epi8(v1, 9)]++; 499 | c[4][_mm_extract_epi8(u1, 10)]++; 500 | c[5][_mm_extract_epi8(v1, 10)]++; 501 | c[6][_mm_extract_epi8(u1, 11)]++; 502 | c[7][_mm_extract_epi8(v1, 11)]++; 503 | c[0][_mm_extract_epi8(u1, 12)]++; 504 | c[1][_mm_extract_epi8(v1, 12)]++; 505 | c[2][_mm_extract_epi8(u1, 13)]++; 506 | c[3][_mm_extract_epi8(v1, 13)]++; 507 | c[4][_mm_extract_epi8(u1, 14)]++; 508 | c[5][_mm_extract_epi8(v1, 14)]++; 509 | c[6][_mm_extract_epi8(u1, 15)]++; 510 | c[7][_mm_extract_epi8(v1, 15)]++; PREFETCH(ip+512, 0); 511 | } 512 | } 513 | while(ip < in+inlen) c[0][*ip++]++; 514 | HISTEND8(c, cnt); 515 | } 516 | #endif 517 | 518 | #ifdef __AVX2__ //---------------------------------- avx2 ----------------------------------------------- 519 | #include 520 | 521 | #define UZ 64 522 | #define N256 128 523 | 524 | static void hist_4_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 525 | cnt_t c[4][CSIZE]={0},i; 526 | 527 | unsigned char *ip = in; 528 | if(inlen >= UZ+N256) { 529 | __m256i u0 = _mm256_loadu_si256((__m256i*)ip), v0 = _mm256_loadu_si256((__m256i*)(ip+32)); 530 | for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) { 531 | __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32)); 532 | c[0][_mm256_extract_epi8(u0, 0)]++; 533 | c[1][_mm256_extract_epi8(v0, 0)]++; 534 | c[2][_mm256_extract_epi8(u0, 1)]++; 535 | c[3][_mm256_extract_epi8(v0, 1)]++; 536 | c[0][_mm256_extract_epi8(u0, 2)]++; 537 | c[1][_mm256_extract_epi8(v0, 2)]++; 538 | c[2][_mm256_extract_epi8(u0, 3)]++; 539 | c[3][_mm256_extract_epi8(v0, 3)]++; 540 | c[0][_mm256_extract_epi8(u0, 4)]++; 541 | c[1][_mm256_extract_epi8(v0, 4)]++; 542 | c[2][_mm256_extract_epi8(u0, 5)]++; 543 | c[3][_mm256_extract_epi8(v0, 5)]++; 544 | c[0][_mm256_extract_epi8(u0, 6)]++; 545 | c[1][_mm256_extract_epi8(v0, 6)]++; 546 | c[2][_mm256_extract_epi8(u0, 7)]++; 547 | c[3][_mm256_extract_epi8(v0, 7)]++; 548 | c[0][_mm256_extract_epi8(u0, 8)]++; 549 | c[1][_mm256_extract_epi8(v0, 8)]++; 550 | c[2][_mm256_extract_epi8(u0, 9)]++; 551 | c[3][_mm256_extract_epi8(v0, 9)]++; 552 | c[0][_mm256_extract_epi8(u0, 10)]++; 553 | c[1][_mm256_extract_epi8(v0, 10)]++; 554 | c[2][_mm256_extract_epi8(u0, 11)]++; 555 | c[3][_mm256_extract_epi8(v0, 11)]++; 556 | c[0][_mm256_extract_epi8(u0, 12)]++; 557 | c[1][_mm256_extract_epi8(v0, 12)]++; 558 | c[2][_mm256_extract_epi8(u0, 13)]++; 559 | c[3][_mm256_extract_epi8(v0, 13)]++; 560 | c[0][_mm256_extract_epi8(u0, 14)]++; 561 | c[1][_mm256_extract_epi8(v0, 14)]++; 562 | c[2][_mm256_extract_epi8(u0, 15)]++; 563 | c[3][_mm256_extract_epi8(v0, 15)]++; 564 | c[0][_mm256_extract_epi8(u0, 16)]++; 565 | c[1][_mm256_extract_epi8(v0, 16)]++; 566 | c[2][_mm256_extract_epi8(u0, 17)]++; 567 | c[3][_mm256_extract_epi8(v0, 17)]++; 568 | c[0][_mm256_extract_epi8(u0, 18)]++; 569 | c[1][_mm256_extract_epi8(v0, 18)]++; 570 | c[2][_mm256_extract_epi8(u0, 19)]++; 571 | c[3][_mm256_extract_epi8(v0, 19)]++; 572 | c[0][_mm256_extract_epi8(u0, 20)]++; 573 | c[1][_mm256_extract_epi8(v0, 20)]++; 574 | c[2][_mm256_extract_epi8(u0, 21)]++; 575 | c[3][_mm256_extract_epi8(v0, 21)]++; 576 | c[0][_mm256_extract_epi8(u0, 22)]++; 577 | c[1][_mm256_extract_epi8(v0, 22)]++; 578 | c[2][_mm256_extract_epi8(u0, 23)]++; 579 | c[3][_mm256_extract_epi8(v0, 23)]++; 580 | c[0][_mm256_extract_epi8(u0, 24)]++; 581 | c[1][_mm256_extract_epi8(v0, 24)]++; 582 | c[2][_mm256_extract_epi8(u0, 25)]++; 583 | c[3][_mm256_extract_epi8(v0, 25)]++; 584 | c[0][_mm256_extract_epi8(u0, 26)]++; 585 | c[1][_mm256_extract_epi8(v0, 26)]++; 586 | c[2][_mm256_extract_epi8(u0, 27)]++; 587 | c[3][_mm256_extract_epi8(v0, 27)]++; 588 | c[0][_mm256_extract_epi8(u0, 28)]++; 589 | c[1][_mm256_extract_epi8(v0, 28)]++; 590 | c[2][_mm256_extract_epi8(u0, 29)]++; 591 | c[3][_mm256_extract_epi8(v0, 29)]++; 592 | c[0][_mm256_extract_epi8(u0, 30)]++; 593 | c[1][_mm256_extract_epi8(v0, 30)]++; 594 | c[2][_mm256_extract_epi8(u0, 31)]++; 595 | c[3][_mm256_extract_epi8(v0, 31)]++; 596 | 597 | u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96)); 598 | c[0][_mm256_extract_epi8(u1, 0)]++; 599 | c[1][_mm256_extract_epi8(v1, 0)]++; 600 | c[2][_mm256_extract_epi8(u1, 1)]++; 601 | c[3][_mm256_extract_epi8(v1, 1)]++; 602 | c[0][_mm256_extract_epi8(u1, 2)]++; 603 | c[1][_mm256_extract_epi8(v1, 2)]++; 604 | c[2][_mm256_extract_epi8(u1, 3)]++; 605 | c[3][_mm256_extract_epi8(v1, 3)]++; 606 | c[0][_mm256_extract_epi8(u1, 4)]++; 607 | c[1][_mm256_extract_epi8(v1, 4)]++; 608 | c[2][_mm256_extract_epi8(u1, 5)]++; 609 | c[3][_mm256_extract_epi8(v1, 5)]++; 610 | c[0][_mm256_extract_epi8(u1, 6)]++; 611 | c[1][_mm256_extract_epi8(v1, 6)]++; 612 | c[2][_mm256_extract_epi8(u1, 7)]++; 613 | c[3][_mm256_extract_epi8(v1, 7)]++; 614 | c[0][_mm256_extract_epi8(u1, 8)]++; 615 | c[1][_mm256_extract_epi8(v1, 8)]++; 616 | c[2][_mm256_extract_epi8(u1, 9)]++; 617 | c[3][_mm256_extract_epi8(v1, 9)]++; 618 | c[0][_mm256_extract_epi8(u1, 10)]++; 619 | c[1][_mm256_extract_epi8(v1, 10)]++; 620 | c[2][_mm256_extract_epi8(u1, 11)]++; 621 | c[3][_mm256_extract_epi8(v1, 11)]++; 622 | c[0][_mm256_extract_epi8(u1, 12)]++; 623 | c[1][_mm256_extract_epi8(v1, 12)]++; 624 | c[2][_mm256_extract_epi8(u1, 13)]++; 625 | c[3][_mm256_extract_epi8(v1, 13)]++; 626 | c[0][_mm256_extract_epi8(u1, 14)]++; 627 | c[1][_mm256_extract_epi8(v1, 14)]++; 628 | c[2][_mm256_extract_epi8(u1, 15)]++; 629 | c[3][_mm256_extract_epi8(v1, 15)]++; 630 | c[0][_mm256_extract_epi8(u1, 16)]++; 631 | c[1][_mm256_extract_epi8(v1, 16)]++; 632 | c[2][_mm256_extract_epi8(u1, 17)]++; 633 | c[3][_mm256_extract_epi8(v1, 17)]++; 634 | c[0][_mm256_extract_epi8(u1, 18)]++; 635 | c[1][_mm256_extract_epi8(v1, 18)]++; 636 | c[2][_mm256_extract_epi8(u1, 19)]++; 637 | c[3][_mm256_extract_epi8(v1, 19)]++; 638 | c[0][_mm256_extract_epi8(u1, 20)]++; 639 | c[1][_mm256_extract_epi8(v1, 20)]++; 640 | c[2][_mm256_extract_epi8(u1, 21)]++; 641 | c[3][_mm256_extract_epi8(v1, 21)]++; 642 | c[0][_mm256_extract_epi8(u1, 22)]++; 643 | c[1][_mm256_extract_epi8(v1, 22)]++; 644 | c[2][_mm256_extract_epi8(u1, 23)]++; 645 | c[3][_mm256_extract_epi8(v1, 23)]++; 646 | c[0][_mm256_extract_epi8(u1, 24)]++; 647 | c[1][_mm256_extract_epi8(v1, 24)]++; 648 | c[2][_mm256_extract_epi8(u1, 25)]++; 649 | c[3][_mm256_extract_epi8(v1, 25)]++; 650 | c[0][_mm256_extract_epi8(u1, 26)]++; 651 | c[1][_mm256_extract_epi8(v1, 26)]++; 652 | c[2][_mm256_extract_epi8(u1, 27)]++; 653 | c[3][_mm256_extract_epi8(v1, 27)]++; 654 | c[0][_mm256_extract_epi8(u1, 28)]++; 655 | c[1][_mm256_extract_epi8(v1, 28)]++; 656 | c[2][_mm256_extract_epi8(u1, 29)]++; 657 | c[3][_mm256_extract_epi8(v1, 29)]++; 658 | c[0][_mm256_extract_epi8(u1, 30)]++; 659 | c[1][_mm256_extract_epi8(v1, 30)]++; 660 | c[2][_mm256_extract_epi8(u1, 31)]++; 661 | c[3][_mm256_extract_epi8(v1, 31)]++; PREFETCH(ip+512, 0); 662 | } 663 | } 664 | while(ip < in+inlen) c[0][*ip++]++; 665 | HISTEND4(c, cnt); 666 | } 667 | 668 | #define UZ 64 669 | #define N256 128 670 | static void hist_8_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 671 | cnt_t c[8][CSIZE]={0},i; 672 | 673 | unsigned char *ip = in; 674 | if(inlen >= UZ+N256) { 675 | __m256i u0 = _mm256_loadu_si256((__m256i*)ip), v0 = _mm256_loadu_si256((__m256i*)(ip+32)); 676 | for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) { 677 | __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ+0)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32)); 678 | c[0][_mm256_extract_epi8(u0, 0)]++; 679 | c[1][_mm256_extract_epi8(v0, 0)]++; 680 | c[2][_mm256_extract_epi8(u0, 1)]++; 681 | c[3][_mm256_extract_epi8(v0, 1)]++; 682 | c[4][_mm256_extract_epi8(u0, 2)]++; 683 | c[5][_mm256_extract_epi8(v0, 2)]++; 684 | c[6][_mm256_extract_epi8(u0, 3)]++; 685 | c[7][_mm256_extract_epi8(v0, 3)]++; 686 | c[0][_mm256_extract_epi8(u0, 4)]++; 687 | c[1][_mm256_extract_epi8(v0, 4)]++; 688 | c[2][_mm256_extract_epi8(u0, 5)]++; 689 | c[3][_mm256_extract_epi8(v0, 5)]++; 690 | c[4][_mm256_extract_epi8(u0, 6)]++; 691 | c[5][_mm256_extract_epi8(v0, 6)]++; 692 | c[6][_mm256_extract_epi8(u0, 7)]++; 693 | c[7][_mm256_extract_epi8(v0, 7)]++; 694 | c[0][_mm256_extract_epi8(u0, 8)]++; 695 | c[1][_mm256_extract_epi8(v0, 8)]++; 696 | c[2][_mm256_extract_epi8(u0, 9)]++; 697 | c[3][_mm256_extract_epi8(v0, 9)]++; 698 | c[4][_mm256_extract_epi8(u0, 10)]++; 699 | c[5][_mm256_extract_epi8(v0, 10)]++; 700 | c[6][_mm256_extract_epi8(u0, 11)]++; 701 | c[7][_mm256_extract_epi8(v0, 11)]++; 702 | c[0][_mm256_extract_epi8(u0, 12)]++; 703 | c[1][_mm256_extract_epi8(v0, 12)]++; 704 | c[2][_mm256_extract_epi8(u0, 13)]++; 705 | c[3][_mm256_extract_epi8(v0, 13)]++; 706 | c[4][_mm256_extract_epi8(u0, 14)]++; 707 | c[5][_mm256_extract_epi8(v0, 14)]++; 708 | c[6][_mm256_extract_epi8(u0, 15)]++; 709 | c[7][_mm256_extract_epi8(v0, 15)]++; 710 | c[0][_mm256_extract_epi8(u0, 16)]++; 711 | c[1][_mm256_extract_epi8(v0, 16)]++; 712 | c[2][_mm256_extract_epi8(u0, 17)]++; 713 | c[3][_mm256_extract_epi8(v0, 17)]++; 714 | c[4][_mm256_extract_epi8(u0, 18)]++; 715 | c[5][_mm256_extract_epi8(v0, 18)]++; 716 | c[6][_mm256_extract_epi8(u0, 19)]++; 717 | c[7][_mm256_extract_epi8(v0, 19)]++; 718 | c[0][_mm256_extract_epi8(u0, 20)]++; 719 | c[1][_mm256_extract_epi8(v0, 20)]++; 720 | c[2][_mm256_extract_epi8(u0, 21)]++; 721 | c[3][_mm256_extract_epi8(v0, 21)]++; 722 | c[4][_mm256_extract_epi8(u0, 22)]++; 723 | c[5][_mm256_extract_epi8(v0, 22)]++; 724 | c[6][_mm256_extract_epi8(u0, 23)]++; 725 | c[7][_mm256_extract_epi8(v0, 23)]++; 726 | c[0][_mm256_extract_epi8(u0, 24)]++; 727 | c[1][_mm256_extract_epi8(v0, 24)]++; 728 | c[2][_mm256_extract_epi8(u0, 25)]++; 729 | c[3][_mm256_extract_epi8(v0, 25)]++; 730 | c[4][_mm256_extract_epi8(u0, 26)]++; 731 | c[5][_mm256_extract_epi8(v0, 26)]++; 732 | c[6][_mm256_extract_epi8(u0, 27)]++; 733 | c[7][_mm256_extract_epi8(v0, 27)]++; 734 | c[0][_mm256_extract_epi8(u0, 28)]++; 735 | c[1][_mm256_extract_epi8(v0, 28)]++; 736 | c[2][_mm256_extract_epi8(u0, 29)]++; 737 | c[3][_mm256_extract_epi8(v0, 29)]++; 738 | c[4][_mm256_extract_epi8(u0, 30)]++; 739 | c[5][_mm256_extract_epi8(v0, 30)]++; 740 | c[6][_mm256_extract_epi8(u0, 31)]++; 741 | c[7][_mm256_extract_epi8(v0, 31)]++; 742 | 743 | u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96)); 744 | c[0][_mm256_extract_epi8(u1, 0)]++; 745 | c[1][_mm256_extract_epi8(v1, 0)]++; 746 | c[2][_mm256_extract_epi8(u1, 1)]++; 747 | c[3][_mm256_extract_epi8(v1, 1)]++; 748 | c[4][_mm256_extract_epi8(u1, 2)]++; 749 | c[5][_mm256_extract_epi8(v1, 2)]++; 750 | c[6][_mm256_extract_epi8(u1, 3)]++; 751 | c[7][_mm256_extract_epi8(v1, 3)]++; 752 | c[0][_mm256_extract_epi8(u1, 4)]++; 753 | c[1][_mm256_extract_epi8(v1, 4)]++; 754 | c[2][_mm256_extract_epi8(u1, 5)]++; 755 | c[3][_mm256_extract_epi8(v1, 5)]++; 756 | c[4][_mm256_extract_epi8(u1, 6)]++; 757 | c[5][_mm256_extract_epi8(v1, 6)]++; 758 | c[6][_mm256_extract_epi8(u1, 7)]++; 759 | c[7][_mm256_extract_epi8(v1, 7)]++; 760 | c[0][_mm256_extract_epi8(u1, 8)]++; 761 | c[1][_mm256_extract_epi8(v1, 8)]++; 762 | c[2][_mm256_extract_epi8(u1, 9)]++; 763 | c[3][_mm256_extract_epi8(v1, 9)]++; 764 | c[4][_mm256_extract_epi8(u1, 10)]++; 765 | c[5][_mm256_extract_epi8(v1, 10)]++; 766 | c[6][_mm256_extract_epi8(u1, 11)]++; 767 | c[7][_mm256_extract_epi8(v1, 11)]++; 768 | c[0][_mm256_extract_epi8(u1, 12)]++; 769 | c[1][_mm256_extract_epi8(v1, 12)]++; 770 | c[2][_mm256_extract_epi8(u1, 13)]++; 771 | c[3][_mm256_extract_epi8(v1, 13)]++; 772 | c[4][_mm256_extract_epi8(u1, 14)]++; 773 | c[5][_mm256_extract_epi8(v1, 14)]++; 774 | c[6][_mm256_extract_epi8(u1, 15)]++; 775 | c[7][_mm256_extract_epi8(v1, 15)]++; 776 | c[0][_mm256_extract_epi8(u1, 16)]++; 777 | c[1][_mm256_extract_epi8(v1, 16)]++; 778 | c[2][_mm256_extract_epi8(u1, 17)]++; 779 | c[3][_mm256_extract_epi8(v1, 17)]++; 780 | c[4][_mm256_extract_epi8(u1, 18)]++; 781 | c[5][_mm256_extract_epi8(v1, 18)]++; 782 | c[6][_mm256_extract_epi8(u1, 19)]++; 783 | c[7][_mm256_extract_epi8(v1, 19)]++; 784 | c[0][_mm256_extract_epi8(u1, 20)]++; 785 | c[1][_mm256_extract_epi8(v1, 20)]++; 786 | c[2][_mm256_extract_epi8(u1, 21)]++; 787 | c[3][_mm256_extract_epi8(v1, 21)]++; 788 | c[4][_mm256_extract_epi8(u1, 22)]++; 789 | c[5][_mm256_extract_epi8(v1, 22)]++; 790 | c[6][_mm256_extract_epi8(u1, 23)]++; 791 | c[7][_mm256_extract_epi8(v1, 23)]++; 792 | c[0][_mm256_extract_epi8(u1, 24)]++; 793 | c[1][_mm256_extract_epi8(v1, 24)]++; 794 | c[2][_mm256_extract_epi8(u1, 25)]++; 795 | c[3][_mm256_extract_epi8(v1, 25)]++; 796 | c[4][_mm256_extract_epi8(u1, 26)]++; 797 | c[5][_mm256_extract_epi8(v1, 26)]++; 798 | c[6][_mm256_extract_epi8(u1, 27)]++; 799 | c[7][_mm256_extract_epi8(v1, 27)]++; 800 | c[0][_mm256_extract_epi8(u1, 28)]++; 801 | c[1][_mm256_extract_epi8(v1, 28)]++; 802 | c[2][_mm256_extract_epi8(u1, 29)]++; 803 | c[3][_mm256_extract_epi8(v1, 29)]++; 804 | c[4][_mm256_extract_epi8(u1, 30)]++; 805 | c[5][_mm256_extract_epi8(v1, 30)]++; 806 | c[6][_mm256_extract_epi8(u1, 31)]++; 807 | c[7][_mm256_extract_epi8(v1, 31)]++; PREFETCH(ip+512, 0); 808 | } 809 | } 810 | while(ip < in+inlen) c[0][*ip++]++; 811 | HISTEND8(c, cnt); 812 | } 813 | #endif 814 | 815 | //------------------------------------------------------------------------- 816 | #ifdef _COUNTBENCH 817 | // "count2x64", fastest function in https://github.com/nkurz/countbench 818 | #define CSIZE (256+8) 819 | 820 | #define ASM_SHIFT_RIGHT(reg, bitsToShift) \ 821 | __asm volatile ("shr %1, %0": \ 822 | "+r" (reg): /* read and written */ \ 823 | "i" (bitsToShift) /* constant */ \ 824 | ) 825 | 826 | 827 | #define ASM_INC_TABLES(src0, src1, byte0, byte1, offset, size, base, scale) \ 828 | __asm volatile ("movzbl %b2, %k0\n" /* byte0 = src0 & 0xFF */ \ 829 | "movzbl %b3, %k1\n" /* byte1 = src1 & 0xFF */ \ 830 | "incl (%c4+0)*%c5(%6, %0, %c7)\n" /* count[i+0][byte0]++ */ \ 831 | "incl (%c4+1)*%c5(%6, %1, %c7)\n" /* count[i+1][byte1]++ */ \ 832 | "movzbl %h2, %k0\n" /* byte0 = (src0 & 0xFF00) >> 8 */ \ 833 | "movzbl %h3, %k1\n" /* byte1 = (src1 & 0xFF00) >> 8 */ \ 834 | "incl (%c4+2)*%c5(%6, %0, %c7)\n" /* count[i+2][byte0]++ */ \ 835 | "incl (%c4+3)*%c5(%6, %1, %c7)\n": /* count[i+3][byte1]++ */ \ 836 | "=&R" (byte0), /* write only (R == non REX) */ \ 837 | "=&R" (byte1): /* write only (R == non REX) */ \ 838 | "Q" (src0), /* read only (Q == must have rH) */ \ 839 | "Q" (src1), /* read only (Q == must have rH) */ \ 840 | "i" (offset), /* constant array offset */ \ 841 | "i" (size), /* constant array size */ \ 842 | "r" (base), /* read only array address */ \ 843 | "i" (scale): /* constant [1,2,4,8] */ \ 844 | "memory" /* clobbered (forces compiler to compute sum ) */ \ 845 | ) 846 | 847 | unsigned count2x64(unsigned char *src, unsigned srcSize, unsigned *__restrict cnt) 848 | { 849 | unsigned long long remainder = srcSize; 850 | if (srcSize < 32) goto handle_remainder; 851 | 852 | unsigned c[16][CSIZE]; 853 | memset(c, 0, sizeof(c)); 854 | 855 | remainder = srcSize % 16; 856 | srcSize -= remainder; 857 | const unsigned char *endSrc = src + srcSize; 858 | unsigned long long next0 = *(unsigned long long *)(src + 0); 859 | unsigned long long next1 = *(unsigned long long *)(src + 8); 860 | 861 | //IACA_START; 862 | 863 | while (src != endSrc) 864 | { 865 | unsigned long long byte0, byte1; 866 | unsigned long long data0 = next0; 867 | unsigned long long data1 = next1; 868 | 869 | src += 16; 870 | next0 = *(unsigned long long *)(src + 0); 871 | next1 = *(unsigned long long *)(src + 8); 872 | 873 | ASM_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4); 874 | 875 | ASM_SHIFT_RIGHT(data0, 16); 876 | ASM_SHIFT_RIGHT(data1, 16); 877 | ASM_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4); 878 | 879 | ASM_SHIFT_RIGHT(data0, 16); 880 | ASM_SHIFT_RIGHT(data1, 16); 881 | ASM_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4); 882 | 883 | ASM_SHIFT_RIGHT(data0, 16); 884 | ASM_SHIFT_RIGHT(data1, 16); 885 | ASM_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4); 886 | } 887 | 888 | //IACA_END; 889 | 890 | handle_remainder: 891 | for (size_t i = 0; i < remainder; i++) { 892 | unsigned long long byte = src[i]; 893 | c[0][byte]++; 894 | } 895 | memset(cnt, 0, 256*sizeof(cnt[0])); 896 | for(int i = 0; i < 256; i++) 897 | for (int idx=0; idx < 16; idx++) 898 | cnt[i] += c[idx][i]; 899 | } 900 | 901 | // Modified version of count2x64 by powturbo, using C instead of assembler 902 | #define C_SHIFT_RIGHT(reg, bitsToShift) reg >>= bitsToShift 903 | #define C_INC_TABLES(src0, src1, byte0, byte1, offset, size, c, scale) \ 904 | { \ 905 | byte0 = (unsigned char)src0;\ 906 | byte1 = (unsigned char)src1;\ 907 | c[offset+0][byte0]++;\ 908 | c[offset+1][byte1]++;\ 909 | byte0 = (unsigned char)(src0 >> 8);\ 910 | byte1 = (unsigned char)(src1 >> 8);\ 911 | c[offset+2][byte0]++; \ 912 | c[offset+3][byte1]++; \ 913 | } 914 | 915 | static void count2x64c(unsigned char *__restrict src, unsigned srcSize, unsigned *__restrict cnt) 916 | { 917 | unsigned long long remainder = srcSize; 918 | if (srcSize < 32) goto handle_remainder; 919 | 920 | unsigned c[16][CSIZE]; 921 | memset(c, 0, sizeof(c)); 922 | 923 | remainder = srcSize % 16; 924 | srcSize -= remainder; 925 | const unsigned char *endSrc = src + srcSize; 926 | unsigned long long next0 = *(unsigned long long *)(src + 0); 927 | unsigned long long next1 = *(unsigned long long *)(src + 8); 928 | 929 | //IACA_START; 930 | 931 | while (src != endSrc) 932 | { 933 | unsigned long long byte0, byte1; 934 | unsigned long long data0 = next0; 935 | unsigned long long data1 = next1; 936 | 937 | src += 16; 938 | next0 = *(unsigned long long *)(src + 0); 939 | next1 = *(unsigned long long *)(src + 8); 940 | 941 | C_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4); 942 | 943 | C_SHIFT_RIGHT(data0, 16); 944 | C_SHIFT_RIGHT(data1, 16); 945 | C_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4); 946 | 947 | C_SHIFT_RIGHT(data0, 16); 948 | C_SHIFT_RIGHT(data1, 16); 949 | C_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4); 950 | 951 | C_SHIFT_RIGHT(data0, 16); 952 | C_SHIFT_RIGHT(data1, 16); 953 | C_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4); 954 | } 955 | 956 | //IACA_END; 957 | 958 | handle_remainder: 959 | for (size_t i = 0; i < remainder; i++) { 960 | unsigned long long byte = src[i]; 961 | c[0][byte]++; 962 | } 963 | memset(cnt, 0, 256*sizeof(cnt[0])); 964 | for(int i = 0; i < 256; i++) 965 | for(int idx=0; idx < 16; idx++) 966 | cnt[i] += c[idx][i]; 967 | } 968 | #endif 969 | --------------------------------------------------------------------------------