├── .travis.yml
├── LICENSE
├── makefile
├── README.md
├── turbohist.c
├── time_.h
├── conf.h
├── sse_neon.h
└── turbohist_.c


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | 
 3 | compiler: 
 4 |   - gcc
 5 |   - clang
 6 | 
 7 | branches:
 8 |   only:
 9 |     - master 
10 | 
11 | script:
12 |   - make
13 |   - ./turbohist
14 | 
15 | matrix:
16 |   include:
17 |     - name: Linux arm
18 |       os: linux
19 |       arch: arm64
20 |       compiler: gcc
21 | 
22 |     - name: Windows-MinGW
23 |       os: windows
24 |       script: 
25 |         - mingw32-make
26 |         - ./turbohist 
27 | 
28 |     - name: macOS, xcode
29 |       os: osx
30 | 
31 | #    - name: Linux amd64
32 | #      os: linux
33 | #      arch: amd64
34 | #    - name: Power ppc64le
35 | #      os: linux-ppc64le
36 | #      compiler: gcc     
37 | 
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-2019, Powturbo
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 |     - homepage : https://sites.google.com/site/powturbo/
28 |     - github   : https://github.com/powturbo
29 |     - twitter  : https://twitter.com/powturbo
30 |     - email    : powturbo [_AT_] gmail [_DOT_] com
31 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
  1 | # powturbo  (c) Copyright 2013-2022
  2 | # Download or clone Turbo-Histogram:
  3 | # git clone git://github.com/powturbo/Turbo-Histogram.git
  4 | 
  5 | #uncomment to enable
  6 | #https://github.com/nkurz/countbench (inline assembly)
  7 | #COUNTBENCH=1
  8 | #https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0  (delete/comment or rename main)
  9 | #RYG=1
 10 | # timer: rdtsc cycles/byte or wall time in MB/s
 11 | #RDTSC=1
 12 | #AVX2=1
 13 | 
 14 | #-------------------------------------------------------------------------------------
 15 | CC ?= gcc
 16 | CXX ?= g++
 17 | #CC=clang
 18 | #CXX=clang++
 19 | ASM ?= nasm
 20 | 
 21 | OPT=-fstrict-aliasing 
 22 | ifeq (,$(findstring clang, $(CC)))
 23 | OPT+=-falign-loops
 24 | endif
 25 | 
 26 | #------- OS/ARCH -------------------
 27 | ifneq (,$(filter Windows%,$(OS)))
 28 |   OS := Windows
 29 | #  CC=gcc
 30 | #  CXX=g++
 31 |   ARCH=x86_64
 32 |   LDFLAGS+=-Wl,--stack,8194304
 33 |   FASM=win64
 34 | else
 35 |   OS := $(shell uname -s)
 36 |   ARCH := $(shell uname -m)
 37 |   FASM=elf64
 38 | 
 39 | ifneq (,$(findstring aarch64,$(CC)))
 40 |   ARCH = aarch64
 41 | else ifneq (,$(findstring powerpc64le,$(CC)))
 42 |   ARCH = ppc64le
 43 | endif
 44 | endif
 45 | 
 46 | ifeq ($(ARCH),ppc64le)
 47 |   _SSE=-D__SSSE3__
 48 |   MARCH=-mcpu=power9 -mtune=power9 $(_SSE)
 49 | else ifeq ($(ARCH),aarch64)
 50 |   MARCH+=-march=armv8-a 
 51 | ifneq (,$(findstring clang, $(CC)))
 52 |   MARCH+=-march=armv8-a 
 53 |   OPT+=-fomit-frame-pointer
 54 | else
 55 |   MARCH+=-march=armv8-a 
 56 | endif
 57 |   SSE=-march=armv8-a
 58 | else ifeq ($(ARCH),$(filter $(ARCH),x86_64))
 59 |   LDFLAG+=-lm
 60 | # set minimum arch sandy bridge SSE4.1 + AVX
 61 |   _SSE=-march=corei7-avx -mtune=corei7-avx 
 62 | # SSE+=-mno-avx -mno-aes
 63 |   _AVX2=-march=haswell
 64 | #  CFLAGS=$(SSE)
 65 | #  CFLAGS=$(AVX2)
 66 | endif
 67 | 
 68 | ifeq ($(AVX2),1)
 69 | MARCH=$(_AVX2) 
 70 | else
 71 | MARCH=$(_SSE) 
 72 | endif
 73 | 
 74 | CFLAGS+=$(MARCH) -w $(OPT)
 75 | ifeq ($(STATIC),1)
 76 | LDFLAGS+=-static
 77 | endif
 78 | 
 79 | ifeq ($(RDTSC),1)
 80 | CFLAGS+=-D_RDTSC
 81 | endif
 82 | 
 83 | ifeq ($(COUNTBENCH),1)
 84 | CFLAGS+=-D_COUNTBENCH
 85 | endif
 86 | 
 87 | ifeq ($(RYG),1)
 88 | CFLAGS+=-D_RYG
 89 | ASMLIB=histo_asm.o
 90 | endif
 91 | 
 92 | all: turbohist 
 93 | 
 94 | histo_asm.o: histo_asm.nas
 95 | 	$(ASM) -f $(FASM) histo_asm.nas -o histo_asm.o
 96 | 
 97 | turbohist: turbohist.o $(ASMLIB)
 98 | 	$(CC) $^ $(LDFLAGS) -o turbohist
 99 | 
100 | .c.o:
101 | 	$(CC) -O3 $(CFLAGS) $< -c -o $@
102 | 
103 | 
104 | ifeq ($(OS),Windows)
105 | clean:
106 | 	del /S *.o 
107 | #	del /S *.exe
108 | else
109 | clean:
110 | 	find . -name "turbohist" -type f -delete
111 | 	find . -name "*.o" -type f -delete
112 | 	find . -name "*~" -type f -delete
113 | 	find . -name "core" -type f -delete
114 | endif
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | TurboHist: Fastest Histogram Construction
  2 | =========================================
  3 | 
  4 | - **~0.18 - 0.90 cycles per byte**
  5 | - 100% C (C++ compatible header) without inline assembly
  6 | - Both 32 and 64 bits supported
  7 | - Portable scalar functions faster than SIMD functions
  8 | - **Up to 22 times** faster than naive solution
  9 | - :new: (2022.01) more faster, beats even other very fast assembler functions
 10 | 
 11 | # Benchmark:
 12 | - Single thread
 13 | - Realistic and practical benchmark with large files.
 14 | - No PURE cache benchmark
 15 | 
 16 | #### - Uniform/Skewed distribution: 
 17 |  - Uniform: [enwik9](http://mattmahoney.net/dc/text.html)
 18 |  - Skewed: enwik9 bwt generated w. libdivsufsort
 19 |  - 1GB zeros 
 20 |  - Accurate benchmarking with command "turbohist file -I15"
 21 | 
 22 | ###### Benchmark Intel CPU: i7-9700K 3.6GHz gcc 11.2
 23 | Uniform distribution - enwik9 Text file, size=1.000.0000.000
 24 | | Function                   | MB/s   |Cycle/Byte|Language  |Package  |
 25 | |----------------------------|-------:|---------:|----------|---------|
 26 | | 1:hist_1_8   naiv    8 bits| 2761.01|1.3423    |C         |TurboHist|
 27 | | 2:hist_4_8   4 bins/ 8 bits| 2725.92|1.3249|C|TurboHist|
 28 | | 3:hist_8_8   8 bins/ 8 bits| 2850.05|1.2627|C|TurboHist|
 29 | | 4:hist_4_32  4 bins/32 bits| 3691.02|0.9660|C|TurboHist|
 30 | | 5:hist_8_32  8 bins/32 bits| 3867.26|0.9561|C|TurboHist|
 31 | | 6:hist_4_64  4 bins/64 bits|4040.55|0.9103|C|TurboHist|
 32 | | 7:hist_8_64  8 bins/64 bits|**4053.37**|**0.9035**|C|TurboHist|
 33 | | 8:histr_4_64 4/64+run      | 3915.85|0.9668|C|TurboHist|
 34 | | 9:histr_8_64 8/64+run      | 3916.51|0.9286|C|TurboHist|
 35 | |10:hist_4_128 4 bins/sse4.1 | 3643.20|1.0081|C|TurboHist|
 36 | |11:hist_8_128 8 bins/sse4.1 | 3607.06|0.9845|C|TurboHist|
 37 | |12:hist_4_256 4 bins/avx2   | 3522.27|1.0195|C|TurboHist|
 38 | |13:hist_8_256 8 bins/avx2   | 3542.25|1.0366|C|TurboHist|
 39 | |15:hist_8_64asm inline asm  |**4161.87**|**0.8787**|inline asm|TurboHist|
 40 | |18:count2x64    inline asm  | 3963.91|0.9172|inline asm|Countbench|
 41 | |20:histo_ref                | 2702.57|1.3567|C|[Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0)|
 42 | |21:histo_cpp_1x             | 1876.13|1.8236|C|Ryg|
 43 | |22:histo_cpp_2x             | 2664.78|1.5935|C|Ryg|
 44 | |23:histo_cpp_4x             | 2817.77|1.2944|C|Ryg|
 45 | |24:histo_asm_scalar4        | 3130.08|1.1609|asm|Ryg|
 46 | |25:histo_asm_scalar8        | 3353.08|1.0636|asm|Ryg|
 47 | |26:histo_asm_scalar8_var    | 3704.88|0.9856|asm|Ryg|
 48 | |27:histo_asm_scalar8_var2   | 4085.48|0.8913|asm|Ryg|
 49 | |28:histo_asm_scalar8_var3   | 4132.54|0.8870|asm|Ryg|
 50 | |29:histo_asm_scalar8_var4   | 4083.92|0.8970|asm|Ryg|
 51 | |30:histo_asm_scalar8_var5   | 4002.21|0.9025|asm|Ryg|
 52 | |31:histo_asm_sse4           | 3153.01|1.1445|asm|Ryg|
 53 | |32:memcpy                   |13724.29|0.2698|C|
 54 | 
 55 | Skewed distribution - enwik9.bwt Text file, size=1.000.0000.000
 56 | | Function                   | MB/s   |Cycle/Byte|Language  |
 57 | |----------------------------|-------:|---------:|----------|
 58 | | 1:hist_1_8   naiv    8 bits| 1170.89|3.0642|C|TurboHist|
 59 | | 2:hist_4_8   4 bins/ 8 bits| 2707.74|1.3321|C|TurboHist|
 60 | | 3:hist_8_8   8 bins/ 8 bits| 2804.08|1.3208|C|TurboHist|
 61 | | 4:hist_4_32  4 bins/32 bits| 3118.54|1.1402|C|TurboHist|
 62 | | 5:hist_8_32  8 bins/32 bits| 3780.16|0.9714|C|TurboHist|
 63 | | 6:hist_4_64  4 bins/64 bits| 3646.25|0.9980|C|TurboHist|
 64 | | 7:hist_8_64  8 bins/64 bits| 3941.96|0.9282|C|TurboHist|
 65 | | 8:histr_4_64 4/64+run      | 5061.62|0.7270|C|TurboHist|
 66 | | 9:histr_8_64 8/64+run      |**5135.29**|**0.7229**|C|TurboHist|
 67 | |10:hist_4_128 4 bins/sse4.1 | 3535.36|1.0365|C|TurboHist|
 68 | |11:hist_8_128 8 bins/sse4.1 | 3654.41|0.9791|C|TurboHist|
 69 | |12:hist_4_256 4 bins/avx2   | 3329.87|1.1022|C|TurboHist|
 70 | |13:hist_8_256 8 bins/avx2   | 3540.36|1.0343|C|TurboHist|
 71 | |15:hist_8_64asm inline asm  | 4047.74|0.9013|inline asm|TurboHist|
 72 | |18:count2x64    inline asm  | 3969.92|0.9262|inline asm|[Countbench](https://github.com/nkurz/countbench)|
 73 | |20:histo_ref                | 1182.61|3.0718|C|Ryg|
 74 | |21:histo_cpp_1x             | 1213.42|2.9748|C|Ryg|
 75 | |22:histo_cpp_2x             | 2115.60|1.7373|C|Ryg|
 76 | |23:histo_cpp_4x             | 1801.97|2.0024|C|Ryg|
 77 | |24:histo_asm_scalar4        | 3092.87|1.1561|asm|Ryg|
 78 | |25:histo_asm_scalar8        | 3203.95|1.1139|asm|Ryg|
 79 | |26:histo_asm_scalar8_var    | 3460.45|1.0422|asm|Ryg|
 80 | |27:histo_asm_scalar8_var2   | 3659.61|0.9878|asm|Ryg|
 81 | |28:histo_asm_scalar8_var3   | 3769.96|0.9569|asm|Ryg|
 82 | |29:histo_asm_scalar8_var4   | 3996.75|0.8905|asm|Ryg|
 83 | |30:histo_asm_scalar8_var5   | 4642.10|0.7719|asm|Ryg|
 84 | |31:histo_asm_sse4           | 3091.36|1.1670|asm|Ryg|
 85 | |32:memcpy                   |15594.28|0.2412|C|
 86 |  
 87 | All zeros: size=1.000.0000.000
 88 | | Function                   | MB/s   |Cycle/Byte|Language  |
 89 | |----------------------------|-------:|---------:|----------|
 90 | | 1:hist_1_8   naiv    8 bits|  877.27|4.0805|C|TurboHist| 
 91 | | 2:hist_4_8   4 bins/ 8 bits| 2650.84|1.3485|C|TurboHist| 
 92 | | 3:hist_8_8   8 bins/ 8 bits| 2743.40|1.2994|C|TurboHist| 
 93 | | 4:hist_4_32  4 bins/32 bits| 2978.83|1.2006|C|TurboHist| 
 94 | | 5:hist_8_32  8 bins/32 bits| 3775.45|0.9555|C|TurboHist| 
 95 | | 6:hist_4_64  4 bins/64 bits| 3411.11|1.0530|C|TurboHist| 
 96 | | 7:hist_8_64  8 bins/64 bits| 3928.09|0.9342|C|TurboHist| 
 97 | | 8:histr_4_64 4/64+run      |18998.87|0.1868|C|TurboHist| 
 98 | | 9:histr_8_64 8/64+run      |**19629.28**|**0.1869**|C|TurboHist| 
 99 | |10:hist_4_128 4 bins/sse4.1 | 3365.40|1.0717|C|TurboHist| 
100 | |11:hist_8_128 8 bins/sse4.1 | 3632.61|0.9950|C|TurboHist| 
101 | |12:hist_4_256 4 bins/avx2   | 3112.15|1.1576|C|TurboHist| 
102 | |13:hist_8_256 8 bins/avx2   | 3497.08|1.0205|C|TurboHist| 
103 | |15:hist_8_64asm inline asm  |4089.97|0.8817|inline asm|TurboHist| 
104 | |18:count2x64    inline asm  | 3881.98|0.9158|inline asm|Countbench| 
105 | |20:histo_ref                |  882.93|4.1072|C|Ryg| 
106 | |21:histo_cpp_1x             |  873.20|4.1069|C|Ryg| 
107 | |22:histo_cpp_2x             | 1720.19|2.0961|C|Ryg| 
108 | |23:histo_cpp_4x             | 1866.99|2.0817|C|Ryg| 
109 | |24:histo_asm_scalar4        | 2995.84|1.1942|asm|Ryg| 
110 | |25:histo_asm_scalar8        | 3107.30|1.1618|asm|Ryg| 
111 | |26:histo_asm_scalar8_var    | 3288.67|1.1143|asm|Ryg| 
112 | |27:histo_asm_scalar8_var2   | 3290.92|1.0957|asm|Ryg| 
113 | |28:histo_asm_scalar8_var3   | 3707.41|0.9763|asm|Ryg| 
114 | |29:histo_asm_scalar8_var4   | 3988.01|0.9019|asm|Ryg| 
115 | |30:histo_asm_scalar8_var5   |14076.09|0.2564|asm|Ryg| 
116 | |31:histo_asm_sse4           | 3020.32|1.1975|asm|Ryg| 
117 | |32:memcpy                   |14057.53|0.2636|C|
118 | 
119 | (**bold** = pareto)  MB=1.000.000
120 | - [Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0) <br/>
121 | - [Countbench](https://github.com/nkurz/countbench)
122 | 
123 | ## Compile:
124 | 
125 | 
126 |         make
127 |      or
128 |         make AVX2=1
129 | 
130 | ## Usage:
131 | 
132 | 
133 |         turbohist [-e#] file [-I#] [-z]
134 |         options:
135 |         -e#     # = function numbers separated by ,
136 |         -I#     # = number of iteration
137 |                 set to -I15 for accurate timings  
138 |         -z      set read buffer to zeros
139 |                 
140 | ### Examples:
141 | 
142 |         ./turbohist file
143 |         ./turbohist -e1,7,9
144 | 
145 | ### Environment:
146 | ###### OS/Compiler (32 + 64 bits):
147 | - Windows: MinGW-w64 makefile
148 | - Linux amd/intel: GNU GCC (>=4.6)
149 | - Linux amd/intel: Clang (>=3.2) 
150 | - Linux arm: aarch64 ARMv8:  gcc (>=6.3)
151 | - MaxOS: XCode (>=9) + Apple M1
152 | - PowerPC ppc64le: gcc (>=8.0)
153 | 
154 | Last update: 01 JAN 2022
155 | 


--------------------------------------------------------------------------------
/turbohist.c:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright (c) 2013-2022, Powturbo
  3 |     - homepage : https://sites.google.com/site/powturbo/
  4 |     - github   : https://github.com/powturbo
  5 |     - twitter  : https://twitter.com/powturbo
  6 |     - email    : powturbo [_AT_] gmail [_DOT_] com
  7 | All rights reserved.
  8 | 
  9 | Redistribution and use in source and binary forms, with or without
 10 | modification, are permitted provided that the following conditions are
 11 | met:
 12 | 
 13 | 1. Redistributions of source code must retain the above copyright
 14 |    notice, this list of conditions and the following disclaimer.
 15 | 
 16 | 2. Redistributions in binary form must reproduce the above copyright
 17 |    notice, this list of conditions and the following disclaimer in the
 18 |    documentation and/or other materials provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | **/
 32 | // Turbo histogram benchmark 
 33 | #include <string.h>
 34 | #include <stdio.h>
 35 | #include <stdlib.h>
 36 |   #ifdef __APPLE__
 37 | #include <sys/malloc.h>
 38 |   #else
 39 | #include <malloc.h>
 40 |   #endif
 41 |   #ifdef _MSC_VER
 42 | #include "vs/getopt.h"
 43 |   #else 
 44 | #include <getopt.h>
 45 | #include <unistd.h>   
 46 |   #endif
 47 | #include "conf.h"
 48 | #include "time_.h"
 49 | 
 50 | #include "turbohist_.c"
 51 |   #ifdef _RYG
 52 | #include "histotest.cpp"
 53 |   #endif 
 54 | 
 55 | NOINLINE void libmemcpy(unsigned char *dst, unsigned char *src, int len) {
 56 |   void *(*memcpy_ptr)(void *, const void *, size_t) = memcpy;
 57 |   if (time(NULL) == 1)
 58 |     memcpy_ptr = NULL;
 59 |   memcpy_ptr(dst, src, len);
 60 | }
 61 |     
 62 | void usage(char *pgm) {
 63 |   fprintf(stderr, "\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__);
 64 |   fprintf(stderr, "Usage: %s [options] [file]\n", pgm);
 65 |   fprintf(stderr, "Benchmark:\n");
 66 |   fprintf(stderr, " -I#      # = Number of runs (default=3)\n");
 67 |   fprintf(stderr, " -z       set the read buffer to zeros\n");
 68 |   fprintf(stderr, "Ex. ./turbohist file -I15\n");
 69 |   fprintf(stderr, "    ./turbohist file -I15 -z\n");
 70 |   fprintf(stderr, "    ./turbohist file -e1,4,8,15 -I15\n");
 71 |   exit(0);
 72 | }      
 73 |           
 74 | int check(unsigned *cnt, unsigned n, unsigned *scnt) { unsigned i; for(i=0;i<256;i++) if(cnt[i]!=scnt[i]) { printf("Error sum at %d ", i); return 0; } printf(" %s", TM_MBS); }
 75 | 
 76 | int bench(unsigned char *in, unsigned n, unsigned *cnt, unsigned id, unsigned *scnt) {
 77 |   switch(id) {
 78 | 	case  1: TMBENCH(" 1:hist_1_8   naiv    8 bits", hist_1_8(   in, n, cnt),n);			break; 
 79 | 	case  2: TMBENCH(" 2:hist_4_8   4 bins/ 8 bits", hist_4_8(   in, n, cnt),n);			break;
 80 | 	case  3: TMBENCH(" 3:hist_8_8   8 bins/ 8 bits", hist_8_8(   in, n, cnt),n);			break;
 81 | 	case  4: TMBENCH(" 4:hist_4_32  4 bins/32 bits", hist_4_32(  in, n, cnt),n);			break;
 82 | 	case  5: TMBENCH(" 5:hist_8_32  8 bins/32 bits", hist_8_32(  in, n, cnt),n);			break;
 83 |  	case  6: TMBENCH(" 6:hist_4_64  4 bins/64 bits", hist_4_64(  in, n, cnt),n);			break;
 84 | 	case  7: TMBENCH(" 7:hist_8_64  8 bins/64 bits", hist_8_64(  in, n, cnt),n);			break;
 85 |  	case  8: TMBENCH(" 8:histr_4_64 4/64+run      ", histr_4_64( in, n, cnt),n);			break;
 86 | 	case  9: TMBENCH(" 9:histr_8_64 8/64+run      ", histr_8_64( in, n, cnt),n);			break;
 87 |       #ifdef __ARM_NEON
 88 | 	case 10: TMBENCH("10:hist_4_128 4 bins/neon   ", hist_4_128( in, n, cnt),n);			break;
 89 | 	case 11: TMBENCH("11:hist_8_128 8 bins/neon   ", hist_8_128( in, n, cnt),n);			break;
 90 |       #else
 91 | 	case 10: TMBENCH("10:hist_4_128 4 bins/sse4.1 ", hist_4_128( in, n, cnt),n);			break;
 92 | 	case 11: TMBENCH("11:hist_8_128 8 bins/sse4.1 ", hist_8_128( in, n, cnt),n);			break;
 93 |       #endif
 94 | 	  #ifdef __AVX2__
 95 | 	case 12: TMBENCH("12:hist_4_256 4 bins/avx2   ", hist_4_256( in, n, cnt),n);			break;
 96 | 	case 13: TMBENCH("13:hist_8_256 8 bins/avx2   ", hist_8_256( in, n, cnt),n);			break;
 97 | 	  #endif
 98 |       #ifdef __x86_64
 99 | 	case 15: TMBENCH("15:hist_8_64asm inline asm  ", hist_8_64a( in, n, cnt),n);		    break;
100 | 	  #endif
101 | 	  #ifdef _COUNTBENCH
102 | 	case 18: TMBENCH("18:count2x64    inline asm  ", count2x64(  in, n, cnt),n);            break;  
103 | //	case 19: TMBENCH("19:count2x64c               ", count2x64c( in, n, cnt),n);            break;
104 |  	  #endif 
105 |       #ifdef _RYG	  
106 | 	case 20: TMBENCH("20:histo_ref                ", histo_ref(             cnt, in, n),n); break; 
107 | 	case 21: TMBENCH("21:histo_cpp_1x             ", histo_cpp_1x(          cnt, in, n),n); break; 
108 | 	case 22: TMBENCH("22:histo_cpp_2x             ", histo_cpp_2x(          cnt, in, n),n); break; 
109 | 	case 23: TMBENCH("23:histo_cpp_4x             ", histo_cpp_4x(          cnt, in, n),n); break; 
110 | 	case 24: TMBENCH("24:histo_asm_scalar4        ", histo_asm_scalar4(     cnt, in, n),n); break; 
111 | 	case 25: TMBENCH("25:histo_asm_scalar8        ", histo_asm_scalar8(     cnt, in, n),n); break; 
112 | 	case 26: TMBENCH("26:histo_asm_scalar8_var    ", histo_asm_scalar8_var( cnt, in, n),n); break; 
113 | 	case 27: TMBENCH("27:histo_asm_scalar8_var2   ", histo_asm_scalar8_var2(cnt, in, n),n); break;
114 | 	case 28: TMBENCH("28:histo_asm_scalar8_var3   ", histo_asm_scalar8_var3(cnt, in, n),n); break;
115 | 	case 29: TMBENCH("29:histo_asm_scalar8_var4   ", histo_asm_scalar8_var4(cnt, in, n),n); break;
116 | 	case 30: TMBENCH("30:histo_asm_scalar8_var5   ", histo_asm_scalar8_var5(cnt, in, n),n); break;
117 | 	case 31: TMBENCH("31:histo_asm_sse4           ", histo_asm_sse4(        cnt, in, n),n); break;
118 | 	    #ifdef __AVX2__
119 |     case 37: TMBENCH("37:histo_asm_avx256_8x_1    ", histo_asm_avx256_8x_1( cnt, in, n),n); break;
120 | 	case 38: TMBENCH("38:histo_asm_avx256_8x_2    ", histo_asm_avx256_8x_2( cnt, in, n),n); break;
121 | 	case 39: TMBENCH("39:histo_asm_avx256_8x_3    ", histo_asm_avx256_8x_3( cnt, in, n),n); break;
122 | 	    #endif	 
123 | 	  #endif
124 | 	case 32: { unsigned char *cpy = malloc(n); if(cpy) { TMBENCH("32:memcpy                   ", libmemcpy(cpy, in, n),n); free(cpy); printf(" %s", TM_MBS); } } return 0; break;
125 |     #define ID_LAST 32	
126 |     default: return 0;
127 |   }        
128 |   check(cnt,n,scnt);
129 |   return 1; 
130 | } 
131 | 
132 | int main(int argc, char *argv[]) { 
133 |   unsigned char *finame = argv[1], *scmd = NULL, *in; 
134 |   unsigned n, fno, zero=0, scnt[256], cnt[256]; 
135 |         
136 |   int c, digit_optind = 0;
137 |   for(;;) {
138 |     int this_option_optind = optind ? optind : 1;
139 |     int option_index = 0;
140 |     static struct option long_options[] = {
141 |       { "help", 	0, 0, 'h'},
142 |       { 0, 		    0, 0, 0}
143 |     };
144 |     if((c = getopt_long(argc, argv, "e:hI:z", long_options, &option_index)) == -1) break;
145 |     switch(c) { 
146 |       case 0:
147 |         printf("Option %s", long_options[option_index].name);
148 |         if(optarg) printf (" with arg %s", optarg);  printf ("\n");
149 |         break;
150 |       case 'I': if((tm_Rep  = atoi(optarg))<=0) tm_rep =tm_Rep =1; break;
151 | 	  case 'z': zero++; break;
152 |       case 'e': scmd = optarg; break;
153 | 	  case 'h':
154 |       default: 
155 |         usage(argv[0]);
156 |         exit(0); 
157 |     }  
158 |   }   
159 |            
160 |   printf("\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__);
161 |   char _scmd[33];
162 |   sprintf(_scmd, "1-%d", ID_LAST);                         
163 | 
164 |   for(fno = optind; fno < argc; fno++) {
165 |     finame = argv[fno];
166 |   
167 |     FILE *fi = fopen(finame,  "rb"); 
168 |     if(!fi) perror(finame), exit(1); 							// printf("'%s'\n", finame);
169 |    
170 |     fseek(fi, 0, SEEK_END); 
171 |     long long flen = ftell(fi); 
172 |     fseek(fi, 0, SEEK_SET);  
173 |   
174 |     if(flen > GB) flen = GB;
175 |     n = flen; 
176 |     if(!(in = (unsigned char*)malloc(n))) 
177 |       printf("malloc error\n"), exit(-1);
178 |     n = fread(in, 1, n, fi);
179 |     fclose(fi);
180 |     if(n <= 0)   
181 |       exit(0);     
182 |   
183 |     if(zero) memset(in, 0, n);                                                                          
184 |     int i; hist_1_8(in, n, scnt);  // first run
185 | 	unsigned char *p = (scmd && (scmd[0] != '0' || scmd[1]))?scmd:_scmd;
186 |     do { 
187 |       int id = strtoul(p, &p, 10),idx = id, i;
188 |       if(id >= 0) {    
189 |         while(isspace(*p)) p++; if(*p == '-') { if((idx = strtoul(p+1, &p, 10)) < id) idx = id; if(idx > ID_LAST) idx = ID_LAST; } //printf("ID=%d,%d ", id, idx);
190 |         for(i = id; i <= idx; i++) {
191 |           if(bench(in, n, cnt, i, scnt)) printf("\t%s\n", finame);  
192 |         }			  
193 |       }        
194 |     } while(*p++);
195 |     printf("\n");
196 |     free(in);
197 |   }
198 | }
199 | 


--------------------------------------------------------------------------------
/time_.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |     Copyright (C) powturbo 2013-2022
  3 |     GPL v2 License
  4 | 
  5 |     This program is free software; you can redistribute it and/or modify
  6 |     it under the terms of the GNU General Public License as published by
  7 |     the Free Software Foundation; either version 2 of the License, or
  8 |     (at your option) any later version.
  9 | 
 10 |     This program is distributed in the hope that it will be useful,
 11 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |     GNU General Public License for more details.
 14 | 
 15 |     You should have received a copy of the GNU General Public License along
 16 |     with this program; if not, write to the Free Software Foundation, Inc.,
 17 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 |     - homepage : https://sites.google.com/site/powturbo/
 20 |     - github   : https://github.com/powturbo
 21 |     - twitter  : https://twitter.com/powturbo
 22 |     - email    : powturbo [_AT_] gmail [_DOT_] com
 23 | **/
 24 | //      time_.h : parameter free high precision time/benchmark functions
 25 | #include <time.h>
 26 | #include <float.h>
 27 |   #ifdef _WIN32
 28 | #include <windows.h>
 29 |     #ifndef sleep
 30 | #define sleep(n) Sleep((n) * 1000)
 31 |     #endif
 32 | typedef unsigned __int64 uint64_t;
 33 | 
 34 |   #else
 35 | #include <stdint.h>
 36 | #include <unistd.h>
 37 | #define Sleep(ms) usleep((ms) * 1000)
 38 |   #endif
 39 | 
 40 | #if defined (__i386__) || defined( __x86_64__ )  // ------------------ rdtsc --------------------------
 41 |   #ifdef _MSC_VER
 42 | #include <intrin.h> // __rdtsc
 43 |   #else
 44 | #include <x86intrin.h>
 45 |   #endif
 46 | 
 47 |   #ifdef __corei7__
 48 | #define RDTSC_INI(_c_) do { unsigned _cl, _ch;              \
 49 |   __asm volatile ("couid\n\t"                               \
 50 |                 "rdtsc\n\t"                                 \
 51 |                 "mov %%edx, %0\n"                           \
 52 |                 "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
 53 |                 "%rax", "%rbx", "%rcx", "%rdx");            \
 54 |   _c_ = (uint64_t)_ch << 32 | _cl;              \
 55 | } while(0)
 56 | 
 57 | #define RDTSC(_c_) do { unsigned _cl, _ch;                  \
 58 |   __asm volatile("rdtscp\n"                                 \
 59 |                "mov %%edx, %0\n"                            \
 60 |                "mov %%eax, %1\n"                            \
 61 |                "cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\
 62 |                "%rbx", "%rcx", "%rdx");\
 63 |   _c_ = (uint64_t)_ch << 32 | _cl;\
 64 | } while(0)
 65 |   #else
 66 | #define RDTSC(_c_) do { unsigned _cl, _ch;\
 67 | /*  __asm volatile ("cpuid \n"\
 68 |                 "rdtsc"\
 69 |                 : "=a"(_cl), "=d"(_ch)\
 70 |                 : "a"(0)\
 71 |                 : "%ebx", "%ecx");\
 72 |   _c_ = (uint64_t)_ch << 32 | _cl;\
 73 | } while(0)*/
 74 | #define RDTSC(_c_) do { unsigned _cl, _ch;\
 75 |    __asm volatile("rdtsc" : "=a"(_cl), "=d"(_ch) );\
 76 |   _c_ = (uint64_t)_ch << 32 | _cl;\
 77 | } while(0)
 78 |   #endif
 79 | 
 80 | #define RDTSC_INI(_c_) RDTSC(_c_)
 81 | #else                                          // ------------------ time --------------------------
 82 | #define RDTSC_INI(_c_)
 83 | #define RDTSC(_c_)
 84 | #endif
 85 | 
 86 | #ifndef TM_F
 87 | #define TM_F 1.0  // TM_F=4 -> MI/s
 88 | #endif
 89 | 
 90 | #ifdef _RDTSC //---------------------- rdtsc --------------------------------
 91 | #define TM_M   (CLOCKS_PER_SEC*1000000ull)
 92 | #define TM_PRE 4
 93 | #define TM_MBS "cycle/byte"
 94 | static double TMBS(unsigned l, double t) { return (double)t/(double)l; }
 95 | 
 96 | typedef uint64_t tm_t;
 97 | static tm_t   tmtime()                      { uint64_t c; RDTSC(c); return c; }
 98 | static tm_t   tminit()                      { uint64_t c; __asm volatile("" ::: "memory"); RDTSC_INI(c); return c; }
 99 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start); }
100 | static int    tmiszero(tm_t t)              { return !t; }
101 | #else          //---------------------- time -----------------------------------
102 | #define TM_M   1
103 | #define TM_PRE 2
104 | #define TM_MBS "MB/s"
105 | static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; }
106 | 
107 |   #ifdef _WIN32 //-------- windows 
108 | static LARGE_INTEGER tps;
109 | 
110 | typedef unsigned __int64 tm_t;
111 | static tm_t   tmtime()                      { LARGE_INTEGER tm; tm_t t; QueryPerformanceCounter(&tm); return tm.QuadPart; }
112 | static tm_t   tminit()                      { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; }
113 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; }
114 | static int    tmiszero(tm_t t)              { return !t; }
115 |   #else        // Linux & compatible / MacOS
116 |     #ifdef __APPLE__
117 | #include <AvailabilityMacros.h>
118 |       #ifndef MAC_OS_X_VERSION_10_12
119 | #define MAC_OS_X_VERSION_10_12 101200
120 |       #endif
121 | #define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12)
122 |       #if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME)
123 | #include <sys/time.h>
124 | #define CLOCK_REALTIME 0
125 | #define CLOCK_MONOTONIC 0
126 | int clock_gettime(int /*clk_id*/, struct timespec* t) {
127 |   struct timeval now;
128 |   int rv = gettimeofday(&now, NULL);
129 |   if (rv) return rv;
130 |   t->tv_sec  = now.tv_sec;
131 |   t->tv_nsec = now.tv_usec * 1000;
132 |   return 0;
133 | }
134 |       #endif
135 |     #endif
136 |     
137 | typedef struct timespec tm_t;   
138 | static tm_t   tmtime()                      { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; }
139 | static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; }
140 | static tm_t   tminit()                      { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; }
141 | static int    tmiszero(tm_t t)              { return !(t.tv_sec|t.tv_nsec); }
142 |   #endif
143 | #endif 
144 | 
145 | //---------------------------------------- bench ----------------------------------------------------------------------
146 | // for each a function call is repeated until exceding tm_tx seconds.
147 | // A run duration is always tm_tx seconds
148 | // The number of runs can be set with the program options  -I and -J (specify -I15 -J15 for more precision)
149 | 
150 | // sleep after each 8 runs to avoid cpu trottling.
151 | #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
152 | 
153 | // benchmark loop
154 | #define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\
155 |   for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\
156 |     for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */
157 | 
158 | #define TMEND(_len_) \
159 |       _tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\
160 |     }\
161 |     /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
162 |     /*other runs: break the loop only after 'tm_rm' repeats */ \
163 |     _tm_t = tmdiff(_tm_t0, tmtime());\
164 |     /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\
165 |     if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("repeats=%u,%u,%.4f ", _tm_Rn, _tm_Rx, _tm_t);*/ } \
166 | 	  tm_tm = _tm_t; _tm_c++;\
167 |     } else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
168 |     if(tm_verbose) { printf("%8.*f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TM_PRE, TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
169 |     if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\
170 |   }\
171 | }
172 | 
173 | static unsigned tm_rep = 1u<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2;
174 | static tm_t tm_0, tm_T;
175 | static double tm_tm, tm_tx = 1.0*TM_M, tm_TX = 60.0*TM_M;
176 | 
177 | static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; }
178 | 
179 | #define TMBENCH(_name_, _func_, _len_)  do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
180 |   TMBEG(tm_Rep) _func_; TMEND(_len_); \
181 |   double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f      \b\b\b\b\b", TM_PRE, TMBS(_len_, dm/dr) );\
182 | } while(0)
183 | 
184 | // second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding
185 | #define TMBENCH2(_name_, _func_, _len_)  do { \
186 |   TMBEG(tm_Rep2) _func_; TMEND(_len_);\
187 |   double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f      \b\b\b\b\b", TM_PRE,TMBS(_len_, dm/dr) );\
188 |   if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
189 | } while(0)
190 | 
191 | // Check
192 | #define TMBENCHT(_name_,_func_, _len_, _res_)  do { \
193 |   TMBEG(tm_Rep) \
194 |   if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\
195 |   TMEND(_len_);\
196 |   if(tm_verbose) printf("%8.*f      \b\b\b\b\b", TM_PRE, TMBS(_len_,(double)tm_tm/(double)tm_rm) );\
197 |   if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\
198 | } while(0)
199 | 
200 | static void pr(unsigned l, unsigned n) {
201 |   double r = (double)l*100.0/n;
202 |   if(r>0.1)  printf("%10u %6.2f%%   ", l, r);
203 |   else if(r>0.01) printf("%10u %7.3f%%  ", l, r);
204 |   else printf("%10u %8.4f%% ", l, r); fflush(stdout); 
205 | }
206 | 
207 | //----------------------------------------------------------------------------------------------------------------------------------
208 | #define Kb (1u<<10)
209 | #define Mb (1u<<20)
210 | #define Gb (1u<<30)
211 | #define KB 1000
212 | #define MB 1000000
213 | #define GB 1000000000
214 | 
215 | static unsigned argtoi(char *s, unsigned def) {
216 |   char *p;
217 |   unsigned n = strtol(s, &p, 10),f = 1;
218 |   switch(*p) {
219 |     case 'K': f = KB; break;
220 |     case 'M': f = MB; break;
221 |     case 'G': f = GB; break;
222 |     case 'k': f = Kb; break;
223 |     case 'm': f = Mb; break;
224 |     case 'g': f = Gb; break;
225 |     case 'B': return n; break;
226 |     case 'b': def = 0;
227 |     default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def;
228 |   }
229 |   return n*f;
230 | }
231 | static uint64_t argtol(char *s) {
232 |   char *p;
233 |   uint64_t n = strtol(s, &p, 10),f=1;
234 |   switch(*p) {
235 |     case 'K': f = KB; break;
236 |     case 'M': f = MB; break;
237 |     case 'G': f = GB; break;
238 |     case 'k': f = Kb; break;
239 |     case 'm': f = Mb; break;
240 |     case 'g': f = Gb; break;
241 |     case 'B': return n; break;
242 |     case 'b': return 1u << n;
243 |     default:  f = MB;
244 |   }
245 |   return n*f;
246 | }
247 | 
248 | static uint64_t argtot(char *s) {
249 |   char *p;
250 |   uint64_t n = strtol(s, &p, 10),f=1;
251 |   switch(*p) {
252 |     case 'h': f = 3600000; break;
253 |     case 'm': f = 60000;   break;
254 |     case 's': f = 1000;    break;
255 |     case 'M': f = 1;       break;
256 |     default:  f = 1000;
257 |   }
258 |   return n*f;
259 | }
260 | 
261 | static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; }
262 | 
263 | 


--------------------------------------------------------------------------------
/conf.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |     Copyright (C) powturbo 2013-2019
  3 |     GPL v2 License
  4 | 
  5 |     This program is free software; you can redistribute it and/or modify
  6 |     it under the terms of the GNU General Public License as published by
  7 |     the Free Software Foundation; either version 2 of the License, or
  8 |     (at your option) any later version.
  9 | 
 10 |     This program is distributed in the hope that it will be useful,
 11 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |     GNU General Public License for more details.
 14 | 
 15 |     You should have received a copy of the GNU General Public License along
 16 |     with this program; if not, write to the Free Software Foundation, Inc.,
 17 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 |     - homepage : https://sites.google.com/site/powturbo/
 20 |     - github   : https://github.com/powturbo
 21 |     - twitter  : https://twitter.com/powturbo
 22 |     - email    : powturbo [_AT_] gmail [_DOT_] com
 23 | **/
 24 | 
 25 | // conf.h - config & common
 26 | #ifndef CONF_H
 27 | #define CONF_H
 28 | //------------------------- Compiler ------------------------------------------
 29 |   #if defined(__GNUC__)
 30 | #include <stdint.h>
 31 | #define ALIGNED(t,v,n)  t v __attribute__ ((aligned (n)))
 32 | #define ALWAYS_INLINE   inline __attribute__((always_inline))
 33 | #define NOINLINE        __attribute__((noinline))
 34 | #define _PACKED         __attribute__ ((packed))
 35 | #define likely(x)       __builtin_expect((x),1)
 36 | #define unlikely(x)     __builtin_expect((x),0)
 37 | 
 38 | #define popcnt32(_x_)   __builtin_popcount(_x_)
 39 | #define popcnt64(_x_)   __builtin_popcountll(_x_)
 40 | 
 41 |     #if defined(__i386__) || defined(__x86_64__)
 42 | //x,__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
 43 | //  x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
 44 | static inline int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
 45 | static inline int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
 46 | static inline int      bsr64(uint64_t x          ) { return x?64 - __builtin_clzll(x):0; }
 47 | static inline int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
 48 | 
 49 | static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
 50 | static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
 51 | static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
 52 | static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
 53 |     #else
 54 | static inline int    __bsr32(unsigned x          ) { return   31 - __builtin_clz(  x); }
 55 | static inline int      bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
 56 | static inline int      bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
 57 | 
 58 | static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
 59 | static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
 60 | static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
 61 | static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
 62 |     #endif
 63 | 
 64 | #define ctz64(_x_) __builtin_ctzll(_x_)
 65 | #define ctz32(_x_) __builtin_ctz(_x_)    // 0:32  ctz32(1<<a) = a (a=1..31)
 66 | #define clz64(_x_) __builtin_clzll(_x_)
 67 | #define clz32(_x_) __builtin_clz(_x_)    // 00000000 00000000 00000000 01000000 = 25
 68 | 
 69 | //#define bswap8(x)    (x)
 70 |     #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
 71 | #define bswap16(x) __builtin_bswap16(x)
 72 |     #else
 73 | static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
 74 |     #endif
 75 | #define bswap32(x) __builtin_bswap32(x)
 76 | #define bswap64(x) __builtin_bswap64(x)
 77 | 
 78 |   #elif _MSC_VER //----------------------------------------------------
 79 | #include <windows.h>
 80 | #include <intrin.h>
 81 |     #if _MSC_VER < 1600
 82 | #include "vs/stdint.h"
 83 | #define __builtin_prefetch(x,a)
 84 | #define inline          __inline
 85 |     #else
 86 | #include <stdint.h>
 87 | #define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
 88 |     #endif
 89 | 
 90 | #define ALIGNED(t,v,n)  __declspec(align(n)) t v
 91 | #define ALWAYS_INLINE   __forceinline
 92 | #define NOINLINE        __declspec(noinline)
 93 | #define THREADLOCAL     __declspec(thread)
 94 | #define likely(x)       (x)
 95 | #define unlikely(x)     (x)
 96 | 
 97 | static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
 98 | static inline int bsr32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?z+1:0; }
 99 | static inline int ctz32(  unsigned x) { unsigned long z;   _BitScanForward(&z, x); return x?z:32; }
100 | static inline int clz32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?31-z:32; }
101 |   #if !defined(_M_ARM64) && !defined(_M_X64)
102 | static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
103 |   unsigned long x0 = (unsigned long)x, top, bottom;         _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
104 |   *ret = x0 ? bottom : 32 + top;  return x != 0;
105 | }
106 | static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
107 |   unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1);                       _BitScanReverse(&bottom, (unsigned long)x);
108 |   *ret = x1 ? top + 32 : bottom;  return x != 0;
109 | }
110 |   #endif
111 | static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
112 | static inline int ctz64(uint64_t x) { unsigned long z;   _BitScanForward64(&z, x); return x?z:64; }
113 | static inline int clz64(uint64_t x) { unsigned long z;   _BitScanReverse64(&z, x); return x?63-z:64; }
114 | 
115 | #define rol32(x,s) _lrotl(x, s)
116 | #define ror32(x,s) _lrotr(x, s)
117 | 
118 | #define bswap16(x) _byteswap_ushort(x)
119 | #define bswap32(x) _byteswap_ulong(x)
120 | #define bswap64(x) _byteswap_uint64(x)
121 | 
122 | #define popcnt32(x) __popcnt(x)
123 |   #ifdef _WIN64
124 | #define popcnt64(x) __popcnt64(x)
125 |   #else
126 | #define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
127 |   #endif
128 | 
129 | #define sleep(x)    Sleep(x/1000)
130 | #define fseeko      _fseeki64
131 | #define ftello      _ftelli64
132 | #define strcasecmp  _stricmp
133 | #define strncasecmp _strnicmp
134 | #define strtoull    _strtoui64
135 | static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
136 |   #endif
137 | 
138 | #define __bsr8(_x_)  __bsr32(_x_)
139 | #define __bsr16(_x_) __bsr32(_x_)
140 | #define bsr8(_x_)  bsr32(_x_)
141 | #define bsr16(_x_) bsr32(_x_)
142 | #define ctz8(_x_)  ctz32(_x_)
143 | #define ctz16(_x_) ctz32(_x_)
144 | #define clz8(_x_)  (clz32(_x_)-24)
145 | #define clz16(_x_) (clz32(_x_)-16)
146 | 
147 | #define popcnt8(x)  popcnt32(x)
148 | #define popcnt16(x) popcnt32(x)
149 | 
150 | //--------------- Unaligned memory access -------------------------------------
151 |   #ifdef UA_MEMCPY
152 | #include <string.h>
153 | static inline unsigned short     ctou16(const void *cp) { unsigned short     x; memcpy(&x, cp, sizeof(x)); return x; }
154 | static inline unsigned           ctou32(const void *cp) { unsigned           x; memcpy(&x, cp, sizeof(x)); return x; }
155 | static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
156 | static inline size_t             ctousz(const void *cp) { size_t             x; memcpy(&x, cp, sizeof(x)); return x; }
157 | static inline float              ctof32(const void *cp) { float              x; memcpy(&x, cp, sizeof(x)); return x; }
158 | static inline double             ctof64(const void *cp) { double             x; memcpy(&x, cp, sizeof(x)); return x; }
159 | 
160 | static inline void               stou16(      void *cp, unsigned short     x) { memcpy(cp, &x, sizeof(x)); }
161 | static inline void               stou32(      void *cp, unsigned           x) { memcpy(cp, &x, sizeof(x)); }
162 | static inline void               stou64(      void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
163 | static inline void               stousz(      void *cp, size_t             x) { memcpy(cp, &x, sizeof(x)); }
164 | static inline void               stof32(      void *cp, float              x) { memcpy(cp, &x, sizeof(x)); }
165 | static inline void               stof64(      void *cp, double             x) { memcpy(cp, &x, sizeof(x)); }
166 |   #elif defined(__i386__) || defined(__x86_64__) || \
167 |     defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
168 |     defined(__powerpc__) || defined(__s390__) ||\
169 |     defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
170 |     defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
171 |     defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
172 |     defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__)   || defined(__ARM_ARCH_6ZK__)
173 | #define ctou16(_cp_) (*(unsigned short *)(_cp_))
174 | #define ctou32(_cp_) (*(unsigned       *)(_cp_))
175 | #define ctof32(_cp_) (*(float          *)(_cp_))
176 | 
177 |     #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
178 | #define ctou64(_cp_)       (*(uint64_t *)(_cp_))
179 | #define ctof64(_cp_)       (*(double   *)(_cp_))
180 |     #elif defined(__ARM_FEATURE_UNALIGNED)
181 | struct _PACKED longu     { uint64_t l; };
182 | struct _PACKED doubleu   { double   d; };
183 | #define ctou64(_cp_) ((struct longu     *)(_cp_))->l
184 | #define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
185 |     #endif
186 | 
187 |   #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
188 | struct _PACKED shortu    { unsigned short     s; };
189 | struct _PACKED unsignedu { unsigned           u; };
190 | struct _PACKED longu     { uint64_t           l; };
191 | struct _PACKED floatu    { float              f; };
192 | struct _PACKED doubleu   { double             d; };
193 | 
194 | #define ctou16(_cp_) ((struct shortu    *)(_cp_))->s
195 | #define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u
196 | #define ctou64(_cp_) ((struct longu     *)(_cp_))->l
197 | #define ctof32(_cp_) ((struct floatu    *)(_cp_))->f
198 | #define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
199 |   #else
200 | #error "unknown cpu"
201 |   #endif
202 | 
203 | #define ctou24(_cp_) (ctou32(_cp_) & 0xffffff)
204 | #define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull)
205 | #define ctou8(_cp_) (*(_cp_))
206 | //--------------------- wordsize ----------------------------------------------
207 |   #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\
208 |     defined(__x86_64__) || defined(_M_X64) ||\
209 |     defined(__ia64) || defined(_M_IA64) ||\
210 |     defined(__aarch64__) ||\
211 |     defined(__mips64) ||\
212 |     defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
213 |     defined(__s390x__)
214 | #define __WORDSIZE 64
215 |   #else
216 | #define __WORDSIZE 32
217 |   #endif
218 | #endif
219 | 
220 | //---------------------misc ---------------------------------------------------
221 | //#define bzhi63(_u_, _b_) 				((_u_) & ((1ull<<(_b_))-1))  // _b_ < 64
222 | //#define bzhi63(_u_, _b_)                ((_u_) & ((1u  <<(_b_))-1))  // _b_ < 32
223 | #define BZHI64(_u_, _b_)                 (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))  // b Constant
224 | #define BZHI32(_u_, _b_)                 (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))  
225 | #define BZHI16(_u_, _b_)                 BZHI32(_u_, _b_)
226 | #define BZHI8( _u_, _b_)                 BZHI32(_u_, _b_)
227 | 
228 |     #ifdef __AVX2__
229 |       #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
230 | #include <intrin.h>
231 |       #else
232 | #include <x86intrin.h>
233 |       #endif
234 | #define bzhi32(_u_, _b_)                 _bzhi_u32(_u_, _b_)  // b variable
235 | #define bzhi31(_u_, _b_)                 _bzhi_u32(_u_, _b_)
236 | 
237 |       #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
238 | #define bzhi64(_u_, _b_)                 BZHI64(_u_, _b_)
239 | #define bzhi63(_u_, _b_)                 ((_u_) & ((1ull<<(_b_))-1))
240 |       #else
241 | #define bzhi64(_u_, _b_)                 _bzhi_u64(_u_, _b_)
242 | #define bzhi63(_u_, _b_)                 _bzhi_u64(_u_, _b_)
243 |       #endif
244 |     #else
245 | #define bzhi64(_u_, _b_)                 BZHI64(_u_, _b_) 
246 | #define bzhi63(_u_, _b_)                 ((_u_) & ((1ull <<(_b_))-1)) 
247 | #define bzhi32(_u_, _b_)                 ((_u_) & ((1ull <<(_b_))-1))
248 | #define bzhi31(_u_, _b_)                 ((_u_) & ((1    <<(_b_))-1))
249 |     #endif
250 | 
251 | #define bzhi16(_u_, _b_)                 bzhi31(_u_, _b_)
252 | #define bzhi8( _u_, _b_)                 bzhi31(_u_, _b_)
253 | 
254 | 
255 | #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
256 | #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
257 | 
258 | #define TEMPLATE2_(_x_, _y_) _x_##_y_
259 | #define T2(_x_, _y_) TEMPLATE2_(_x_,_y_)
260 | 
261 | #define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
262 | #define T3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
263 | 
264 | #define CACHE_LINE_SIZE     64
265 | #define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
266 | 
267 | #define CLAMP(_x_, _low_, _high_)  (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
268 | 
269 | //--- NDEBUG -------
270 | #include <stdio.h>
271 |   #ifdef _MSC_VER
272 |     #ifdef NDEBUG
273 | #define AS(expr, fmt, ...)
274 | #define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
275 | #define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
276 |     #else
277 | #define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
278 | #define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
279 | #define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
280 |     #endif
281 |   #else
282 |     #ifdef NDEBUG
283 | #define AS(expr, fmt,args...)
284 | #define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
285 | #define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
286 |     #else
287 | #define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
288 | #define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
289 | #define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
290 |     #endif
291 |   #endif
292 | 


--------------------------------------------------------------------------------
/sse_neon.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |     Copyright (C) powturbo 2013-2019
  3 |     GPL v2 License
  4 |   
  5 |     This program is free software; you can redistribute it and/or modify
  6 |     it under the terms of the GNU General Public License as published by
  7 |     the Free Software Foundation; either version 2 of the License, or
  8 |     (at your option) any later version.
  9 | 
 10 |     This program is distributed in the hope that it will be useful,
 11 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |     GNU General Public License for more details.
 14 | 
 15 |     You should have received a copy of the GNU General Public License along
 16 |     with this program; if not, write to the Free Software Foundation, Inc.,
 17 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 |     - homepage : https://sites.google.com/site/powturbo/
 20 |     - github   : https://github.com/powturbo
 21 |     - twitter  : https://twitter.com/powturbo
 22 |     - email    : powturbo [_AT_] gmail [_DOT_] com
 23 | **/
 24 | // intel sse to arm neon 
 25 | 
 26 | #ifndef _SSE_NEON_H_
 27 | #define _SSE_NEON_H_
 28 | #include "conf.h"
 29 | 
 30 |   #ifdef __ARM_NEON  //--------------------------------------------------------------------------------------------------
 31 | #include <arm_neon.h>
 32 | #define __m128i uint32x4_t
 33 | 
 34 | //#define USE_MACROS
 35 | #define uint8x16_to_8x8x2(_a_)                  ((uint8x8x2_t) { vget_low_u8(_a_), vget_high_u8(_a_) })
 36 | 
 37 | #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
 38 | #define _mm_set_epi8(u15,u14,u13,u12,\
 39 |                      u11,u10, u9, u8,\
 40 | 					     u7,u6,u5,u4,\
 41 | 					     u3,u2,u1,u0) 			({ uint8_t  __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
 42 | #define _mm_set_epi16(   u7,u6,u5,u4,\
 43 |                          u3,u2,u1,u0)           ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; 					             (uint32x4_t)vld1q_u16(_u);})
 44 | #define _mm_set_epi32(   u3,u2,u1,u0) 			({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; 									                    vld1q_u32(_u);})
 45 | #define _mm_set_epi64x(        u1,u0) 			({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 };   									             (uint32x4_t)vld1q_u64(_u);})
 46 | #define _mm_set_epi32(u3, u2, u1, u0)           vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
 47 | #define _mm_set_epi64x(u1, u0)                  (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
 48 | #else
 49 | static ALWAYS_INLINE __m128i _mm_set_epi8(      uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t  u9, uint8_t  u8,
 50 |                                                 uint8_t  u7, uint8_t  u6, uint8_t  u5, uint8_t  u4, 
 51 |                                                 uint8_t  u3, uint8_t  u2, uint8_t  u1, uint8_t  u0) {
 52 |                                                   uint8_t  __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 };                       return (uint32x4_t)vld1q_u8( u); }
 53 | static ALWAYS_INLINE __m128i _mm_set_epi16(     uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4, 
 54 |                                                 uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
 55 | static ALWAYS_INLINE __m128i _mm_set_epi32(     uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; 			 return             vld1q_u32(u); }
 56 | static ALWAYS_INLINE __m128i _mm_set_epi64x(    uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 };   									     return (uint32x4_t)vld1q_u64(u); }
 57 | #endif
 58 | 
 59 | #define _mm_set1_epi8(  _u8_ )					(__m128i)vdupq_n_u8( _u8_ )
 60 | #define _mm_set1_epi16( _u16_)					(__m128i)vdupq_n_u16(_u16_)
 61 | #define _mm_set1_epi32( _u32_)							 vdupq_n_u32(_u32_)
 62 | #define _mm_set1_epi64x(_u64_)     				(__m128i)vdupq_n_u64(_u64_)
 63 | #define _mm_setzero_si128()                              vdupq_n_u32( 0   )
 64 | //---------------------------------------------- Arithmetic -----------------------------------------------------------------------
 65 | #define _mm_add_epi8(  _a_,_b_)   				(__m128i)vaddq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 
 66 | #define _mm_add_epi16(  _a_,_b_)   				(__m128i)vaddq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 
 67 | #define _mm_add_epi32(  _a_,_b_)   				            vaddq_u32(             _a_,               _b_ ) 
 68 | #define _mm_sub_epi16(  _a_,_b_)   				(__m128i)vsubq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 
 69 | #define _mm_sub_epi32(  _a_,_b_)   				(__m128i)vsubq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_)) 
 70 | #define _mm_subs_epu8(   _a_,_b_)   			(__m128i)vqsubq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_)) 
 71 | 
 72 | #define _mm_mullo_epi32(_a_,_b_)   				(__m128i)vmulq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
 73 | #define  mm_mullo_epu32(_a_,_b_)   				         vmulq_u32(_a_,_b_)
 74 | #define _mm_mul_epu32(  _a_,_b_)     			(__m128i)vmull_u32(vget_low_u32(_a_),vget_low_u32(_b_))
 75 | #define _mm_adds_epu16( _a_,_b_)                (__m128i)vqaddq_u16((uint16x8_t)(_a_),(uint16x8_t)(_b_))
 76 | static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { 
 77 |   int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)a), vget_low_s16( (int16x8_t)b));
 78 |   int32x4_t mhi = vmull_s16(vget_high_s16((int16x8_t)a), vget_high_s16((int16x8_t)b));
 79 |   int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo));
 80 |   int32x2_t ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
 81 |   return (__m128i)vcombine_s32(alo, ahi);
 82 | }
 83 | //---------------------------------------------- Special math functions -----------------------------------------------------------
 84 | #define _mm_min_epu8(   _a_,_b_)                (__m128i)vminq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
 85 | #define _mm_min_epu16(  _a_,_b_)                (__m128i)vminq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_))
 86 | #define _mm_min_epi16(  _a_,_b_)                (__m128i)vminq_s16((int16x8_t)(_a_), (int16x8_t)(_b_))
 87 | //---------------------------------------------- Logical --------------------------------------------------------------------------
 88 | #define  mm_testnz_epu32(_a_)   				vmaxvq_u32(_a_) //vaddvq_u32(_a_)
 89 | #define  mm_testnz_epu8(_a_)   				    vmaxv_u8(_a_)
 90 | #define _mm_or_si128(   _a_,_b_)    			(__m128i)vorrq_u32(  (uint32x4_t)(_a_), (uint32x4_t)(_b_))
 91 | #define _mm_and_si128(  _a_,_b_)    			(__m128i)vandq_u32(  (uint32x4_t)(_a_), (uint32x4_t)(_b_))
 92 | #define _mm_xor_si128(  _a_,_b_)    			(__m128i)veorq_u32(  (uint32x4_t)(_a_), (uint32x4_t)(_b_))
 93 | //---------------------------------------------- Shift ----------------------------------------------------------------------------
 94 | #define _mm_slli_epi16( _a_,_m_)    			(__m128i)vshlq_n_u16((uint16x8_t)(_a_), _m_)
 95 | #define _mm_slli_epi32( _a_,_m_)    			(__m128i)vshlq_n_u32((uint32x4_t)(_a_), _m_)
 96 | #define _mm_slli_epi64( _a_,_m_)    			(__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_)
 97 | #define _mm_slli_si128( _a_,_m_)                (__m128i)vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_a_), 16 - (_m_) ) // _m_: 1 - 15
 98 |  
 99 | #define _mm_srli_epi16( _a_,_m_)    			(__m128i)vshrq_n_u16((uint16x8_t)(_a_), _m_)
100 | #define _mm_srli_epi32( _a_,_m_)    			(__m128i)vshrq_n_u32((uint32x4_t)(_a_), _m_)
101 | #define _mm_srli_epi64( _a_,_m_)    			(__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_) 
102 | #define _mm_srli_si128( _a_,_m_) 			    (__m128i)vextq_s8((int8x16_t)(_a_), vdupq_n_s8(0), (_m_))
103 | 
104 | #define _mm_srai_epi16( _a_,_m_)			    (__m128i)vshrq_n_s16((int16x8_t)(_a_), _m_)  
105 | #define _mm_srai_epi32( _a_,_m_)			    (__m128i)vshrq_n_s32((int32x4_t)(_a_), _m_) 
106 | #define _mm_srai_epi64( _a_,_m_)			    (__m128i)vshrq_n_s64((int64x2_t)(_a_), _m_) 
107 | 
108 | #define _mm_sllv_epi32( _a_,_b_)    			(__m128i)vshlq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_))									
109 | #define _mm_srlv_epi32( _a_,_b_)                (__m128i)vshlq_u32((uint32x4_t)(_a_), vnegq_s32((int32x4_t)(_b_)))
110 | //---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
111 | #define _mm_cmpeq_epi8( _a_,_b_)   				(__m128i)vceqq_s8( ( int8x16_t)(_a_), ( int8x16_t)(_b_))
112 | #define _mm_cmpeq_epi16(_a_,_b_)   				(__m128i)vceqq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_))
113 | #define _mm_cmpeq_epi32(_a_,_b_)   				(__m128i)vceqq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
114 | 
115 | #define _mm_cmpgt_epi16(_a_,_b_)   				(__m128i)vcgtq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_)) 
116 | #define _mm_cmpgt_epi32(_a_,_b_)   				(__m128i)vcgtq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
117 | 
118 | #define _mm_cmpgt_epu16(_a_,_b_)   				(__m128i)vcgtq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_)) 
119 | #define  mm_cmpgt_epu32(_a_,_b_)   				(__m128i)vcgtq_u32(             _a_,               _b_) 
120 | //---------------------------------------------- Load -----------------------------------------------------------------------------
121 | #define _mm_loadl_epi64(   _u64p_)              (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
122 | #define  mm_loadu_epi64p(  _u64p_,_a_) 			(__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_a_), 0)
123 | #define _mm_loadu_si128(  _ip_)     			vld1q_u32(_ip_)
124 | #define _mm_load_si128(   _ip_)      			vld1q_u32(_ip_)
125 | //---------------------------------------------- Store ----------------------------------------------------------------------------
126 | #define _mm_storel_epi64(_ip_,_a_)   			vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_a_), 0)
127 | #define _mm_storeu_si128(_ip_,_a_)  			vst1q_u32((__m128i *)(_ip_),_a_)
128 | //---------------------------------------------- Convert --------------------------------------------------------------------------
129 | #define  mm_cvtsi64_si128p(_u64p_,_a_)          mm_loadu_epi64p(_u64p_,_a_)
130 | #define _mm_cvtsi64_si128(_a_) 				    (__m128i)vdupq_n_u64(_a_)  //vld1q_s64(_a_)
131 | //---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
132 | #define mm_rbit_epi8(a)                         (__m128i)vrbitq_u8( (uint8x16_t)(a)) // reverse bits
133 | #define mm_rev_epi16(a)                         vrev16q_u8((uint8x16_t)(a))           // reverse bytes
134 | #define mm_rev_epi32(a)                         vrev32q_u8((uint8x16_t)(a))
135 | #define mm_rev_epi64(a)                         vrev64q_u8((uint8x16_t)(a))
136 | //--------------------------------------------- Insert/extract --------------------------------------------------------------------
137 | #define  mm_extract_epi32x(_a_,_u32_,_id_) 	    vst1q_lane_u32((uint32_t *)&(_u32_),              _a_,  _id_)
138 | #define _mm_extract_epi64x(_a_,_u64_,_id_)      vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_a_), _id_)
139 | 
140 | #define _mm_extract_epi8(_a_,        _id_)      vgetq_lane_u8( (uint8x16_t)(_a_), _id_)
141 | #define _mm_extract_epi16(_a_,       _id_)      vgetq_lane_u16(_a_, _id_)
142 | #define _mm_extract_epi32(_a_,       _id_)      vgetq_lane_u32(_a_, _id_)
143 | #define  mm_extract_epu32(_a_,       _id_)      vgetq_lane_u32(_a_, _id_)
144 | #define _mm_cvtsi128_si32(_a_)  				vgetq_lane_u32((uint32x4_t)(_a_),0)
145 | #define _mm_cvtsi128_si64(_a_)  				vgetq_lane_u64((uint64x2_t)(_a_),0)
146 | 
147 | #define _mm_insert_epu32p(_a_,_u32p_,_id_)      vsetq_lane_u32(_x_, _a_, _id_)
148 | #define  mm_insert_epi32p(_a_,_u32p_,_id_) 	    vld1q_lane_u32(_u32p_, (uint32x4_t)(_a_), _id_)
149 | #define _mm_cvtsi32_si128(_a_) 					(__m128i)vsetq_lane_s32(_a_, vdupq_n_s32(0), 0)
150 | 
151 | #define _mm_blendv_epi8(_a_,_b_,_m_) 			vbslq_u32(_m_,_b_,_a_)
152 | //---------------------------------------------- Miscellaneous --------------------------------------------------------------------
153 | #define _mm_alignr_epi8(_a_,_b_,_m_)            (__m128i)vextq_u8(  (uint8x16_t)(_b_), (uint8x16_t)(_a_), _m_)                   
154 | #define _mm_packs_epi16(   _a_,_b_) 		    (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_a_)), vqmovn_s16((int16x8_t)(_b_)))
155 | #define _mm_packs_epi32(   _a_,_b_) 		    (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_a_)), vqmovn_s32((int32x4_t)(_b_)))
156 | 
157 | #define _mm_packs_epu16(   _a_,_b_) 			(__m128i)vcombine_u8((uint16x8_t)(_a_), (uint16x8_t)(_b_))
158 | #define _mm_packus_epi16(  _a_,_b_) 			(__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_a_)), vqmovun_s16((int16x8_t)(_b_)))
159 | 
160 | static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
161 |   const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
162 |   uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
163 |   return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
164 | }
165 | //-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
166 |     #ifdef __aarch64__
167 | static ALWAYS_INLINE uint8_t   mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t  m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8(  vand_u8(             sv, m)); } // short only ARM
168 | //static ALWAYS_INLINE uint16_t  mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
169 | static ALWAYS_INLINE uint16_t  mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); } 
170 | static ALWAYS_INLINE uint32_t  mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 };                           return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); } 
171 | static ALWAYS_INLINE uint64_t  mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 };                                       return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
172 |     #else
173 | static ALWAYS_INLINE uint32_t  mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
174 |     #endif
175 | // --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
176 | #define _MM_SHUFFLE(u3,u2,u1,u0) 				((u3) << 6 | (u2) << 4 | (u1) << 2 | (u0))
177 | 
178 | #define _mm_shuffle_epi8(_a_, _b_) 				(__m128i)vqtbl1q_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
179 |     #if defined(__aarch64__)
180 | #define  mm_shuffle_nnnn_epi32(_a_,_m_)     	(__m128i)vdupq_laneq_u32(_a_, _m_)
181 |     #else
182 | #define  mm_shuffle_nnnn_epi32(_a_,_m_)         (__m128i)vdupq_n_u32(vgetq_lane_u32(_a_, _m_)
183 |     #endif
184 | 
185 |   #ifdef USE_MACROS
186 | #define mm_shuffle_2031_epi32(_a_) 				({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_a_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
187 | #define mm_shuffle_3120_epi32(_a_) 				({ uint32x4_t _zv = _a_;                                       _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
188 |   #else
189 | static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i a) { uint32x4_t   v = (uint32x4_t)vrev64q_u32(a); uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
190 | static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i a) {                                              uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
191 |   #endif
192 | 
193 |   #if defined(USE_MACROS) || defined(__clang__)
194 | #define _mm_shuffle_epi32(_a_, _m_)             ({ const uint32x4_t _av =_a_;\
195 |                                                    uint32x4_t _v =    vmovq_n_u32(vgetq_lane_u32(_av, (_m_)        & 0x3));\
196 |                                                               _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
197 |                                                               _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
198 |                                                               _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
199 |                                                 })
200 | #define _mm_shuffle_epi32s(_a_, _m_) 			_mm_set_epi32(vgetq_lane_u32(_a_, ((_m_)     ) & 0x3),\
201 | 															  vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3),\
202 | 															  vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3),\
203 | 															  vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3))
204 |   #else
205 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _a_, const unsigned _m_) { const uint32x4_t _av =_a_;
206 |   uint32x4_t _v =    vmovq_n_u32(vgetq_lane_u32(_av, (_m_)        & 0x3));
207 |   _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
208 |   _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
209 |   _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); 
210 |   return _v;
211 | }
212 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _a_, const unsigned _m_) { 
213 |   return _mm_set_epi32(vgetq_lane_u32(_a_, ((_m_)     ) & 0x3),
214 | 	                   vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3),
215 | 		 	           vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3),
216 | 			           vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3));
217 | }
218 |   #endif
219 | #ifdef USE_MACROS
220 | #define _mm_unpacklo_epi8( _a_,_b_)            	({ uint8x8x2_t  _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
221 | #define _mm_unpacklo_epi16(_a_,_b_)            	({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
222 | #define _mm_unpacklo_epi32(_a_,_b_)             ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32(             _a_ ), vget_low_u32(             _b_ ));             vcombine_u32(_zv.val[0], _zv.val[1]);})
223 | #define _mm_unpacklo_epi64(_a_,_b_)             (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_)))
224 | 
225 | #define _mm_unpackhi_epi8( _a_,_b_)            	({ uint8x8x2_t  _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
226 | #define _mm_unpackhi_epi16(_a_,_b_)            	({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
227 | #define _mm_unpackhi_epi32(_a_,_b_)             ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32(             _a_ ), vget_high_u32(             _b_ ));             vcombine_u32(_zv.val[0], _zv.val[1]);})
228 | #define _mm_unpackhi_epi64(_a_,_b_)             (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_)))
229 | #else
230 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t  _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_)));  return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
231 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_)));  return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
232 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32(             _a_ ), vget_low_u32(             _b_ ));  return             vcombine_u32(_zv.val[0], _zv.val[1]);}
233 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _a_, __m128i _b_) {                                                                                                  return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_))); }
234 | 
235 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t  _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
236 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
237 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32(             _a_ ), vget_high_u32(             _b_ )); return             vcombine_u32(_zv.val[0], _zv.val[1]); }
238 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _a_, __m128i _b_) {                                                                                                  return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_))); }
239 | #endif
240 | 
241 |   #else //------------------------------------- intel SSE2/SSSE3 --------------------------------------------------------------
242 | #define mm_movemask_epu32(_a_)                  _mm_movemask_ps(_mm_castsi128_ps(_a_))
243 | #define mm_movemask_epu16(_a_)                  _mm_movemask_epi8(_a_)
244 | #define mm_loadu_epi64p( _u64p_,_a_)            _a_ = _mm_cvtsi64_si128(ctou64(_u64p_))
245 | 
246 | #define mm_extract_epu32( _a_,       _id_)      _mm_extract_epi32(_a_, _id_) 
247 | #define mm_extract_epi32x(_a_,_u32_, _id_)      _u32_ = _mm_extract_epi32(_a_, _id_) 
248 | #define mm_extract_epi64x(_a_,_u64_, _id_)      _u64_ = _mm_extract_epi64(_a_, _id_) 
249 | #define mm_insert_epi32p( _a_,_u32p_,_c_)       _mm_insert_epi32( _a_,ctou32(_u32p_),_c_)  
250 | 
251 | #define mm_mullo_epu32(   _a_,_b_)   			_mm_mullo_epi32(_a_,_b_)
252 | #define mm_cvtsi64_si128p(_u64p_,_a_)           _a_ = _mm_cvtsi64_si128(ctou64(_u64p_))
253 | 
254 | #define mm_cmpgt_epu32(   _a_, _b_)             _mm_cmpgt_epi32(_mm_xor_si128(_a_, cv80000000), _mm_xor_si128(_b_, cv80000000))
255 | 
256 | #define mm_shuffle_nnnn_epi32(_a_, _n_)         _mm_shuffle_epi32(_a_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
257 | #define mm_shuffle_2031_epi32(_a_)              _mm_shuffle_epi32(_a_, _MM_SHUFFLE(2,0,3,1))
258 | #define mm_shuffle_3120_epi32(_a_)              _mm_shuffle_epi32(_a_, _MM_SHUFFLE(3,1,2,0))
259 | 
260 | static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
261 |   __m128i fv     = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);	  
262 |   __m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128(               v,     cv0f_8));
263 |   __m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128(_mm_srli_epi64(v, 4), cv0f_8));
264 |   return _mm_or_si128(_mm_slli_epi64(lv,4), hv);
265 | }
266 | 
267 | static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
268 | static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
269 | static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
270 | static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
271 |   #endif
272 | #endif
273 | 


--------------------------------------------------------------------------------
/turbohist_.c:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright (c) 2013-2022, Powturbo
  3 |     - homepage : https://sites.google.com/site/powturbo/
  4 |     - github   : https://github.com/powturbo
  5 |     - twitter  : https://twitter.com/powturbo
  6 |     - email    : powturbo [_AT_] gmail [_DOT_] com
  7 | All rights reserved.
  8 | 
  9 | Redistribution and use in source and binary forms, with or without
 10 | modification, are permitted provided that the following conditions are
 11 | met:
 12 | 
 13 | 1. Redistributions of source code must retain the above copyright
 14 |    notice, this list of conditions and the following disclaimer.
 15 | 
 16 | 2. Redistributions in binary form must reproduce the above copyright
 17 |    notice, this list of conditions and the following disclaimer in the
 18 |    documentation and/or other materials provided with the distribution.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | **/
 32 | // 1D Histogram: hist[r]_X_Y   r:run aware  X: number bins used,   Y: processing unit   1:8 bits, 4:32 bits, 8:64 bits 
 33 | #include "conf.h"
 34 |   #ifdef __ARM_NEON
 35 | #define PREFETCH(_ip_,_rw_)
 36 |   #else
 37 | #define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
 38 |   #endif
 39 |   
 40 | #define CSIZE (256 + 8)
 41 | typedef unsigned cnt_t;
 42 | 
 43 |   #if 1 // fast when auto-vectorization enabled (for ex. with gcc -O3)
 44 | #define HISTEND(_c_,_cn_,_cnt_) { int _i,_j;\
 45 |   memset(_cnt_, 0, 256*sizeof(_cnt_[0]));\
 46 |   for(_i=0; _i < 256; _i++)\
 47 |     for(_j=0; _j < _cn_;_j++) _cnt_[_i] += _c_[_j][_i];\
 48 | }	
 49 | 
 50 | #define HISTEND8(_c_,_cnt_) HISTEND(_c_,8,_cnt_)
 51 | #define HISTEND4(_c_,_cnt_) HISTEND(_c_,4,_cnt_)
 52 |   #else
 53 | static ALWAYS_INLINE histend4(cnt_t c[4][CSIZE], cnt_t *__restrict cnt) { unsigned i;
 54 |   #ifdef __AVX2__
 55 |   for(i = 0; i != 256; i+=8) {
 56 |     __m256i sv =                  _mm256_load_si256((const __m256i *)&c[0][i]);
 57 |             sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[1][i]), sv);
 58 |             sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[2][i]), sv);
 59 |             sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[3][i]), sv);
 60 |     _mm256_storeu_si256((__m256i *)&cnt[i], sv);
 61 |   }
 62 |   #elif defined(__SSE2__) || defined(__ARM_NEON)
 63 |   for(i = 0; i != 256; i+=4) {
 64 |     __m128i sv =                  _mm_load_si128((const __m128i *)&c[0][i]);
 65 |             sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[1][i]), sv);
 66 |             sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[2][i]), sv);
 67 |             sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[3][i]), sv);
 68 |     _mm_storeu_si128((__m128i *)&cnt[i], sv);
 69 |   }
 70 |     #else
 71 |   for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i];
 72 | 	#endif
 73 | }
 74 | 
 75 | static ALWAYS_INLINE histend8(cnt_t c[8][CSIZE], cnt_t *__restrict cnt) { unsigned i;
 76 |     #ifdef __AVX2__ 
 77 |   for(i = 0; i != 256; i+=8) {
 78 |     __m256i v0 = _mm256_load_si256((const __m256i *)&c[0][i]);
 79 |     __m256i v1 = _mm256_load_si256((const __m256i *)&c[1][i]);
 80 | 	__m256i s0 = _mm256_add_epi32(v0, v1);
 81 |             v0 = _mm256_load_si256((const __m256i *)&c[2][i]);
 82 |             v1 = _mm256_load_si256((const __m256i *)&c[3][i]);
 83 | 	__m256i s1 = _mm256_add_epi32(v0, v1);
 84 |             s0 = _mm256_add_epi32(s0, s1);
 85 | 			
 86 |             v0 = _mm256_load_si256((const __m256i *)&c[4][i]);
 87 |             v1 = _mm256_load_si256((const __m256i *)&c[5][i]);
 88 | 			s1 = _mm256_add_epi32(v0, v1);
 89 |             v0 = _mm256_load_si256((const __m256i *)&c[6][i]);
 90 |             v1 = _mm256_load_si256((const __m256i *)&c[7][i]);
 91 | 	        s0 = _mm256_add_epi32(s0, v0);	
 92 | 			s1 = _mm256_add_epi32(s1, v1);
 93 | 			
 94 |     _mm256_storeu_si256((__m256i *)&cnt[i], _mm256_add_epi32(s0, s1));
 95 |   }
 96 |     #elif defined(__SSE2__) || defined(__ARM_NEON)
 97 |   for(i = 0; i != 256; i+=4) {
 98 |     __m128i v0 = _mm_load_si128((const __m128i *)&c[0][i]);
 99 |     __m128i v1 = _mm_load_si128((const __m128i *)&c[1][i]);
100 | 	__m128i sv = _mm_add_epi32(v0, v1);
101 |             v0 = _mm_load_si128((const __m128i *)&c[2][i]);
102 |             v1 = _mm_load_si128((const __m128i *)&c[3][i]);
103 | 	        sv = _mm_add_epi32(sv, v0);
104 | 	        sv = _mm_add_epi32(sv, v1);
105 | 
106 |             v0 = _mm_load_si128((const __m128i *)&c[4][i]);
107 |             v1 = _mm_load_si128((const __m128i *)&c[5][i]);
108 | 	        sv = _mm_add_epi32(sv, v0);
109 | 	        sv = _mm_add_epi32(sv, v1);
110 |             v0 = _mm_load_si128((const __m128i *)&c[6][i]);
111 |             v1 = _mm_load_si128((const __m128i *)&c[7][i]);
112 | 	        sv = _mm_add_epi32(sv, v0);
113 |     _mm_storeu_si128((__m128i *)&cnt[i], _mm_add_epi32(sv, v1));
114 |   }
115 |     #else
116 |   for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i]+c[4][i]+c[5][i]+c[6][i]+c[7][i];
117 | 	#endif
118 | }
119 | 
120 | #define HISTEND8(_c_,_cnt_) histend8(_c_,_cnt_)
121 | #define HISTEND4(_c_,_cnt_) histend4(_c_,_cnt_)
122 |   #endif
123 | 
124 | //---------------------------- 8 bits ------------------------------------------------------
125 | static void hist_1_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
126 |   unsigned char *ip = in;
127 |   
128 |   memset(cnt, 0, 256*sizeof(cnt[0]));
129 |   while(ip < in+inlen) cnt[*ip++]++;
130 | }
131 | 
132 | static void hist_4_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 
133 |   cnt_t c[4][CSIZE] = {0},i;
134 |   unsigned char *ip = in; 
135 |   
136 |   while(ip != in+(inlen&~(4-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++;
137 |   while(ip != in+ inlen        ) c[0][*ip++]++;
138 |   HISTEND4(c, cnt);
139 | }
140 | 
141 | static void hist_8_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) { 
142 |   cnt_t c[8][CSIZE] = {0},i; 
143 |   unsigned char *ip = in; 
144 |   
145 |   while(ip != in+(inlen&~(8-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++, c[4][*ip++]++, c[5][*ip++]++, c[6][*ip++]++, c[7][*ip++]++; 
146 |   while(ip != in+ inlen        ) c[0][*ip++]++; 
147 |   HISTEND8(c, cnt);
148 | }
149 | 
150 | //----------------------------- 32 bits --------------------------------------------------------
151 |   #if defined(__i386__) || defined(__x86_64__) 
152 | #define CU32(_u_,_i_,_c_) {\
153 |   c[_i_+0][(unsigned char )(_u_)   ]+=_c_;\
154 |   c[_i_+1][(unsigned short)(_u_)>>8]+=_c_; _u_>>=16;\
155 |   c[_i_+2][(unsigned char )(_u_)   ]+=_c_;\
156 |   c[_i_+3][(unsigned short)(_u_)>>8]+=_c_;\
157 | }
158 |   #else
159 | #define CU32(_u_,_i_,_c_) {\
160 |   c[_i_+0][(unsigned char) (_u_)     ]+=_c_;\
161 |   c[_i_+1][(unsigned char)((_u_)>> 8)]+=_c_;\
162 |   c[_i_+2][(unsigned char)((_u_)>>16)]+=_c_;\
163 |   c[_i_+3][ 	           (_u_)>>24 ]+=_c_;\
164 | }
165 |   #endif
166 | 
167 | #define UZ 4 // Load size 1x 32 bits = 4 bytes
168 | #define I132(_i_,_o_) { unsigned u1 = ctou32(ip+UZ+_i_*UZ*2+0); CU32(u0, 0, 1);\
169 |                                  u0 = ctou32(ip+UZ+_i_*UZ*2+4); CU32(u1,_o_,1);\
170 | }
171 | 
172 | #define N32 32
173 | static void hist_4_32(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
174 |   #define IC 0
175 |   cnt_t    c[4][CSIZE] = {0}, i; 
176 |   unsigned char *ip = in; 
177 |   
178 |   if(inlen >= UZ+N32) {
179 |     unsigned u0 = ctou32(ip);
180 |     for(; ip <= in+inlen-(UZ+N32); ip += N32) { 
181 | 	  I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC);  			
182 | 														PREFETCH(ip+512, 0); 
183 | 	}
184 |   }
185 |   while(ip != in+inlen) c[0][*ip++]++; 
186 |   HISTEND4(c, cnt);
187 | }
188 | 
189 | static void hist_8_32(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 
190 |   #define IC 4
191 |   cnt_t c[8][CSIZE] = {0}, i; 
192 |   unsigned char *ip = in; 
193 |   
194 |   if(inlen >= UZ+N32) {
195 |     unsigned u0 = ctou32(ip);
196 |     for(; ip <= in+inlen-(UZ+N32); ip += N32) { 
197 | 	  I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC);	  //I132(4,IC); I132(5,IC); I132(6,IC); I132(7,IC);			
198 | 	  PREFETCH(ip+512, 0); 
199 | 	}
200 |   }
201 |   while(ip != in+inlen) c[0][*ip++]++; 
202 |   HISTEND8(c, cnt);
203 | } 
204 | 
205 | //--------------------  64 bits ---------------------------------------------------
206 |   #if defined(__i386__) || defined(__x86_64__) 
207 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\
208 |   c[0    ][(unsigned char )_x    ]+=_c_;\
209 |   c[1    ][(unsigned short)_x>> 8]+=_c_; _x>>=16;\
210 |   c[2    ][(unsigned char )_x    ]+=_c_;\
211 |   c[3    ][(unsigned short)_x>> 8]+=_c_; _x=(_u_)>>=32;\
212 |   c[0+_o_][(unsigned char )_x    ]+=_c_;\
213 |   c[1+_o_][(unsigned short)_x>> 8]+=_c_; _x>>=16;\
214 |   c[2+_o_][(unsigned char )_x    ]+=_c_;\
215 |   c[3+_o_][(unsigned short)_x>> 8]+=_c_;\
216 | }
217 |   #else
218 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\
219 |   c[0    ][(unsigned char) _x     ]+=_c_;\
220 |   c[1    ][(unsigned char)(_x>> 8)]+=_c_;\
221 |   c[2    ][(unsigned char)(_x>>16)]+=_c_;\
222 |   c[3    ][                _x>>24 ]+=_c_;  _x=(_u_)>>=32;\
223 |   c[0+_o_][(unsigned char) _x     ]+=_c_;\
224 |   c[1+_o_][(unsigned char)(_x>> 8)]+=_c_;\
225 |   c[2+_o_][(unsigned char)(_x>>16)]+=_c_;\
226 |   c[3+_o_][                _x>>24 ]+=_c_;\
227 | }
228 |   #endif
229 | 
230 | #define UZ 8 // Load size 1x 64 bits = 8 bytes
231 | #define I164(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0); CU64(u0, _o_, 1);\
232 |                                  u0 = ctou64(ip+UZ+_i_*UZ*2+ 8); CU64(u1, _o_, 1);\
233 | }
234 | 
235 | #define N64 64
236 | static void hist_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
237 |   #define IC 0
238 |   cnt_t         c[4][CSIZE] = {0}, i; 
239 |   unsigned char *ip = in; 
240 |   
241 |   if(inlen >= UZ+N64) {
242 |     uint64_t u0 = ctou64(ip);
243 |     for(; ip <= in+inlen-(UZ+N64); ip += N64) { 
244 | 	  I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC);
245 | 																	PREFETCH(ip+512, 0); 
246 | 	}
247 |   }
248 |   while(ip != in+inlen) c[0][*ip++]++; 
249 |   HISTEND4(c, cnt);
250 | }
251 |    
252 | static void hist_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
253 |   #define IC 4
254 |   cnt_t c[8][CSIZE] = {0}, i; 
255 |   unsigned char *ip = in; 
256 |   
257 |   if(inlen >= UZ+N64) {
258 |     uint64_t u0 = ctou64(ip);
259 |     for(; ip <= in+inlen-(UZ+N64); ip += N64) { 
260 | 	  I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC);
261 | 																	PREFETCH(ip+512, 0); 
262 | 	}
263 |   } 
264 |   while(ip != in+inlen) c[0][*ip++]++; 
265 |   HISTEND8(c, cnt);
266 | }
267 | 
268 | //-----  hist_8_64a with inline assembly -----------------------------------------
269 |   #ifdef __x86_64
270 | #define RSHR(r, b) __asm volatile ("shr %1, %0": "+r" (r): "i" (b) )
271 | 	
272 | #define CU16(x, u,  offset, size, base, scale) \
273 |   __asm volatile (\
274 |     "movzbl %b1, %k0\n"\
275 |     "incl (%c2+0)*%c3(%4, %0, %c5)\n"\
276 |     "movzbl %h1, %k0\n"\
277 |     "incl (%c2+1)*%c3(%4, %0, %c5)\n"\
278 |     :"=&R" (x)\
279 |     :"Q" (u), "i" (offset), "i" (size), "r" (base), "i" (scale) \
280 |     :"memory"\
281 |   ) 
282 | 				  
283 | #define N64 64					
284 | unsigned hist_8_64a(unsigned char *in, unsigned inlen, unsigned *__restrict cnt) {
285 |   unsigned      c[8][CSIZE]= {0};    
286 |   unsigned char *ip = in; 
287 |   
288 |   if(inlen >= 8+N64) {
289 |     uint64_t u0 = ctou64(ip),b;
290 |     for(; ip <= in+inlen-(8+N64); ip += N64) { 
291 |       uint64_t x, u1;
292 | 	  #define ST(u)  CU16(x, u, 0, CSIZE*4, c, 4);\
293 | 	    RSHR(u, 16); CU16(x, u, 2, CSIZE*4, c, 4);\
294 |         RSHR(u, 16); CU16(x, u, 4, CSIZE*4, c, 4);\
295 |         RSHR(u, 16); CU16(x, u, 6, CSIZE*4, c, 4); 
296 | 	  u1 = ctou64(ip+8+ 0); ST(u0);
297 |       u0 = ctou64(ip+8+ 8); ST(u1);
298 |       u1 = ctou64(ip+8+16); ST(u0);
299 |       u0 = ctou64(ip+8+24); ST(u1);
300 |       u1 = ctou64(ip+8+32); ST(u0);
301 |       u0 = ctou64(ip+8+40); ST(u1);
302 |       u1 = ctou64(ip+8+48); ST(u0);
303 |       u0 = ctou64(ip+8+56); ST(u1);				PREFETCH(ip+768, 0);
304 |     }
305 |   }
306 |   while(ip < in+inlen) c[0][*ip++]++; 
307 |   HISTEND8(c, cnt);
308 | }
309 | #endif
310 | 
311 | #define UZ 16 // Load size 2x 64 bits = 2*8 bytes
312 | #define CR64(u,v,_o_,_c_) if(likely(u!=v)) { CU64(u,_o_,1); CU64(v,_o_,1); } else if((u^(v<<8)) < (1<<8)) c[_c_][(unsigned char)u]+=UZ; else CU64(u, _o_,2)
313 | #define I2R64(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0), v1 = ctou64(ip+UZ+_i_*UZ*2+ 8); CR64(u0,v0,_o_,_i_);\
314 |                                   u0 = ctou64(ip+UZ+_i_*UZ*2+16); v0 = ctou64(ip+UZ+_i_*UZ*2+24); CR64(u1,v1,_o_,_i_);\
315 | }
316 | 
317 | #define N64 64
318 | static void histr_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
319 |   #define IC 0
320 |   cnt_t c[4][CSIZE] = {0},i; 
321 |   unsigned char *ip = in,*in_; 
322 |   
323 |   if(inlen >= UZ+N64) {
324 |     uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8);
325 |     for(; ip <= in+inlen-(UZ+N64); ip += N64) { 
326 | 	  I2R64(0,IC); I2R64(1,IC); 
327 | 																PREFETCH(ip+512, 0); 
328 | 	}
329 |   }
330 |   while(ip != in+inlen)
331 |     c[0][*ip++]++; 
332 |   HISTEND4(c, cnt);
333 | }
334 | 
335 | static void histr_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
336 |   #define IC 4
337 |   cnt_t c[8][CSIZE] = {0},i; 
338 |   unsigned char *ip = in,*in_; 
339 |   
340 |   if(inlen >= UZ+N64) {
341 |     uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8);
342 |     for(; ip <= in+inlen-(UZ+N64); ip += N64) { 
343 | 	  I2R64(0,IC); I2R64(1,IC); 
344 | 																PREFETCH(ip+512, 0); 
345 | 	}
346 |   }
347 |   while(ip != in+inlen) c[0][*ip++]++; 
348 |   HISTEND8(c, cnt);
349 | }
350 | 
351 |   #if defined(__SSE4_1__) || defined(__ARM_NEON) //---------- sse4.1 ---------------------------------------
352 |     #ifdef __SSE4_1__
353 | #include <smmintrin.h>
354 |     #else
355 | #include "sse_neon.h"
356 |     #endif
357 | static void hist_4_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
358 |   cnt_t c[4][CSIZE]={0},i; 
359 | 
360 |   unsigned char *ip = in;
361 |   if(inlen >= 32+64) {
362 |       __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16));
363 |     for(; ip <= in+inlen-(32+64); ip += 64) { 
364 |       __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16)); 
365 |       c[0][_mm_extract_epi8(u0,  0)]++;
366 |       c[1][_mm_extract_epi8(v0,  0)]++;
367 |       c[2][_mm_extract_epi8(u0,  1)]++;
368 |       c[3][_mm_extract_epi8(v0,  1)]++;
369 |       c[0][_mm_extract_epi8(u0,  2)]++;
370 |       c[1][_mm_extract_epi8(v0,  2)]++;
371 |       c[2][_mm_extract_epi8(u0,  3)]++;
372 |       c[3][_mm_extract_epi8(v0,  3)]++;
373 |       c[0][_mm_extract_epi8(u0,  4)]++;
374 |       c[1][_mm_extract_epi8(v0,  4)]++;
375 |       c[2][_mm_extract_epi8(u0,  5)]++;
376 |       c[3][_mm_extract_epi8(v0,  5)]++;
377 |       c[0][_mm_extract_epi8(u0,  6)]++;
378 |       c[1][_mm_extract_epi8(v0,  6)]++;
379 |       c[2][_mm_extract_epi8(u0,  7)]++;
380 |       c[3][_mm_extract_epi8(v0,  7)]++;
381 |       c[0][_mm_extract_epi8(u0,  8)]++;
382 |       c[1][_mm_extract_epi8(v0,  8)]++;
383 |       c[2][_mm_extract_epi8(u0,  9)]++;
384 |       c[3][_mm_extract_epi8(v0,  9)]++;
385 |       c[0][_mm_extract_epi8(u0, 10)]++;
386 |       c[1][_mm_extract_epi8(v0, 10)]++;
387 |       c[2][_mm_extract_epi8(u0, 11)]++;
388 |       c[3][_mm_extract_epi8(v0, 11)]++;
389 |       c[0][_mm_extract_epi8(u0, 12)]++;
390 |       c[1][_mm_extract_epi8(v0, 12)]++;
391 |       c[2][_mm_extract_epi8(u0, 13)]++;
392 |       c[3][_mm_extract_epi8(v0, 13)]++;
393 |       c[0][_mm_extract_epi8(u0, 14)]++;
394 |       c[1][_mm_extract_epi8(v0, 14)]++;
395 |       c[2][_mm_extract_epi8(u0, 15)]++;
396 |       c[3][_mm_extract_epi8(v0, 15)]++;
397 | 	
398 | 	  u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48));
399 |       c[0][_mm_extract_epi8(u1,  0)]++;
400 |       c[1][_mm_extract_epi8(v1,  0)]++;
401 |       c[2][_mm_extract_epi8(u1,  1)]++;
402 |       c[3][_mm_extract_epi8(v1,  1)]++;
403 |       c[0][_mm_extract_epi8(u1,  2)]++;
404 |       c[1][_mm_extract_epi8(v1,  2)]++;
405 |       c[2][_mm_extract_epi8(u1,  3)]++;
406 |       c[3][_mm_extract_epi8(v1,  3)]++;
407 |       c[0][_mm_extract_epi8(u1,  4)]++;
408 |       c[1][_mm_extract_epi8(v1,  4)]++;
409 |       c[2][_mm_extract_epi8(u1,  5)]++;
410 |       c[3][_mm_extract_epi8(v1,  5)]++;
411 |       c[0][_mm_extract_epi8(u1,  6)]++;
412 |       c[1][_mm_extract_epi8(v1,  6)]++;
413 |       c[2][_mm_extract_epi8(u1,  7)]++;
414 |       c[3][_mm_extract_epi8(v1,  7)]++;
415 |       c[0][_mm_extract_epi8(u1,  8)]++;
416 |       c[1][_mm_extract_epi8(v1,  8)]++;
417 |       c[2][_mm_extract_epi8(u1,  9)]++;
418 |       c[3][_mm_extract_epi8(v1,  9)]++;
419 |       c[0][_mm_extract_epi8(u1, 10)]++;
420 |       c[1][_mm_extract_epi8(v1, 10)]++;
421 |       c[2][_mm_extract_epi8(u1, 11)]++;
422 |       c[3][_mm_extract_epi8(v1, 11)]++;
423 |       c[0][_mm_extract_epi8(u1, 12)]++;
424 |       c[1][_mm_extract_epi8(v1, 12)]++;
425 |       c[2][_mm_extract_epi8(u1, 13)]++;
426 |       c[3][_mm_extract_epi8(v1, 13)]++;
427 |       c[0][_mm_extract_epi8(u1, 14)]++;
428 |       c[1][_mm_extract_epi8(v1, 14)]++;
429 |       c[2][_mm_extract_epi8(u1, 15)]++;
430 |       c[3][_mm_extract_epi8(v1, 15)]++;										PREFETCH(ip+512, 0);
431 |     }
432 |   }
433 |   while(ip < in+inlen) c[0][*ip++]++; 
434 |   HISTEND4(c, cnt);
435 | }
436 | 
437 | unsigned hist_8_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 
438 |   cnt_t c[8][CSIZE]={0},i; 
439 | 
440 |   unsigned char *ip = in;
441 |   if(inlen >= 32+64) {
442 |       __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16));
443 |     for(; ip <= in+inlen-(32+64); ip += 64) { 
444 |       __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16)); 
445 |       c[0][_mm_extract_epi8(u0,  0)]++;
446 |       c[1][_mm_extract_epi8(v0,  0)]++;
447 |       c[2][_mm_extract_epi8(u0,  1)]++;
448 |       c[3][_mm_extract_epi8(v0,  1)]++;
449 |       c[4][_mm_extract_epi8(u0,  2)]++;
450 |       c[5][_mm_extract_epi8(v0,  2)]++;
451 |       c[6][_mm_extract_epi8(u0,  3)]++;
452 |       c[7][_mm_extract_epi8(v0,  3)]++;
453 |       c[0][_mm_extract_epi8(u0,  4)]++;
454 |       c[1][_mm_extract_epi8(v0,  4)]++;
455 |       c[2][_mm_extract_epi8(u0,  5)]++;
456 |       c[3][_mm_extract_epi8(v0,  5)]++;
457 |       c[4][_mm_extract_epi8(u0,  6)]++;
458 |       c[5][_mm_extract_epi8(v0,  6)]++;
459 |       c[6][_mm_extract_epi8(u0,  7)]++;
460 |       c[7][_mm_extract_epi8(v0,  7)]++;
461 |       c[0][_mm_extract_epi8(u0,  8)]++;
462 |       c[1][_mm_extract_epi8(v0,  8)]++;
463 |       c[2][_mm_extract_epi8(u0,  9)]++;
464 |       c[3][_mm_extract_epi8(v0,  9)]++;
465 |       c[4][_mm_extract_epi8(u0, 10)]++;
466 |       c[5][_mm_extract_epi8(v0, 10)]++;
467 |       c[6][_mm_extract_epi8(u0, 11)]++;
468 |       c[7][_mm_extract_epi8(v0, 11)]++;
469 |       c[0][_mm_extract_epi8(u0, 12)]++;
470 |       c[1][_mm_extract_epi8(v0, 12)]++;
471 |       c[2][_mm_extract_epi8(u0, 13)]++;
472 |       c[3][_mm_extract_epi8(v0, 13)]++;
473 |       c[4][_mm_extract_epi8(u0, 14)]++;
474 |       c[5][_mm_extract_epi8(v0, 14)]++;
475 |       c[6][_mm_extract_epi8(u0, 15)]++;
476 |       c[7][_mm_extract_epi8(v0, 15)]++; 
477 | 	
478 | 	  u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48));
479 |       c[0][_mm_extract_epi8(u1,  0)]++;
480 |       c[1][_mm_extract_epi8(v1,  0)]++;
481 |       c[2][_mm_extract_epi8(u1,  1)]++;
482 |       c[3][_mm_extract_epi8(v1,  1)]++;
483 |       c[4][_mm_extract_epi8(u1,  2)]++;
484 |       c[5][_mm_extract_epi8(v1,  2)]++;
485 |       c[6][_mm_extract_epi8(u1,  3)]++;
486 |       c[7][_mm_extract_epi8(v1,  3)]++;
487 |       c[0][_mm_extract_epi8(u1,  4)]++;
488 |       c[1][_mm_extract_epi8(v1,  4)]++;
489 |       c[2][_mm_extract_epi8(u1,  5)]++;
490 |       c[3][_mm_extract_epi8(v1,  5)]++;
491 |       c[4][_mm_extract_epi8(u1,  6)]++;
492 |       c[5][_mm_extract_epi8(v1,  6)]++;
493 |       c[6][_mm_extract_epi8(u1,  7)]++;
494 |       c[7][_mm_extract_epi8(v1,  7)]++;
495 |       c[0][_mm_extract_epi8(u1,  8)]++;
496 |       c[1][_mm_extract_epi8(v1,  8)]++;
497 |       c[2][_mm_extract_epi8(u1,  9)]++;
498 |       c[3][_mm_extract_epi8(v1,  9)]++;
499 |       c[4][_mm_extract_epi8(u1, 10)]++;
500 |       c[5][_mm_extract_epi8(v1, 10)]++;
501 |       c[6][_mm_extract_epi8(u1, 11)]++;
502 |       c[7][_mm_extract_epi8(v1, 11)]++;
503 |       c[0][_mm_extract_epi8(u1, 12)]++;
504 |       c[1][_mm_extract_epi8(v1, 12)]++;
505 |       c[2][_mm_extract_epi8(u1, 13)]++;
506 |       c[3][_mm_extract_epi8(v1, 13)]++;
507 |       c[4][_mm_extract_epi8(u1, 14)]++;
508 |       c[5][_mm_extract_epi8(v1, 14)]++;
509 |       c[6][_mm_extract_epi8(u1, 15)]++;
510 |       c[7][_mm_extract_epi8(v1, 15)]++;          	  							PREFETCH(ip+512, 0);
511 |     }
512 |   }
513 |   while(ip < in+inlen) c[0][*ip++]++; 
514 |   HISTEND8(c, cnt);
515 | }
516 |   #endif
517 | 
518 |   #ifdef __AVX2__ //---------------------------------- avx2 -----------------------------------------------
519 | #include <immintrin.h>
520 | 
521 | #define UZ   64
522 | #define N256 128
523 | 
524 | static void hist_4_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 
525 |   cnt_t c[4][CSIZE]={0},i; 
526 | 
527 |   unsigned char *ip = in;
528 |   if(inlen >= UZ+N256) {
529 |       __m256i u0 = _mm256_loadu_si256((__m256i*)ip),      v0 = _mm256_loadu_si256((__m256i*)(ip+32));
530 |     for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) { 
531 |       __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32)); 	
532 |       c[0][_mm256_extract_epi8(u0,  0)]++;
533 |       c[1][_mm256_extract_epi8(v0,  0)]++;
534 |       c[2][_mm256_extract_epi8(u0,  1)]++;
535 |       c[3][_mm256_extract_epi8(v0,  1)]++;
536 |       c[0][_mm256_extract_epi8(u0,  2)]++;
537 |       c[1][_mm256_extract_epi8(v0,  2)]++;
538 |       c[2][_mm256_extract_epi8(u0,  3)]++;
539 |       c[3][_mm256_extract_epi8(v0,  3)]++;
540 |       c[0][_mm256_extract_epi8(u0,  4)]++;
541 |       c[1][_mm256_extract_epi8(v0,  4)]++;
542 |       c[2][_mm256_extract_epi8(u0,  5)]++;
543 |       c[3][_mm256_extract_epi8(v0,  5)]++;
544 |       c[0][_mm256_extract_epi8(u0,  6)]++;
545 |       c[1][_mm256_extract_epi8(v0,  6)]++;
546 |       c[2][_mm256_extract_epi8(u0,  7)]++;
547 |       c[3][_mm256_extract_epi8(v0,  7)]++;
548 |       c[0][_mm256_extract_epi8(u0,  8)]++;
549 |       c[1][_mm256_extract_epi8(v0,  8)]++;
550 |       c[2][_mm256_extract_epi8(u0,  9)]++;
551 |       c[3][_mm256_extract_epi8(v0,  9)]++;
552 |       c[0][_mm256_extract_epi8(u0, 10)]++;
553 |       c[1][_mm256_extract_epi8(v0, 10)]++;
554 |       c[2][_mm256_extract_epi8(u0, 11)]++;
555 |       c[3][_mm256_extract_epi8(v0, 11)]++;
556 |       c[0][_mm256_extract_epi8(u0, 12)]++;
557 |       c[1][_mm256_extract_epi8(v0, 12)]++;
558 |       c[2][_mm256_extract_epi8(u0, 13)]++;
559 |       c[3][_mm256_extract_epi8(v0, 13)]++;
560 |       c[0][_mm256_extract_epi8(u0, 14)]++;
561 |       c[1][_mm256_extract_epi8(v0, 14)]++;
562 |       c[2][_mm256_extract_epi8(u0, 15)]++;
563 |       c[3][_mm256_extract_epi8(v0, 15)]++;
564 |       c[0][_mm256_extract_epi8(u0, 16)]++;
565 |       c[1][_mm256_extract_epi8(v0, 16)]++;
566 |       c[2][_mm256_extract_epi8(u0, 17)]++;
567 |       c[3][_mm256_extract_epi8(v0, 17)]++;
568 |       c[0][_mm256_extract_epi8(u0, 18)]++;
569 |       c[1][_mm256_extract_epi8(v0, 18)]++;
570 |       c[2][_mm256_extract_epi8(u0, 19)]++;
571 |       c[3][_mm256_extract_epi8(v0, 19)]++;
572 |       c[0][_mm256_extract_epi8(u0, 20)]++;
573 |       c[1][_mm256_extract_epi8(v0, 20)]++;
574 |       c[2][_mm256_extract_epi8(u0, 21)]++;
575 |       c[3][_mm256_extract_epi8(v0, 21)]++;
576 |       c[0][_mm256_extract_epi8(u0, 22)]++;
577 |       c[1][_mm256_extract_epi8(v0, 22)]++;
578 |       c[2][_mm256_extract_epi8(u0, 23)]++;
579 |       c[3][_mm256_extract_epi8(v0, 23)]++;
580 |       c[0][_mm256_extract_epi8(u0, 24)]++;
581 |       c[1][_mm256_extract_epi8(v0, 24)]++;
582 |       c[2][_mm256_extract_epi8(u0, 25)]++;
583 |       c[3][_mm256_extract_epi8(v0, 25)]++;
584 |       c[0][_mm256_extract_epi8(u0, 26)]++;
585 |       c[1][_mm256_extract_epi8(v0, 26)]++;
586 |       c[2][_mm256_extract_epi8(u0, 27)]++;
587 |       c[3][_mm256_extract_epi8(v0, 27)]++;
588 |       c[0][_mm256_extract_epi8(u0, 28)]++;
589 |       c[1][_mm256_extract_epi8(v0, 28)]++;
590 |       c[2][_mm256_extract_epi8(u0, 29)]++;
591 |       c[3][_mm256_extract_epi8(v0, 29)]++;
592 |       c[0][_mm256_extract_epi8(u0, 30)]++;
593 |       c[1][_mm256_extract_epi8(v0, 30)]++;
594 |       c[2][_mm256_extract_epi8(u0, 31)]++;
595 |       c[3][_mm256_extract_epi8(v0, 31)]++;
596 | 
597 |       u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96)); 	
598 |       c[0][_mm256_extract_epi8(u1,  0)]++;
599 |       c[1][_mm256_extract_epi8(v1,  0)]++;
600 |       c[2][_mm256_extract_epi8(u1,  1)]++;
601 |       c[3][_mm256_extract_epi8(v1,  1)]++;
602 |       c[0][_mm256_extract_epi8(u1,  2)]++;
603 |       c[1][_mm256_extract_epi8(v1,  2)]++;
604 |       c[2][_mm256_extract_epi8(u1,  3)]++;
605 |       c[3][_mm256_extract_epi8(v1,  3)]++;
606 |       c[0][_mm256_extract_epi8(u1,  4)]++;
607 |       c[1][_mm256_extract_epi8(v1,  4)]++;
608 |       c[2][_mm256_extract_epi8(u1,  5)]++;
609 |       c[3][_mm256_extract_epi8(v1,  5)]++;
610 |       c[0][_mm256_extract_epi8(u1,  6)]++;
611 |       c[1][_mm256_extract_epi8(v1,  6)]++;
612 |       c[2][_mm256_extract_epi8(u1,  7)]++;
613 |       c[3][_mm256_extract_epi8(v1,  7)]++;
614 |       c[0][_mm256_extract_epi8(u1,  8)]++;
615 |       c[1][_mm256_extract_epi8(v1,  8)]++;
616 |       c[2][_mm256_extract_epi8(u1,  9)]++;
617 |       c[3][_mm256_extract_epi8(v1,  9)]++;
618 |       c[0][_mm256_extract_epi8(u1, 10)]++;
619 |       c[1][_mm256_extract_epi8(v1, 10)]++;
620 |       c[2][_mm256_extract_epi8(u1, 11)]++;
621 |       c[3][_mm256_extract_epi8(v1, 11)]++;
622 |       c[0][_mm256_extract_epi8(u1, 12)]++;
623 |       c[1][_mm256_extract_epi8(v1, 12)]++;
624 |       c[2][_mm256_extract_epi8(u1, 13)]++;
625 |       c[3][_mm256_extract_epi8(v1, 13)]++;
626 |       c[0][_mm256_extract_epi8(u1, 14)]++;
627 |       c[1][_mm256_extract_epi8(v1, 14)]++;
628 |       c[2][_mm256_extract_epi8(u1, 15)]++;
629 |       c[3][_mm256_extract_epi8(v1, 15)]++;
630 |       c[0][_mm256_extract_epi8(u1, 16)]++;
631 |       c[1][_mm256_extract_epi8(v1, 16)]++;
632 |       c[2][_mm256_extract_epi8(u1, 17)]++;
633 |       c[3][_mm256_extract_epi8(v1, 17)]++;
634 |       c[0][_mm256_extract_epi8(u1, 18)]++;
635 |       c[1][_mm256_extract_epi8(v1, 18)]++;
636 |       c[2][_mm256_extract_epi8(u1, 19)]++;
637 |       c[3][_mm256_extract_epi8(v1, 19)]++;
638 |       c[0][_mm256_extract_epi8(u1, 20)]++;
639 |       c[1][_mm256_extract_epi8(v1, 20)]++;
640 |       c[2][_mm256_extract_epi8(u1, 21)]++;
641 |       c[3][_mm256_extract_epi8(v1, 21)]++;
642 |       c[0][_mm256_extract_epi8(u1, 22)]++;
643 |       c[1][_mm256_extract_epi8(v1, 22)]++;
644 |       c[2][_mm256_extract_epi8(u1, 23)]++;
645 |       c[3][_mm256_extract_epi8(v1, 23)]++;
646 |       c[0][_mm256_extract_epi8(u1, 24)]++;
647 |       c[1][_mm256_extract_epi8(v1, 24)]++;
648 |       c[2][_mm256_extract_epi8(u1, 25)]++;
649 |       c[3][_mm256_extract_epi8(v1, 25)]++;
650 |       c[0][_mm256_extract_epi8(u1, 26)]++;
651 |       c[1][_mm256_extract_epi8(v1, 26)]++;
652 |       c[2][_mm256_extract_epi8(u1, 27)]++;
653 |       c[3][_mm256_extract_epi8(v1, 27)]++;
654 |       c[0][_mm256_extract_epi8(u1, 28)]++;
655 |       c[1][_mm256_extract_epi8(v1, 28)]++;
656 |       c[2][_mm256_extract_epi8(u1, 29)]++;
657 |       c[3][_mm256_extract_epi8(v1, 29)]++;
658 |       c[0][_mm256_extract_epi8(u1, 30)]++;
659 |       c[1][_mm256_extract_epi8(v1, 30)]++;
660 |       c[2][_mm256_extract_epi8(u1, 31)]++;
661 |       c[3][_mm256_extract_epi8(v1, 31)]++;							PREFETCH(ip+512, 0);
662 |     }
663 |   }
664 |   while(ip < in+inlen) c[0][*ip++]++; 
665 |   HISTEND4(c, cnt);
666 | }
667 | 
668 | #define UZ   64
669 | #define N256 128
670 | static void hist_8_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) { 
671 |   cnt_t c[8][CSIZE]={0},i; 
672 | 
673 |   unsigned char *ip = in;
674 |   if(inlen >= UZ+N256) {
675 |       __m256i u0 = _mm256_loadu_si256((__m256i*)ip),        v0 = _mm256_loadu_si256((__m256i*)(ip+32));
676 |     for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) { 
677 |       __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ+0)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32)); 	
678 |       c[0][_mm256_extract_epi8(u0,  0)]++;
679 |       c[1][_mm256_extract_epi8(v0,  0)]++;
680 |       c[2][_mm256_extract_epi8(u0,  1)]++;
681 |       c[3][_mm256_extract_epi8(v0,  1)]++;
682 |       c[4][_mm256_extract_epi8(u0,  2)]++;
683 |       c[5][_mm256_extract_epi8(v0,  2)]++;
684 |       c[6][_mm256_extract_epi8(u0,  3)]++;
685 |       c[7][_mm256_extract_epi8(v0,  3)]++;
686 |       c[0][_mm256_extract_epi8(u0,  4)]++;
687 |       c[1][_mm256_extract_epi8(v0,  4)]++;
688 |       c[2][_mm256_extract_epi8(u0,  5)]++;
689 |       c[3][_mm256_extract_epi8(v0,  5)]++;
690 |       c[4][_mm256_extract_epi8(u0,  6)]++;
691 |       c[5][_mm256_extract_epi8(v0,  6)]++;
692 |       c[6][_mm256_extract_epi8(u0,  7)]++;
693 |       c[7][_mm256_extract_epi8(v0,  7)]++;
694 |       c[0][_mm256_extract_epi8(u0,  8)]++;
695 |       c[1][_mm256_extract_epi8(v0,  8)]++;
696 |       c[2][_mm256_extract_epi8(u0,  9)]++;
697 |       c[3][_mm256_extract_epi8(v0,  9)]++;
698 |       c[4][_mm256_extract_epi8(u0, 10)]++;
699 |       c[5][_mm256_extract_epi8(v0, 10)]++;
700 |       c[6][_mm256_extract_epi8(u0, 11)]++;
701 |       c[7][_mm256_extract_epi8(v0, 11)]++;
702 |       c[0][_mm256_extract_epi8(u0, 12)]++;
703 |       c[1][_mm256_extract_epi8(v0, 12)]++;
704 |       c[2][_mm256_extract_epi8(u0, 13)]++;
705 |       c[3][_mm256_extract_epi8(v0, 13)]++;
706 |       c[4][_mm256_extract_epi8(u0, 14)]++;
707 |       c[5][_mm256_extract_epi8(v0, 14)]++;
708 |       c[6][_mm256_extract_epi8(u0, 15)]++;
709 |       c[7][_mm256_extract_epi8(v0, 15)]++;
710 |       c[0][_mm256_extract_epi8(u0, 16)]++;
711 |       c[1][_mm256_extract_epi8(v0, 16)]++;
712 |       c[2][_mm256_extract_epi8(u0, 17)]++;
713 |       c[3][_mm256_extract_epi8(v0, 17)]++;
714 |       c[4][_mm256_extract_epi8(u0, 18)]++;
715 |       c[5][_mm256_extract_epi8(v0, 18)]++;
716 |       c[6][_mm256_extract_epi8(u0, 19)]++;
717 |       c[7][_mm256_extract_epi8(v0, 19)]++;
718 |       c[0][_mm256_extract_epi8(u0, 20)]++;
719 |       c[1][_mm256_extract_epi8(v0, 20)]++;
720 |       c[2][_mm256_extract_epi8(u0, 21)]++;
721 |       c[3][_mm256_extract_epi8(v0, 21)]++;
722 |       c[4][_mm256_extract_epi8(u0, 22)]++;
723 |       c[5][_mm256_extract_epi8(v0, 22)]++;
724 |       c[6][_mm256_extract_epi8(u0, 23)]++;
725 |       c[7][_mm256_extract_epi8(v0, 23)]++;
726 |       c[0][_mm256_extract_epi8(u0, 24)]++;
727 |       c[1][_mm256_extract_epi8(v0, 24)]++;
728 |       c[2][_mm256_extract_epi8(u0, 25)]++;
729 |       c[3][_mm256_extract_epi8(v0, 25)]++;
730 |       c[4][_mm256_extract_epi8(u0, 26)]++;
731 |       c[5][_mm256_extract_epi8(v0, 26)]++;
732 |       c[6][_mm256_extract_epi8(u0, 27)]++;
733 |       c[7][_mm256_extract_epi8(v0, 27)]++;
734 |       c[0][_mm256_extract_epi8(u0, 28)]++;
735 |       c[1][_mm256_extract_epi8(v0, 28)]++;
736 |       c[2][_mm256_extract_epi8(u0, 29)]++;
737 |       c[3][_mm256_extract_epi8(v0, 29)]++;
738 |       c[4][_mm256_extract_epi8(u0, 30)]++;
739 |       c[5][_mm256_extract_epi8(v0, 30)]++;
740 |       c[6][_mm256_extract_epi8(u0, 31)]++;
741 |       c[7][_mm256_extract_epi8(v0, 31)]++;
742 | 
743 |       u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96)); 	
744 |       c[0][_mm256_extract_epi8(u1,  0)]++;
745 |       c[1][_mm256_extract_epi8(v1,  0)]++;
746 |       c[2][_mm256_extract_epi8(u1,  1)]++;
747 |       c[3][_mm256_extract_epi8(v1,  1)]++;
748 |       c[4][_mm256_extract_epi8(u1,  2)]++;
749 |       c[5][_mm256_extract_epi8(v1,  2)]++;
750 |       c[6][_mm256_extract_epi8(u1,  3)]++;
751 |       c[7][_mm256_extract_epi8(v1,  3)]++;
752 |       c[0][_mm256_extract_epi8(u1,  4)]++;
753 |       c[1][_mm256_extract_epi8(v1,  4)]++;
754 |       c[2][_mm256_extract_epi8(u1,  5)]++;
755 |       c[3][_mm256_extract_epi8(v1,  5)]++;
756 |       c[4][_mm256_extract_epi8(u1,  6)]++;
757 |       c[5][_mm256_extract_epi8(v1,  6)]++;
758 |       c[6][_mm256_extract_epi8(u1,  7)]++;
759 |       c[7][_mm256_extract_epi8(v1,  7)]++;
760 |       c[0][_mm256_extract_epi8(u1,  8)]++;
761 |       c[1][_mm256_extract_epi8(v1,  8)]++;
762 |       c[2][_mm256_extract_epi8(u1,  9)]++;
763 |       c[3][_mm256_extract_epi8(v1,  9)]++;
764 |       c[4][_mm256_extract_epi8(u1, 10)]++;
765 |       c[5][_mm256_extract_epi8(v1, 10)]++;
766 |       c[6][_mm256_extract_epi8(u1, 11)]++;
767 |       c[7][_mm256_extract_epi8(v1, 11)]++;
768 |       c[0][_mm256_extract_epi8(u1, 12)]++;
769 |       c[1][_mm256_extract_epi8(v1, 12)]++;
770 |       c[2][_mm256_extract_epi8(u1, 13)]++;
771 |       c[3][_mm256_extract_epi8(v1, 13)]++;
772 |       c[4][_mm256_extract_epi8(u1, 14)]++;
773 |       c[5][_mm256_extract_epi8(v1, 14)]++;
774 |       c[6][_mm256_extract_epi8(u1, 15)]++;
775 |       c[7][_mm256_extract_epi8(v1, 15)]++;
776 |       c[0][_mm256_extract_epi8(u1, 16)]++;
777 |       c[1][_mm256_extract_epi8(v1, 16)]++;
778 |       c[2][_mm256_extract_epi8(u1, 17)]++;
779 |       c[3][_mm256_extract_epi8(v1, 17)]++;
780 |       c[4][_mm256_extract_epi8(u1, 18)]++;
781 |       c[5][_mm256_extract_epi8(v1, 18)]++;
782 |       c[6][_mm256_extract_epi8(u1, 19)]++;
783 |       c[7][_mm256_extract_epi8(v1, 19)]++;
784 |       c[0][_mm256_extract_epi8(u1, 20)]++;
785 |       c[1][_mm256_extract_epi8(v1, 20)]++;
786 |       c[2][_mm256_extract_epi8(u1, 21)]++;
787 |       c[3][_mm256_extract_epi8(v1, 21)]++;
788 |       c[4][_mm256_extract_epi8(u1, 22)]++;
789 |       c[5][_mm256_extract_epi8(v1, 22)]++;
790 |       c[6][_mm256_extract_epi8(u1, 23)]++;
791 |       c[7][_mm256_extract_epi8(v1, 23)]++;
792 |       c[0][_mm256_extract_epi8(u1, 24)]++;
793 |       c[1][_mm256_extract_epi8(v1, 24)]++;
794 |       c[2][_mm256_extract_epi8(u1, 25)]++;
795 |       c[3][_mm256_extract_epi8(v1, 25)]++;
796 |       c[4][_mm256_extract_epi8(u1, 26)]++;
797 |       c[5][_mm256_extract_epi8(v1, 26)]++;
798 |       c[6][_mm256_extract_epi8(u1, 27)]++;
799 |       c[7][_mm256_extract_epi8(v1, 27)]++;
800 |       c[0][_mm256_extract_epi8(u1, 28)]++;
801 |       c[1][_mm256_extract_epi8(v1, 28)]++;
802 |       c[2][_mm256_extract_epi8(u1, 29)]++;
803 |       c[3][_mm256_extract_epi8(v1, 29)]++;
804 |       c[4][_mm256_extract_epi8(u1, 30)]++;
805 |       c[5][_mm256_extract_epi8(v1, 30)]++;
806 |       c[6][_mm256_extract_epi8(u1, 31)]++;
807 |       c[7][_mm256_extract_epi8(v1, 31)]++;               PREFETCH(ip+512, 0);
808 |     }
809 |   }
810 |   while(ip < in+inlen) c[0][*ip++]++; 
811 |   HISTEND8(c, cnt);
812 | }
813 |   #endif
814 | 
815 | //-------------------------------------------------------------------------
816 |   #ifdef _COUNTBENCH
817 | // "count2x64", fastest function in https://github.com/nkurz/countbench
818 | #define CSIZE (256+8)
819 | 					
820 | #define ASM_SHIFT_RIGHT(reg, bitsToShift)                               \
821 |     __asm volatile ("shr %1, %0":                                       \
822 |                     "+r" (reg): /* read and written */                  \
823 |                     "i" (bitsToShift) /* constant */                    \
824 |                     )
825 | 
826 | 
827 | #define ASM_INC_TABLES(src0, src1, byte0, byte1, offset, size, base, scale) \
828 |     __asm volatile ("movzbl %b2, %k0\n"                /* byte0 = src0 & 0xFF */ \
829 |                     "movzbl %b3, %k1\n"                /* byte1 = src1 & 0xFF */ \
830 |                     "incl (%c4+0)*%c5(%6, %0, %c7)\n"  /* count[i+0][byte0]++ */ \
831 |                     "incl (%c4+1)*%c5(%6, %1, %c7)\n"  /* count[i+1][byte1]++ */ \
832 |                     "movzbl %h2, %k0\n"                /* byte0 = (src0 & 0xFF00) >> 8 */ \
833 |                     "movzbl %h3, %k1\n"                /* byte1 = (src1 & 0xFF00) >> 8 */ \
834 |                     "incl (%c4+2)*%c5(%6, %0, %c7)\n"  /* count[i+2][byte0]++ */ \
835 |                     "incl (%c4+3)*%c5(%6, %1, %c7)\n": /* count[i+3][byte1]++ */ \
836 |                     "=&R" (byte0),  /* write only (R == non REX) */     \
837 |                     "=&R" (byte1):  /* write only (R == non REX) */     \
838 |                     "Q" (src0),  /* read only (Q == must have rH) */    \
839 |                     "Q" (src1),  /* read only (Q == must have rH) */    \
840 |                     "i" (offset), /* constant array offset */           \
841 |                     "i" (size), /* constant array size     */           \
842 |                     "r" (base),  /* read only array address */          \
843 |                     "i" (scale):  /* constant [1,2,4,8] */              \
844 |                     "memory" /* clobbered (forces compiler to compute sum ) */ \
845 |                     )
846 | 					
847 | unsigned count2x64(unsigned char *src, unsigned srcSize, unsigned *__restrict cnt)
848 | {
849 |     unsigned long long remainder = srcSize;
850 |     if (srcSize < 32) goto handle_remainder;
851 | 
852 |     unsigned c[16][CSIZE];
853 |     memset(c, 0, sizeof(c));
854 |     
855 |     remainder = srcSize % 16;
856 |     srcSize -= remainder;  
857 |     const unsigned char *endSrc = src + srcSize;
858 |     unsigned long long next0 = *(unsigned long long *)(src + 0);
859 |     unsigned long long next1 = *(unsigned long long *)(src + 8);
860 | 
861 |     //IACA_START;
862 | 
863 |     while (src != endSrc)
864 |     {
865 |         unsigned long long byte0, byte1;
866 |         unsigned long long data0 = next0;
867 |         unsigned long long data1 = next1;
868 | 
869 |         src += 16;
870 |         next0 = *(unsigned long long *)(src + 0);
871 |         next1 = *(unsigned long long *)(src + 8);
872 | 
873 |         ASM_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4);
874 | 
875 |         ASM_SHIFT_RIGHT(data0, 16);
876 |         ASM_SHIFT_RIGHT(data1, 16);
877 |         ASM_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4);
878 | 
879 |         ASM_SHIFT_RIGHT(data0, 16);
880 |         ASM_SHIFT_RIGHT(data1, 16);
881 |         ASM_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4);
882 | 
883 |         ASM_SHIFT_RIGHT(data0, 16);
884 |         ASM_SHIFT_RIGHT(data1, 16);
885 |         ASM_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4);
886 |     }
887 | 
888 |     //IACA_END;
889 | 
890 |  handle_remainder:
891 |     for (size_t i = 0; i < remainder; i++) {
892 |         unsigned long long byte = src[i];
893 |         c[0][byte]++;
894 |     }
895 |     memset(cnt, 0, 256*sizeof(cnt[0]));
896 |     for(int i = 0; i < 256; i++)
897 |       for (int idx=0; idx < 16; idx++)
898 |         cnt[i] += c[idx][i];
899 | }
900 | 
901 | // Modified version of count2x64 by powturbo, using C instead of assembler
902 | #define C_SHIFT_RIGHT(reg, bitsToShift) reg >>= bitsToShift
903 | #define C_INC_TABLES(src0, src1, byte0, byte1, offset, size, c, scale) \
904 |         { \
905 |             byte0 = (unsigned char)src0;\
906 |             byte1 = (unsigned char)src1;\
907 |             c[offset+0][byte0]++;\
908 |             c[offset+1][byte1]++;\
909 |             byte0 = (unsigned char)(src0 >> 8);\
910 |             byte1 = (unsigned char)(src1 >> 8);\
911 |             c[offset+2][byte0]++; \
912 |             c[offset+3][byte1]++; \
913 |         }
914 | 
915 | static void count2x64c(unsigned char *__restrict src, unsigned srcSize, unsigned *__restrict cnt)
916 | {
917 |     unsigned long long remainder = srcSize;
918 |     if (srcSize < 32) goto handle_remainder;
919 | 
920 |     unsigned c[16][CSIZE];
921 |     memset(c, 0, sizeof(c));
922 |     
923 |     remainder = srcSize % 16;
924 |     srcSize -= remainder;  
925 |     const unsigned char *endSrc = src + srcSize;
926 |     unsigned long long next0 = *(unsigned long long *)(src + 0);
927 |     unsigned long long next1 = *(unsigned long long *)(src + 8);
928 | 
929 |     //IACA_START;
930 | 
931 |     while (src != endSrc)
932 |     {
933 |         unsigned long long byte0, byte1;
934 |         unsigned long long data0 = next0;
935 |         unsigned long long data1 = next1;
936 | 
937 |         src += 16;
938 |         next0 = *(unsigned long long *)(src + 0);
939 |         next1 = *(unsigned long long *)(src + 8);
940 | 
941 |         C_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4);
942 | 
943 |         C_SHIFT_RIGHT(data0, 16);
944 |         C_SHIFT_RIGHT(data1, 16);
945 |         C_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4);
946 | 
947 |         C_SHIFT_RIGHT(data0, 16);
948 |         C_SHIFT_RIGHT(data1, 16);
949 |         C_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4);
950 | 
951 |         C_SHIFT_RIGHT(data0, 16);
952 |         C_SHIFT_RIGHT(data1, 16);
953 |         C_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4);
954 |     }
955 | 
956 |     //IACA_END;
957 | 
958 |  handle_remainder:
959 |     for (size_t i = 0; i < remainder; i++) {
960 |         unsigned long long byte = src[i];
961 |         c[0][byte]++;
962 |     }
963 |     memset(cnt, 0, 256*sizeof(cnt[0]));
964 |     for(int i = 0; i < 256; i++) 
965 |       for(int idx=0; idx < 16; idx++)
966 |         cnt[i] += c[idx][i];
967 | }
968 |   #endif
969 | 


--------------------------------------------------------------------------------