├── .travis.yml
├── LICENSE
├── makefile
├── README.md
├── turbohist.c
├── time_.h
├── conf.h
├── sse_neon.h
└── turbohist_.c
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 |
3 | compiler:
4 | - gcc
5 | - clang
6 |
7 | branches:
8 | only:
9 | - master
10 |
11 | script:
12 | - make
13 | - ./turbohist
14 |
15 | matrix:
16 | include:
17 | - name: Linux arm
18 | os: linux
19 | arch: arm64
20 | compiler: gcc
21 |
22 | - name: Windows-MinGW
23 | os: windows
24 | script:
25 | - mingw32-make
26 | - ./turbohist
27 |
28 | - name: macOS, xcode
29 | os: osx
30 |
31 | # - name: Linux amd64
32 | # os: linux
33 | # arch: amd64
34 | # - name: Power ppc64le
35 | # os: linux-ppc64le
36 | # compiler: gcc
37 |
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016-2019, Powturbo
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are
6 | met:
7 |
8 | 1. Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright
12 | notice, this list of conditions and the following disclaimer in the
13 | documentation and/or other materials provided with the distribution.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | - homepage : https://sites.google.com/site/powturbo/
28 | - github : https://github.com/powturbo
29 | - twitter : https://twitter.com/powturbo
30 | - email : powturbo [_AT_] gmail [_DOT_] com
31 |
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | # powturbo (c) Copyright 2013-2022
2 | # Download or clone Turbo-Histogram:
3 | # git clone git://github.com/powturbo/Turbo-Histogram.git
4 |
5 | #uncomment to enable
6 | #https://github.com/nkurz/countbench (inline assembly)
7 | #COUNTBENCH=1
8 | #https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0 (delete/comment or rename main)
9 | #RYG=1
10 | # timer: rdtsc cycles/byte or wall time in MB/s
11 | #RDTSC=1
12 | #AVX2=1
13 |
14 | #-------------------------------------------------------------------------------------
15 | CC ?= gcc
16 | CXX ?= g++
17 | #CC=clang
18 | #CXX=clang++
19 | ASM ?= nasm
20 |
21 | OPT=-fstrict-aliasing
22 | ifeq (,$(findstring clang, $(CC)))
23 | OPT+=-falign-loops
24 | endif
25 |
26 | #------- OS/ARCH -------------------
27 | ifneq (,$(filter Windows%,$(OS)))
28 | OS := Windows
29 | # CC=gcc
30 | # CXX=g++
31 | ARCH=x86_64
32 | LDFLAGS+=-Wl,--stack,8194304
33 | FASM=win64
34 | else
35 | OS := $(shell uname -s)
36 | ARCH := $(shell uname -m)
37 | FASM=elf64
38 |
39 | ifneq (,$(findstring aarch64,$(CC)))
40 | ARCH = aarch64
41 | else ifneq (,$(findstring powerpc64le,$(CC)))
42 | ARCH = ppc64le
43 | endif
44 | endif
45 |
46 | ifeq ($(ARCH),ppc64le)
47 | _SSE=-D__SSSE3__
48 | MARCH=-mcpu=power9 -mtune=power9 $(_SSE)
49 | else ifeq ($(ARCH),aarch64)
50 | MARCH+=-march=armv8-a
51 | ifneq (,$(findstring clang, $(CC)))
52 | MARCH+=-march=armv8-a
53 | OPT+=-fomit-frame-pointer
54 | else
55 | MARCH+=-march=armv8-a
56 | endif
57 | SSE=-march=armv8-a
58 | else ifeq ($(ARCH),$(filter $(ARCH),x86_64))
59 | LDFLAG+=-lm
60 | # set minimum arch sandy bridge SSE4.1 + AVX
61 | _SSE=-march=corei7-avx -mtune=corei7-avx
62 | # SSE+=-mno-avx -mno-aes
63 | _AVX2=-march=haswell
64 | # CFLAGS=$(SSE)
65 | # CFLAGS=$(AVX2)
66 | endif
67 |
68 | ifeq ($(AVX2),1)
69 | MARCH=$(_AVX2)
70 | else
71 | MARCH=$(_SSE)
72 | endif
73 |
74 | CFLAGS+=$(MARCH) -w $(OPT)
75 | ifeq ($(STATIC),1)
76 | LDFLAGS+=-static
77 | endif
78 |
79 | ifeq ($(RDTSC),1)
80 | CFLAGS+=-D_RDTSC
81 | endif
82 |
83 | ifeq ($(COUNTBENCH),1)
84 | CFLAGS+=-D_COUNTBENCH
85 | endif
86 |
87 | ifeq ($(RYG),1)
88 | CFLAGS+=-D_RYG
89 | ASMLIB=histo_asm.o
90 | endif
91 |
92 | all: turbohist
93 |
94 | histo_asm.o: histo_asm.nas
95 | $(ASM) -f $(FASM) histo_asm.nas -o histo_asm.o
96 |
97 | turbohist: turbohist.o $(ASMLIB)
98 | $(CC) $^ $(LDFLAGS) -o turbohist
99 |
100 | .c.o:
101 | $(CC) -O3 $(CFLAGS) $< -c -o $@
102 |
103 |
104 | ifeq ($(OS),Windows)
105 | clean:
106 | del /S *.o
107 | # del /S *.exe
108 | else
109 | clean:
110 | find . -name "turbohist" -type f -delete
111 | find . -name "*.o" -type f -delete
112 | find . -name "*~" -type f -delete
113 | find . -name "core" -type f -delete
114 | endif
115 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | TurboHist: Fastest Histogram Construction
2 | =========================================
3 |
4 | - **~0.18 - 0.90 cycles per byte**
5 | - 100% C (C++ compatible header) without inline assembly
6 | - Both 32 and 64 bits supported
7 | - Portable scalar functions faster than SIMD functions
8 | - **Up to 22 times** faster than naive solution
9 | - :new: (2022.01) more faster, beats even other very fast assembler functions
10 |
11 | # Benchmark:
12 | - Single thread
13 | - Realistic and practical benchmark with large files.
14 | - No PURE cache benchmark
15 |
16 | #### - Uniform/Skewed distribution:
17 | - Uniform: [enwik9](http://mattmahoney.net/dc/text.html)
18 | - Skewed: enwik9 bwt generated w. libdivsufsort
19 | - 1GB zeros
20 | - Accurate benchmarking with command "turbohist file -I15"
21 |
22 | ###### Benchmark Intel CPU: i7-9700K 3.6GHz gcc 11.2
23 | Uniform distribution - enwik9 Text file, size=1.000.0000.000
24 | | Function | MB/s |Cycle/Byte|Language |Package |
25 | |----------------------------|-------:|---------:|----------|---------|
26 | | 1:hist_1_8 naiv 8 bits| 2761.01|1.3423 |C |TurboHist|
27 | | 2:hist_4_8 4 bins/ 8 bits| 2725.92|1.3249|C|TurboHist|
28 | | 3:hist_8_8 8 bins/ 8 bits| 2850.05|1.2627|C|TurboHist|
29 | | 4:hist_4_32 4 bins/32 bits| 3691.02|0.9660|C|TurboHist|
30 | | 5:hist_8_32 8 bins/32 bits| 3867.26|0.9561|C|TurboHist|
31 | | 6:hist_4_64 4 bins/64 bits|4040.55|0.9103|C|TurboHist|
32 | | 7:hist_8_64 8 bins/64 bits|**4053.37**|**0.9035**|C|TurboHist|
33 | | 8:histr_4_64 4/64+run | 3915.85|0.9668|C|TurboHist|
34 | | 9:histr_8_64 8/64+run | 3916.51|0.9286|C|TurboHist|
35 | |10:hist_4_128 4 bins/sse4.1 | 3643.20|1.0081|C|TurboHist|
36 | |11:hist_8_128 8 bins/sse4.1 | 3607.06|0.9845|C|TurboHist|
37 | |12:hist_4_256 4 bins/avx2 | 3522.27|1.0195|C|TurboHist|
38 | |13:hist_8_256 8 bins/avx2 | 3542.25|1.0366|C|TurboHist|
39 | |15:hist_8_64asm inline asm |**4161.87**|**0.8787**|inline asm|TurboHist|
40 | |18:count2x64 inline asm | 3963.91|0.9172|inline asm|Countbench|
41 | |20:histo_ref | 2702.57|1.3567|C|[Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0)|
42 | |21:histo_cpp_1x | 1876.13|1.8236|C|Ryg|
43 | |22:histo_cpp_2x | 2664.78|1.5935|C|Ryg|
44 | |23:histo_cpp_4x | 2817.77|1.2944|C|Ryg|
45 | |24:histo_asm_scalar4 | 3130.08|1.1609|asm|Ryg|
46 | |25:histo_asm_scalar8 | 3353.08|1.0636|asm|Ryg|
47 | |26:histo_asm_scalar8_var | 3704.88|0.9856|asm|Ryg|
48 | |27:histo_asm_scalar8_var2 | 4085.48|0.8913|asm|Ryg|
49 | |28:histo_asm_scalar8_var3 | 4132.54|0.8870|asm|Ryg|
50 | |29:histo_asm_scalar8_var4 | 4083.92|0.8970|asm|Ryg|
51 | |30:histo_asm_scalar8_var5 | 4002.21|0.9025|asm|Ryg|
52 | |31:histo_asm_sse4 | 3153.01|1.1445|asm|Ryg|
53 | |32:memcpy |13724.29|0.2698|C|
54 |
55 | Skewed distribution - enwik9.bwt Text file, size=1.000.0000.000
56 | | Function | MB/s |Cycle/Byte|Language |
57 | |----------------------------|-------:|---------:|----------|
58 | | 1:hist_1_8 naiv 8 bits| 1170.89|3.0642|C|TurboHist|
59 | | 2:hist_4_8 4 bins/ 8 bits| 2707.74|1.3321|C|TurboHist|
60 | | 3:hist_8_8 8 bins/ 8 bits| 2804.08|1.3208|C|TurboHist|
61 | | 4:hist_4_32 4 bins/32 bits| 3118.54|1.1402|C|TurboHist|
62 | | 5:hist_8_32 8 bins/32 bits| 3780.16|0.9714|C|TurboHist|
63 | | 6:hist_4_64 4 bins/64 bits| 3646.25|0.9980|C|TurboHist|
64 | | 7:hist_8_64 8 bins/64 bits| 3941.96|0.9282|C|TurboHist|
65 | | 8:histr_4_64 4/64+run | 5061.62|0.7270|C|TurboHist|
66 | | 9:histr_8_64 8/64+run |**5135.29**|**0.7229**|C|TurboHist|
67 | |10:hist_4_128 4 bins/sse4.1 | 3535.36|1.0365|C|TurboHist|
68 | |11:hist_8_128 8 bins/sse4.1 | 3654.41|0.9791|C|TurboHist|
69 | |12:hist_4_256 4 bins/avx2 | 3329.87|1.1022|C|TurboHist|
70 | |13:hist_8_256 8 bins/avx2 | 3540.36|1.0343|C|TurboHist|
71 | |15:hist_8_64asm inline asm | 4047.74|0.9013|inline asm|TurboHist|
72 | |18:count2x64 inline asm | 3969.92|0.9262|inline asm|[Countbench](https://github.com/nkurz/countbench)|
73 | |20:histo_ref | 1182.61|3.0718|C|Ryg|
74 | |21:histo_cpp_1x | 1213.42|2.9748|C|Ryg|
75 | |22:histo_cpp_2x | 2115.60|1.7373|C|Ryg|
76 | |23:histo_cpp_4x | 1801.97|2.0024|C|Ryg|
77 | |24:histo_asm_scalar4 | 3092.87|1.1561|asm|Ryg|
78 | |25:histo_asm_scalar8 | 3203.95|1.1139|asm|Ryg|
79 | |26:histo_asm_scalar8_var | 3460.45|1.0422|asm|Ryg|
80 | |27:histo_asm_scalar8_var2 | 3659.61|0.9878|asm|Ryg|
81 | |28:histo_asm_scalar8_var3 | 3769.96|0.9569|asm|Ryg|
82 | |29:histo_asm_scalar8_var4 | 3996.75|0.8905|asm|Ryg|
83 | |30:histo_asm_scalar8_var5 | 4642.10|0.7719|asm|Ryg|
84 | |31:histo_asm_sse4 | 3091.36|1.1670|asm|Ryg|
85 | |32:memcpy |15594.28|0.2412|C|
86 |
87 | All zeros: size=1.000.0000.000
88 | | Function | MB/s |Cycle/Byte|Language |
89 | |----------------------------|-------:|---------:|----------|
90 | | 1:hist_1_8 naiv 8 bits| 877.27|4.0805|C|TurboHist|
91 | | 2:hist_4_8 4 bins/ 8 bits| 2650.84|1.3485|C|TurboHist|
92 | | 3:hist_8_8 8 bins/ 8 bits| 2743.40|1.2994|C|TurboHist|
93 | | 4:hist_4_32 4 bins/32 bits| 2978.83|1.2006|C|TurboHist|
94 | | 5:hist_8_32 8 bins/32 bits| 3775.45|0.9555|C|TurboHist|
95 | | 6:hist_4_64 4 bins/64 bits| 3411.11|1.0530|C|TurboHist|
96 | | 7:hist_8_64 8 bins/64 bits| 3928.09|0.9342|C|TurboHist|
97 | | 8:histr_4_64 4/64+run |18998.87|0.1868|C|TurboHist|
98 | | 9:histr_8_64 8/64+run |**19629.28**|**0.1869**|C|TurboHist|
99 | |10:hist_4_128 4 bins/sse4.1 | 3365.40|1.0717|C|TurboHist|
100 | |11:hist_8_128 8 bins/sse4.1 | 3632.61|0.9950|C|TurboHist|
101 | |12:hist_4_256 4 bins/avx2 | 3112.15|1.1576|C|TurboHist|
102 | |13:hist_8_256 8 bins/avx2 | 3497.08|1.0205|C|TurboHist|
103 | |15:hist_8_64asm inline asm |4089.97|0.8817|inline asm|TurboHist|
104 | |18:count2x64 inline asm | 3881.98|0.9158|inline asm|Countbench|
105 | |20:histo_ref | 882.93|4.1072|C|Ryg|
106 | |21:histo_cpp_1x | 873.20|4.1069|C|Ryg|
107 | |22:histo_cpp_2x | 1720.19|2.0961|C|Ryg|
108 | |23:histo_cpp_4x | 1866.99|2.0817|C|Ryg|
109 | |24:histo_asm_scalar4 | 2995.84|1.1942|asm|Ryg|
110 | |25:histo_asm_scalar8 | 3107.30|1.1618|asm|Ryg|
111 | |26:histo_asm_scalar8_var | 3288.67|1.1143|asm|Ryg|
112 | |27:histo_asm_scalar8_var2 | 3290.92|1.0957|asm|Ryg|
113 | |28:histo_asm_scalar8_var3 | 3707.41|0.9763|asm|Ryg|
114 | |29:histo_asm_scalar8_var4 | 3988.01|0.9019|asm|Ryg|
115 | |30:histo_asm_scalar8_var5 |14076.09|0.2564|asm|Ryg|
116 | |31:histo_asm_sse4 | 3020.32|1.1975|asm|Ryg|
117 | |32:memcpy |14057.53|0.2636|C|
118 |
119 | (**bold** = pareto) MB=1.000.000
120 | - [Ryg](https://gist.github.com/rygorous/a86a5cf348922cdea357c928e32fc7e0)
121 | - [Countbench](https://github.com/nkurz/countbench)
122 |
123 | ## Compile:
124 |
125 |
126 | make
127 | or
128 | make AVX2=1
129 |
130 | ## Usage:
131 |
132 |
133 | turbohist [-e#] file [-I#] [-z]
134 | options:
135 | -e# # = function numbers separated by ,
136 | -I# # = number of iteration
137 | set to -I15 for accurate timings
138 | -z set read buffer to zeros
139 |
140 | ### Examples:
141 |
142 | ./turbohist file
143 | ./turbohist -e1,7,9
144 |
145 | ### Environment:
146 | ###### OS/Compiler (32 + 64 bits):
147 | - Windows: MinGW-w64 makefile
148 | - Linux amd/intel: GNU GCC (>=4.6)
149 | - Linux amd/intel: Clang (>=3.2)
150 | - Linux arm: aarch64 ARMv8: gcc (>=6.3)
151 | - MaxOS: XCode (>=9) + Apple M1
152 | - PowerPC ppc64le: gcc (>=8.0)
153 |
154 | Last update: 01 JAN 2022
155 |
--------------------------------------------------------------------------------
/turbohist.c:
--------------------------------------------------------------------------------
1 | /**
2 | Copyright (c) 2013-2022, Powturbo
3 | - homepage : https://sites.google.com/site/powturbo/
4 | - github : https://github.com/powturbo
5 | - twitter : https://twitter.com/powturbo
6 | - email : powturbo [_AT_] gmail [_DOT_] com
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are
11 | met:
12 |
13 | 1. Redistributions of source code must retain the above copyright
14 | notice, this list of conditions and the following disclaimer.
15 |
16 | 2. Redistributions in binary form must reproduce the above copyright
17 | notice, this list of conditions and the following disclaimer in the
18 | documentation and/or other materials provided with the distribution.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | **/
32 | // Turbo histogram benchmark
33 | #include
34 | #include
35 | #include
36 | #ifdef __APPLE__
37 | #include
38 | #else
39 | #include
40 | #endif
41 | #ifdef _MSC_VER
42 | #include "vs/getopt.h"
43 | #else
44 | #include
45 | #include
46 | #endif
47 | #include "conf.h"
48 | #include "time_.h"
49 |
50 | #include "turbohist_.c"
51 | #ifdef _RYG
52 | #include "histotest.cpp"
53 | #endif
54 |
55 | NOINLINE void libmemcpy(unsigned char *dst, unsigned char *src, int len) {
56 | void *(*memcpy_ptr)(void *, const void *, size_t) = memcpy;
57 | if (time(NULL) == 1)
58 | memcpy_ptr = NULL;
59 | memcpy_ptr(dst, src, len);
60 | }
61 |
62 | void usage(char *pgm) {
63 | fprintf(stderr, "\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__);
64 | fprintf(stderr, "Usage: %s [options] [file]\n", pgm);
65 | fprintf(stderr, "Benchmark:\n");
66 | fprintf(stderr, " -I# # = Number of runs (default=3)\n");
67 | fprintf(stderr, " -z set the read buffer to zeros\n");
68 | fprintf(stderr, "Ex. ./turbohist file -I15\n");
69 | fprintf(stderr, " ./turbohist file -I15 -z\n");
70 | fprintf(stderr, " ./turbohist file -e1,4,8,15 -I15\n");
71 | exit(0);
72 | }
73 |
74 | int check(unsigned *cnt, unsigned n, unsigned *scnt) { unsigned i; for(i=0;i<256;i++) if(cnt[i]!=scnt[i]) { printf("Error sum at %d ", i); return 0; } printf(" %s", TM_MBS); }
75 |
76 | int bench(unsigned char *in, unsigned n, unsigned *cnt, unsigned id, unsigned *scnt) {
77 | switch(id) {
78 | case 1: TMBENCH(" 1:hist_1_8 naiv 8 bits", hist_1_8( in, n, cnt),n); break;
79 | case 2: TMBENCH(" 2:hist_4_8 4 bins/ 8 bits", hist_4_8( in, n, cnt),n); break;
80 | case 3: TMBENCH(" 3:hist_8_8 8 bins/ 8 bits", hist_8_8( in, n, cnt),n); break;
81 | case 4: TMBENCH(" 4:hist_4_32 4 bins/32 bits", hist_4_32( in, n, cnt),n); break;
82 | case 5: TMBENCH(" 5:hist_8_32 8 bins/32 bits", hist_8_32( in, n, cnt),n); break;
83 | case 6: TMBENCH(" 6:hist_4_64 4 bins/64 bits", hist_4_64( in, n, cnt),n); break;
84 | case 7: TMBENCH(" 7:hist_8_64 8 bins/64 bits", hist_8_64( in, n, cnt),n); break;
85 | case 8: TMBENCH(" 8:histr_4_64 4/64+run ", histr_4_64( in, n, cnt),n); break;
86 | case 9: TMBENCH(" 9:histr_8_64 8/64+run ", histr_8_64( in, n, cnt),n); break;
87 | #ifdef __ARM_NEON
88 | case 10: TMBENCH("10:hist_4_128 4 bins/neon ", hist_4_128( in, n, cnt),n); break;
89 | case 11: TMBENCH("11:hist_8_128 8 bins/neon ", hist_8_128( in, n, cnt),n); break;
90 | #else
91 | case 10: TMBENCH("10:hist_4_128 4 bins/sse4.1 ", hist_4_128( in, n, cnt),n); break;
92 | case 11: TMBENCH("11:hist_8_128 8 bins/sse4.1 ", hist_8_128( in, n, cnt),n); break;
93 | #endif
94 | #ifdef __AVX2__
95 | case 12: TMBENCH("12:hist_4_256 4 bins/avx2 ", hist_4_256( in, n, cnt),n); break;
96 | case 13: TMBENCH("13:hist_8_256 8 bins/avx2 ", hist_8_256( in, n, cnt),n); break;
97 | #endif
98 | #ifdef __x86_64
99 | case 15: TMBENCH("15:hist_8_64asm inline asm ", hist_8_64a( in, n, cnt),n); break;
100 | #endif
101 | #ifdef _COUNTBENCH
102 | case 18: TMBENCH("18:count2x64 inline asm ", count2x64( in, n, cnt),n); break;
103 | // case 19: TMBENCH("19:count2x64c ", count2x64c( in, n, cnt),n); break;
104 | #endif
105 | #ifdef _RYG
106 | case 20: TMBENCH("20:histo_ref ", histo_ref( cnt, in, n),n); break;
107 | case 21: TMBENCH("21:histo_cpp_1x ", histo_cpp_1x( cnt, in, n),n); break;
108 | case 22: TMBENCH("22:histo_cpp_2x ", histo_cpp_2x( cnt, in, n),n); break;
109 | case 23: TMBENCH("23:histo_cpp_4x ", histo_cpp_4x( cnt, in, n),n); break;
110 | case 24: TMBENCH("24:histo_asm_scalar4 ", histo_asm_scalar4( cnt, in, n),n); break;
111 | case 25: TMBENCH("25:histo_asm_scalar8 ", histo_asm_scalar8( cnt, in, n),n); break;
112 | case 26: TMBENCH("26:histo_asm_scalar8_var ", histo_asm_scalar8_var( cnt, in, n),n); break;
113 | case 27: TMBENCH("27:histo_asm_scalar8_var2 ", histo_asm_scalar8_var2(cnt, in, n),n); break;
114 | case 28: TMBENCH("28:histo_asm_scalar8_var3 ", histo_asm_scalar8_var3(cnt, in, n),n); break;
115 | case 29: TMBENCH("29:histo_asm_scalar8_var4 ", histo_asm_scalar8_var4(cnt, in, n),n); break;
116 | case 30: TMBENCH("30:histo_asm_scalar8_var5 ", histo_asm_scalar8_var5(cnt, in, n),n); break;
117 | case 31: TMBENCH("31:histo_asm_sse4 ", histo_asm_sse4( cnt, in, n),n); break;
118 | #ifdef __AVX2__
119 | case 37: TMBENCH("37:histo_asm_avx256_8x_1 ", histo_asm_avx256_8x_1( cnt, in, n),n); break;
120 | case 38: TMBENCH("38:histo_asm_avx256_8x_2 ", histo_asm_avx256_8x_2( cnt, in, n),n); break;
121 | case 39: TMBENCH("39:histo_asm_avx256_8x_3 ", histo_asm_avx256_8x_3( cnt, in, n),n); break;
122 | #endif
123 | #endif
124 | case 32: { unsigned char *cpy = malloc(n); if(cpy) { TMBENCH("32:memcpy ", libmemcpy(cpy, in, n),n); free(cpy); printf(" %s", TM_MBS); } } return 0; break;
125 | #define ID_LAST 32
126 | default: return 0;
127 | }
128 | check(cnt,n,scnt);
129 | return 1;
130 | }
131 |
132 | int main(int argc, char *argv[]) {
133 | unsigned char *finame = argv[1], *scmd = NULL, *in;
134 | unsigned n, fno, zero=0, scnt[256], cnt[256];
135 |
136 | int c, digit_optind = 0;
137 | for(;;) {
138 | int this_option_optind = optind ? optind : 1;
139 | int option_index = 0;
140 | static struct option long_options[] = {
141 | { "help", 0, 0, 'h'},
142 | { 0, 0, 0, 0}
143 | };
144 | if((c = getopt_long(argc, argv, "e:hI:z", long_options, &option_index)) == -1) break;
145 | switch(c) {
146 | case 0:
147 | printf("Option %s", long_options[option_index].name);
148 | if(optarg) printf (" with arg %s", optarg); printf ("\n");
149 | break;
150 | case 'I': if((tm_Rep = atoi(optarg))<=0) tm_rep =tm_Rep =1; break;
151 | case 'z': zero++; break;
152 | case 'e': scmd = optarg; break;
153 | case 'h':
154 | default:
155 | usage(argv[0]);
156 | exit(0);
157 | }
158 | }
159 |
160 | printf("\nTurboHist Copyright (c) 2013-2022 Powturbo %s\n", __DATE__);
161 | char _scmd[33];
162 | sprintf(_scmd, "1-%d", ID_LAST);
163 |
164 | for(fno = optind; fno < argc; fno++) {
165 | finame = argv[fno];
166 |
167 | FILE *fi = fopen(finame, "rb");
168 | if(!fi) perror(finame), exit(1); // printf("'%s'\n", finame);
169 |
170 | fseek(fi, 0, SEEK_END);
171 | long long flen = ftell(fi);
172 | fseek(fi, 0, SEEK_SET);
173 |
174 | if(flen > GB) flen = GB;
175 | n = flen;
176 | if(!(in = (unsigned char*)malloc(n)))
177 | printf("malloc error\n"), exit(-1);
178 | n = fread(in, 1, n, fi);
179 | fclose(fi);
180 | if(n <= 0)
181 | exit(0);
182 |
183 | if(zero) memset(in, 0, n);
184 | int i; hist_1_8(in, n, scnt); // first run
185 | unsigned char *p = (scmd && (scmd[0] != '0' || scmd[1]))?scmd:_scmd;
186 | do {
187 | int id = strtoul(p, &p, 10),idx = id, i;
188 | if(id >= 0) {
189 | while(isspace(*p)) p++; if(*p == '-') { if((idx = strtoul(p+1, &p, 10)) < id) idx = id; if(idx > ID_LAST) idx = ID_LAST; } //printf("ID=%d,%d ", id, idx);
190 | for(i = id; i <= idx; i++) {
191 | if(bench(in, n, cnt, i, scnt)) printf("\t%s\n", finame);
192 | }
193 | }
194 | } while(*p++);
195 | printf("\n");
196 | free(in);
197 | }
198 | }
199 |
--------------------------------------------------------------------------------
/time_.h:
--------------------------------------------------------------------------------
1 | /**
2 | Copyright (C) powturbo 2013-2022
3 | GPL v2 License
4 |
5 | This program is free software; you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation; either version 2 of the License, or
8 | (at your option) any later version.
9 |
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License along
16 | with this program; if not, write to the Free Software Foundation, Inc.,
17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 |
19 | - homepage : https://sites.google.com/site/powturbo/
20 | - github : https://github.com/powturbo
21 | - twitter : https://twitter.com/powturbo
22 | - email : powturbo [_AT_] gmail [_DOT_] com
23 | **/
24 | // time_.h : parameter free high precision time/benchmark functions
25 | #include
26 | #include
27 | #ifdef _WIN32
28 | #include
29 | #ifndef sleep
30 | #define sleep(n) Sleep((n) * 1000)
31 | #endif
32 | typedef unsigned __int64 uint64_t;
33 |
34 | #else
35 | #include
36 | #include
37 | #define Sleep(ms) usleep((ms) * 1000)
38 | #endif
39 |
40 | #if defined (__i386__) || defined( __x86_64__ ) // ------------------ rdtsc --------------------------
41 | #ifdef _MSC_VER
42 | #include // __rdtsc
43 | #else
44 | #include
45 | #endif
46 |
47 | #ifdef __corei7__
48 | #define RDTSC_INI(_c_) do { unsigned _cl, _ch; \
49 | __asm volatile ("couid\n\t" \
50 | "rdtsc\n\t" \
51 | "mov %%edx, %0\n" \
52 | "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
53 | "%rax", "%rbx", "%rcx", "%rdx"); \
54 | _c_ = (uint64_t)_ch << 32 | _cl; \
55 | } while(0)
56 |
57 | #define RDTSC(_c_) do { unsigned _cl, _ch; \
58 | __asm volatile("rdtscp\n" \
59 | "mov %%edx, %0\n" \
60 | "mov %%eax, %1\n" \
61 | "cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\
62 | "%rbx", "%rcx", "%rdx");\
63 | _c_ = (uint64_t)_ch << 32 | _cl;\
64 | } while(0)
65 | #else
66 | #define RDTSC(_c_) do { unsigned _cl, _ch;\
67 | /* __asm volatile ("cpuid \n"\
68 | "rdtsc"\
69 | : "=a"(_cl), "=d"(_ch)\
70 | : "a"(0)\
71 | : "%ebx", "%ecx");\
72 | _c_ = (uint64_t)_ch << 32 | _cl;\
73 | } while(0)*/
74 | #define RDTSC(_c_) do { unsigned _cl, _ch;\
75 | __asm volatile("rdtsc" : "=a"(_cl), "=d"(_ch) );\
76 | _c_ = (uint64_t)_ch << 32 | _cl;\
77 | } while(0)
78 | #endif
79 |
80 | #define RDTSC_INI(_c_) RDTSC(_c_)
81 | #else // ------------------ time --------------------------
82 | #define RDTSC_INI(_c_)
83 | #define RDTSC(_c_)
84 | #endif
85 |
86 | #ifndef TM_F
87 | #define TM_F 1.0 // TM_F=4 -> MI/s
88 | #endif
89 |
90 | #ifdef _RDTSC //---------------------- rdtsc --------------------------------
91 | #define TM_M (CLOCKS_PER_SEC*1000000ull)
92 | #define TM_PRE 4
93 | #define TM_MBS "cycle/byte"
94 | static double TMBS(unsigned l, double t) { return (double)t/(double)l; }
95 |
96 | typedef uint64_t tm_t;
97 | static tm_t tmtime() { uint64_t c; RDTSC(c); return c; }
98 | static tm_t tminit() { uint64_t c; __asm volatile("" ::: "memory"); RDTSC_INI(c); return c; }
99 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start); }
100 | static int tmiszero(tm_t t) { return !t; }
101 | #else //---------------------- time -----------------------------------
102 | #define TM_M 1
103 | #define TM_PRE 2
104 | #define TM_MBS "MB/s"
105 | static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; }
106 |
107 | #ifdef _WIN32 //-------- windows
108 | static LARGE_INTEGER tps;
109 |
110 | typedef unsigned __int64 tm_t;
111 | static tm_t tmtime() { LARGE_INTEGER tm; tm_t t; QueryPerformanceCounter(&tm); return tm.QuadPart; }
112 | static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; }
113 | static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; }
114 | static int tmiszero(tm_t t) { return !t; }
115 | #else // Linux & compatible / MacOS
116 | #ifdef __APPLE__
117 | #include
118 | #ifndef MAC_OS_X_VERSION_10_12
119 | #define MAC_OS_X_VERSION_10_12 101200
120 | #endif
121 | #define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12)
122 | #if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME)
123 | #include
124 | #define CLOCK_REALTIME 0
125 | #define CLOCK_MONOTONIC 0
126 | int clock_gettime(int /*clk_id*/, struct timespec* t) {
127 | struct timeval now;
128 | int rv = gettimeofday(&now, NULL);
129 | if (rv) return rv;
130 | t->tv_sec = now.tv_sec;
131 | t->tv_nsec = now.tv_usec * 1000;
132 | return 0;
133 | }
134 | #endif
135 | #endif
136 |
137 | typedef struct timespec tm_t;
138 | static tm_t tmtime() { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; }
139 | static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; }
140 | static tm_t tminit() { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; }
141 | static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
142 | #endif
143 | #endif
144 |
145 | //---------------------------------------- bench ----------------------------------------------------------------------
146 | // for each a function call is repeated until exceding tm_tx seconds.
147 | // A run duration is always tm_tx seconds
148 | // The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision)
149 |
150 | // sleep after each 8 runs to avoid cpu trottling.
151 | #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
152 |
153 | // benchmark loop
154 | #define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\
155 | for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\
156 | for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */
157 |
158 | #define TMEND(_len_) \
159 | _tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\
160 | }\
161 | /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
162 | /*other runs: break the loop only after 'tm_rm' repeats */ \
163 | _tm_t = tmdiff(_tm_t0, tmtime());\
164 | /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\
165 | if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("repeats=%u,%u,%.4f ", _tm_Rn, _tm_Rx, _tm_t);*/ } \
166 | tm_tm = _tm_t; _tm_c++;\
167 | } else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
168 | if(tm_verbose) { printf("%8.*f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TM_PRE, TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
169 | if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\
170 | }\
171 | }
172 |
173 | static unsigned tm_rep = 1u<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2;
174 | static tm_t tm_0, tm_T;
175 | static double tm_tm, tm_tx = 1.0*TM_M, tm_TX = 60.0*TM_M;
176 |
177 | static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; }
178 |
179 | #define TMBENCH(_name_, _func_, _len_) do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
180 | TMBEG(tm_Rep) _func_; TMEND(_len_); \
181 | double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE, TMBS(_len_, dm/dr) );\
182 | } while(0)
183 |
184 | // second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding
185 | #define TMBENCH2(_name_, _func_, _len_) do { \
186 | TMBEG(tm_Rep2) _func_; TMEND(_len_);\
187 | double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE,TMBS(_len_, dm/dr) );\
188 | if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
189 | } while(0)
190 |
191 | // Check
192 | #define TMBENCHT(_name_,_func_, _len_, _res_) do { \
193 | TMBEG(tm_Rep) \
194 | if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\
195 | TMEND(_len_);\
196 | if(tm_verbose) printf("%8.*f \b\b\b\b\b", TM_PRE, TMBS(_len_,(double)tm_tm/(double)tm_rm) );\
197 | if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\
198 | } while(0)
199 |
200 | static void pr(unsigned l, unsigned n) {
201 | double r = (double)l*100.0/n;
202 | if(r>0.1) printf("%10u %6.2f%% ", l, r);
203 | else if(r>0.01) printf("%10u %7.3f%% ", l, r);
204 | else printf("%10u %8.4f%% ", l, r); fflush(stdout);
205 | }
206 |
207 | //----------------------------------------------------------------------------------------------------------------------------------
208 | #define Kb (1u<<10)
209 | #define Mb (1u<<20)
210 | #define Gb (1u<<30)
211 | #define KB 1000
212 | #define MB 1000000
213 | #define GB 1000000000
214 |
215 | static unsigned argtoi(char *s, unsigned def) {
216 | char *p;
217 | unsigned n = strtol(s, &p, 10),f = 1;
218 | switch(*p) {
219 | case 'K': f = KB; break;
220 | case 'M': f = MB; break;
221 | case 'G': f = GB; break;
222 | case 'k': f = Kb; break;
223 | case 'm': f = Mb; break;
224 | case 'g': f = Gb; break;
225 | case 'B': return n; break;
226 | case 'b': def = 0;
227 | default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def;
228 | }
229 | return n*f;
230 | }
231 | static uint64_t argtol(char *s) {
232 | char *p;
233 | uint64_t n = strtol(s, &p, 10),f=1;
234 | switch(*p) {
235 | case 'K': f = KB; break;
236 | case 'M': f = MB; break;
237 | case 'G': f = GB; break;
238 | case 'k': f = Kb; break;
239 | case 'm': f = Mb; break;
240 | case 'g': f = Gb; break;
241 | case 'B': return n; break;
242 | case 'b': return 1u << n;
243 | default: f = MB;
244 | }
245 | return n*f;
246 | }
247 |
248 | static uint64_t argtot(char *s) {
249 | char *p;
250 | uint64_t n = strtol(s, &p, 10),f=1;
251 | switch(*p) {
252 | case 'h': f = 3600000; break;
253 | case 'm': f = 60000; break;
254 | case 's': f = 1000; break;
255 | case 'M': f = 1; break;
256 | default: f = 1000;
257 | }
258 | return n*f;
259 | }
260 |
261 | static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; }
262 |
263 |
--------------------------------------------------------------------------------
/conf.h:
--------------------------------------------------------------------------------
1 | /**
2 | Copyright (C) powturbo 2013-2019
3 | GPL v2 License
4 |
5 | This program is free software; you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation; either version 2 of the License, or
8 | (at your option) any later version.
9 |
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License along
16 | with this program; if not, write to the Free Software Foundation, Inc.,
17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 |
19 | - homepage : https://sites.google.com/site/powturbo/
20 | - github : https://github.com/powturbo
21 | - twitter : https://twitter.com/powturbo
22 | - email : powturbo [_AT_] gmail [_DOT_] com
23 | **/
24 |
25 | // conf.h - config & common
26 | #ifndef CONF_H
27 | #define CONF_H
28 | //------------------------- Compiler ------------------------------------------
29 | #if defined(__GNUC__)
30 | #include
31 | #define ALIGNED(t,v,n) t v __attribute__ ((aligned (n)))
32 | #define ALWAYS_INLINE inline __attribute__((always_inline))
33 | #define NOINLINE __attribute__((noinline))
34 | #define _PACKED __attribute__ ((packed))
35 | #define likely(x) __builtin_expect((x),1)
36 | #define unlikely(x) __builtin_expect((x),0)
37 |
38 | #define popcnt32(_x_) __builtin_popcount(_x_)
39 | #define popcnt64(_x_) __builtin_popcountll(_x_)
40 |
41 | #if defined(__i386__) || defined(__x86_64__)
42 | //x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
43 | // x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
44 | static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
45 | static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
46 | static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; }
47 | static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
48 |
49 | static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
50 | static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
51 | static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
52 | static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
53 | #else
54 | static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); }
55 | static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
56 | static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
57 |
58 | static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
59 | static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
60 | static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
61 | static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
62 | #endif
63 |
64 | #define ctz64(_x_) __builtin_ctzll(_x_)
65 | #define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1< 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
71 | #define bswap16(x) __builtin_bswap16(x)
72 | #else
73 | static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
74 | #endif
75 | #define bswap32(x) __builtin_bswap32(x)
76 | #define bswap64(x) __builtin_bswap64(x)
77 |
78 | #elif _MSC_VER //----------------------------------------------------
79 | #include
80 | #include
81 | #if _MSC_VER < 1600
82 | #include "vs/stdint.h"
83 | #define __builtin_prefetch(x,a)
84 | #define inline __inline
85 | #else
86 | #include
87 | #define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
88 | #endif
89 |
90 | #define ALIGNED(t,v,n) __declspec(align(n)) t v
91 | #define ALWAYS_INLINE __forceinline
92 | #define NOINLINE __declspec(noinline)
93 | #define THREADLOCAL __declspec(thread)
94 | #define likely(x) (x)
95 | #define unlikely(x) (x)
96 |
97 | static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
98 | static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; }
99 | static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; }
100 | static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; }
101 | #if !defined(_M_ARM64) && !defined(_M_X64)
102 | static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
103 | unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
104 | *ret = x0 ? bottom : 32 + top; return x != 0;
105 | }
106 | static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
107 | unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1); _BitScanReverse(&bottom, (unsigned long)x);
108 | *ret = x1 ? top + 32 : bottom; return x != 0;
109 | }
110 | #endif
111 | static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
112 | static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; }
113 | static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; }
114 |
115 | #define rol32(x,s) _lrotl(x, s)
116 | #define ror32(x,s) _lrotr(x, s)
117 |
118 | #define bswap16(x) _byteswap_ushort(x)
119 | #define bswap32(x) _byteswap_ulong(x)
120 | #define bswap64(x) _byteswap_uint64(x)
121 |
122 | #define popcnt32(x) __popcnt(x)
123 | #ifdef _WIN64
124 | #define popcnt64(x) __popcnt64(x)
125 | #else
126 | #define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
127 | #endif
128 |
129 | #define sleep(x) Sleep(x/1000)
130 | #define fseeko _fseeki64
131 | #define ftello _ftelli64
132 | #define strcasecmp _stricmp
133 | #define strncasecmp _strnicmp
134 | #define strtoull _strtoui64
135 | static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
136 | #endif
137 |
138 | #define __bsr8(_x_) __bsr32(_x_)
139 | #define __bsr16(_x_) __bsr32(_x_)
140 | #define bsr8(_x_) bsr32(_x_)
141 | #define bsr16(_x_) bsr32(_x_)
142 | #define ctz8(_x_) ctz32(_x_)
143 | #define ctz16(_x_) ctz32(_x_)
144 | #define clz8(_x_) (clz32(_x_)-24)
145 | #define clz16(_x_) (clz32(_x_)-16)
146 |
147 | #define popcnt8(x) popcnt32(x)
148 | #define popcnt16(x) popcnt32(x)
149 |
150 | //--------------- Unaligned memory access -------------------------------------
151 | #ifdef UA_MEMCPY
152 | #include
153 | static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; }
154 | static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; }
155 | static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
156 | static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; }
157 | static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; }
158 | static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; }
159 |
160 | static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); }
161 | static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); }
162 | static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
163 | static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); }
164 | static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); }
165 | static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); }
166 | #elif defined(__i386__) || defined(__x86_64__) || \
167 | defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
168 | defined(__powerpc__) || defined(__s390__) ||\
169 | defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
170 | defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
171 | defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
172 | defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
173 | #define ctou16(_cp_) (*(unsigned short *)(_cp_))
174 | #define ctou32(_cp_) (*(unsigned *)(_cp_))
175 | #define ctof32(_cp_) (*(float *)(_cp_))
176 |
177 | #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
178 | #define ctou64(_cp_) (*(uint64_t *)(_cp_))
179 | #define ctof64(_cp_) (*(double *)(_cp_))
180 | #elif defined(__ARM_FEATURE_UNALIGNED)
181 | struct _PACKED longu { uint64_t l; };
182 | struct _PACKED doubleu { double d; };
183 | #define ctou64(_cp_) ((struct longu *)(_cp_))->l
184 | #define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
185 | #endif
186 |
187 | #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
188 | struct _PACKED shortu { unsigned short s; };
189 | struct _PACKED unsignedu { unsigned u; };
190 | struct _PACKED longu { uint64_t l; };
191 | struct _PACKED floatu { float f; };
192 | struct _PACKED doubleu { double d; };
193 |
194 | #define ctou16(_cp_) ((struct shortu *)(_cp_))->s
195 | #define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u
196 | #define ctou64(_cp_) ((struct longu *)(_cp_))->l
197 | #define ctof32(_cp_) ((struct floatu *)(_cp_))->f
198 | #define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
199 | #else
200 | #error "unknown cpu"
201 | #endif
202 |
203 | #define ctou24(_cp_) (ctou32(_cp_) & 0xffffff)
204 | #define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull)
205 | #define ctou8(_cp_) (*(_cp_))
206 | //--------------------- wordsize ----------------------------------------------
207 | #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\
208 | defined(__x86_64__) || defined(_M_X64) ||\
209 | defined(__ia64) || defined(_M_IA64) ||\
210 | defined(__aarch64__) ||\
211 | defined(__mips64) ||\
212 | defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
213 | defined(__s390x__)
214 | #define __WORDSIZE 64
215 | #else
216 | #define __WORDSIZE 32
217 | #endif
218 | #endif
219 |
220 | //---------------------misc ---------------------------------------------------
221 | //#define bzhi63(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64
222 | //#define bzhi63(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32
223 | #define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // b Constant
224 | #define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
225 | #define BZHI16(_u_, _b_) BZHI32(_u_, _b_)
226 | #define BZHI8( _u_, _b_) BZHI32(_u_, _b_)
227 |
228 | #ifdef __AVX2__
229 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
230 | #include
231 | #else
232 | #include
233 | #endif
234 | #define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) // b variable
235 | #define bzhi31(_u_, _b_) _bzhi_u32(_u_, _b_)
236 |
237 | #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
238 | #define bzhi64(_u_, _b_) BZHI64(_u_, _b_)
239 | #define bzhi63(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
240 | #else
241 | #define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
242 | #define bzhi63(_u_, _b_) _bzhi_u64(_u_, _b_)
243 | #endif
244 | #else
245 | #define bzhi64(_u_, _b_) BZHI64(_u_, _b_)
246 | #define bzhi63(_u_, _b_) ((_u_) & ((1ull <<(_b_))-1))
247 | #define bzhi32(_u_, _b_) ((_u_) & ((1ull <<(_b_))-1))
248 | #define bzhi31(_u_, _b_) ((_u_) & ((1 <<(_b_))-1))
249 | #endif
250 |
251 | #define bzhi16(_u_, _b_) bzhi31(_u_, _b_)
252 | #define bzhi8( _u_, _b_) bzhi31(_u_, _b_)
253 |
254 |
255 | #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
256 | #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
257 |
258 | #define TEMPLATE2_(_x_, _y_) _x_##_y_
259 | #define T2(_x_, _y_) TEMPLATE2_(_x_,_y_)
260 |
261 | #define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
262 | #define T3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
263 |
264 | #define CACHE_LINE_SIZE 64
265 | #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
266 |
267 | #define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
268 |
269 | //--- NDEBUG -------
270 | #include
271 | #ifdef _MSC_VER
272 | #ifdef NDEBUG
273 | #define AS(expr, fmt, ...)
274 | #define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
275 | #define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
276 | #else
277 | #define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
278 | #define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } } while(0)
279 | #define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
280 | #endif
281 | #else
282 | #ifdef NDEBUG
283 | #define AS(expr, fmt,args...)
284 | #define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
285 | #define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
286 | #else
287 | #define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
288 | #define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } } while(0)
289 | #define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
290 | #endif
291 | #endif
292 |
--------------------------------------------------------------------------------
/sse_neon.h:
--------------------------------------------------------------------------------
1 | /**
2 | Copyright (C) powturbo 2013-2019
3 | GPL v2 License
4 |
5 | This program is free software; you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation; either version 2 of the License, or
8 | (at your option) any later version.
9 |
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License along
16 | with this program; if not, write to the Free Software Foundation, Inc.,
17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 |
19 | - homepage : https://sites.google.com/site/powturbo/
20 | - github : https://github.com/powturbo
21 | - twitter : https://twitter.com/powturbo
22 | - email : powturbo [_AT_] gmail [_DOT_] com
23 | **/
24 | // intel sse to arm neon
25 |
26 | #ifndef _SSE_NEON_H_
27 | #define _SSE_NEON_H_
28 | #include "conf.h"
29 |
30 | #ifdef __ARM_NEON //--------------------------------------------------------------------------------------------------
31 | #include
32 | #define __m128i uint32x4_t
33 |
34 | //#define USE_MACROS
35 | #define uint8x16_to_8x8x2(_a_) ((uint8x8x2_t) { vget_low_u8(_a_), vget_high_u8(_a_) })
36 |
37 | #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
38 | #define _mm_set_epi8(u15,u14,u13,u12,\
39 | u11,u10, u9, u8,\
40 | u7,u6,u5,u4,\
41 | u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
42 | #define _mm_set_epi16( u7,u6,u5,u4,\
43 | u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);})
44 | #define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);})
45 | #define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);})
46 | #define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
47 | #define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
48 | #else
49 | static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
50 | uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
51 | uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
52 | uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); }
53 | static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
54 | uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
55 | static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); }
56 | static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); }
57 | #endif
58 |
59 | #define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ )
60 | #define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_)
61 | #define _mm_set1_epi32( _u32_) vdupq_n_u32(_u32_)
62 | #define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_)
63 | #define _mm_setzero_si128() vdupq_n_u32( 0 )
64 | //---------------------------------------------- Arithmetic -----------------------------------------------------------------------
65 | #define _mm_add_epi8( _a_,_b_) (__m128i)vaddq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
66 | #define _mm_add_epi16( _a_,_b_) (__m128i)vaddq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_))
67 | #define _mm_add_epi32( _a_,_b_) vaddq_u32( _a_, _b_ )
68 | #define _mm_sub_epi16( _a_,_b_) (__m128i)vsubq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_))
69 | #define _mm_sub_epi32( _a_,_b_) (__m128i)vsubq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_))
70 | #define _mm_subs_epu8( _a_,_b_) (__m128i)vqsubq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
71 |
72 | #define _mm_mullo_epi32(_a_,_b_) (__m128i)vmulq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
73 | #define mm_mullo_epu32(_a_,_b_) vmulq_u32(_a_,_b_)
74 | #define _mm_mul_epu32( _a_,_b_) (__m128i)vmull_u32(vget_low_u32(_a_),vget_low_u32(_b_))
75 | #define _mm_adds_epu16( _a_,_b_) (__m128i)vqaddq_u16((uint16x8_t)(_a_),(uint16x8_t)(_b_))
76 | static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) {
77 | int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)a), vget_low_s16( (int16x8_t)b));
78 | int32x4_t mhi = vmull_s16(vget_high_s16((int16x8_t)a), vget_high_s16((int16x8_t)b));
79 | int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo));
80 | int32x2_t ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
81 | return (__m128i)vcombine_s32(alo, ahi);
82 | }
83 | //---------------------------------------------- Special math functions -----------------------------------------------------------
84 | #define _mm_min_epu8( _a_,_b_) (__m128i)vminq_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
85 | #define _mm_min_epu16( _a_,_b_) (__m128i)vminq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_))
86 | #define _mm_min_epi16( _a_,_b_) (__m128i)vminq_s16((int16x8_t)(_a_), (int16x8_t)(_b_))
87 | //---------------------------------------------- Logical --------------------------------------------------------------------------
88 | #define mm_testnz_epu32(_a_) vmaxvq_u32(_a_) //vaddvq_u32(_a_)
89 | #define mm_testnz_epu8(_a_) vmaxv_u8(_a_)
90 | #define _mm_or_si128( _a_,_b_) (__m128i)vorrq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_))
91 | #define _mm_and_si128( _a_,_b_) (__m128i)vandq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_))
92 | #define _mm_xor_si128( _a_,_b_) (__m128i)veorq_u32( (uint32x4_t)(_a_), (uint32x4_t)(_b_))
93 | //---------------------------------------------- Shift ----------------------------------------------------------------------------
94 | #define _mm_slli_epi16( _a_,_m_) (__m128i)vshlq_n_u16((uint16x8_t)(_a_), _m_)
95 | #define _mm_slli_epi32( _a_,_m_) (__m128i)vshlq_n_u32((uint32x4_t)(_a_), _m_)
96 | #define _mm_slli_epi64( _a_,_m_) (__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_)
97 | #define _mm_slli_si128( _a_,_m_) (__m128i)vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_a_), 16 - (_m_) ) // _m_: 1 - 15
98 |
99 | #define _mm_srli_epi16( _a_,_m_) (__m128i)vshrq_n_u16((uint16x8_t)(_a_), _m_)
100 | #define _mm_srli_epi32( _a_,_m_) (__m128i)vshrq_n_u32((uint32x4_t)(_a_), _m_)
101 | #define _mm_srli_epi64( _a_,_m_) (__m128i)vshlq_n_u64((uint64x2_t)(_a_), _m_)
102 | #define _mm_srli_si128( _a_,_m_) (__m128i)vextq_s8((int8x16_t)(_a_), vdupq_n_s8(0), (_m_))
103 |
104 | #define _mm_srai_epi16( _a_,_m_) (__m128i)vshrq_n_s16((int16x8_t)(_a_), _m_)
105 | #define _mm_srai_epi32( _a_,_m_) (__m128i)vshrq_n_s32((int32x4_t)(_a_), _m_)
106 | #define _mm_srai_epi64( _a_,_m_) (__m128i)vshrq_n_s64((int64x2_t)(_a_), _m_)
107 |
108 | #define _mm_sllv_epi32( _a_,_b_) (__m128i)vshlq_u32((uint32x4_t)(_a_), (uint32x4_t)(_b_))
109 | #define _mm_srlv_epi32( _a_,_b_) (__m128i)vshlq_u32((uint32x4_t)(_a_), vnegq_s32((int32x4_t)(_b_)))
110 | //---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
111 | #define _mm_cmpeq_epi8( _a_,_b_) (__m128i)vceqq_s8( ( int8x16_t)(_a_), ( int8x16_t)(_b_))
112 | #define _mm_cmpeq_epi16(_a_,_b_) (__m128i)vceqq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_))
113 | #define _mm_cmpeq_epi32(_a_,_b_) (__m128i)vceqq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
114 |
115 | #define _mm_cmpgt_epi16(_a_,_b_) (__m128i)vcgtq_s16(( int16x8_t)(_a_), ( int16x8_t)(_b_))
116 | #define _mm_cmpgt_epi32(_a_,_b_) (__m128i)vcgtq_s32(( int32x4_t)(_a_), ( int32x4_t)(_b_))
117 |
118 | #define _mm_cmpgt_epu16(_a_,_b_) (__m128i)vcgtq_u16((uint16x8_t)(_a_), (uint16x8_t)(_b_))
119 | #define mm_cmpgt_epu32(_a_,_b_) (__m128i)vcgtq_u32( _a_, _b_)
120 | //---------------------------------------------- Load -----------------------------------------------------------------------------
121 | #define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
122 | #define mm_loadu_epi64p( _u64p_,_a_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_a_), 0)
123 | #define _mm_loadu_si128( _ip_) vld1q_u32(_ip_)
124 | #define _mm_load_si128( _ip_) vld1q_u32(_ip_)
125 | //---------------------------------------------- Store ----------------------------------------------------------------------------
126 | #define _mm_storel_epi64(_ip_,_a_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_a_), 0)
127 | #define _mm_storeu_si128(_ip_,_a_) vst1q_u32((__m128i *)(_ip_),_a_)
128 | //---------------------------------------------- Convert --------------------------------------------------------------------------
129 | #define mm_cvtsi64_si128p(_u64p_,_a_) mm_loadu_epi64p(_u64p_,_a_)
130 | #define _mm_cvtsi64_si128(_a_) (__m128i)vdupq_n_u64(_a_) //vld1q_s64(_a_)
131 | //---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
132 | #define mm_rbit_epi8(a) (__m128i)vrbitq_u8( (uint8x16_t)(a)) // reverse bits
133 | #define mm_rev_epi16(a) vrev16q_u8((uint8x16_t)(a)) // reverse bytes
134 | #define mm_rev_epi32(a) vrev32q_u8((uint8x16_t)(a))
135 | #define mm_rev_epi64(a) vrev64q_u8((uint8x16_t)(a))
136 | //--------------------------------------------- Insert/extract --------------------------------------------------------------------
137 | #define mm_extract_epi32x(_a_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _a_, _id_)
138 | #define _mm_extract_epi64x(_a_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_a_), _id_)
139 |
140 | #define _mm_extract_epi8(_a_, _id_) vgetq_lane_u8( (uint8x16_t)(_a_), _id_)
141 | #define _mm_extract_epi16(_a_, _id_) vgetq_lane_u16(_a_, _id_)
142 | #define _mm_extract_epi32(_a_, _id_) vgetq_lane_u32(_a_, _id_)
143 | #define mm_extract_epu32(_a_, _id_) vgetq_lane_u32(_a_, _id_)
144 | #define _mm_cvtsi128_si32(_a_) vgetq_lane_u32((uint32x4_t)(_a_),0)
145 | #define _mm_cvtsi128_si64(_a_) vgetq_lane_u64((uint64x2_t)(_a_),0)
146 |
147 | #define _mm_insert_epu32p(_a_,_u32p_,_id_) vsetq_lane_u32(_x_, _a_, _id_)
148 | #define mm_insert_epi32p(_a_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_a_), _id_)
149 | #define _mm_cvtsi32_si128(_a_) (__m128i)vsetq_lane_s32(_a_, vdupq_n_s32(0), 0)
150 |
151 | #define _mm_blendv_epi8(_a_,_b_,_m_) vbslq_u32(_m_,_b_,_a_)
152 | //---------------------------------------------- Miscellaneous --------------------------------------------------------------------
153 | #define _mm_alignr_epi8(_a_,_b_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_b_), (uint8x16_t)(_a_), _m_)
154 | #define _mm_packs_epi16( _a_,_b_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_a_)), vqmovn_s16((int16x8_t)(_b_)))
155 | #define _mm_packs_epi32( _a_,_b_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_a_)), vqmovn_s32((int32x4_t)(_b_)))
156 |
157 | #define _mm_packs_epu16( _a_,_b_) (__m128i)vcombine_u8((uint16x8_t)(_a_), (uint16x8_t)(_b_))
158 | #define _mm_packus_epi16( _a_,_b_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_a_)), vqmovun_s16((int16x8_t)(_b_)))
159 |
160 | static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
161 | const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
162 | uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
163 | return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
164 | }
165 | //-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
166 | #ifdef __aarch64__
167 | static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM
168 | //static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
169 | static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
170 | static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); }
171 | static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
172 | #else
173 | static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
174 | #endif
175 | // --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
176 | #define _MM_SHUFFLE(u3,u2,u1,u0) ((u3) << 6 | (u2) << 4 | (u1) << 2 | (u0))
177 |
178 | #define _mm_shuffle_epi8(_a_, _b_) (__m128i)vqtbl1q_u8((uint8x16_t)(_a_), (uint8x16_t)(_b_))
179 | #if defined(__aarch64__)
180 | #define mm_shuffle_nnnn_epi32(_a_,_m_) (__m128i)vdupq_laneq_u32(_a_, _m_)
181 | #else
182 | #define mm_shuffle_nnnn_epi32(_a_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_a_, _m_)
183 | #endif
184 |
185 | #ifdef USE_MACROS
186 | #define mm_shuffle_2031_epi32(_a_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_a_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
187 | #define mm_shuffle_3120_epi32(_a_) ({ uint32x4_t _zv = _a_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
188 | #else
189 | static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i a) { uint32x4_t v = (uint32x4_t)vrev64q_u32(a); uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
190 | static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i a) { uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
191 | #endif
192 |
193 | #if defined(USE_MACROS) || defined(__clang__)
194 | #define _mm_shuffle_epi32(_a_, _m_) ({ const uint32x4_t _av =_a_;\
195 | uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\
196 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
197 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
198 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
199 | })
200 | #define _mm_shuffle_epi32s(_a_, _m_) _mm_set_epi32(vgetq_lane_u32(_a_, ((_m_) ) & 0x3),\
201 | vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3),\
202 | vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3),\
203 | vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3))
204 | #else
205 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _a_, const unsigned _m_) { const uint32x4_t _av =_a_;
206 | uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));
207 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
208 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
209 | _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3);
210 | return _v;
211 | }
212 | static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _a_, const unsigned _m_) {
213 | return _mm_set_epi32(vgetq_lane_u32(_a_, ((_m_) ) & 0x3),
214 | vgetq_lane_u32(_a_, ((_m_) >> 2) & 0x3),
215 | vgetq_lane_u32(_a_, ((_m_) >> 4) & 0x3),
216 | vgetq_lane_u32(_a_, ((_m_) >> 6) & 0x3));
217 | }
218 | #endif
219 | #ifdef USE_MACROS
220 | #define _mm_unpacklo_epi8( _a_,_b_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
221 | #define _mm_unpacklo_epi16(_a_,_b_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
222 | #define _mm_unpacklo_epi32(_a_,_b_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _a_ ), vget_low_u32( _b_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
223 | #define _mm_unpacklo_epi64(_a_,_b_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_)))
224 |
225 | #define _mm_unpackhi_epi8( _a_,_b_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
226 | #define _mm_unpackhi_epi16(_a_,_b_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
227 | #define _mm_unpackhi_epi32(_a_,_b_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _a_ ), vget_high_u32( _b_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
228 | #define _mm_unpackhi_epi64(_a_,_b_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_)))
229 | #else
230 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_a_)), vget_low_u8 ((uint8x16_t)(_b_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
231 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
232 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _a_ ), vget_low_u32( _b_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);}
233 | static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _a_, __m128i _b_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_a_)), vget_low_u64((uint64x2_t)(_b_))); }
234 |
235 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _a_, __m128i _b_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_a_)), vget_high_u8( (uint8x16_t)(_b_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
236 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _a_, __m128i _b_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
237 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _a_, __m128i _b_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _a_ ), vget_high_u32( _b_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); }
238 | static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _a_, __m128i _b_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_a_)), vget_high_u64((uint64x2_t)(_b_))); }
239 | #endif
240 |
241 | #else //------------------------------------- intel SSE2/SSSE3 --------------------------------------------------------------
242 | #define mm_movemask_epu32(_a_) _mm_movemask_ps(_mm_castsi128_ps(_a_))
243 | #define mm_movemask_epu16(_a_) _mm_movemask_epi8(_a_)
244 | #define mm_loadu_epi64p( _u64p_,_a_) _a_ = _mm_cvtsi64_si128(ctou64(_u64p_))
245 |
246 | #define mm_extract_epu32( _a_, _id_) _mm_extract_epi32(_a_, _id_)
247 | #define mm_extract_epi32x(_a_,_u32_, _id_) _u32_ = _mm_extract_epi32(_a_, _id_)
248 | #define mm_extract_epi64x(_a_,_u64_, _id_) _u64_ = _mm_extract_epi64(_a_, _id_)
249 | #define mm_insert_epi32p( _a_,_u32p_,_c_) _mm_insert_epi32( _a_,ctou32(_u32p_),_c_)
250 |
251 | #define mm_mullo_epu32( _a_,_b_) _mm_mullo_epi32(_a_,_b_)
252 | #define mm_cvtsi64_si128p(_u64p_,_a_) _a_ = _mm_cvtsi64_si128(ctou64(_u64p_))
253 |
254 | #define mm_cmpgt_epu32( _a_, _b_) _mm_cmpgt_epi32(_mm_xor_si128(_a_, cv80000000), _mm_xor_si128(_b_, cv80000000))
255 |
256 | #define mm_shuffle_nnnn_epi32(_a_, _n_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
257 | #define mm_shuffle_2031_epi32(_a_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(2,0,3,1))
258 | #define mm_shuffle_3120_epi32(_a_) _mm_shuffle_epi32(_a_, _MM_SHUFFLE(3,1,2,0))
259 |
260 | static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
261 | __m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);
262 | __m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8));
263 | __m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128(_mm_srli_epi64(v, 4), cv0f_8));
264 | return _mm_or_si128(_mm_slli_epi64(lv,4), hv);
265 | }
266 |
267 | static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
268 | static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
269 | static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
270 | static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
271 | #endif
272 | #endif
273 |
--------------------------------------------------------------------------------
/turbohist_.c:
--------------------------------------------------------------------------------
1 | /**
2 | Copyright (c) 2013-2022, Powturbo
3 | - homepage : https://sites.google.com/site/powturbo/
4 | - github : https://github.com/powturbo
5 | - twitter : https://twitter.com/powturbo
6 | - email : powturbo [_AT_] gmail [_DOT_] com
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are
11 | met:
12 |
13 | 1. Redistributions of source code must retain the above copyright
14 | notice, this list of conditions and the following disclaimer.
15 |
16 | 2. Redistributions in binary form must reproduce the above copyright
17 | notice, this list of conditions and the following disclaimer in the
18 | documentation and/or other materials provided with the distribution.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | **/
32 | // 1D Histogram: hist[r]_X_Y r:run aware X: number bins used, Y: processing unit 1:8 bits, 4:32 bits, 8:64 bits
33 | #include "conf.h"
34 | #ifdef __ARM_NEON
35 | #define PREFETCH(_ip_,_rw_)
36 | #else
37 | #define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
38 | #endif
39 |
40 | #define CSIZE (256 + 8)
41 | typedef unsigned cnt_t;
42 |
43 | #if 1 // fast when auto-vectorization enabled (for ex. with gcc -O3)
44 | #define HISTEND(_c_,_cn_,_cnt_) { int _i,_j;\
45 | memset(_cnt_, 0, 256*sizeof(_cnt_[0]));\
46 | for(_i=0; _i < 256; _i++)\
47 | for(_j=0; _j < _cn_;_j++) _cnt_[_i] += _c_[_j][_i];\
48 | }
49 |
50 | #define HISTEND8(_c_,_cnt_) HISTEND(_c_,8,_cnt_)
51 | #define HISTEND4(_c_,_cnt_) HISTEND(_c_,4,_cnt_)
52 | #else
53 | static ALWAYS_INLINE histend4(cnt_t c[4][CSIZE], cnt_t *__restrict cnt) { unsigned i;
54 | #ifdef __AVX2__
55 | for(i = 0; i != 256; i+=8) {
56 | __m256i sv = _mm256_load_si256((const __m256i *)&c[0][i]);
57 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[1][i]), sv);
58 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[2][i]), sv);
59 | sv = _mm256_add_epi32(_mm256_load_si256((const __m256i *)&c[3][i]), sv);
60 | _mm256_storeu_si256((__m256i *)&cnt[i], sv);
61 | }
62 | #elif defined(__SSE2__) || defined(__ARM_NEON)
63 | for(i = 0; i != 256; i+=4) {
64 | __m128i sv = _mm_load_si128((const __m128i *)&c[0][i]);
65 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[1][i]), sv);
66 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[2][i]), sv);
67 | sv = _mm256_add_epi32(_mm_load_si128((const __m128i *)&c[3][i]), sv);
68 | _mm_storeu_si128((__m128i *)&cnt[i], sv);
69 | }
70 | #else
71 | for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i];
72 | #endif
73 | }
74 |
75 | static ALWAYS_INLINE histend8(cnt_t c[8][CSIZE], cnt_t *__restrict cnt) { unsigned i;
76 | #ifdef __AVX2__
77 | for(i = 0; i != 256; i+=8) {
78 | __m256i v0 = _mm256_load_si256((const __m256i *)&c[0][i]);
79 | __m256i v1 = _mm256_load_si256((const __m256i *)&c[1][i]);
80 | __m256i s0 = _mm256_add_epi32(v0, v1);
81 | v0 = _mm256_load_si256((const __m256i *)&c[2][i]);
82 | v1 = _mm256_load_si256((const __m256i *)&c[3][i]);
83 | __m256i s1 = _mm256_add_epi32(v0, v1);
84 | s0 = _mm256_add_epi32(s0, s1);
85 |
86 | v0 = _mm256_load_si256((const __m256i *)&c[4][i]);
87 | v1 = _mm256_load_si256((const __m256i *)&c[5][i]);
88 | s1 = _mm256_add_epi32(v0, v1);
89 | v0 = _mm256_load_si256((const __m256i *)&c[6][i]);
90 | v1 = _mm256_load_si256((const __m256i *)&c[7][i]);
91 | s0 = _mm256_add_epi32(s0, v0);
92 | s1 = _mm256_add_epi32(s1, v1);
93 |
94 | _mm256_storeu_si256((__m256i *)&cnt[i], _mm256_add_epi32(s0, s1));
95 | }
96 | #elif defined(__SSE2__) || defined(__ARM_NEON)
97 | for(i = 0; i != 256; i+=4) {
98 | __m128i v0 = _mm_load_si128((const __m128i *)&c[0][i]);
99 | __m128i v1 = _mm_load_si128((const __m128i *)&c[1][i]);
100 | __m128i sv = _mm_add_epi32(v0, v1);
101 | v0 = _mm_load_si128((const __m128i *)&c[2][i]);
102 | v1 = _mm_load_si128((const __m128i *)&c[3][i]);
103 | sv = _mm_add_epi32(sv, v0);
104 | sv = _mm_add_epi32(sv, v1);
105 |
106 | v0 = _mm_load_si128((const __m128i *)&c[4][i]);
107 | v1 = _mm_load_si128((const __m128i *)&c[5][i]);
108 | sv = _mm_add_epi32(sv, v0);
109 | sv = _mm_add_epi32(sv, v1);
110 | v0 = _mm_load_si128((const __m128i *)&c[6][i]);
111 | v1 = _mm_load_si128((const __m128i *)&c[7][i]);
112 | sv = _mm_add_epi32(sv, v0);
113 | _mm_storeu_si128((__m128i *)&cnt[i], _mm_add_epi32(sv, v1));
114 | }
115 | #else
116 | for(i = 0; i != 256; i++) cnt[i] = c[0][i]+c[1][i]+c[2][i]+c[3][i]+c[4][i]+c[5][i]+c[6][i]+c[7][i];
117 | #endif
118 | }
119 |
120 | #define HISTEND8(_c_,_cnt_) histend8(_c_,_cnt_)
121 | #define HISTEND4(_c_,_cnt_) histend4(_c_,_cnt_)
122 | #endif
123 |
124 | //---------------------------- 8 bits ------------------------------------------------------
125 | static void hist_1_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
126 | unsigned char *ip = in;
127 |
128 | memset(cnt, 0, 256*sizeof(cnt[0]));
129 | while(ip < in+inlen) cnt[*ip++]++;
130 | }
131 |
132 | static void hist_4_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
133 | cnt_t c[4][CSIZE] = {0},i;
134 | unsigned char *ip = in;
135 |
136 | while(ip != in+(inlen&~(4-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++;
137 | while(ip != in+ inlen ) c[0][*ip++]++;
138 | HISTEND4(c, cnt);
139 | }
140 |
141 | static void hist_8_8(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
142 | cnt_t c[8][CSIZE] = {0},i;
143 | unsigned char *ip = in;
144 |
145 | while(ip != in+(inlen&~(8-1))) c[0][*ip++]++, c[1][*ip++]++, c[2][*ip++]++, c[3][*ip++]++, c[4][*ip++]++, c[5][*ip++]++, c[6][*ip++]++, c[7][*ip++]++;
146 | while(ip != in+ inlen ) c[0][*ip++]++;
147 | HISTEND8(c, cnt);
148 | }
149 |
150 | //----------------------------- 32 bits --------------------------------------------------------
151 | #if defined(__i386__) || defined(__x86_64__)
152 | #define CU32(_u_,_i_,_c_) {\
153 | c[_i_+0][(unsigned char )(_u_) ]+=_c_;\
154 | c[_i_+1][(unsigned short)(_u_)>>8]+=_c_; _u_>>=16;\
155 | c[_i_+2][(unsigned char )(_u_) ]+=_c_;\
156 | c[_i_+3][(unsigned short)(_u_)>>8]+=_c_;\
157 | }
158 | #else
159 | #define CU32(_u_,_i_,_c_) {\
160 | c[_i_+0][(unsigned char) (_u_) ]+=_c_;\
161 | c[_i_+1][(unsigned char)((_u_)>> 8)]+=_c_;\
162 | c[_i_+2][(unsigned char)((_u_)>>16)]+=_c_;\
163 | c[_i_+3][ (_u_)>>24 ]+=_c_;\
164 | }
165 | #endif
166 |
167 | #define UZ 4 // Load size 1x 32 bits = 4 bytes
168 | #define I132(_i_,_o_) { unsigned u1 = ctou32(ip+UZ+_i_*UZ*2+0); CU32(u0, 0, 1);\
169 | u0 = ctou32(ip+UZ+_i_*UZ*2+4); CU32(u1,_o_,1);\
170 | }
171 |
172 | #define N32 32
173 | static void hist_4_32(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
174 | #define IC 0
175 | cnt_t c[4][CSIZE] = {0}, i;
176 | unsigned char *ip = in;
177 |
178 | if(inlen >= UZ+N32) {
179 | unsigned u0 = ctou32(ip);
180 | for(; ip <= in+inlen-(UZ+N32); ip += N32) {
181 | I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC);
182 | PREFETCH(ip+512, 0);
183 | }
184 | }
185 | while(ip != in+inlen) c[0][*ip++]++;
186 | HISTEND4(c, cnt);
187 | }
188 |
189 | static void hist_8_32(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
190 | #define IC 4
191 | cnt_t c[8][CSIZE] = {0}, i;
192 | unsigned char *ip = in;
193 |
194 | if(inlen >= UZ+N32) {
195 | unsigned u0 = ctou32(ip);
196 | for(; ip <= in+inlen-(UZ+N32); ip += N32) {
197 | I132(0,IC); I132(1,IC); I132(2,IC); I132(3,IC); //I132(4,IC); I132(5,IC); I132(6,IC); I132(7,IC);
198 | PREFETCH(ip+512, 0);
199 | }
200 | }
201 | while(ip != in+inlen) c[0][*ip++]++;
202 | HISTEND8(c, cnt);
203 | }
204 |
205 | //-------------------- 64 bits ---------------------------------------------------
206 | #if defined(__i386__) || defined(__x86_64__)
207 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\
208 | c[0 ][(unsigned char )_x ]+=_c_;\
209 | c[1 ][(unsigned short)_x>> 8]+=_c_; _x>>=16;\
210 | c[2 ][(unsigned char )_x ]+=_c_;\
211 | c[3 ][(unsigned short)_x>> 8]+=_c_; _x=(_u_)>>=32;\
212 | c[0+_o_][(unsigned char )_x ]+=_c_;\
213 | c[1+_o_][(unsigned short)_x>> 8]+=_c_; _x>>=16;\
214 | c[2+_o_][(unsigned char )_x ]+=_c_;\
215 | c[3+_o_][(unsigned short)_x>> 8]+=_c_;\
216 | }
217 | #else
218 | #define CU64(_u_,_o_,_c_) { unsigned _x = _u_;\
219 | c[0 ][(unsigned char) _x ]+=_c_;\
220 | c[1 ][(unsigned char)(_x>> 8)]+=_c_;\
221 | c[2 ][(unsigned char)(_x>>16)]+=_c_;\
222 | c[3 ][ _x>>24 ]+=_c_; _x=(_u_)>>=32;\
223 | c[0+_o_][(unsigned char) _x ]+=_c_;\
224 | c[1+_o_][(unsigned char)(_x>> 8)]+=_c_;\
225 | c[2+_o_][(unsigned char)(_x>>16)]+=_c_;\
226 | c[3+_o_][ _x>>24 ]+=_c_;\
227 | }
228 | #endif
229 |
230 | #define UZ 8 // Load size 1x 64 bits = 8 bytes
231 | #define I164(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0); CU64(u0, _o_, 1);\
232 | u0 = ctou64(ip+UZ+_i_*UZ*2+ 8); CU64(u1, _o_, 1);\
233 | }
234 |
235 | #define N64 64
236 | static void hist_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
237 | #define IC 0
238 | cnt_t c[4][CSIZE] = {0}, i;
239 | unsigned char *ip = in;
240 |
241 | if(inlen >= UZ+N64) {
242 | uint64_t u0 = ctou64(ip);
243 | for(; ip <= in+inlen-(UZ+N64); ip += N64) {
244 | I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC);
245 | PREFETCH(ip+512, 0);
246 | }
247 | }
248 | while(ip != in+inlen) c[0][*ip++]++;
249 | HISTEND4(c, cnt);
250 | }
251 |
252 | static void hist_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
253 | #define IC 4
254 | cnt_t c[8][CSIZE] = {0}, i;
255 | unsigned char *ip = in;
256 |
257 | if(inlen >= UZ+N64) {
258 | uint64_t u0 = ctou64(ip);
259 | for(; ip <= in+inlen-(UZ+N64); ip += N64) {
260 | I164(0,IC); I164(1,IC); I164(2,IC); I164(3,IC);
261 | PREFETCH(ip+512, 0);
262 | }
263 | }
264 | while(ip != in+inlen) c[0][*ip++]++;
265 | HISTEND8(c, cnt);
266 | }
267 |
268 | //----- hist_8_64a with inline assembly -----------------------------------------
269 | #ifdef __x86_64
270 | #define RSHR(r, b) __asm volatile ("shr %1, %0": "+r" (r): "i" (b) )
271 |
272 | #define CU16(x, u, offset, size, base, scale) \
273 | __asm volatile (\
274 | "movzbl %b1, %k0\n"\
275 | "incl (%c2+0)*%c3(%4, %0, %c5)\n"\
276 | "movzbl %h1, %k0\n"\
277 | "incl (%c2+1)*%c3(%4, %0, %c5)\n"\
278 | :"=&R" (x)\
279 | :"Q" (u), "i" (offset), "i" (size), "r" (base), "i" (scale) \
280 | :"memory"\
281 | )
282 |
283 | #define N64 64
284 | unsigned hist_8_64a(unsigned char *in, unsigned inlen, unsigned *__restrict cnt) {
285 | unsigned c[8][CSIZE]= {0};
286 | unsigned char *ip = in;
287 |
288 | if(inlen >= 8+N64) {
289 | uint64_t u0 = ctou64(ip),b;
290 | for(; ip <= in+inlen-(8+N64); ip += N64) {
291 | uint64_t x, u1;
292 | #define ST(u) CU16(x, u, 0, CSIZE*4, c, 4);\
293 | RSHR(u, 16); CU16(x, u, 2, CSIZE*4, c, 4);\
294 | RSHR(u, 16); CU16(x, u, 4, CSIZE*4, c, 4);\
295 | RSHR(u, 16); CU16(x, u, 6, CSIZE*4, c, 4);
296 | u1 = ctou64(ip+8+ 0); ST(u0);
297 | u0 = ctou64(ip+8+ 8); ST(u1);
298 | u1 = ctou64(ip+8+16); ST(u0);
299 | u0 = ctou64(ip+8+24); ST(u1);
300 | u1 = ctou64(ip+8+32); ST(u0);
301 | u0 = ctou64(ip+8+40); ST(u1);
302 | u1 = ctou64(ip+8+48); ST(u0);
303 | u0 = ctou64(ip+8+56); ST(u1); PREFETCH(ip+768, 0);
304 | }
305 | }
306 | while(ip < in+inlen) c[0][*ip++]++;
307 | HISTEND8(c, cnt);
308 | }
309 | #endif
310 |
311 | #define UZ 16 // Load size 2x 64 bits = 2*8 bytes
312 | #define CR64(u,v,_o_,_c_) if(likely(u!=v)) { CU64(u,_o_,1); CU64(v,_o_,1); } else if((u^(v<<8)) < (1<<8)) c[_c_][(unsigned char)u]+=UZ; else CU64(u, _o_,2)
313 | #define I2R64(_i_,_o_) { uint64_t u1 = ctou64(ip+UZ+_i_*UZ*2+ 0), v1 = ctou64(ip+UZ+_i_*UZ*2+ 8); CR64(u0,v0,_o_,_i_);\
314 | u0 = ctou64(ip+UZ+_i_*UZ*2+16); v0 = ctou64(ip+UZ+_i_*UZ*2+24); CR64(u1,v1,_o_,_i_);\
315 | }
316 |
317 | #define N64 64
318 | static void histr_4_64(unsigned char *__restrict in, unsigned inlen, cnt_t *__restrict cnt) {
319 | #define IC 0
320 | cnt_t c[4][CSIZE] = {0},i;
321 | unsigned char *ip = in,*in_;
322 |
323 | if(inlen >= UZ+N64) {
324 | uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8);
325 | for(; ip <= in+inlen-(UZ+N64); ip += N64) {
326 | I2R64(0,IC); I2R64(1,IC);
327 | PREFETCH(ip+512, 0);
328 | }
329 | }
330 | while(ip != in+inlen)
331 | c[0][*ip++]++;
332 | HISTEND4(c, cnt);
333 | }
334 |
335 | static void histr_8_64(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
336 | #define IC 4
337 | cnt_t c[8][CSIZE] = {0},i;
338 | unsigned char *ip = in,*in_;
339 |
340 | if(inlen >= UZ+N64) {
341 | uint64_t u0 = ctou64(ip), v0 = ctou64(ip+8);
342 | for(; ip <= in+inlen-(UZ+N64); ip += N64) {
343 | I2R64(0,IC); I2R64(1,IC);
344 | PREFETCH(ip+512, 0);
345 | }
346 | }
347 | while(ip != in+inlen) c[0][*ip++]++;
348 | HISTEND8(c, cnt);
349 | }
350 |
351 | #if defined(__SSE4_1__) || defined(__ARM_NEON) //---------- sse4.1 ---------------------------------------
352 | #ifdef __SSE4_1__
353 | #include
354 | #else
355 | #include "sse_neon.h"
356 | #endif
357 | static void hist_4_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
358 | cnt_t c[4][CSIZE]={0},i;
359 |
360 | unsigned char *ip = in;
361 | if(inlen >= 32+64) {
362 | __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16));
363 | for(; ip <= in+inlen-(32+64); ip += 64) {
364 | __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16));
365 | c[0][_mm_extract_epi8(u0, 0)]++;
366 | c[1][_mm_extract_epi8(v0, 0)]++;
367 | c[2][_mm_extract_epi8(u0, 1)]++;
368 | c[3][_mm_extract_epi8(v0, 1)]++;
369 | c[0][_mm_extract_epi8(u0, 2)]++;
370 | c[1][_mm_extract_epi8(v0, 2)]++;
371 | c[2][_mm_extract_epi8(u0, 3)]++;
372 | c[3][_mm_extract_epi8(v0, 3)]++;
373 | c[0][_mm_extract_epi8(u0, 4)]++;
374 | c[1][_mm_extract_epi8(v0, 4)]++;
375 | c[2][_mm_extract_epi8(u0, 5)]++;
376 | c[3][_mm_extract_epi8(v0, 5)]++;
377 | c[0][_mm_extract_epi8(u0, 6)]++;
378 | c[1][_mm_extract_epi8(v0, 6)]++;
379 | c[2][_mm_extract_epi8(u0, 7)]++;
380 | c[3][_mm_extract_epi8(v0, 7)]++;
381 | c[0][_mm_extract_epi8(u0, 8)]++;
382 | c[1][_mm_extract_epi8(v0, 8)]++;
383 | c[2][_mm_extract_epi8(u0, 9)]++;
384 | c[3][_mm_extract_epi8(v0, 9)]++;
385 | c[0][_mm_extract_epi8(u0, 10)]++;
386 | c[1][_mm_extract_epi8(v0, 10)]++;
387 | c[2][_mm_extract_epi8(u0, 11)]++;
388 | c[3][_mm_extract_epi8(v0, 11)]++;
389 | c[0][_mm_extract_epi8(u0, 12)]++;
390 | c[1][_mm_extract_epi8(v0, 12)]++;
391 | c[2][_mm_extract_epi8(u0, 13)]++;
392 | c[3][_mm_extract_epi8(v0, 13)]++;
393 | c[0][_mm_extract_epi8(u0, 14)]++;
394 | c[1][_mm_extract_epi8(v0, 14)]++;
395 | c[2][_mm_extract_epi8(u0, 15)]++;
396 | c[3][_mm_extract_epi8(v0, 15)]++;
397 |
398 | u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48));
399 | c[0][_mm_extract_epi8(u1, 0)]++;
400 | c[1][_mm_extract_epi8(v1, 0)]++;
401 | c[2][_mm_extract_epi8(u1, 1)]++;
402 | c[3][_mm_extract_epi8(v1, 1)]++;
403 | c[0][_mm_extract_epi8(u1, 2)]++;
404 | c[1][_mm_extract_epi8(v1, 2)]++;
405 | c[2][_mm_extract_epi8(u1, 3)]++;
406 | c[3][_mm_extract_epi8(v1, 3)]++;
407 | c[0][_mm_extract_epi8(u1, 4)]++;
408 | c[1][_mm_extract_epi8(v1, 4)]++;
409 | c[2][_mm_extract_epi8(u1, 5)]++;
410 | c[3][_mm_extract_epi8(v1, 5)]++;
411 | c[0][_mm_extract_epi8(u1, 6)]++;
412 | c[1][_mm_extract_epi8(v1, 6)]++;
413 | c[2][_mm_extract_epi8(u1, 7)]++;
414 | c[3][_mm_extract_epi8(v1, 7)]++;
415 | c[0][_mm_extract_epi8(u1, 8)]++;
416 | c[1][_mm_extract_epi8(v1, 8)]++;
417 | c[2][_mm_extract_epi8(u1, 9)]++;
418 | c[3][_mm_extract_epi8(v1, 9)]++;
419 | c[0][_mm_extract_epi8(u1, 10)]++;
420 | c[1][_mm_extract_epi8(v1, 10)]++;
421 | c[2][_mm_extract_epi8(u1, 11)]++;
422 | c[3][_mm_extract_epi8(v1, 11)]++;
423 | c[0][_mm_extract_epi8(u1, 12)]++;
424 | c[1][_mm_extract_epi8(v1, 12)]++;
425 | c[2][_mm_extract_epi8(u1, 13)]++;
426 | c[3][_mm_extract_epi8(v1, 13)]++;
427 | c[0][_mm_extract_epi8(u1, 14)]++;
428 | c[1][_mm_extract_epi8(v1, 14)]++;
429 | c[2][_mm_extract_epi8(u1, 15)]++;
430 | c[3][_mm_extract_epi8(v1, 15)]++; PREFETCH(ip+512, 0);
431 | }
432 | }
433 | while(ip < in+inlen) c[0][*ip++]++;
434 | HISTEND4(c, cnt);
435 | }
436 |
437 | unsigned hist_8_128(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
438 | cnt_t c[8][CSIZE]={0},i;
439 |
440 | unsigned char *ip = in;
441 | if(inlen >= 32+64) {
442 | __m128i u0 = _mm_loadu_si128((__m128i*)ip), v0 = _mm_loadu_si128((__m128i*)(ip+16));
443 | for(; ip <= in+inlen-(32+64); ip += 64) {
444 | __m128i u1 = _mm_loadu_si128((__m128i*)(ip+32)), v1 = _mm_loadu_si128((__m128i*)(ip+32+16));
445 | c[0][_mm_extract_epi8(u0, 0)]++;
446 | c[1][_mm_extract_epi8(v0, 0)]++;
447 | c[2][_mm_extract_epi8(u0, 1)]++;
448 | c[3][_mm_extract_epi8(v0, 1)]++;
449 | c[4][_mm_extract_epi8(u0, 2)]++;
450 | c[5][_mm_extract_epi8(v0, 2)]++;
451 | c[6][_mm_extract_epi8(u0, 3)]++;
452 | c[7][_mm_extract_epi8(v0, 3)]++;
453 | c[0][_mm_extract_epi8(u0, 4)]++;
454 | c[1][_mm_extract_epi8(v0, 4)]++;
455 | c[2][_mm_extract_epi8(u0, 5)]++;
456 | c[3][_mm_extract_epi8(v0, 5)]++;
457 | c[4][_mm_extract_epi8(u0, 6)]++;
458 | c[5][_mm_extract_epi8(v0, 6)]++;
459 | c[6][_mm_extract_epi8(u0, 7)]++;
460 | c[7][_mm_extract_epi8(v0, 7)]++;
461 | c[0][_mm_extract_epi8(u0, 8)]++;
462 | c[1][_mm_extract_epi8(v0, 8)]++;
463 | c[2][_mm_extract_epi8(u0, 9)]++;
464 | c[3][_mm_extract_epi8(v0, 9)]++;
465 | c[4][_mm_extract_epi8(u0, 10)]++;
466 | c[5][_mm_extract_epi8(v0, 10)]++;
467 | c[6][_mm_extract_epi8(u0, 11)]++;
468 | c[7][_mm_extract_epi8(v0, 11)]++;
469 | c[0][_mm_extract_epi8(u0, 12)]++;
470 | c[1][_mm_extract_epi8(v0, 12)]++;
471 | c[2][_mm_extract_epi8(u0, 13)]++;
472 | c[3][_mm_extract_epi8(v0, 13)]++;
473 | c[4][_mm_extract_epi8(u0, 14)]++;
474 | c[5][_mm_extract_epi8(v0, 14)]++;
475 | c[6][_mm_extract_epi8(u0, 15)]++;
476 | c[7][_mm_extract_epi8(v0, 15)]++;
477 |
478 | u0 = _mm_loadu_si128((__m128i*)(ip+32+32)); v0 = _mm_loadu_si128((__m128i*)(ip+32+48));
479 | c[0][_mm_extract_epi8(u1, 0)]++;
480 | c[1][_mm_extract_epi8(v1, 0)]++;
481 | c[2][_mm_extract_epi8(u1, 1)]++;
482 | c[3][_mm_extract_epi8(v1, 1)]++;
483 | c[4][_mm_extract_epi8(u1, 2)]++;
484 | c[5][_mm_extract_epi8(v1, 2)]++;
485 | c[6][_mm_extract_epi8(u1, 3)]++;
486 | c[7][_mm_extract_epi8(v1, 3)]++;
487 | c[0][_mm_extract_epi8(u1, 4)]++;
488 | c[1][_mm_extract_epi8(v1, 4)]++;
489 | c[2][_mm_extract_epi8(u1, 5)]++;
490 | c[3][_mm_extract_epi8(v1, 5)]++;
491 | c[4][_mm_extract_epi8(u1, 6)]++;
492 | c[5][_mm_extract_epi8(v1, 6)]++;
493 | c[6][_mm_extract_epi8(u1, 7)]++;
494 | c[7][_mm_extract_epi8(v1, 7)]++;
495 | c[0][_mm_extract_epi8(u1, 8)]++;
496 | c[1][_mm_extract_epi8(v1, 8)]++;
497 | c[2][_mm_extract_epi8(u1, 9)]++;
498 | c[3][_mm_extract_epi8(v1, 9)]++;
499 | c[4][_mm_extract_epi8(u1, 10)]++;
500 | c[5][_mm_extract_epi8(v1, 10)]++;
501 | c[6][_mm_extract_epi8(u1, 11)]++;
502 | c[7][_mm_extract_epi8(v1, 11)]++;
503 | c[0][_mm_extract_epi8(u1, 12)]++;
504 | c[1][_mm_extract_epi8(v1, 12)]++;
505 | c[2][_mm_extract_epi8(u1, 13)]++;
506 | c[3][_mm_extract_epi8(v1, 13)]++;
507 | c[4][_mm_extract_epi8(u1, 14)]++;
508 | c[5][_mm_extract_epi8(v1, 14)]++;
509 | c[6][_mm_extract_epi8(u1, 15)]++;
510 | c[7][_mm_extract_epi8(v1, 15)]++; PREFETCH(ip+512, 0);
511 | }
512 | }
513 | while(ip < in+inlen) c[0][*ip++]++;
514 | HISTEND8(c, cnt);
515 | }
516 | #endif
517 |
518 | #ifdef __AVX2__ //---------------------------------- avx2 -----------------------------------------------
519 | #include
520 |
521 | #define UZ 64
522 | #define N256 128
523 |
524 | static void hist_4_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
525 | cnt_t c[4][CSIZE]={0},i;
526 |
527 | unsigned char *ip = in;
528 | if(inlen >= UZ+N256) {
529 | __m256i u0 = _mm256_loadu_si256((__m256i*)ip), v0 = _mm256_loadu_si256((__m256i*)(ip+32));
530 | for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) {
531 | __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32));
532 | c[0][_mm256_extract_epi8(u0, 0)]++;
533 | c[1][_mm256_extract_epi8(v0, 0)]++;
534 | c[2][_mm256_extract_epi8(u0, 1)]++;
535 | c[3][_mm256_extract_epi8(v0, 1)]++;
536 | c[0][_mm256_extract_epi8(u0, 2)]++;
537 | c[1][_mm256_extract_epi8(v0, 2)]++;
538 | c[2][_mm256_extract_epi8(u0, 3)]++;
539 | c[3][_mm256_extract_epi8(v0, 3)]++;
540 | c[0][_mm256_extract_epi8(u0, 4)]++;
541 | c[1][_mm256_extract_epi8(v0, 4)]++;
542 | c[2][_mm256_extract_epi8(u0, 5)]++;
543 | c[3][_mm256_extract_epi8(v0, 5)]++;
544 | c[0][_mm256_extract_epi8(u0, 6)]++;
545 | c[1][_mm256_extract_epi8(v0, 6)]++;
546 | c[2][_mm256_extract_epi8(u0, 7)]++;
547 | c[3][_mm256_extract_epi8(v0, 7)]++;
548 | c[0][_mm256_extract_epi8(u0, 8)]++;
549 | c[1][_mm256_extract_epi8(v0, 8)]++;
550 | c[2][_mm256_extract_epi8(u0, 9)]++;
551 | c[3][_mm256_extract_epi8(v0, 9)]++;
552 | c[0][_mm256_extract_epi8(u0, 10)]++;
553 | c[1][_mm256_extract_epi8(v0, 10)]++;
554 | c[2][_mm256_extract_epi8(u0, 11)]++;
555 | c[3][_mm256_extract_epi8(v0, 11)]++;
556 | c[0][_mm256_extract_epi8(u0, 12)]++;
557 | c[1][_mm256_extract_epi8(v0, 12)]++;
558 | c[2][_mm256_extract_epi8(u0, 13)]++;
559 | c[3][_mm256_extract_epi8(v0, 13)]++;
560 | c[0][_mm256_extract_epi8(u0, 14)]++;
561 | c[1][_mm256_extract_epi8(v0, 14)]++;
562 | c[2][_mm256_extract_epi8(u0, 15)]++;
563 | c[3][_mm256_extract_epi8(v0, 15)]++;
564 | c[0][_mm256_extract_epi8(u0, 16)]++;
565 | c[1][_mm256_extract_epi8(v0, 16)]++;
566 | c[2][_mm256_extract_epi8(u0, 17)]++;
567 | c[3][_mm256_extract_epi8(v0, 17)]++;
568 | c[0][_mm256_extract_epi8(u0, 18)]++;
569 | c[1][_mm256_extract_epi8(v0, 18)]++;
570 | c[2][_mm256_extract_epi8(u0, 19)]++;
571 | c[3][_mm256_extract_epi8(v0, 19)]++;
572 | c[0][_mm256_extract_epi8(u0, 20)]++;
573 | c[1][_mm256_extract_epi8(v0, 20)]++;
574 | c[2][_mm256_extract_epi8(u0, 21)]++;
575 | c[3][_mm256_extract_epi8(v0, 21)]++;
576 | c[0][_mm256_extract_epi8(u0, 22)]++;
577 | c[1][_mm256_extract_epi8(v0, 22)]++;
578 | c[2][_mm256_extract_epi8(u0, 23)]++;
579 | c[3][_mm256_extract_epi8(v0, 23)]++;
580 | c[0][_mm256_extract_epi8(u0, 24)]++;
581 | c[1][_mm256_extract_epi8(v0, 24)]++;
582 | c[2][_mm256_extract_epi8(u0, 25)]++;
583 | c[3][_mm256_extract_epi8(v0, 25)]++;
584 | c[0][_mm256_extract_epi8(u0, 26)]++;
585 | c[1][_mm256_extract_epi8(v0, 26)]++;
586 | c[2][_mm256_extract_epi8(u0, 27)]++;
587 | c[3][_mm256_extract_epi8(v0, 27)]++;
588 | c[0][_mm256_extract_epi8(u0, 28)]++;
589 | c[1][_mm256_extract_epi8(v0, 28)]++;
590 | c[2][_mm256_extract_epi8(u0, 29)]++;
591 | c[3][_mm256_extract_epi8(v0, 29)]++;
592 | c[0][_mm256_extract_epi8(u0, 30)]++;
593 | c[1][_mm256_extract_epi8(v0, 30)]++;
594 | c[2][_mm256_extract_epi8(u0, 31)]++;
595 | c[3][_mm256_extract_epi8(v0, 31)]++;
596 |
597 | u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96));
598 | c[0][_mm256_extract_epi8(u1, 0)]++;
599 | c[1][_mm256_extract_epi8(v1, 0)]++;
600 | c[2][_mm256_extract_epi8(u1, 1)]++;
601 | c[3][_mm256_extract_epi8(v1, 1)]++;
602 | c[0][_mm256_extract_epi8(u1, 2)]++;
603 | c[1][_mm256_extract_epi8(v1, 2)]++;
604 | c[2][_mm256_extract_epi8(u1, 3)]++;
605 | c[3][_mm256_extract_epi8(v1, 3)]++;
606 | c[0][_mm256_extract_epi8(u1, 4)]++;
607 | c[1][_mm256_extract_epi8(v1, 4)]++;
608 | c[2][_mm256_extract_epi8(u1, 5)]++;
609 | c[3][_mm256_extract_epi8(v1, 5)]++;
610 | c[0][_mm256_extract_epi8(u1, 6)]++;
611 | c[1][_mm256_extract_epi8(v1, 6)]++;
612 | c[2][_mm256_extract_epi8(u1, 7)]++;
613 | c[3][_mm256_extract_epi8(v1, 7)]++;
614 | c[0][_mm256_extract_epi8(u1, 8)]++;
615 | c[1][_mm256_extract_epi8(v1, 8)]++;
616 | c[2][_mm256_extract_epi8(u1, 9)]++;
617 | c[3][_mm256_extract_epi8(v1, 9)]++;
618 | c[0][_mm256_extract_epi8(u1, 10)]++;
619 | c[1][_mm256_extract_epi8(v1, 10)]++;
620 | c[2][_mm256_extract_epi8(u1, 11)]++;
621 | c[3][_mm256_extract_epi8(v1, 11)]++;
622 | c[0][_mm256_extract_epi8(u1, 12)]++;
623 | c[1][_mm256_extract_epi8(v1, 12)]++;
624 | c[2][_mm256_extract_epi8(u1, 13)]++;
625 | c[3][_mm256_extract_epi8(v1, 13)]++;
626 | c[0][_mm256_extract_epi8(u1, 14)]++;
627 | c[1][_mm256_extract_epi8(v1, 14)]++;
628 | c[2][_mm256_extract_epi8(u1, 15)]++;
629 | c[3][_mm256_extract_epi8(v1, 15)]++;
630 | c[0][_mm256_extract_epi8(u1, 16)]++;
631 | c[1][_mm256_extract_epi8(v1, 16)]++;
632 | c[2][_mm256_extract_epi8(u1, 17)]++;
633 | c[3][_mm256_extract_epi8(v1, 17)]++;
634 | c[0][_mm256_extract_epi8(u1, 18)]++;
635 | c[1][_mm256_extract_epi8(v1, 18)]++;
636 | c[2][_mm256_extract_epi8(u1, 19)]++;
637 | c[3][_mm256_extract_epi8(v1, 19)]++;
638 | c[0][_mm256_extract_epi8(u1, 20)]++;
639 | c[1][_mm256_extract_epi8(v1, 20)]++;
640 | c[2][_mm256_extract_epi8(u1, 21)]++;
641 | c[3][_mm256_extract_epi8(v1, 21)]++;
642 | c[0][_mm256_extract_epi8(u1, 22)]++;
643 | c[1][_mm256_extract_epi8(v1, 22)]++;
644 | c[2][_mm256_extract_epi8(u1, 23)]++;
645 | c[3][_mm256_extract_epi8(v1, 23)]++;
646 | c[0][_mm256_extract_epi8(u1, 24)]++;
647 | c[1][_mm256_extract_epi8(v1, 24)]++;
648 | c[2][_mm256_extract_epi8(u1, 25)]++;
649 | c[3][_mm256_extract_epi8(v1, 25)]++;
650 | c[0][_mm256_extract_epi8(u1, 26)]++;
651 | c[1][_mm256_extract_epi8(v1, 26)]++;
652 | c[2][_mm256_extract_epi8(u1, 27)]++;
653 | c[3][_mm256_extract_epi8(v1, 27)]++;
654 | c[0][_mm256_extract_epi8(u1, 28)]++;
655 | c[1][_mm256_extract_epi8(v1, 28)]++;
656 | c[2][_mm256_extract_epi8(u1, 29)]++;
657 | c[3][_mm256_extract_epi8(v1, 29)]++;
658 | c[0][_mm256_extract_epi8(u1, 30)]++;
659 | c[1][_mm256_extract_epi8(v1, 30)]++;
660 | c[2][_mm256_extract_epi8(u1, 31)]++;
661 | c[3][_mm256_extract_epi8(v1, 31)]++; PREFETCH(ip+512, 0);
662 | }
663 | }
664 | while(ip < in+inlen) c[0][*ip++]++;
665 | HISTEND4(c, cnt);
666 | }
667 |
668 | #define UZ 64
669 | #define N256 128
670 | static void hist_8_256(unsigned char *__restrict in, unsigned inlen, unsigned *__restrict cnt) {
671 | cnt_t c[8][CSIZE]={0},i;
672 |
673 | unsigned char *ip = in;
674 | if(inlen >= UZ+N256) {
675 | __m256i u0 = _mm256_loadu_si256((__m256i*)ip), v0 = _mm256_loadu_si256((__m256i*)(ip+32));
676 | for(; ip <= in+((inlen-(UZ+N256))&~(N256-1)); ip += N256) {
677 | __m256i u1 = _mm256_loadu_si256((__m256i*)(ip+UZ+0)), v1 = _mm256_loadu_si256((__m256i*)(ip+UZ+32));
678 | c[0][_mm256_extract_epi8(u0, 0)]++;
679 | c[1][_mm256_extract_epi8(v0, 0)]++;
680 | c[2][_mm256_extract_epi8(u0, 1)]++;
681 | c[3][_mm256_extract_epi8(v0, 1)]++;
682 | c[4][_mm256_extract_epi8(u0, 2)]++;
683 | c[5][_mm256_extract_epi8(v0, 2)]++;
684 | c[6][_mm256_extract_epi8(u0, 3)]++;
685 | c[7][_mm256_extract_epi8(v0, 3)]++;
686 | c[0][_mm256_extract_epi8(u0, 4)]++;
687 | c[1][_mm256_extract_epi8(v0, 4)]++;
688 | c[2][_mm256_extract_epi8(u0, 5)]++;
689 | c[3][_mm256_extract_epi8(v0, 5)]++;
690 | c[4][_mm256_extract_epi8(u0, 6)]++;
691 | c[5][_mm256_extract_epi8(v0, 6)]++;
692 | c[6][_mm256_extract_epi8(u0, 7)]++;
693 | c[7][_mm256_extract_epi8(v0, 7)]++;
694 | c[0][_mm256_extract_epi8(u0, 8)]++;
695 | c[1][_mm256_extract_epi8(v0, 8)]++;
696 | c[2][_mm256_extract_epi8(u0, 9)]++;
697 | c[3][_mm256_extract_epi8(v0, 9)]++;
698 | c[4][_mm256_extract_epi8(u0, 10)]++;
699 | c[5][_mm256_extract_epi8(v0, 10)]++;
700 | c[6][_mm256_extract_epi8(u0, 11)]++;
701 | c[7][_mm256_extract_epi8(v0, 11)]++;
702 | c[0][_mm256_extract_epi8(u0, 12)]++;
703 | c[1][_mm256_extract_epi8(v0, 12)]++;
704 | c[2][_mm256_extract_epi8(u0, 13)]++;
705 | c[3][_mm256_extract_epi8(v0, 13)]++;
706 | c[4][_mm256_extract_epi8(u0, 14)]++;
707 | c[5][_mm256_extract_epi8(v0, 14)]++;
708 | c[6][_mm256_extract_epi8(u0, 15)]++;
709 | c[7][_mm256_extract_epi8(v0, 15)]++;
710 | c[0][_mm256_extract_epi8(u0, 16)]++;
711 | c[1][_mm256_extract_epi8(v0, 16)]++;
712 | c[2][_mm256_extract_epi8(u0, 17)]++;
713 | c[3][_mm256_extract_epi8(v0, 17)]++;
714 | c[4][_mm256_extract_epi8(u0, 18)]++;
715 | c[5][_mm256_extract_epi8(v0, 18)]++;
716 | c[6][_mm256_extract_epi8(u0, 19)]++;
717 | c[7][_mm256_extract_epi8(v0, 19)]++;
718 | c[0][_mm256_extract_epi8(u0, 20)]++;
719 | c[1][_mm256_extract_epi8(v0, 20)]++;
720 | c[2][_mm256_extract_epi8(u0, 21)]++;
721 | c[3][_mm256_extract_epi8(v0, 21)]++;
722 | c[4][_mm256_extract_epi8(u0, 22)]++;
723 | c[5][_mm256_extract_epi8(v0, 22)]++;
724 | c[6][_mm256_extract_epi8(u0, 23)]++;
725 | c[7][_mm256_extract_epi8(v0, 23)]++;
726 | c[0][_mm256_extract_epi8(u0, 24)]++;
727 | c[1][_mm256_extract_epi8(v0, 24)]++;
728 | c[2][_mm256_extract_epi8(u0, 25)]++;
729 | c[3][_mm256_extract_epi8(v0, 25)]++;
730 | c[4][_mm256_extract_epi8(u0, 26)]++;
731 | c[5][_mm256_extract_epi8(v0, 26)]++;
732 | c[6][_mm256_extract_epi8(u0, 27)]++;
733 | c[7][_mm256_extract_epi8(v0, 27)]++;
734 | c[0][_mm256_extract_epi8(u0, 28)]++;
735 | c[1][_mm256_extract_epi8(v0, 28)]++;
736 | c[2][_mm256_extract_epi8(u0, 29)]++;
737 | c[3][_mm256_extract_epi8(v0, 29)]++;
738 | c[4][_mm256_extract_epi8(u0, 30)]++;
739 | c[5][_mm256_extract_epi8(v0, 30)]++;
740 | c[6][_mm256_extract_epi8(u0, 31)]++;
741 | c[7][_mm256_extract_epi8(v0, 31)]++;
742 |
743 | u0 = _mm256_loadu_si256((__m256i*)(ip+UZ+64)); v0 = _mm256_loadu_si256((__m256i*)(ip+UZ+96));
744 | c[0][_mm256_extract_epi8(u1, 0)]++;
745 | c[1][_mm256_extract_epi8(v1, 0)]++;
746 | c[2][_mm256_extract_epi8(u1, 1)]++;
747 | c[3][_mm256_extract_epi8(v1, 1)]++;
748 | c[4][_mm256_extract_epi8(u1, 2)]++;
749 | c[5][_mm256_extract_epi8(v1, 2)]++;
750 | c[6][_mm256_extract_epi8(u1, 3)]++;
751 | c[7][_mm256_extract_epi8(v1, 3)]++;
752 | c[0][_mm256_extract_epi8(u1, 4)]++;
753 | c[1][_mm256_extract_epi8(v1, 4)]++;
754 | c[2][_mm256_extract_epi8(u1, 5)]++;
755 | c[3][_mm256_extract_epi8(v1, 5)]++;
756 | c[4][_mm256_extract_epi8(u1, 6)]++;
757 | c[5][_mm256_extract_epi8(v1, 6)]++;
758 | c[6][_mm256_extract_epi8(u1, 7)]++;
759 | c[7][_mm256_extract_epi8(v1, 7)]++;
760 | c[0][_mm256_extract_epi8(u1, 8)]++;
761 | c[1][_mm256_extract_epi8(v1, 8)]++;
762 | c[2][_mm256_extract_epi8(u1, 9)]++;
763 | c[3][_mm256_extract_epi8(v1, 9)]++;
764 | c[4][_mm256_extract_epi8(u1, 10)]++;
765 | c[5][_mm256_extract_epi8(v1, 10)]++;
766 | c[6][_mm256_extract_epi8(u1, 11)]++;
767 | c[7][_mm256_extract_epi8(v1, 11)]++;
768 | c[0][_mm256_extract_epi8(u1, 12)]++;
769 | c[1][_mm256_extract_epi8(v1, 12)]++;
770 | c[2][_mm256_extract_epi8(u1, 13)]++;
771 | c[3][_mm256_extract_epi8(v1, 13)]++;
772 | c[4][_mm256_extract_epi8(u1, 14)]++;
773 | c[5][_mm256_extract_epi8(v1, 14)]++;
774 | c[6][_mm256_extract_epi8(u1, 15)]++;
775 | c[7][_mm256_extract_epi8(v1, 15)]++;
776 | c[0][_mm256_extract_epi8(u1, 16)]++;
777 | c[1][_mm256_extract_epi8(v1, 16)]++;
778 | c[2][_mm256_extract_epi8(u1, 17)]++;
779 | c[3][_mm256_extract_epi8(v1, 17)]++;
780 | c[4][_mm256_extract_epi8(u1, 18)]++;
781 | c[5][_mm256_extract_epi8(v1, 18)]++;
782 | c[6][_mm256_extract_epi8(u1, 19)]++;
783 | c[7][_mm256_extract_epi8(v1, 19)]++;
784 | c[0][_mm256_extract_epi8(u1, 20)]++;
785 | c[1][_mm256_extract_epi8(v1, 20)]++;
786 | c[2][_mm256_extract_epi8(u1, 21)]++;
787 | c[3][_mm256_extract_epi8(v1, 21)]++;
788 | c[4][_mm256_extract_epi8(u1, 22)]++;
789 | c[5][_mm256_extract_epi8(v1, 22)]++;
790 | c[6][_mm256_extract_epi8(u1, 23)]++;
791 | c[7][_mm256_extract_epi8(v1, 23)]++;
792 | c[0][_mm256_extract_epi8(u1, 24)]++;
793 | c[1][_mm256_extract_epi8(v1, 24)]++;
794 | c[2][_mm256_extract_epi8(u1, 25)]++;
795 | c[3][_mm256_extract_epi8(v1, 25)]++;
796 | c[4][_mm256_extract_epi8(u1, 26)]++;
797 | c[5][_mm256_extract_epi8(v1, 26)]++;
798 | c[6][_mm256_extract_epi8(u1, 27)]++;
799 | c[7][_mm256_extract_epi8(v1, 27)]++;
800 | c[0][_mm256_extract_epi8(u1, 28)]++;
801 | c[1][_mm256_extract_epi8(v1, 28)]++;
802 | c[2][_mm256_extract_epi8(u1, 29)]++;
803 | c[3][_mm256_extract_epi8(v1, 29)]++;
804 | c[4][_mm256_extract_epi8(u1, 30)]++;
805 | c[5][_mm256_extract_epi8(v1, 30)]++;
806 | c[6][_mm256_extract_epi8(u1, 31)]++;
807 | c[7][_mm256_extract_epi8(v1, 31)]++; PREFETCH(ip+512, 0);
808 | }
809 | }
810 | while(ip < in+inlen) c[0][*ip++]++;
811 | HISTEND8(c, cnt);
812 | }
813 | #endif
814 |
815 | //-------------------------------------------------------------------------
816 | #ifdef _COUNTBENCH
817 | // "count2x64", fastest function in https://github.com/nkurz/countbench
818 | #define CSIZE (256+8)
819 |
820 | #define ASM_SHIFT_RIGHT(reg, bitsToShift) \
821 | __asm volatile ("shr %1, %0": \
822 | "+r" (reg): /* read and written */ \
823 | "i" (bitsToShift) /* constant */ \
824 | )
825 |
826 |
827 | #define ASM_INC_TABLES(src0, src1, byte0, byte1, offset, size, base, scale) \
828 | __asm volatile ("movzbl %b2, %k0\n" /* byte0 = src0 & 0xFF */ \
829 | "movzbl %b3, %k1\n" /* byte1 = src1 & 0xFF */ \
830 | "incl (%c4+0)*%c5(%6, %0, %c7)\n" /* count[i+0][byte0]++ */ \
831 | "incl (%c4+1)*%c5(%6, %1, %c7)\n" /* count[i+1][byte1]++ */ \
832 | "movzbl %h2, %k0\n" /* byte0 = (src0 & 0xFF00) >> 8 */ \
833 | "movzbl %h3, %k1\n" /* byte1 = (src1 & 0xFF00) >> 8 */ \
834 | "incl (%c4+2)*%c5(%6, %0, %c7)\n" /* count[i+2][byte0]++ */ \
835 | "incl (%c4+3)*%c5(%6, %1, %c7)\n": /* count[i+3][byte1]++ */ \
836 | "=&R" (byte0), /* write only (R == non REX) */ \
837 | "=&R" (byte1): /* write only (R == non REX) */ \
838 | "Q" (src0), /* read only (Q == must have rH) */ \
839 | "Q" (src1), /* read only (Q == must have rH) */ \
840 | "i" (offset), /* constant array offset */ \
841 | "i" (size), /* constant array size */ \
842 | "r" (base), /* read only array address */ \
843 | "i" (scale): /* constant [1,2,4,8] */ \
844 | "memory" /* clobbered (forces compiler to compute sum ) */ \
845 | )
846 |
847 | unsigned count2x64(unsigned char *src, unsigned srcSize, unsigned *__restrict cnt)
848 | {
849 | unsigned long long remainder = srcSize;
850 | if (srcSize < 32) goto handle_remainder;
851 |
852 | unsigned c[16][CSIZE];
853 | memset(c, 0, sizeof(c));
854 |
855 | remainder = srcSize % 16;
856 | srcSize -= remainder;
857 | const unsigned char *endSrc = src + srcSize;
858 | unsigned long long next0 = *(unsigned long long *)(src + 0);
859 | unsigned long long next1 = *(unsigned long long *)(src + 8);
860 |
861 | //IACA_START;
862 |
863 | while (src != endSrc)
864 | {
865 | unsigned long long byte0, byte1;
866 | unsigned long long data0 = next0;
867 | unsigned long long data1 = next1;
868 |
869 | src += 16;
870 | next0 = *(unsigned long long *)(src + 0);
871 | next1 = *(unsigned long long *)(src + 8);
872 |
873 | ASM_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4);
874 |
875 | ASM_SHIFT_RIGHT(data0, 16);
876 | ASM_SHIFT_RIGHT(data1, 16);
877 | ASM_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4);
878 |
879 | ASM_SHIFT_RIGHT(data0, 16);
880 | ASM_SHIFT_RIGHT(data1, 16);
881 | ASM_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4);
882 |
883 | ASM_SHIFT_RIGHT(data0, 16);
884 | ASM_SHIFT_RIGHT(data1, 16);
885 | ASM_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4);
886 | }
887 |
888 | //IACA_END;
889 |
890 | handle_remainder:
891 | for (size_t i = 0; i < remainder; i++) {
892 | unsigned long long byte = src[i];
893 | c[0][byte]++;
894 | }
895 | memset(cnt, 0, 256*sizeof(cnt[0]));
896 | for(int i = 0; i < 256; i++)
897 | for (int idx=0; idx < 16; idx++)
898 | cnt[i] += c[idx][i];
899 | }
900 |
901 | // Modified version of count2x64 by powturbo, using C instead of assembler
902 | #define C_SHIFT_RIGHT(reg, bitsToShift) reg >>= bitsToShift
903 | #define C_INC_TABLES(src0, src1, byte0, byte1, offset, size, c, scale) \
904 | { \
905 | byte0 = (unsigned char)src0;\
906 | byte1 = (unsigned char)src1;\
907 | c[offset+0][byte0]++;\
908 | c[offset+1][byte1]++;\
909 | byte0 = (unsigned char)(src0 >> 8);\
910 | byte1 = (unsigned char)(src1 >> 8);\
911 | c[offset+2][byte0]++; \
912 | c[offset+3][byte1]++; \
913 | }
914 |
915 | static void count2x64c(unsigned char *__restrict src, unsigned srcSize, unsigned *__restrict cnt)
916 | {
917 | unsigned long long remainder = srcSize;
918 | if (srcSize < 32) goto handle_remainder;
919 |
920 | unsigned c[16][CSIZE];
921 | memset(c, 0, sizeof(c));
922 |
923 | remainder = srcSize % 16;
924 | srcSize -= remainder;
925 | const unsigned char *endSrc = src + srcSize;
926 | unsigned long long next0 = *(unsigned long long *)(src + 0);
927 | unsigned long long next1 = *(unsigned long long *)(src + 8);
928 |
929 | //IACA_START;
930 |
931 | while (src != endSrc)
932 | {
933 | unsigned long long byte0, byte1;
934 | unsigned long long data0 = next0;
935 | unsigned long long data1 = next1;
936 |
937 | src += 16;
938 | next0 = *(unsigned long long *)(src + 0);
939 | next1 = *(unsigned long long *)(src + 8);
940 |
941 | C_INC_TABLES(data0, data1, byte0, byte1, 0, CSIZE * 4, c, 4);
942 |
943 | C_SHIFT_RIGHT(data0, 16);
944 | C_SHIFT_RIGHT(data1, 16);
945 | C_INC_TABLES(data0, data1, byte0, byte1, 4, CSIZE * 4, c, 4);
946 |
947 | C_SHIFT_RIGHT(data0, 16);
948 | C_SHIFT_RIGHT(data1, 16);
949 | C_INC_TABLES(data0, data1, byte0, byte1, 8, CSIZE * 4, c, 4);
950 |
951 | C_SHIFT_RIGHT(data0, 16);
952 | C_SHIFT_RIGHT(data1, 16);
953 | C_INC_TABLES(data0, data1, byte0, byte1, 12, CSIZE * 4, c, 4);
954 | }
955 |
956 | //IACA_END;
957 |
958 | handle_remainder:
959 | for (size_t i = 0; i < remainder; i++) {
960 | unsigned long long byte = src[i];
961 | c[0][byte]++;
962 | }
963 | memset(cnt, 0, 256*sizeof(cnt[0]));
964 | for(int i = 0; i < 256; i++)
965 | for(int idx=0; idx < 16; idx++)
966 | cnt[i] += c[idx][i];
967 | }
968 | #endif
969 |
--------------------------------------------------------------------------------