├── .gitignore ├── LICENSE ├── Makefile ├── README.rst ├── aarch64-strstr-v2.cpp ├── avx2-naive-strstr.cpp ├── avx2-naive-strstr64.cpp ├── avx2-naive-unrolled-strstr.cpp ├── avx2-strstr-v2-clang-specific.cpp ├── avx2-strstr-v2.cpp ├── avx2-strstr.cpp ├── avx512bw-strstr-v2.cpp ├── avx512bw-strstr-v3.cpp ├── avx512f-strstr-v2.cpp ├── avx512f-strstr.cpp ├── common.h ├── data └── placeholder ├── fixed-memcmp.cpp ├── make_words.sh ├── neon-strstr-v2.cpp ├── original ├── sse4_strstr-test.py └── sse4_strstr.c ├── results ├── armv7-32bit-gcc4.9.2.txt ├── armv8-64bit-clang3.8.0.txt ├── bulldozer-fx-8510-gcc4.8.4-sse.txt ├── cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt ├── haswell-i7-4770-gcc5.4.1-avx2.txt ├── knights-landing-7210-gcc5.3.0-avx512f.txt ├── postprocess.py ├── skylake-i7-6700-gcc5.4.1-avx2.txt ├── skylake-i9-7900-gcc-5.4.1-avx512bw.txt └── westmere-m540-gcc6.2.0-sse4.txt ├── scalar.cpp ├── src ├── all.h ├── all_procedures.cpp ├── application_base.cpp ├── benchmark.cpp ├── benchmark.h ├── speedup.cpp ├── unittests.cpp └── validate.cpp ├── sse-naive-strstr.cpp ├── sse2-needle4.cpp ├── sse2-strstr.cpp ├── sse4-strstr-unrolled.cpp ├── sse4-strstr.cpp ├── sse4.2-strstr.cpp ├── swar32-strstr-v2.cpp ├── swar64-strstr-v2.cpp └── utils ├── ansi.cpp ├── avx2.cpp ├── avx512.cpp ├── bits.cpp ├── neon.cpp └── sse.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | speedup_sse4 2 | benchmark_sse4 3 | unittests_sse4 4 | validate_sse4 5 | 6 | speedup_avx2 7 | benchmark_avx2 8 | unittests_avx2 9 | validate_avx2 10 | 11 | speedup_avx512f 12 | benchmark_avx512f 13 | unittests_avx512f 14 | validate_avx512f 15 | 16 | unittests_avx512bw 17 | benchmark_avx512bw 18 | validate_avx512bw 19 | speedup_avx512bw 20 | 21 | speedup_arm 22 | unittests_arm 23 | validate_arm 24 | 25 | speedup_aarch64 26 | unittests_aarch64 27 | validate_aarch64 28 | 29 | data/i386.txt 30 | data/words 31 | 32 | tags 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2016, Wojciech Muła 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean compile_intel 2 | 3 | FLAGS=-std=c++11 -O3 -Wall -Wextra -pedantic -I. $(CXXFLAGS) 4 | FLAGS_INTEL=$(FLAGS) -DHAVE_SSE_INSTRUCTIONS 5 | FLAGS_SSE4=$(FLAGS_INTEL) -msse4.2 6 | FLAGS_AVX2=$(FLAGS_INTEL) -mavx2 -DHAVE_AVX2_INSTRUCTIONS 7 | FLAGS_AVX512F=$(FLAGS_INTEL) -mavx512f -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS 8 | FLAGS_AVX512BW=$(FLAGS_INTEL) -mavx512bw -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX512BW_INSTRUCTIONS 9 | FLAGS_ARM=$(FLAGS) -mfpu=neon -DHAVE_NEON_INSTRUCTIONS 10 | FLAGS_AARCH64=$(FLAGS) -DHAVE_NEON_INSTRUCTIONS -DHAVE_AARCH64_ARCHITECTURE 11 | 12 | DEPS=utils/ansi.cpp utils/bits.cpp common.h fixed-memcmp.cpp 13 | DEPS_SCALAR=swar64-strstr-v2.cpp swar32-strstr-v2.cpp scalar.cpp 14 | DEPS_SSE4=sse4-strstr.cpp sse4-strstr-unrolled.cpp sse4.2-strstr.cpp sse2-strstr.cpp sse-naive-strstr.cpp sse2-needle4.cpp utils/sse.cpp $(DEPS) $(DEPS_SCALAR) 15 | DEPS_AVX2=avx2-*.cpp utils/avx2.cpp $(DEPS_SSE4) 16 | DEPS_AVX512F=avx512f-*.cpp utils/avx512.cpp $(DEPS_AVX2) 17 | DEPS_AVX512BW=avx512bw-*.cpp utils/avx512.cpp $(DEPS_AVX512F) 18 | DEPS_ARM=neon-strstr-v2.cpp $(DEPS) $(DEPS_SCALAR) 19 | DEPS_AARCH64=aarch64-strstr-v2.cpp $(DEPS_ARM) 20 | 21 | ALL_INTEL=\ 22 | validate_sse4 \ 23 | speedup_sse4 \ 24 | benchmark_sse4 \ 25 | unittests_sse4 \ 26 | validate_avx2 \ 27 | speedup_avx2 \ 28 | benchmark_avx2 \ 29 | unittests_avx2 \ 30 | validate_avx512f \ 31 | speedup_avx512f \ 32 | benchmark_avx512f \ 33 | unittests_avx512f \ 34 | speedup_avx512bw \ 35 | benchmark_avx512bw \ 36 | validate_avx512bw \ 37 | unittests_avx512bw \ 38 | 39 | ALL_ARM=\ 40 | validate_arm \ 41 | unittests_arm \ 42 | speedup_arm 43 | 44 | ALL_AARCH64=\ 45 | validate_aarch64 \ 46 | unittests_aarch64 \ 47 | speedup_aarch64 48 | 49 | ALL=$(ALL_INTEL) $(ALL_ARM) $(ALL_AARCH64) 50 | 51 | all: 52 | @echo "select target test_ARCH or run_ARCH" 53 | @echo 54 | @echo "test_ARCH runs unit and validation tests" 55 | @echo "run_ARCH runs performance tests" 56 | @echo 57 | @echo "ARCH might be:" 58 | @echo "* sse4" 59 | @echo "* avx2" 60 | @echo "* avx512f" 61 | @echo "* avx512bw" 62 | @echo "* arm" 63 | @echo "* aarch64" 64 | 65 | build_intel: $(ALL_INTEL) 66 | build_arm: $(ALL_ARM) 67 | build_aarch64: $(ALL_AARCH64) 68 | 69 | UNITTESTS_DEPS=src/unittests.cpp src/all_procedures.cpp 70 | VALIDATE_DEPS=src/validate.cpp src/application_base.cpp src/all_procedures.cpp 71 | SPEEDUP_DEPS=src/speedup.cpp src/application_base.cpp src/all_procedures.cpp 72 | BENCHMARK_DEPS=src/benchmark.cpp src/benchmark.h src/application_base.cpp src/all_procedures.cpp 73 | 74 | validate_sse4: $(VALIDATE_DEPS) $(DEPS_SSE4) 75 | $(CXX) $(FLAGS_SSE4) src/validate.cpp -o $@ 76 | 77 | speedup_sse4: $(SPEEDUP_DEPS) $(DEPS_SSE4) 78 | $(CXX) $(FLAGS_SSE4) -DNDEBUG src/speedup.cpp -o $@ 79 | 80 | benchmark_sse4: $(BENCHMARK_DEPS) $(DEPS_SSE4) 81 | $(CXX) $(FLAGS_SSE4) -DNDEBUG src/benchmark.cpp -o $@ 82 | 83 | unittests_sse4: $(UNITTESTS_DEPS) $(DEPS_SSE4) 84 | $(CXX) $(FLAGS_SSE4) src/unittests.cpp -o $@ 85 | 86 | validate_avx2: $(VALIDATE_DEPS) $(DEPS_AVX2) 87 | $(CXX) $(FLAGS_AVX2) src/validate.cpp -o $@ 88 | 89 | speedup_avx2: $(SPEEDUP_DEPS) $(DEPS_AVX2) 90 | $(CXX) $(FLAGS_AVX2) -DNDEBUG src/speedup.cpp -o $@ 91 | 92 | benchmark_avx2: $(BENCHMARK_DEPS) $(DEPS_SSE4) 93 | $(CXX) $(FLAGS_AVX2) -DNDEBUG src/benchmark.cpp -o $@ 94 | 95 | unittests_avx2: $(UNITTESTS_DEPS) $(DEPS_AVX2) 96 | $(CXX) $(FLAGS_AVX2) src/unittests.cpp -o $@ 97 | 98 | validate_avx512f: $(VALIDATE_DEPS) $(DEPS_AVX512F) 99 | $(CXX) $(FLAGS_AVX512F) src/validate.cpp -o $@ 100 | 101 | benchmark_avx512f: $(BENCHMARK_DEPS) $(DEPS_SSE4) 102 | $(CXX) $(FLAGS_AVX512F) -DNDEBUG src/benchmark.cpp -o $@ 103 | 104 | speedup_avx512f: $(SPEEDUP_DEPS) $(DEPS_AVX512F) 105 | $(CXX) $(FLAGS_AVX512F) -DNDEBUG src/speedup.cpp -o $@ 106 | 107 | unittests_avx512f: $(UNITTESTS_DEPS) $(DEPS_AVX512F) 108 | $(CXX) $(FLAGS_AVX512F) src/unittests.cpp -o $@ 109 | 110 | validate_avx512bw: $(VALIDATE_DEPS) $(DEPS_AVX512BW) 111 | $(CXX) $(FLAGS_AVX512BW) src/validate.cpp -o $@ 112 | 113 | speedup_avx512bw: $(SPEEDUP_DEPS) $(DEPS_AVX512BW) 114 | $(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/speedup.cpp -o $@ 115 | 116 | benchmark_avx512bw: $(BENCHMARK_DEPS) $(DEPS_SSE4) 117 | $(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/benchmark.cpp -o $@ 118 | 119 | unittests_avx512bw: $(UNITTESTS_DEPS) $(DEPS_AVX512BW) 120 | $(CXX) $(FLAGS_AVX512BW) src/unittests.cpp -o $@ 121 | 122 | validate_arm: $(VALIDATE_DEPS) $(DEPS_ARM) 123 | $(CXX) $(FLAGS_ARM) src/validate.cpp -o $@ 124 | 125 | speedup_arm: $(SPEEDUP_DEPS) $(DEPS_ARM) 126 | $(CXX) $(FLAGS_ARM) -DNDEBUG src/speedup.cpp -o $@ 127 | 128 | unittests_arm: $(UNITTESTS_DEPS) $(DEPS_ARM) 129 | $(CXX) $(FLAGS_ARM) src/unittests.cpp -o $@ 130 | 131 | validate_aarch64: $(VALIDATE_DEPS) $(DEPS_AARCH64) 132 | $(CXX) $(FLAGS_AARCH64) src/validate.cpp -o $@ 133 | 134 | speedup_aarch64: $(SPEEDUP_DEPS) $(DEPS_AARCH64) 135 | $(CXX) $(FLAGS_AARCH64) -DNDEBUG src/speedup.cpp -o $@ 136 | 137 | unittests_aarch64: $(UNITTESTS_DEPS) $(DEPS_ARM) 138 | $(CXX) $(FLAGS_AARCH64) src/unittests.cpp -o $@ 139 | 140 | data/i386.txt: 141 | wget http://css.csail.mit.edu/6.858/2013/readings/i386.txt 142 | mv i386.txt data/i386.txt 143 | 144 | data/words: data/i386.txt 145 | sh make_words.sh $^ $@ 146 | 147 | test_sse4: unittests_sse4 validate_sse4 data/words data/i386.txt 148 | ./unittests_sse4 149 | ./validate_sse4 data/i386.txt data/words 150 | 151 | run_sse4: speedup_sse4 data/words data/i386.txt 152 | ./speedup_sse4 data/i386.txt data/words 153 | 154 | test_avx2: unittests_avx2 validate_avx2 data/words data/i386.txt 155 | ./unittests_avx2 156 | ./validate_avx2 data/i386.txt data/words 157 | 158 | run_avx2: speedup_avx2 data/words data/i386.txt 159 | ./speedup_avx2 data/i386.txt data/words 160 | 161 | test_avx512f: unittests_avx512f validate_avx512f data/words data/i386.txt 162 | ./unittests_avx512f 163 | ./validate_avx512f data/i386.txt data/words 164 | 165 | run_avx512f: speedup_avx512f data/words data/i386.txt 166 | ./speedup_avx512f data/i386.txt data/words 167 | 168 | run_avx512bw: speedup_avx512bw data/words data/i386.txt 169 | ./speedup_avx512bw data/i386.txt data/words 170 | 171 | test_avx512bw: unittests_avx512bw validate_avx512bw data/words data/i386.txt 172 | ./unittests_avx512bw 173 | ./validate_avx512bw data/i386.txt data/words 174 | 175 | test_arm: unittests_arm validate_arm data/words data/i386.txt 176 | ./unittests_arm 177 | ./validate_arm data/i386.txt data/words 178 | 179 | run_arm: speedup_arm data/words data/i386.txt 180 | # my Raspberry Pi is slow, repeat count = 1 is enough 181 | ./$< data/i386.txt data/words 1 182 | 183 | test_aarch64: unittests_aarch64 validate_aarch64 data/words data/i386.txt 184 | ./unittests_aarch64 185 | ./validate_aarch64 data/i386.txt data/words 186 | 187 | run_aarch64: speedup_aarch64 data/words data/i386.txt 188 | ./$< data/i386.txt data/words 1 189 | 190 | compile_intel: $(ALL_INTEL) 191 | 192 | clean: 193 | rm -f $(ALL) 194 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | SIMD-friendly algorithms for substring searching 3 | ================================================================================ 4 | 5 | Sample programs for article "SIMD-friendly algorithms for substring searching" 6 | (http://0x80.pl/articles/simd-strfind.html). 7 | 8 | The **root directory** contains C++11 procedures implemented using intrinsics 9 | for SSE, SSE4, AVX2, AVX512F, AVX512BW and ARM Neon (both ARMv7 and ARMv8). 10 | 11 | The subdirectory **original** contains 32-bit programs with inline assembly, 12 | written in 2008 for another article__. 13 | 14 | __ http://0x80.pl/articles/sse4_substring_locate.html 15 | 16 | 17 | Usage 18 | ------------------------------------------------------------------------ 19 | 20 | To run unit and validation tests type ``make test_ARCH``, to run 21 | performance tests type ``make run_ARCH``. Value ``ARCH`` selectes 22 | the CPU architecture: 23 | 24 | * sse4, 25 | * avx2, 26 | * avx512f, 27 | * avx512bw, 28 | * arm, 29 | * aarch64. 30 | 31 | 32 | Performance results 33 | ------------------------------------------------------------------------ 34 | 35 | The subdirectory ``results`` contains raw timings from various computers. 36 | -------------------------------------------------------------------------------- /aarch64-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | size_t FORCE_INLINE aarch64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 2 | 3 | assert(k > 0); 4 | assert(n > 0); 5 | 6 | const uint8x16_t first = vdupq_n_u8(needle[0]); 7 | const uint8x16_t last = vdupq_n_u8(needle[k - 1]); 8 | 9 | const uint8_t* ptr = reinterpret_cast(s); 10 | 11 | for (size_t i = 0; i < n; i += 16) { 12 | 13 | const uint8x16_t block_first = vld1q_u8(ptr + i); 14 | const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); 15 | 16 | const uint8x16_t eq_first = vceqq_u8(first, block_first); 17 | const uint8x16_t eq_last = vceqq_u8(last, block_last); 18 | const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); 19 | 20 | uint64_t mask; 21 | 22 | mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0); 23 | if (mask) { 24 | for (int j=0; j < 8; j++) { 25 | if ((mask & 0xff) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) { 26 | return i + j; 27 | } 28 | 29 | mask >>= 8; 30 | } 31 | } 32 | 33 | mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1); 34 | if (mask) { 35 | for (int j=0; j < 8; j++) { 36 | if ((mask & 0xff) && (memcmp(s + i + j + 8 + 1, needle + 1, k - 2) == 0)) { 37 | return i + j + 8; 38 | } 39 | 40 | mask >>= 8; 41 | } 42 | } 43 | } 44 | 45 | return std::string::npos; 46 | } 47 | 48 | // ------------------------------------------------------------------------ 49 | 50 | template 51 | size_t FORCE_INLINE aarch64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 52 | 53 | assert(k > 0); 54 | assert(n > 0); 55 | 56 | const uint8x16_t first = vdupq_n_u8(needle[0]); 57 | const uint8x16_t last = vdupq_n_u8(needle[k - 1]); 58 | 59 | const uint8_t* ptr = reinterpret_cast(s); 60 | 61 | for (size_t i = 0; i < n; i += 16) { 62 | 63 | const uint8x16_t block_first = vld1q_u8(ptr + i); 64 | const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); 65 | 66 | const uint8x16_t eq_first = vceqq_u8(first, block_first); 67 | const uint8x16_t eq_last = vceqq_u8(last, block_last); 68 | const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); 69 | 70 | uint64_t mask; 71 | int j; 72 | 73 | mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0); 74 | j = 0; 75 | while (mask) { 76 | if ((mask & 0xff) && (memcmp_fun(s + i + j + 1, needle + 1))) { 77 | return i + j; 78 | } 79 | 80 | mask >>= 8; 81 | j += 1; 82 | } 83 | 84 | mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1); 85 | j = 0; 86 | while (mask) { 87 | if ((mask & 0xff) && (memcmp_fun(s + i + j + 8 + 1, needle + 1))) { 88 | return i + j + 8; 89 | } 90 | 91 | mask >>= 8; 92 | j += 1; 93 | } 94 | } 95 | 96 | return std::string::npos; 97 | } 98 | 99 | // ------------------------------------------------------------------------ 100 | 101 | size_t aarch64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 102 | 103 | size_t result = std::string::npos; 104 | 105 | if (n < k) { 106 | return result; 107 | } 108 | 109 | switch (k) { 110 | case 0: 111 | return 0; 112 | 113 | case 1: { 114 | const char* res = reinterpret_cast(strchr(s, needle[0])); 115 | 116 | return (res != nullptr) ? res - s : std::string::npos; 117 | } 118 | 119 | case 2: 120 | result = aarch64_strstr_memcmp<2>(s, n, needle, always_true); 121 | break; 122 | 123 | case 3: 124 | result = aarch64_strstr_memcmp<3>(s, n, needle, memcmp1); 125 | break; 126 | 127 | case 4: 128 | result = aarch64_strstr_memcmp<4>(s, n, needle, memcmp2); 129 | break; 130 | 131 | case 5: 132 | result = aarch64_strstr_memcmp<5>(s, n, needle, memcmp4); 133 | break; 134 | 135 | case 6: 136 | result = aarch64_strstr_memcmp<6>(s, n, needle, memcmp4); 137 | break; 138 | 139 | case 7: 140 | result = aarch64_strstr_memcmp<7>(s, n, needle, memcmp5); 141 | break; 142 | 143 | case 8: 144 | result = aarch64_strstr_memcmp<8>(s, n, needle, memcmp6); 145 | break; 146 | 147 | case 9: 148 | result = aarch64_strstr_memcmp<9>(s, n, needle, memcmp8); 149 | break; 150 | 151 | case 10: 152 | result = aarch64_strstr_memcmp<10>(s, n, needle, memcmp8); 153 | break; 154 | 155 | case 11: 156 | result = aarch64_strstr_memcmp<11>(s, n, needle, memcmp9); 157 | break; 158 | 159 | case 12: 160 | result = aarch64_strstr_memcmp<12>(s, n, needle, memcmp10); 161 | break; 162 | 163 | default: 164 | result = aarch64_strstr_anysize(s, n, needle, k); 165 | break; 166 | } 167 | 168 | if (result <= n - k) { 169 | return result; 170 | } else { 171 | return std::string::npos; 172 | } 173 | } 174 | 175 | // ------------------------------------------------------------------------ 176 | 177 | size_t aarch64_strstr_v2(const std::string& s, const std::string& needle) { 178 | 179 | return aarch64_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 180 | } 181 | 182 | 183 | -------------------------------------------------------------------------------- /avx2-naive-strstr.cpp: -------------------------------------------------------------------------------- 1 | // Method descibed in https://arxiv.org/pdf/1612.01506.pdf 2 | // 3 | // Implementation by Daniel Lemire 4 | // https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c 5 | 6 | size_t FORCE_INLINE avx2_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 7 | 8 | assert(k > 0); 9 | assert(n > 0); 10 | 11 | if (n == k) { 12 | return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos; 13 | } 14 | 15 | for (size_t i = 0; i < n - k + 1; i += 32) { 16 | uint32_t found = 0xffffffff; 17 | for (size_t j = 0; (j < k) && (found != 0) ; ++j) { 18 | const __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j)); 19 | const __m256i needlevector = _mm256_set1_epi8(needle[j]); 20 | uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector)); 21 | found = found & bitmask; 22 | } 23 | if (found != 0) { 24 | return i + __builtin_ctz(found); 25 | } 26 | } 27 | 28 | return std::string::npos; 29 | } 30 | 31 | 32 | // ------------------------------------------------------------------------ 33 | 34 | size_t avx2_naive_strstr(const char* s, size_t n, const char* needle, size_t k) { 35 | 36 | size_t result = std::string::npos; 37 | 38 | if (n < k) { 39 | return result; 40 | } 41 | 42 | result = avx2_naive_strstr_anysize(s, n, needle, k); 43 | 44 | if (result <= n - k) { 45 | return result; 46 | } else { 47 | return std::string::npos; 48 | } 49 | } 50 | 51 | // ------------------------------------------------------------------------ 52 | 53 | size_t avx2_naive_strstr(const std::string& s, const std::string& needle) { 54 | 55 | return avx2_naive_strstr(s.data(), s.size(), needle.data(), needle.size()); 56 | } 57 | 58 | 59 | -------------------------------------------------------------------------------- /avx2-naive-strstr64.cpp: -------------------------------------------------------------------------------- 1 | // Method descibed in https://arxiv.org/pdf/1612.01506.pdf 2 | // 3 | // Implementation by Daniel Lemire 4 | // https://github.com/WojciechMula/sse4-strstr/issues/2 5 | 6 | size_t FORCE_INLINE avx2_naive_strstr_anysize64(const char* s, size_t n, const char* needle, size_t k) { 7 | 8 | assert(k > 0); 9 | assert(n > 0); 10 | const __m256i first = _mm256_set1_epi8(needle[0]); 11 | const __m256i last = _mm256_set1_epi8(needle[k - 1]); 12 | for (size_t i = 0; i < n; i += 64) { 13 | 14 | const __m256i block_first1 = _mm256_loadu_si256((const __m256i*)(s + i)); 15 | const __m256i block_last1 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1)); 16 | 17 | const __m256i block_first2 = _mm256_loadu_si256((const __m256i*)(s + i + 32)); 18 | const __m256i block_last2 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1 + 32)); 19 | 20 | const __m256i eq_first1 = _mm256_cmpeq_epi8(first, block_first1); 21 | const __m256i eq_last1 = _mm256_cmpeq_epi8(last, block_last1); 22 | 23 | const __m256i eq_first2 = _mm256_cmpeq_epi8(first, block_first2); 24 | const __m256i eq_last2 = _mm256_cmpeq_epi8(last, block_last2); 25 | 26 | const uint32_t mask1 = _mm256_movemask_epi8(_mm256_and_si256(eq_first1, eq_last1)); 27 | const uint32_t mask2 = _mm256_movemask_epi8(_mm256_and_si256(eq_first2, eq_last2)); 28 | uint64_t mask = mask1 | ((uint64_t)mask2 << 32); 29 | 30 | while (mask != 0) { 31 | const int bitpos = __builtin_ctzll(mask); 32 | if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { 33 | return i + bitpos; 34 | } 35 | mask = bits::clear_leftmost_set(mask); 36 | } 37 | } 38 | 39 | return std::string::npos; 40 | } 41 | 42 | 43 | // ------------------------------------------------------------------------ 44 | 45 | size_t avx2_naive_strstr64(const char* s, size_t n, const char* needle, size_t k) { 46 | 47 | size_t result = std::string::npos; 48 | 49 | if (n < k) { 50 | return result; 51 | } 52 | 53 | result = avx2_naive_strstr_anysize64(s, n, needle, k); 54 | 55 | if (result <= n - k) { 56 | return result; 57 | } else { 58 | return std::string::npos; 59 | } 60 | } 61 | 62 | // ------------------------------------------------------------------------ 63 | 64 | size_t avx2_naive_strstr64(const std::string& s, const std::string& needle) { 65 | 66 | return avx2_naive_strstr64(s.data(), s.size(), needle.data(), needle.size()); 67 | } 68 | 69 | 70 | -------------------------------------------------------------------------------- /avx2-naive-unrolled-strstr.cpp: -------------------------------------------------------------------------------- 1 | // Method described in https://arxiv.org/pdf/1612.01506.pdf 2 | // 3 | // Implementation by Daniel Lemire 4 | 5 | size_t FORCE_INLINE avx2_naive_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t k) { 6 | 7 | // assert(n % 32 == 0); // deliberately commented out 8 | // todo: fix it so we can handle variable-length inputs and 9 | // can catch matches at the end of the data. 10 | for (size_t i = 0; i < n - k; i += 32) { 11 | uint32_t found = 0xFFFFFFFF; // 32 1-bits 12 | size_t j = 0; 13 | for (; (j + 3 < k) && (found != 0) ; j += 4) { 14 | __m256i textvector1 = _mm256_loadu_si256((const __m256i *)(s + i + j)); 15 | __m256i needlevector1 = _mm256_set1_epi8(needle[j]); 16 | __m256i textvector2 = _mm256_loadu_si256((const __m256i *)(s + i + j + 1)); 17 | __m256i needlevector2 = _mm256_set1_epi8(needle[j + 1]); 18 | __m256i cmp1 = _mm256_cmpeq_epi8(textvector1, needlevector1); 19 | __m256i cmp2 = _mm256_cmpeq_epi8(textvector2, needlevector2); 20 | __m256i textvector3 = _mm256_loadu_si256((const __m256i *)(s + i + j + 2)); 21 | __m256i needlevector3 = _mm256_set1_epi8(needle[j + 2]); 22 | __m256i textvector4 = _mm256_loadu_si256((const __m256i *)(s + i + j + 3)); 23 | __m256i needlevector4 = _mm256_set1_epi8(needle[j + 3]); 24 | __m256i cmp3 = _mm256_cmpeq_epi8(textvector3, needlevector3); 25 | __m256i cmp4 = _mm256_cmpeq_epi8(textvector4, needlevector4); 26 | __m256i cmp12 = _mm256_and_si256(cmp1,cmp2); 27 | __m256i cmp34 = _mm256_and_si256(cmp3,cmp4); 28 | uint32_t bitmask = _mm256_movemask_epi8(_mm256_and_si256(cmp12,cmp34)); 29 | found = found & bitmask; 30 | } 31 | for (; (j < k) && (found != 0) ; ++j) { 32 | __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j)); 33 | __m256i needlevector = _mm256_set1_epi8(needle[j]); 34 | uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector)); 35 | found = found & bitmask; 36 | } 37 | if(found != 0) { 38 | // got a match... maybe 39 | return i + __builtin_ctz(found); 40 | } 41 | } 42 | 43 | return std::string::npos; 44 | } 45 | 46 | 47 | // ------------------------------------------------------------------------ 48 | 49 | size_t avx2_naive_unrolled_strstr(const char* s, size_t n, const char* needle, size_t k) { 50 | 51 | size_t result = std::string::npos; 52 | 53 | if (n < k) { 54 | return result; 55 | } 56 | 57 | result = avx2_naive_strstr_unrolled_anysize(s, n, needle, k); 58 | 59 | if (result <= n - k) { 60 | return result; 61 | } else { 62 | return std::string::npos; 63 | } 64 | } 65 | 66 | // ------------------------------------------------------------------------ 67 | 68 | size_t avx2_naive_unrolled_strstr(const std::string& s, const std::string& needle) { 69 | 70 | return avx2_naive_unrolled_strstr(s.data(), s.size(), needle.data(), needle.size()); 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /avx2-strstr-v2-clang-specific.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | The following templates implement the loop, where K is a template parameter. 3 | 4 | for (unsigned i=1; i < K; i++) { 5 | const __m256i substring = _mm256_alignr_epi8(next1, curr, i); 6 | eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); 7 | } 8 | 9 | Clang complains that the loop parameter `i` is a variable and it cannot be 10 | applied as a parameter _mm256_alignr_epi8. GCC somehow deals with it. 11 | */ 12 | 13 | #ifdef __clang__ 14 | 15 | template 16 | struct inner_loop_aux; 17 | 18 | template 19 | struct inner_loop_aux { 20 | void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) { 21 | const __m256i substring = _mm256_alignr_epi8(next1, curr, i); 22 | eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); 23 | inner_loop_aux()(eq, next1, curr, broadcasted); 24 | } 25 | }; 26 | 27 | template 28 | struct inner_loop_aux { 29 | void operator()(__m256i&, const __m256i&, const __m256i&, const __m256i (&)[K]) { 30 | // nop 31 | } 32 | }; 33 | 34 | template 35 | struct inner_loop { 36 | void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) { 37 | static_assert(K > 0, "wrong value"); 38 | inner_loop_aux()(eq, next1, curr, broadcasted); 39 | } 40 | }; 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /avx2-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html 2 | 3 | size_t FORCE_INLINE avx2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 4 | 5 | assert(k > 0); 6 | assert(n > 0); 7 | 8 | const __m256i first = _mm256_set1_epi8(needle[0]); 9 | const __m256i last = _mm256_set1_epi8(needle[k - 1]); 10 | 11 | for (size_t i = 0; i < n; i += 32) { 12 | 13 | const __m256i block_first = _mm256_loadu_si256(reinterpret_cast(s + i)); 14 | const __m256i block_last = _mm256_loadu_si256(reinterpret_cast(s + i + k - 1)); 15 | 16 | const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first); 17 | const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last); 18 | 19 | uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last)); 20 | 21 | while (mask != 0) { 22 | 23 | const auto bitpos = bits::get_first_bit_set(mask); 24 | 25 | if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { 26 | return i + bitpos; 27 | } 28 | 29 | mask = bits::clear_leftmost_set(mask); 30 | } 31 | } 32 | 33 | return std::string::npos; 34 | } 35 | 36 | #include "avx2-strstr-v2-clang-specific.cpp" 37 | 38 | template 39 | size_t FORCE_INLINE avx2_strstr_eq(const char* s, size_t n, const char* needle) { 40 | 41 | static_assert(K > 0 && K < 16, "K must be in range [1..15]"); 42 | assert(n > 0); 43 | 44 | __m256i broadcasted[K]; 45 | for (unsigned i=0; i < K; i++) { 46 | broadcasted[i] = _mm256_set1_epi8(needle[i]); 47 | } 48 | 49 | __m256i curr = _mm256_loadu_si256(reinterpret_cast(s)); 50 | 51 | for (size_t i = 0; i < n; i += 32) { 52 | 53 | const __m256i next = _mm256_loadu_si256(reinterpret_cast(s + i + 32)); 54 | 55 | __m256i eq = _mm256_cmpeq_epi8(curr, broadcasted[0]); 56 | 57 | // AVX2 palignr works on 128-bit lanes, thus some extra work is needed 58 | // 59 | // curr = [a, b] (2 x 128 bit) 60 | // next = [c, d] 61 | // substring = [palignr(b, a, i), palignr(c, b, i)] 62 | __m256i next1; 63 | next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(curr, 1), 0); // b 64 | next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(next, 0), 1); // c 65 | 66 | #ifndef __clang__ 67 | for (unsigned i=1; i < K; i++) { 68 | const __m256i substring = _mm256_alignr_epi8(next1, curr, i); 69 | eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); 70 | } 71 | #else 72 | inner_loop()(eq, next1, curr, broadcasted); 73 | #endif 74 | 75 | curr = next; 76 | 77 | const uint32_t mask = _mm256_movemask_epi8(eq); 78 | if (mask != 0) { 79 | return i + bits::get_first_bit_set(mask); 80 | } 81 | } 82 | 83 | return std::string::npos; 84 | } 85 | 86 | template 87 | size_t FORCE_INLINE avx2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 88 | 89 | assert(k > 0); 90 | assert(n > 0); 91 | 92 | const __m256i first = _mm256_set1_epi8(needle[0]); 93 | const __m256i last = _mm256_set1_epi8(needle[k - 1]); 94 | 95 | for (size_t i = 0; i < n; i += 32) { 96 | 97 | const __m256i block_first = _mm256_loadu_si256(reinterpret_cast(s + i)); 98 | const __m256i block_last = _mm256_loadu_si256(reinterpret_cast(s + i + k - 1)); 99 | 100 | const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first); 101 | const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last); 102 | 103 | uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last)); 104 | 105 | while (mask != 0) { 106 | 107 | const auto bitpos = bits::get_first_bit_set(mask); 108 | 109 | if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { 110 | return i + bitpos; 111 | } 112 | 113 | mask = bits::clear_leftmost_set(mask); 114 | } 115 | } 116 | 117 | return std::string::npos; 118 | } 119 | 120 | // ------------------------------------------------------------------------ 121 | 122 | size_t avx2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 123 | 124 | size_t result = std::string::npos; 125 | 126 | if (n < k) { 127 | return result; 128 | } 129 | 130 | switch (k) { 131 | case 0: 132 | return 0; 133 | 134 | case 1: { 135 | const char* res = reinterpret_cast(strchr(s, needle[0])); 136 | 137 | return (res != nullptr) ? res - s : std::string::npos; 138 | } 139 | 140 | case 2: 141 | result = avx2_strstr_eq<2>(s, n, needle); 142 | break; 143 | 144 | case 3: 145 | result = avx2_strstr_memcmp<3>(s, n, needle, memcmp1); 146 | break; 147 | 148 | case 4: 149 | result = avx2_strstr_memcmp<4>(s, n, needle, memcmp2); 150 | break; 151 | 152 | case 5: 153 | // Note: use memcmp4 rather memcmp3, as the last character 154 | // of needle is already proven to be equal 155 | result = avx2_strstr_memcmp<5>(s, n, needle, memcmp4); 156 | break; 157 | 158 | case 6: 159 | result = avx2_strstr_memcmp<6>(s, n, needle, memcmp4); 160 | break; 161 | 162 | case 7: 163 | result = avx2_strstr_memcmp<7>(s, n, needle, memcmp5); 164 | break; 165 | 166 | case 8: 167 | result = avx2_strstr_memcmp<8>(s, n, needle, memcmp6); 168 | break; 169 | 170 | case 9: 171 | // Note: use memcmp8 rather memcmp7 for the same reason as above. 172 | result = avx2_strstr_memcmp<9>(s, n, needle, memcmp8); 173 | break; 174 | 175 | case 10: 176 | result = avx2_strstr_memcmp<10>(s, n, needle, memcmp8); 177 | break; 178 | 179 | case 11: 180 | result = avx2_strstr_memcmp<11>(s, n, needle, memcmp9); 181 | break; 182 | 183 | case 12: 184 | result = avx2_strstr_memcmp<12>(s, n, needle, memcmp10); 185 | break; 186 | 187 | default: 188 | result = avx2_strstr_anysize(s, n, needle, k); 189 | break; 190 | } 191 | 192 | if (result <= n - k) { 193 | return result; 194 | } else { 195 | return std::string::npos; 196 | } 197 | } 198 | 199 | // ------------------------------------------------------------------------ 200 | 201 | size_t avx2_strstr_v2(const std::string& s, const std::string& needle) { 202 | 203 | return avx2_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 204 | } 205 | 206 | 207 | -------------------------------------------------------------------------------- /avx2-strstr.cpp: -------------------------------------------------------------------------------- 1 | size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, size_t neddle_size) { 2 | 3 | assert(neddle_size > 4); 4 | assert(n > 0); 5 | 6 | const uint32_t prefix32 = *reinterpret_cast(neddle); 7 | const __m256i prefix = _mm256_set1_epi32(prefix32); 8 | const __m256i zeros = _mm256_setzero_si256(); 9 | 10 | const __m256i permute = _mm256_setr_epi32( 11 | 0, 1, 2, 0, 12 | 2, 3, 4, 0 13 | ); 14 | 15 | for (size_t i = 0; i < n; i += 16) { 16 | 17 | const __m256i in = _mm256_loadu_si256(reinterpret_cast(s + i)); 18 | /* 19 | [00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31] 20 | lane | boundary 21 | [00|01|02|03|04|05|06|07|08|09|10|11|??|??|??|??|08|09|10|11|12|13|14|15|16|17|18|19|??|??|??|??] 22 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 23 | */ 24 | const __m256i data = _mm256_permutevar8x32_epi32(in, permute); 25 | const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0); 26 | 27 | const __m256i cmp = _mm256_cmpeq_epi16(result, zeros); 28 | 29 | uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u; 30 | 31 | while (mask != 0) { 32 | 33 | const auto bitpos = bits::get_first_bit_set(mask)/2; 34 | 35 | if (memcmp(s + i + bitpos + 4, neddle + 4, neddle_size - 4) == 0) { 36 | return i + bitpos; 37 | } 38 | 39 | mask = bits::clear_leftmost_set(mask); 40 | } 41 | } 42 | 43 | return std::string::npos; 44 | } 45 | 46 | // ------------------------------------------------------------------------ 47 | 48 | size_t avx2_strstr_len4(const char* s, size_t n, const char* neddle) { 49 | 50 | assert(n > 0); 51 | 52 | const uint32_t prefix32 = *reinterpret_cast(neddle); 53 | const __m256i prefix = _mm256_set1_epi32(prefix32); 54 | const __m256i zeros = _mm256_setzero_si256(); 55 | 56 | const __m256i permute = _mm256_setr_epi32( 57 | 0, 1, 2, 0, 58 | 2, 3, 4, 0 59 | ); 60 | 61 | for (size_t i = 0; i < n; i += 16) { 62 | 63 | const __m256i in = _mm256_loadu_si256(reinterpret_cast(s + i)); 64 | const __m256i data = _mm256_permutevar8x32_epi32(in, permute); 65 | const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0); 66 | 67 | const __m256i cmp = _mm256_cmpeq_epi16(result, zeros); 68 | 69 | const uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u; 70 | 71 | if (mask != 0) { 72 | return i + bits::get_first_bit_set(mask)/2; 73 | } 74 | } 75 | 76 | return std::string::npos; 77 | } 78 | 79 | // ------------------------------------------------------------------------ 80 | 81 | size_t avx2_strstr(const char* s, size_t n, const char* neddle, size_t neddle_size) { 82 | 83 | size_t result = std::string::npos; 84 | 85 | if (n < neddle_size) { 86 | return result; 87 | } 88 | 89 | switch (neddle_size) { 90 | case 0: 91 | return 0; 92 | 93 | case 1: { 94 | const char* res = reinterpret_cast(strchr(s, neddle[0])); 95 | 96 | return (res != nullptr) ? res - s : std::string::npos; 97 | } 98 | case 2: 99 | case 3: 100 | { 101 | const char* res = reinterpret_cast(strstr(s, neddle)); 102 | 103 | return (res != nullptr) ? res - s : std::string::npos; 104 | } 105 | 106 | case 4: 107 | result = avx2_strstr_len4(s, n, neddle); 108 | break; 109 | 110 | default: 111 | result = avx2_strstr_long(s, n, neddle, neddle_size); 112 | break; 113 | } 114 | 115 | 116 | if (result <= n - neddle_size) { 117 | return result; 118 | } else { 119 | return std::string::npos; 120 | } 121 | } 122 | 123 | // -------------------------------------------------- 124 | 125 | size_t avx2_strstr(const std::string& s, const std::string& neddle) { 126 | 127 | return avx2_strstr(s.data(), s.size(), neddle.data(), neddle.size()); 128 | } 129 | 130 | -------------------------------------------------------------------------------- /avx512bw-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html 2 | 3 | size_t avx512bw_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) { 4 | 5 | assert(n > 0); 6 | assert(k > 0); 7 | 8 | const __m512i first = _mm512_set1_epi8(needle[0]); 9 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 10 | 11 | char* haystack = const_cast(string); 12 | char* end = haystack + n; 13 | 14 | for (/**/; haystack < end; haystack += 64) { 15 | 16 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 17 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 18 | 19 | uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first) 20 | & _mm512_cmpeq_epi8_mask(block_last, last); 21 | 22 | while (mask != 0) { 23 | 24 | const uint64_t bitpos = bits::get_first_bit_set(mask); 25 | const char* s = reinterpret_cast(haystack); 26 | 27 | if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) { 28 | return (s - string) + bitpos; 29 | } 30 | 31 | mask = bits::clear_leftmost_set(mask); 32 | } 33 | } 34 | 35 | return size_t(-1); 36 | } 37 | 38 | 39 | template 40 | size_t avx512bw_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { 41 | 42 | assert(n > 0); 43 | assert(k > 0); 44 | 45 | const __m512i first = _mm512_set1_epi8(needle[0]); 46 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 47 | 48 | char* haystack = const_cast(string); 49 | char* end = haystack + n; 50 | 51 | for (/**/; haystack < end; haystack += 64) { 52 | 53 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 54 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 55 | 56 | uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first) 57 | & _mm512_cmpeq_epi8_mask(block_last, last); 58 | 59 | while (mask != 0) { 60 | 61 | const uint64_t bitpos = bits::get_first_bit_set(mask); 62 | const char* s = reinterpret_cast(haystack); 63 | 64 | if (memeq_fun(s + bitpos + 1, needle + 1)) { 65 | return (s - string) + bitpos; 66 | } 67 | 68 | mask = bits::clear_leftmost_set(mask); 69 | } 70 | } 71 | 72 | return size_t(-1); 73 | } 74 | 75 | // ------------------------------------------------------------------------ 76 | 77 | size_t avx512bw_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 78 | 79 | size_t result = std::string::npos; 80 | 81 | if (n < k) { 82 | return result; 83 | } 84 | 85 | switch (k) { 86 | case 0: 87 | return 0; 88 | 89 | case 1: { 90 | const char* res = reinterpret_cast(strchr(s, needle[0])); 91 | 92 | return (res != nullptr) ? res - s : std::string::npos; 93 | } 94 | 95 | case 2: 96 | result = avx512bw_strstr_v2_memcmp<2>(s, n, needle, always_true); 97 | break; 98 | 99 | case 3: 100 | result = avx512bw_strstr_v2_memcmp<3>(s, n, needle, memcmp1); 101 | break; 102 | 103 | case 4: 104 | result = avx512bw_strstr_v2_memcmp<4>(s, n, needle, memcmp2); 105 | break; 106 | 107 | case 5: 108 | result = avx512bw_strstr_v2_memcmp<5>(s, n, needle, memcmp3); 109 | break; 110 | 111 | case 6: 112 | result = avx512bw_strstr_v2_memcmp<6>(s, n, needle, memcmp4); 113 | break; 114 | 115 | case 7: 116 | result = avx512bw_strstr_v2_memcmp<7>(s, n, needle, memcmp5); 117 | break; 118 | 119 | case 8: 120 | result = avx512bw_strstr_v2_memcmp<8>(s, n, needle, memcmp6); 121 | break; 122 | 123 | case 9: 124 | result = avx512bw_strstr_v2_memcmp<9>(s, n, needle, memcmp7); 125 | break; 126 | 127 | case 10: 128 | result = avx512bw_strstr_v2_memcmp<10>(s, n, needle, memcmp8); 129 | break; 130 | 131 | case 11: 132 | result = avx512bw_strstr_v2_memcmp<11>(s, n, needle, memcmp9); 133 | break; 134 | 135 | case 12: 136 | result = avx512bw_strstr_v2_memcmp<12>(s, n, needle, memcmp10); 137 | break; 138 | 139 | default: 140 | result = avx512bw_strstr_v2_anysize(s, n, needle, k); 141 | break; 142 | } 143 | 144 | if (result <= n - k) { 145 | return result; 146 | } else { 147 | return std::string::npos; 148 | } 149 | } 150 | 151 | // -------------------------------------------------- 152 | 153 | size_t avx512bw_strstr_v2(const std::string& s, const std::string& needle) { 154 | 155 | return avx512bw_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 156 | } 157 | 158 | -------------------------------------------------------------------------------- /avx512bw-strstr-v3.cpp: -------------------------------------------------------------------------------- 1 | // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html 2 | 3 | size_t avx512bw_strstr_v3_anysize(const char* string, size_t n, const char* needle, size_t k) { 4 | 5 | assert(n > 0); 6 | assert(k > 0); 7 | 8 | const __m512i first = _mm512_set1_epi8(needle[0]); 9 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 10 | 11 | char* haystack = const_cast(string); 12 | char* end = haystack + n; 13 | 14 | for (/**/; haystack < end; haystack += 64) { 15 | 16 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 17 | const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first); 18 | 19 | if (first_eq == 0) 20 | continue; 21 | 22 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 23 | uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last); 24 | 25 | while (mask != 0) { 26 | 27 | const uint64_t bitpos = bits::get_first_bit_set(mask); 28 | const char* s = reinterpret_cast(haystack); 29 | 30 | if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) { 31 | return (s - string) + bitpos; 32 | } 33 | 34 | mask = bits::clear_leftmost_set(mask); 35 | } 36 | } 37 | 38 | return size_t(-1); 39 | } 40 | 41 | 42 | template 43 | size_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { 44 | 45 | assert(n > 0); 46 | assert(k > 0); 47 | 48 | const __m512i first = _mm512_set1_epi8(needle[0]); 49 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 50 | 51 | char* haystack = const_cast(string); 52 | char* end = haystack + n; 53 | 54 | for (/**/; haystack < end; haystack += 64) { 55 | 56 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 57 | const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first); 58 | 59 | if (first_eq == 0) 60 | continue; 61 | 62 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 63 | uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last); 64 | 65 | while (mask != 0) { 66 | 67 | const uint64_t bitpos = bits::get_first_bit_set(mask); 68 | const char* s = reinterpret_cast(haystack); 69 | 70 | if (memeq_fun(s + bitpos + 1, needle + 1)) { 71 | return (s - string) + bitpos; 72 | } 73 | 74 | mask = bits::clear_leftmost_set(mask); 75 | } 76 | } 77 | 78 | return size_t(-1); 79 | } 80 | 81 | // ------------------------------------------------------------------------ 82 | 83 | size_t avx512bw_strstr_v3(const char* s, size_t n, const char* needle, size_t k) { 84 | 85 | size_t result = std::string::npos; 86 | 87 | if (n < k) { 88 | return result; 89 | } 90 | 91 | switch (k) { 92 | case 0: 93 | return 0; 94 | 95 | case 1: { 96 | const char* res = reinterpret_cast(strchr(s, needle[0])); 97 | 98 | return (res != nullptr) ? res - s : std::string::npos; 99 | } 100 | 101 | case 2: 102 | result = avx512bw_strstr_v3_memcmp<2>(s, n, needle, always_true); 103 | break; 104 | 105 | case 3: 106 | result = avx512bw_strstr_v3_memcmp<3>(s, n, needle, memcmp1); 107 | break; 108 | 109 | case 4: 110 | result = avx512bw_strstr_v3_memcmp<4>(s, n, needle, memcmp2); 111 | break; 112 | 113 | case 5: 114 | result = avx512bw_strstr_v3_memcmp<5>(s, n, needle, memcmp3); 115 | break; 116 | 117 | case 6: 118 | result = avx512bw_strstr_v3_memcmp<6>(s, n, needle, memcmp4); 119 | break; 120 | 121 | case 7: 122 | result = avx512bw_strstr_v3_memcmp<7>(s, n, needle, memcmp5); 123 | break; 124 | 125 | case 8: 126 | result = avx512bw_strstr_v3_memcmp<8>(s, n, needle, memcmp6); 127 | break; 128 | 129 | case 9: 130 | result = avx512bw_strstr_v3_memcmp<9>(s, n, needle, memcmp7); 131 | break; 132 | 133 | case 10: 134 | result = avx512bw_strstr_v3_memcmp<10>(s, n, needle, memcmp8); 135 | break; 136 | 137 | case 11: 138 | result = avx512bw_strstr_v3_memcmp<11>(s, n, needle, memcmp9); 139 | break; 140 | 141 | case 12: 142 | result = avx512bw_strstr_v3_memcmp<12>(s, n, needle, memcmp10); 143 | break; 144 | 145 | default: 146 | result = avx512bw_strstr_v3_anysize(s, n, needle, k); 147 | break; 148 | } 149 | 150 | if (result <= n - k) { 151 | return result; 152 | } else { 153 | return std::string::npos; 154 | } 155 | } 156 | 157 | // -------------------------------------------------- 158 | 159 | size_t avx512bw_strstr_v3(const std::string& s, const std::string& needle) { 160 | 161 | return avx512bw_strstr_v3(s.data(), s.size(), needle.data(), needle.size()); 162 | } 163 | 164 | -------------------------------------------------------------------------------- /avx512f-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html 2 | 3 | __mmask16 FORCE_INLINE zero_byte_mask(const __m512i v) { 4 | 5 | const __m512i v01 = _mm512_set1_epi8(0x01); 6 | const __m512i v80 = _mm512_set1_epi8(int8_t(0x80)); 7 | 8 | const __m512i v1 = _mm512_sub_epi32(v, v01); 9 | // tmp1 = (v - 0x01010101) & ~v & 0x80808080 10 | const __m512i tmp1 = _mm512_ternarylogic_epi32(v1, v, v80, 0x20); 11 | 12 | return _mm512_test_epi32_mask(tmp1, tmp1); 13 | } 14 | 15 | 16 | size_t avx512f_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) { 17 | 18 | assert(n > 0); 19 | assert(k > 0); 20 | 21 | const __m512i first = _mm512_set1_epi8(needle[0]); 22 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 23 | 24 | char* haystack = const_cast(string); 25 | char* end = haystack + n; 26 | 27 | for (/**/; haystack < end; haystack += 64) { 28 | 29 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 30 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 31 | 32 | #if 0 33 | const __m512i first_zeros = _mm512_xor_si512(block_first, first); 34 | const __m512i last_zeros = _mm512_xor_si512(block_last, last); 35 | const __m512i zeros = _mm512_or_si512(first_zeros, last_zeros); 36 | #else 37 | const __m512i first_zeros = _mm512_xor_si512(block_first, first); 38 | /* 39 | first_zeros | block_last | last | first_zeros | (block_last ^ last) 40 | ------------+------------+------+------------------------------------ 41 | 0 | 0 | 0 | 0 42 | 0 | 0 | 1 | 1 43 | 0 | 1 | 0 | 1 44 | 0 | 1 | 1 | 0 45 | 1 | 0 | 0 | 1 46 | 1 | 0 | 1 | 1 47 | 1 | 1 | 0 | 1 48 | 1 | 1 | 1 | 1 49 | */ 50 | const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6); 51 | #endif 52 | 53 | uint32_t mask = zero_byte_mask(zeros); 54 | while (mask) { 55 | 56 | const uint64_t p = __builtin_ctz(mask); 57 | 58 | if (memcmp(haystack + 4*p + 0, needle, k) == 0) { 59 | return (haystack - string) + 4*p + 0; 60 | } 61 | 62 | if (memcmp(haystack + 4*p + 1, needle, k) == 0) { 63 | return (haystack - string) + 4*p + 1; 64 | } 65 | 66 | if (memcmp(haystack + 4*p + 2, needle, k) == 0) { 67 | return (haystack - string) + 4*p + 2; 68 | } 69 | 70 | if (memcmp(haystack + 4*p + 3, needle, k) == 0) { 71 | return (haystack - string) + 4*p + 3; 72 | } 73 | 74 | mask = bits::clear_leftmost_set(mask); 75 | } 76 | } 77 | 78 | return size_t(-1); 79 | } 80 | 81 | 82 | template 83 | size_t avx512f_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { 84 | 85 | assert(n > 0); 86 | assert(k > 0); 87 | 88 | const __m512i first = _mm512_set1_epi8(needle[0]); 89 | const __m512i last = _mm512_set1_epi8(needle[k - 1]); 90 | 91 | char* haystack = const_cast(string); 92 | char* end = haystack + n; 93 | 94 | for (/**/; haystack < end; haystack += 64) { 95 | 96 | const __m512i block_first = _mm512_loadu_si512(haystack + 0); 97 | const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); 98 | 99 | const __m512i first_zeros = _mm512_xor_si512(block_first, first); 100 | const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6); 101 | 102 | uint32_t mask = zero_byte_mask(zeros); 103 | while (mask) { 104 | 105 | const uint64_t p = __builtin_ctz(mask); 106 | 107 | if (memeq_fun(haystack + 4*p + 0, needle)) { 108 | return (haystack - string) + 4*p + 0; 109 | } 110 | 111 | if (memeq_fun(haystack + 4*p + 1, needle)) { 112 | return (haystack - string) + 4*p + 1; 113 | } 114 | 115 | if (memeq_fun(haystack + 4*p + 2, needle)) { 116 | return (haystack - string) + 4*p + 2; 117 | } 118 | 119 | if (memeq_fun(haystack + 4*p + 3, needle)) { 120 | return (haystack - string) + 4*p + 3; 121 | } 122 | 123 | mask = bits::clear_leftmost_set(mask); 124 | } 125 | } 126 | 127 | return size_t(-1); 128 | } 129 | 130 | // ------------------------------------------------------------------------ 131 | 132 | size_t avx512f_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 133 | 134 | size_t result = std::string::npos; 135 | 136 | if (n < k) { 137 | return result; 138 | } 139 | 140 | switch (k) { 141 | case 0: 142 | return 0; 143 | 144 | case 1: { 145 | const char* res = reinterpret_cast(strchr(s, needle[0])); 146 | 147 | return (res != nullptr) ? res - s : std::string::npos; 148 | } 149 | 150 | case 2: 151 | result = avx512f_strstr_v2_memcmp<2>(s, n, needle, memcmp2); 152 | break; 153 | 154 | case 3: 155 | result = avx512f_strstr_v2_memcmp<3>(s, n, needle, memcmp3); 156 | break; 157 | 158 | case 4: 159 | result = avx512f_strstr_v2_memcmp<4>(s, n, needle, memcmp4); 160 | break; 161 | 162 | case 5: 163 | result = avx512f_strstr_v2_memcmp<5>(s, n, needle, memcmp5); 164 | break; 165 | 166 | case 6: 167 | result = avx512f_strstr_v2_memcmp<6>(s, n, needle, memcmp6); 168 | break; 169 | 170 | case 7: 171 | result = avx512f_strstr_v2_memcmp<7>(s, n, needle, memcmp7); 172 | break; 173 | 174 | case 8: 175 | result = avx512f_strstr_v2_memcmp<8>(s, n, needle, memcmp8); 176 | break; 177 | 178 | case 9: 179 | result = avx512f_strstr_v2_memcmp<9>(s, n, needle, memcmp9); 180 | break; 181 | 182 | case 10: 183 | result = avx512f_strstr_v2_memcmp<10>(s, n, needle, memcmp10); 184 | break; 185 | 186 | case 11: 187 | result = avx512f_strstr_v2_memcmp<11>(s, n, needle, memcmp11); 188 | break; 189 | 190 | case 12: 191 | result = avx512f_strstr_v2_memcmp<12>(s, n, needle, memcmp12); 192 | break; 193 | 194 | default: 195 | result = avx512f_strstr_v2_anysize(s, n, needle, k); 196 | break; 197 | } 198 | 199 | if (result <= n - k) { 200 | return result; 201 | } else { 202 | return std::string::npos; 203 | } 204 | } 205 | 206 | // -------------------------------------------------- 207 | 208 | size_t avx512f_strstr_v2(const std::string& s, const std::string& needle) { 209 | 210 | return avx512f_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 211 | } 212 | 213 | -------------------------------------------------------------------------------- /avx512f-strstr.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | string - pointer to the string 3 | n - string length in bytes 4 | needle - pointer to another string 5 | n - needle length in bytes 6 | */ 7 | size_t avx512f_strstr_long(const char* string, size_t n, const char* needle, size_t k) { 8 | 9 | assert(n > 0); 10 | assert(k > 4); 11 | 12 | __m512i curr; 13 | __m512i next; 14 | __m512i v0, v1, v2, v3; 15 | 16 | char* haystack = const_cast(string); 17 | char* last = haystack + n; 18 | 19 | const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle 20 | const __m512i prefix = _mm512_set1_epi32(prf); 21 | 22 | next = _mm512_loadu_si512(haystack); 23 | 24 | for (/**/; haystack < last; haystack += 64) { 25 | 26 | curr = next; 27 | next = _mm512_loadu_si512(haystack + 64); 28 | const __m512i shft = _mm512_alignr_epi32(next, curr, 1); 29 | 30 | v0 = curr; 31 | 32 | { 33 | const __m512i t1 = _mm512_srli_epi32(curr, 8); 34 | const __m512i t2 = _mm512_slli_epi32(shft, 24); 35 | v1 = _mm512_or_si512(t1, t2); 36 | } 37 | { 38 | const __m512i t1 = _mm512_srli_epi32(curr, 16); 39 | const __m512i t2 = _mm512_slli_epi32(shft, 16); 40 | v2 = _mm512_or_si512(t1, t2); 41 | } 42 | { 43 | const __m512i t1 = _mm512_srli_epi32(curr, 24); 44 | const __m512i t2 = _mm512_slli_epi32(shft, 8); 45 | v3 = _mm512_or_si512(t1, t2); 46 | } 47 | 48 | uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix); 49 | uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix); 50 | uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix); 51 | uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix); 52 | 53 | int index = 64; 54 | while (m0 | m1 | m2 | m3) { 55 | if (m0) { 56 | int pos = __builtin_ctz(m0) * 4 + 0; 57 | m0 = m0 & (m0 - 1); 58 | 59 | if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { 60 | index = pos; 61 | } 62 | } 63 | 64 | if (m1) { 65 | int pos = __builtin_ctz(m1) * 4 + 1; 66 | m1 = m1 & (m1 - 1); 67 | 68 | if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { 69 | index = pos; 70 | } 71 | } 72 | 73 | if (m2) { 74 | int pos = __builtin_ctz(m2) * 4 + 2; 75 | m2 = m2 & (m2 - 1); 76 | 77 | if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { 78 | index = pos; 79 | } 80 | } 81 | 82 | if (m3) { 83 | int pos = __builtin_ctz(m3) * 4 + 3; 84 | m3 = m3 & (m3 - 1); 85 | 86 | if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { 87 | index = pos; 88 | } 89 | } 90 | } 91 | 92 | if (index < 64) { 93 | return (haystack - string) + index; 94 | } 95 | } 96 | 97 | return size_t(-1); 98 | } 99 | 100 | // ------------------------------------------------------------------------ 101 | 102 | size_t avx512f_strstr_eq4(const char* string, size_t n, const char* needle) { 103 | 104 | assert(n > 0); 105 | 106 | __m512i curr; 107 | __m512i next; 108 | __m512i v0, v1, v2, v3; 109 | 110 | char* haystack = const_cast(string); 111 | char* last = haystack + n; 112 | 113 | const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle 114 | const __m512i prefix = _mm512_set1_epi32(prf); 115 | 116 | next = _mm512_loadu_si512(haystack); 117 | 118 | for (/**/; haystack < last; haystack += 64) { 119 | 120 | curr = next; 121 | next = _mm512_loadu_si512(haystack + 64); 122 | const __m512i shft = _mm512_alignr_epi32(next, curr, 1); 123 | 124 | v0 = curr; 125 | 126 | { 127 | const __m512i t1 = _mm512_srli_epi32(curr, 8); 128 | const __m512i t2 = _mm512_slli_epi32(shft, 24); 129 | v1 = _mm512_or_si512(t1, t2); 130 | } 131 | { 132 | const __m512i t1 = _mm512_srli_epi32(curr, 16); 133 | const __m512i t2 = _mm512_slli_epi32(shft, 16); 134 | v2 = _mm512_or_si512(t1, t2); 135 | } 136 | { 137 | const __m512i t1 = _mm512_srli_epi32(curr, 24); 138 | const __m512i t2 = _mm512_slli_epi32(shft, 8); 139 | v3 = _mm512_or_si512(t1, t2); 140 | } 141 | 142 | uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix); 143 | uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix); 144 | uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix); 145 | uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix); 146 | 147 | int index = 64; 148 | if (m0) { 149 | int pos = __builtin_ctz(m0) * 4 + 0; 150 | if (pos < index) { 151 | index = pos; 152 | } 153 | } 154 | 155 | if (m1) { 156 | int pos = __builtin_ctz(m1) * 4 + 1; 157 | if (pos < index) { 158 | index = pos; 159 | } 160 | } 161 | 162 | if (m2) { 163 | int pos = __builtin_ctz(m2) * 4 + 2; 164 | if (pos < index) { 165 | index = pos; 166 | } 167 | } 168 | 169 | if (m3) { 170 | int pos = __builtin_ctz(m3) * 4 + 3; 171 | if (pos < index) { 172 | index = pos; 173 | } 174 | } 175 | 176 | if (index < 64) { 177 | return (haystack - string) + index; 178 | } 179 | 180 | assert(m0 == 0 && m1 == 0 && m2 == 0 && m3 == 0); 181 | } 182 | 183 | return size_t(-1); 184 | } 185 | 186 | // ------------------------------------------------------------------------ 187 | 188 | size_t avx512f_strstr(const char* s, size_t n, const char* needle, size_t needle_size) { 189 | 190 | size_t result = std::string::npos; 191 | 192 | if (n < needle_size) { 193 | return result; 194 | } 195 | 196 | switch (needle_size) { 197 | case 0: 198 | return 0; 199 | 200 | case 1: { 201 | const char* res = reinterpret_cast(strchr(s, needle[0])); 202 | 203 | return (res != nullptr) ? res - s : std::string::npos; 204 | } 205 | case 2: 206 | case 3: { 207 | const char* res = reinterpret_cast(strstr(s, needle)); 208 | 209 | return (res != nullptr) ? res - s : std::string::npos; 210 | } 211 | 212 | case 4: 213 | result = avx512f_strstr_eq4(s, n, needle); 214 | break; 215 | 216 | default: 217 | result = avx512f_strstr_long(s, n, needle, needle_size); 218 | break; 219 | } 220 | 221 | 222 | if (result <= n - needle_size) { 223 | return result; 224 | } else { 225 | return std::string::npos; 226 | } 227 | } 228 | 229 | // -------------------------------------------------- 230 | 231 | size_t avx512f_strstr(const std::string& s, const std::string& needle) { 232 | 233 | return avx512f_strstr(s.data(), s.size(), needle.data(), needle.size()); 234 | } 235 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define FORCE_INLINE inline __attribute__((always_inline)) 4 | #define MAYBE_UNUSED inline __attribute__((unused)) 5 | 6 | #if defined(HAVE_NEON_INSTRUCTIONS) 7 | # include 8 | # define USE_SIMPLE_MEMCMP // for fixed-memcmp.cpp 9 | #else 10 | # include 11 | #endif 12 | -------------------------------------------------------------------------------- /data/placeholder: -------------------------------------------------------------------------------- 1 | placeholder 2 | -------------------------------------------------------------------------------- /fixed-memcmp.cpp: -------------------------------------------------------------------------------- 1 | // #define USE_SIMPLE_MEMCMP // when defined simpler expressions are used 2 | 3 | namespace { 4 | 5 | MAYBE_UNUSED 6 | bool always_true(const char*, const char*) { 7 | return true; 8 | } 9 | 10 | MAYBE_UNUSED 11 | bool memcmp1(const char* a, const char* b) { 12 | return a[0] == b[0]; 13 | } 14 | 15 | MAYBE_UNUSED 16 | bool memcmp2(const char* a, const char* b) { 17 | const uint16_t A = *reinterpret_cast(a); 18 | const uint16_t B = *reinterpret_cast(b); 19 | return A == B; 20 | } 21 | 22 | MAYBE_UNUSED 23 | bool memcmp3(const char* a, const char* b) { 24 | 25 | #ifdef USE_SIMPLE_MEMCMP 26 | return memcmp2(a, b) && memcmp1(a + 2, b + 2); 27 | #else 28 | const uint32_t A = *reinterpret_cast(a); 29 | const uint32_t B = *reinterpret_cast(b); 30 | return (A & 0x00ffffff) == (B & 0x00ffffff); 31 | #endif 32 | } 33 | 34 | MAYBE_UNUSED 35 | bool memcmp4(const char* a, const char* b) { 36 | 37 | const uint32_t A = *reinterpret_cast(a); 38 | const uint32_t B = *reinterpret_cast(b); 39 | return A == B; 40 | } 41 | 42 | MAYBE_UNUSED 43 | bool memcmp5(const char* a, const char* b) { 44 | 45 | #ifdef USE_SIMPLE_MEMCMP 46 | return memcmp4(a, b) && memcmp1(a + 4, b + 4); 47 | #else 48 | const uint64_t A = *reinterpret_cast(a); 49 | const uint64_t B = *reinterpret_cast(b); 50 | return ((A ^ B) & 0x000000fffffffffflu) == 0; 51 | #endif 52 | } 53 | 54 | MAYBE_UNUSED 55 | bool memcmp6(const char* a, const char* b) { 56 | 57 | #ifdef USE_SIMPLE_MEMCMP 58 | return memcmp4(a, b) && memcmp2(a + 4, b + 4); 59 | #else 60 | const uint64_t A = *reinterpret_cast(a); 61 | const uint64_t B = *reinterpret_cast(b); 62 | return ((A ^ B) & 0x0000fffffffffffflu) == 0; 63 | #endif 64 | } 65 | 66 | MAYBE_UNUSED 67 | bool memcmp7(const char* a, const char* b) { 68 | 69 | #ifdef USE_SIMPLE_MEMCMP 70 | return memcmp4(a, b) && memcmp3(a + 4, b + 4); 71 | #else 72 | const uint64_t A = *reinterpret_cast(a); 73 | const uint64_t B = *reinterpret_cast(b); 74 | return ((A ^ B) & 0x00fffffffffffffflu) == 0; 75 | #endif 76 | } 77 | 78 | MAYBE_UNUSED 79 | bool memcmp8(const char* a, const char* b) { 80 | 81 | const uint64_t A = *reinterpret_cast(a); 82 | const uint64_t B = *reinterpret_cast(b); 83 | return A == B; 84 | } 85 | 86 | MAYBE_UNUSED 87 | bool memcmp9(const char* a, const char* b) { 88 | 89 | const uint64_t A = *reinterpret_cast(a); 90 | const uint64_t B = *reinterpret_cast(b); 91 | return (A == B) & (a[8] == b[8]); 92 | } 93 | 94 | MAYBE_UNUSED 95 | bool memcmp10(const char* a, const char* b) { 96 | 97 | const uint64_t Aq = *reinterpret_cast(a); 98 | const uint64_t Bq = *reinterpret_cast(b); 99 | const uint16_t Aw = *reinterpret_cast(a + 8); 100 | const uint16_t Bw = *reinterpret_cast(b + 8); 101 | return (Aq == Bq) & (Aw == Bw); 102 | } 103 | 104 | MAYBE_UNUSED 105 | bool memcmp11(const char* a, const char* b) { 106 | 107 | #ifdef USE_SIMPLE_MEMCMP 108 | return memcmp8(a, b) && memcmp3(a + 8, b + 8); 109 | #else 110 | const uint64_t Aq = *reinterpret_cast(a); 111 | const uint64_t Bq = *reinterpret_cast(b); 112 | const uint32_t Ad = *reinterpret_cast(a + 8); 113 | const uint32_t Bd = *reinterpret_cast(b + 8); 114 | return (Aq == Bq) & ((Ad & 0x00ffffff) == (Bd & 0x00ffffff)); 115 | #endif 116 | } 117 | 118 | MAYBE_UNUSED 119 | bool memcmp12(const char* a, const char* b) { 120 | 121 | const uint64_t Aq = *reinterpret_cast(a); 122 | const uint64_t Bq = *reinterpret_cast(b); 123 | const uint32_t Ad = *reinterpret_cast(a + 8); 124 | const uint32_t Bd = *reinterpret_cast(b + 8); 125 | return (Aq == Bq) & (Ad == Bd); 126 | } 127 | 128 | } 129 | 130 | -------------------------------------------------------------------------------- /make_words.sh: -------------------------------------------------------------------------------- 1 | # split words 2 | cat $1 \ 3 | | tr -s -c "a-zA-Z" "\n" \ 4 | | sort -u \ 5 | > $2 6 | -------------------------------------------------------------------------------- /neon-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | size_t FORCE_INLINE neon_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 2 | 3 | assert(k > 0); 4 | assert(n > 0); 5 | 6 | const uint8x16_t first = vdupq_n_u8(needle[0]); 7 | const uint8x16_t last = vdupq_n_u8(needle[k - 1]); 8 | const uint8x8_t half = vdup_n_u8(0x0f); 9 | 10 | const uint8_t* ptr = reinterpret_cast(s); 11 | 12 | union { 13 | uint8_t tmp[8]; 14 | uint32_t word[2]; 15 | }; 16 | 17 | for (size_t i = 0; i < n; i += 16) { 18 | 19 | const uint8x16_t block_first = vld1q_u8(ptr + i); 20 | const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); 21 | 22 | const uint8x16_t eq_first = vceqq_u8(first, block_first); 23 | const uint8x16_t eq_last = vceqq_u8(last, block_last); 24 | const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); 25 | const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16)); 26 | 27 | vst1_u8(tmp, pred_8); 28 | 29 | if ((word[0] | word[1]) == 0) { 30 | continue; 31 | } 32 | 33 | #if 0 34 | for (int j=0; j < 8; j++) { 35 | if ((tmp[j] & 0x0f) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) { 36 | return i + j; 37 | } 38 | } 39 | 40 | for (int j=0; j < 8; j++) { 41 | if ((tmp[j] & 0xf0) && (memcmp(s + i + j + 1 + 8, needle + 1, k - 2) == 0)) { 42 | return i + j + 8; 43 | } 44 | } 45 | #else 46 | // the above loops unrolled 47 | uint32_t v; 48 | 49 | #define RETURN_IF_EQ(MASK, SHIFT) \ 50 | if ((v & MASK) && memcmp(s + i + SHIFT + 1, needle + 1, k - 2) == 0) { \ 51 | return i + SHIFT; \ 52 | } 53 | 54 | #define COMPARE(MASK, WORD_IDX, SHIFT) \ 55 | v = word[WORD_IDX]; \ 56 | RETURN_IF_EQ(MASK, SHIFT + 0); \ 57 | v >>= 8; \ 58 | RETURN_IF_EQ(MASK, SHIFT + 1); \ 59 | v >>= 8; \ 60 | RETURN_IF_EQ(MASK, SHIFT + 2); \ 61 | v >>= 8; \ 62 | RETURN_IF_EQ(MASK, SHIFT + 3); 63 | 64 | COMPARE(0x0f, 0, 0); 65 | COMPARE(0x0f, 1, 4); 66 | COMPARE(0xf0, 0, 8); 67 | COMPARE(0xf0, 1, 12); 68 | 69 | #undef RETURN_IF_EQ 70 | #undef COMPARE 71 | 72 | #endif 73 | } 74 | 75 | return std::string::npos; 76 | } 77 | 78 | // ------------------------------------------------------------------------ 79 | 80 | template 81 | size_t FORCE_INLINE neon_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 82 | 83 | assert(k > 0); 84 | assert(n > 0); 85 | 86 | const uint8x16_t first = vdupq_n_u8(needle[0]); 87 | const uint8x16_t last = vdupq_n_u8(needle[k - 1]); 88 | const uint8x8_t half = vdup_n_u8(0x0f); 89 | 90 | const uint8_t* ptr = reinterpret_cast(s); 91 | 92 | union { 93 | uint8_t tmp[8]; 94 | uint32_t word[2]; 95 | }; 96 | 97 | for (size_t i = 0; i < n; i += 16) { 98 | 99 | const uint8x16_t block_first = vld1q_u8(ptr + i); 100 | const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); 101 | 102 | const uint8x16_t eq_first = vceqq_u8(first, block_first); 103 | const uint8x16_t eq_last = vceqq_u8(last, block_last); 104 | const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); 105 | const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16)); 106 | 107 | vst1_u8(tmp, pred_8); 108 | 109 | if ((word[0] | word[1]) == 0) { 110 | continue; 111 | } 112 | 113 | #if 0 114 | for (int j=0; j < 8; j++) { 115 | if ((tmp[j] & 0x0f) && memcmp_fun(s + i + j + 1, needle + 1)) { 116 | return i + j; 117 | } 118 | } 119 | 120 | for (int j=0; j < 8; j++) { 121 | if ((tmp[j] & 0xf0) && memcmp_fun(s + i + j + 1 + 8, needle + 1)) { 122 | return i + j + 8; 123 | } 124 | } 125 | #else 126 | // the above loops unrolled 127 | uint32_t v; 128 | 129 | #define RETURN_IF_EQ(MASK, SHIFT) \ 130 | if ((v & MASK) && memcmp_fun(s + i + SHIFT + 1, needle + 1)) { \ 131 | return i + SHIFT; \ 132 | } 133 | 134 | #define COMPARE(MASK, WORD_IDX, SHIFT) \ 135 | v = word[WORD_IDX]; \ 136 | RETURN_IF_EQ(MASK, SHIFT + 0); \ 137 | v >>= 8; \ 138 | RETURN_IF_EQ(MASK, SHIFT + 1); \ 139 | v >>= 8; \ 140 | RETURN_IF_EQ(MASK, SHIFT + 2); \ 141 | v >>= 8; \ 142 | RETURN_IF_EQ(MASK, SHIFT + 3); 143 | 144 | COMPARE(0x0f, 0, 0); 145 | COMPARE(0x0f, 1, 4); 146 | COMPARE(0xf0, 0, 8); 147 | COMPARE(0xf0, 1, 12); 148 | 149 | #undef RETURN_IF_EQ 150 | #undef COMPARE 151 | 152 | #endif 153 | } 154 | 155 | return std::string::npos; 156 | } 157 | 158 | // ------------------------------------------------------------------------ 159 | 160 | size_t neon_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 161 | 162 | size_t result = std::string::npos; 163 | 164 | if (n < k) { 165 | return result; 166 | } 167 | 168 | switch (k) { 169 | case 0: 170 | return 0; 171 | 172 | case 1: { 173 | const char* res = reinterpret_cast(strchr(s, needle[0])); 174 | 175 | return (res != nullptr) ? res - s : std::string::npos; 176 | } 177 | 178 | case 2: 179 | result = neon_strstr_memcmp<2>(s, n, needle, always_true); 180 | break; 181 | 182 | case 3: 183 | result = neon_strstr_memcmp<3>(s, n, needle, memcmp1); 184 | break; 185 | 186 | case 4: 187 | result = neon_strstr_memcmp<4>(s, n, needle, memcmp2); 188 | break; 189 | 190 | case 5: 191 | result = neon_strstr_memcmp<5>(s, n, needle, memcmp4); 192 | break; 193 | 194 | case 6: 195 | result = neon_strstr_memcmp<6>(s, n, needle, memcmp4); 196 | break; 197 | 198 | case 7: 199 | result = neon_strstr_memcmp<7>(s, n, needle, memcmp5); 200 | break; 201 | 202 | case 8: 203 | result = neon_strstr_memcmp<8>(s, n, needle, memcmp6); 204 | break; 205 | 206 | case 9: 207 | result = neon_strstr_memcmp<9>(s, n, needle, memcmp8); 208 | break; 209 | 210 | case 10: 211 | result = neon_strstr_memcmp<10>(s, n, needle, memcmp8); 212 | break; 213 | 214 | case 11: 215 | result = neon_strstr_memcmp<11>(s, n, needle, memcmp9); 216 | break; 217 | 218 | case 12: 219 | result = neon_strstr_memcmp<12>(s, n, needle, memcmp10); 220 | break; 221 | 222 | default: 223 | result = neon_strstr_anysize(s, n, needle, k); 224 | break; 225 | } 226 | 227 | if (result <= n - k) { 228 | return result; 229 | } else { 230 | return std::string::npos; 231 | } 232 | } 233 | 234 | // ------------------------------------------------------------------------ 235 | 236 | size_t neon_strstr_v2(const std::string& s, const std::string& needle) { 237 | 238 | return neon_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 239 | } 240 | 241 | 242 | -------------------------------------------------------------------------------- /original/sse4_strstr-test.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | 3 | filename = "" 4 | try: 5 | filename = sys.argv[1] 6 | string = open(filename, "r").read() 7 | except: 8 | print "can't open '%s'" % filename 9 | sys.exit(1) 10 | 11 | try: 12 | random.seed(int(sys.argv[3])) 13 | except: 14 | pass 15 | 16 | def time_command(command): 17 | os.system('/usr/bin/time -o /tmp/measure -f "%U" ' + command) 18 | f = open("/tmp/measure", "r") 19 | t = float(f.read()) 20 | f.close() 21 | return t 22 | 23 | 24 | def time(command1, command2, iters=10): 25 | while True: 26 | t1 = time_command(command1.replace("__iters__", str(iters))) 27 | if t1 > 1: 28 | t2 = time_command(command2.replace("__iters__", str(iters))) 29 | return iters, t1, t2 30 | else: 31 | iters *= 10 32 | 33 | 34 | def compare(filename, wordpos, word, wordlen): 35 | word = word.replace("%", "%%") 36 | cmd1 = './a.out "%s" libc __iters__ "%s" > /dev/null' % (filename, word) 37 | cmd2 = './a.out "%s" sse4 __iters__ "%s" > /dev/null' % (filename, word) 38 | _, t1, t2 = time(cmd1, cmd2) 39 | 40 | return "[%d,%d] libc=%0.3fs sse4=%0.3fs speedup=%0.2f" % (wordpos, wordlen, t1, t2, t1/t2) 41 | 42 | 43 | logname = "sse4.log" 44 | lognumber = 1 45 | while True: 46 | if not os.path.exists(logname): 47 | log = open(logname, "w") 48 | break 49 | else: 50 | logname = "sse4%d.log" % lognumber 51 | lognumber += 1 52 | 53 | 54 | try: 55 | for n in xrange(4, 64): 56 | i1 = random.randint( 0, 64) 57 | i2 = random.randint( 65, 1024) 58 | i3 = random.randint(1024, len(string)-n) 59 | print "length", n 60 | for i in [i1, i2, i3]: 61 | word = string[i:i+n] 62 | for c in "\\`()<>{}\"": 63 | word = word.replace(c, "\\" + c) 64 | 65 | cmd = './a.out "%s" verify 1 "%s"' % (filename, word) 66 | err = os.system(cmd) 67 | if err: 68 | print repr(string[i:i+l]) 69 | sys.exit(1) 70 | else: 71 | s = compare(filename, i, word, n) 72 | log.write(s + "\n") 73 | print s 74 | except: 75 | import traceback 76 | traceback.print_exc() 77 | log.close() 78 | -------------------------------------------------------------------------------- /original/sse4_strstr.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/sse4-strstr/9cdc4b6df817c8a8c67a1cebdc7e18a1da35f407/original/sse4_strstr.c -------------------------------------------------------------------------------- /results/armv7-32bit-gcc4.9.2.txt: -------------------------------------------------------------------------------- 1 | ./speedup_arm data/i386.txt data/words 1 2 | std::strstr ... reference result = 810807651, time = 7.318775 s 3 | std::string::find ... reference result = 810807651, time = 4.171311 s 4 | SWAR 32-bit (generic) ... reference result = 810807651, time = 2.450585 s 5 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299383 s 6 | ./speedup_arm data/i386.txt data/words 1 7 | std::strstr ... reference result = 810807651, time = 7.329223 s 8 | std::string::find ... reference result = 810807651, time = 4.188313 s 9 | SWAR 32-bit (generic) ... reference result = 810807651, time = 2.461333 s 10 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.305622 s 11 | ./speedup_arm data/i386.txt data/words 1 12 | std::strstr ... reference result = 810807651, time = 7.304049 s 13 | std::string::find ... reference result = 810807651, time = 4.172608 s 14 | SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451913 s 15 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.300619 s 16 | ./speedup_arm data/i386.txt data/words 1 17 | std::strstr ... reference result = 810807651, time = 7.307621 s 18 | std::string::find ... reference result = 810807651, time = 4.176439 s 19 | SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451030 s 20 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299274 s 21 | ./speedup_arm data/i386.txt data/words 1 22 | std::strstr ... reference result = 810807651, time = 7.313498 s 23 | std::string::find ... reference result = 810807651, time = 4.175714 s 24 | SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451439 s 25 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.298613 s 26 | -------------------------------------------------------------------------------- /results/armv8-64bit-clang3.8.0.txt: -------------------------------------------------------------------------------- 1 | std::strstr ... reference result = 810807651, time = 3.457578 s 2 | std::string::find ... reference result = 810807651, time = 1.821379 s 3 | SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463006 s 4 | SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810749 s 5 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407214 s 6 | AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279203 s 7 | std::strstr ... reference result = 810807651, time = 3.381364 s 8 | std::string::find ... reference result = 810807651, time = 1.813678 s 9 | SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462694 s 10 | SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810882 s 11 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.406888 s 12 | AArch64 64 bit (v2) ... reference result = 810807651, time = 0.278970 s 13 | std::strstr ... reference result = 810807651, time = 4.118293 s 14 | std::string::find ... reference result = 810807651, time = 1.822696 s 15 | SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463028 s 16 | SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810933 s 17 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407296 s 18 | AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279606 s 19 | std::strstr ... reference result = 810807651, time = 3.375462 s 20 | std::string::find ... reference result = 810807651, time = 1.821449 s 21 | SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462863 s 22 | SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811320 s 23 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407274 s 24 | AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279285 s 25 | std::strstr ... reference result = 810807651, time = 3.378566 s 26 | std::string::find ... reference result = 810807651, time = 1.825054 s 27 | SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462957 s 28 | SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811188 s 29 | ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407364 s 30 | AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279490 s 31 | -------------------------------------------------------------------------------- /results/bulldozer-fx-8510-gcc4.8.4-sse.txt: -------------------------------------------------------------------------------- 1 | ./speedup data/i386.txt data/words 2 | std::strstr ... reference result = 8108076510, time = 9.390892 s 3 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.938355 s 4 | SSE2 (generic) ... reference result = 8108076510, time = 0.788781 s 5 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989833 s 6 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.060081 s 7 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.006810 s 8 | ./speedup data/i386.txt data/words 9 | std::strstr ... reference result = 8108076510, time = 9.387153 s 10 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.948608 s 11 | SSE2 (generic) ... reference result = 8108076510, time = 0.789325 s 12 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.988635 s 13 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.066327 s 14 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007233 s 15 | ./speedup data/i386.txt data/words 16 | std::strstr ... reference result = 8108076510, time = 9.377923 s 17 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.967027 s 18 | SSE2 (generic) ... reference result = 8108076510, time = 0.788709 s 19 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989077 s 20 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.065608 s 21 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007228 s 22 | 23 | -------------------------------------------------------------------------------- /results/cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt: -------------------------------------------------------------------------------- 1 | ./speedup_avx512bw data/i386.txt data/words 2 | scalar (naive) ... reference result = 8108076510, time = 4.095307 s 3 | std::strstr ... reference result = 8108076510, time = 0.492459 s 4 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243510 s 5 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.349437 s 6 | SSE2 (generic) ... reference result = 8108076510, time = 0.443313 s 7 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583372 s 8 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822263 s 9 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.311350 s 10 | SSE (naive) ... reference result = 8108076510, time = 1.757493 s 11 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531920 s 12 | AVX2 (generic) ... reference result = 8108076510, time = 0.338738 s 13 | AVX2 (naive) ... reference result = 8108076510, time = 1.013489 s 14 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.480182 s 15 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634909 s 16 | AVX512F (generic) ... reference result = 8108076510, time = 0.281276 s 17 | AVX512BW (generic) ... reference result = 8108076510, time = 0.256798 s 18 | ./speedup_avx512bw data/i386.txt data/words 19 | scalar (naive) ... reference result = 8108076510, time = 4.089051 s 20 | std::strstr ... reference result = 8108076510, time = 0.492275 s 21 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243637 s 22 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.343078 s 23 | SSE2 (generic) ... reference result = 8108076510, time = 0.443659 s 24 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.584467 s 25 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822993 s 26 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.313485 s 27 | SSE (naive) ... reference result = 8108076510, time = 1.760697 s 28 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531827 s 29 | AVX2 (generic) ... reference result = 8108076510, time = 0.338912 s 30 | AVX2 (naive) ... reference result = 8108076510, time = 1.012637 s 31 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.478455 s 32 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.636537 s 33 | AVX512F (generic) ... reference result = 8108076510, time = 0.279054 s 34 | AVX512BW (generic) ... reference result = 8108076510, time = 0.255777 s 35 | ./speedup_avx512bw data/i386.txt data/words 36 | scalar (naive) ... reference result = 8108076510, time = 4.092489 s 37 | std::strstr ... reference result = 8108076510, time = 0.489993 s 38 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.241418 s 39 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.346954 s 40 | SSE2 (generic) ... reference result = 8108076510, time = 0.442109 s 41 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583955 s 42 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822657 s 43 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.312243 s 44 | SSE (naive) ... reference result = 8108076510, time = 1.757719 s 45 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.532528 s 46 | AVX2 (generic) ... reference result = 8108076510, time = 0.338666 s 47 | AVX2 (naive) ... reference result = 8108076510, time = 1.013151 s 48 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.477202 s 49 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634753 s 50 | AVX512F (generic) ... reference result = 8108076510, time = 0.280525 s 51 | AVX512BW (generic) ... reference result = 8108076510, time = 0.256838 s 52 | -------------------------------------------------------------------------------- /results/haswell-i7-4770-gcc5.4.1-avx2.txt: -------------------------------------------------------------------------------- 1 | ./speedup_avx2 data/i386.txt data/words 2 | std::strstr ... reference result = 8108076510, time = 0.528137 s 3 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.605520 s 4 | SSE2 (generic) ... reference result = 8108076510, time = 0.554532 s 5 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897859 s 6 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996473 s 7 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559956 s 8 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615836 s 9 | AVX2 (generic) ... reference result = 8108076510, time = 0.386747 s 10 | ./speedup_avx2 data/i386.txt data/words 11 | std::strstr ... reference result = 8108076510, time = 0.527864 s 12 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.577149 s 13 | SSE2 (generic) ... reference result = 8108076510, time = 0.554352 s 14 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897752 s 15 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996771 s 16 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.560012 s 17 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615825 s 18 | AVX2 (generic) ... reference result = 8108076510, time = 0.386528 s 19 | ./speedup_avx2 data/i386.txt data/words 20 | std::strstr ... reference result = 8108076510, time = 0.528205 s 21 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.591732 s 22 | SSE2 (generic) ... reference result = 8108076510, time = 0.554423 s 23 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897921 s 24 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996889 s 25 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559919 s 26 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615783 s 27 | AVX2 (generic) ... reference result = 8108076510, time = 0.386609 s 28 | -------------------------------------------------------------------------------- /results/knights-landing-7210-gcc5.3.0-avx512f.txt: -------------------------------------------------------------------------------- 1 | ./speedup_avx512 data/i386.txt data/words 2 | std::strstr ... reference result = 8108076510, time = 4.964439 s 3 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.205818 s 4 | SSE2 (generic) ... reference result = 8108076510, time = 6.126381 s 5 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.737857 s 6 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.745691 s 7 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.306659 s 8 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.179747 s 9 | AVX2 (generic) ... reference result = 8108076510, time = 4.113571 s 10 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.348848 s 11 | AVX512F (generic) ... reference result = 8108076510, time = 1.164081 s 12 | ./speedup_avx512 data/i386.txt data/words 13 | std::strstr ... reference result = 8108076510, time = 4.946063 s 14 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.172884 s 15 | SSE2 (generic) ... reference result = 8108076510, time = 6.107860 s 16 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.717146 s 17 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.724856 s 18 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.288685 s 19 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.151361 s 20 | AVX2 (generic) ... reference result = 8108076510, time = 4.094781 s 21 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.327864 s 22 | AVX512F (generic) ... reference result = 8108076510, time = 1.142747 s 23 | ./speedup_avx512 data/i386.txt data/words 24 | std::strstr ... reference result = 8108076510, time = 4.949234 s 25 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.170751 s 26 | SSE2 (generic) ... reference result = 8108076510, time = 6.109035 s 27 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.716665 s 28 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.727568 s 29 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.289994 s 30 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.153943 s 31 | AVX2 (generic) ... reference result = 8108076510, time = 4.094941 s 32 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.326156 s 33 | AVX512F (generic) ... reference result = 8108076510, time = 1.140567 s 34 | -------------------------------------------------------------------------------- /results/postprocess.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def load(file): 4 | D = OrderedDict() 5 | for line in file: 6 | if 'reference result' not in line: 7 | continue 8 | 9 | name, tail = line.split('...') 10 | name = name.strip() 11 | time = float(tail.split()[6]) 12 | 13 | if name not in D: 14 | D[name] = time 15 | else: 16 | D[name] = min(time, D[name]) 17 | 18 | return D 19 | 20 | 21 | def main(): 22 | import sys 23 | paths = sys.argv[1:] 24 | for path in paths: 25 | if len(paths) > 1: 26 | print path 27 | 28 | with open(path, 'rt') as f: 29 | for name, time in load(f).iteritems(): 30 | print '%-30s %10.5f' % (name, time) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /results/skylake-i7-6700-gcc5.4.1-avx2.txt: -------------------------------------------------------------------------------- 1 | ./speedup_avx2 data/i386.txt data/words 2 | std::strstr ... reference result = 8108076510, time = 0.662049 s 3 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404260 s 4 | SSE2 (generic) ... reference result = 8108076510, time = 0.489281 s 5 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638782 s 6 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879433 s 7 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390802 s 8 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.570455 s 9 | AVX2 (generic) ... reference result = 8108076510, time = 0.363694 s 10 | ./speedup_avx2 data/i386.txt data/words 11 | std::strstr ... reference result = 8108076510, time = 0.662266 s 12 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404036 s 13 | SSE2 (generic) ... reference result = 8108076510, time = 0.489313 s 14 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638926 s 15 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879193 s 16 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390626 s 17 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569980 s 18 | AVX2 (generic) ... reference result = 8108076510, time = 0.363876 s 19 | ./speedup_avx2 data/i386.txt data/words 20 | std::strstr ... reference result = 8108076510, time = 0.661478 s 21 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.405280 s 22 | SSE2 (generic) ... reference result = 8108076510, time = 0.488631 s 23 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638753 s 24 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879345 s 25 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390670 s 26 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569808 s 27 | AVX2 (generic) ... reference result = 8108076510, time = 0.363091 s 28 | -------------------------------------------------------------------------------- /results/skylake-i9-7900-gcc-5.4.1-avx512bw.txt: -------------------------------------------------------------------------------- 1 | ./speedup_avx512bw data/i386.txt data/words 2 | naive scalar ... reference result = 8108076510, time = 4.872957 s 3 | std::strstr ... reference result = 8108076510, time = 0.401080 s 4 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237922 s 5 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.044511 s 6 | SSE2 (generic) ... reference result = 8108076510, time = 0.385573 s 7 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.580510 s 8 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.674341 s 9 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.058753 s 10 | SSE (naive) ... reference result = 8108076510, time = 1.709206 s 11 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444774 s 12 | AVX2 (generic) ... reference result = 8108076510, time = 0.274761 s 13 | AVX2 (naive) ... reference result = 8108076510, time = 0.918683 s 14 | AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.463246 s 15 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.441233 s 16 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.507046 s 17 | AVX512F (generic) ... reference result = 8108076510, time = 0.262774 s 18 | AVX512BW (generic) ... reference result = 8108076510, time = 0.220457 s 19 | ./speedup_avx512bw data/i386.txt data/words 20 | naive scalar ... reference result = 8108076510, time = 4.816247 s 21 | std::strstr ... reference result = 8108076510, time = 0.398468 s 22 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.239442 s 23 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.050195 s 24 | SSE2 (generic) ... reference result = 8108076510, time = 0.384561 s 25 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.582862 s 26 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675480 s 27 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.061598 s 28 | SSE (naive) ... reference result = 8108076510, time = 1.676643 s 29 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.439711 s 30 | AVX2 (generic) ... reference result = 8108076510, time = 1.638515 s 31 | AVX2 (naive) ... reference result = 8108076510, time = 0.984768 s 32 | AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.494318 s 33 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.479306 s 34 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.553042 s 35 | AVX512F (generic) ... reference result = 8108076510, time = 0.290909 s 36 | AVX512BW (generic) ... reference result = 8108076510, time = 0.237055 s 37 | ./speedup_avx512bw data/i386.txt data/words 38 | naive scalar ... reference result = 8108076510, time = 6.406914 s 39 | std::strstr ... reference result = 8108076510, time = 0.401352 s 40 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237499 s 41 | SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.043457 s 42 | SSE2 (generic) ... reference result = 8108076510, time = 0.385167 s 43 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.581361 s 44 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675044 s 45 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.059933 s 46 | SSE (naive) ... reference result = 8108076510, time = 1.671910 s 47 | AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444940 s 48 | AVX2 (generic) ... reference result = 8108076510, time = 0.276522 s 49 | AVX2 (naive) ... reference result = 8108076510, time = 0.921444 s 50 | AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.464818 s 51 | AVX2-wide (naive) ... reference result = 8107771150, time = 0.442211 s 52 | AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.511326 s 53 | AVX512F (generic) ... reference result = 8108076510, time = 0.265488 s 54 | AVX512BW (generic) ... reference result = 8108076510, time = 0.221329 s 55 | -------------------------------------------------------------------------------- /results/westmere-m540-gcc6.2.0-sse4.txt: -------------------------------------------------------------------------------- 1 | ./speedup data/i386.txt data/words 2 | std::strstr ... reference result = 8108076510, time = 0.832291 s 3 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.498591 s 4 | SSE2 (generic) ... reference result = 8108076510, time = 0.745890 s 5 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.450405 s 6 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238676 s 7 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.699681 s 8 | ./speedup data/i386.txt data/words 9 | std::strstr ... reference result = 8108076510, time = 0.822457 s 10 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.518604 s 11 | SSE2 (generic) ... reference result = 8108076510, time = 0.750936 s 12 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.470000 s 13 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.239929 s 14 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.707411 s 15 | ./speedup data/i386.txt data/words 16 | std::strstr ... reference result = 8108076510, time = 0.827280 s 17 | SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.535406 s 18 | SSE2 (generic) ... reference result = 8108076510, time = 0.747252 s 19 | SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.456153 s 20 | SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238485 s 21 | SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.711734 s 22 | -------------------------------------------------------------------------------- /scalar.cpp: -------------------------------------------------------------------------------- 1 | // Implementation by Daniel Lemire 2 | // https://github.com/WojciechMula/sse4-strstr/issues/2 3 | 4 | size_t strstr_naive(const char * hay, size_t size, const char *needle, size_t needlesize) { 5 | 6 | if (size == needlesize) { 7 | return memcmp(hay, needle, size) == 0 ? 0 : std::string::npos; 8 | } 9 | 10 | const char first = needle[0]; 11 | const ssize_t maxpos = ssize_t(size) - ssize_t(needlesize) + 1; 12 | for(ssize_t i = 0; i < maxpos; i++) { 13 | if(hay[i] != first) { 14 | i++; 15 | while( i < maxpos && hay[i] != first ) i++; 16 | if ( i == maxpos ) break; 17 | } 18 | size_t j = 1; 19 | for( ; j < needlesize; ++j) 20 | if(hay[ i + j ] != needle[ j ] ) break; 21 | if( j == needlesize) return i; 22 | } 23 | return std::string::npos; 24 | } 25 | -------------------------------------------------------------------------------- /src/all.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | #include 5 | #include 6 | #include "fixed-memcmp.cpp" 7 | #include "scalar.cpp" 8 | #include "swar64-strstr-v2.cpp" 9 | #include "swar32-strstr-v2.cpp" 10 | #ifdef HAVE_SSE_INSTRUCTIONS 11 | # include 12 | # include "sse4-strstr.cpp" 13 | # include "sse4-strstr-unrolled.cpp" 14 | # include "sse4.2-strstr.cpp" 15 | # include "sse2-strstr.cpp" 16 | # include "sse-naive-strstr.cpp" 17 | # include "sse2-needle4.cpp" 18 | #endif 19 | #ifdef HAVE_AVX2_INSTRUCTIONS 20 | # include 21 | # include "avx2-strstr.cpp" 22 | # include "avx2-strstr-v2.cpp" 23 | # include "avx2-naive-strstr.cpp" 24 | # include "avx2-naive-strstr64.cpp" 25 | # include "avx2-naive-unrolled-strstr.cpp" 26 | #endif 27 | #ifdef HAVE_AVX512F_INSTRUCTIONS 28 | # include "avx512f-strstr.cpp" 29 | # include "avx512f-strstr-v2.cpp" 30 | #endif 31 | #ifdef HAVE_AVX512BW_INSTRUCTIONS 32 | # include "avx512bw-strstr-v2.cpp" 33 | # include "avx512bw-strstr-v3.cpp" 34 | #endif 35 | #ifdef HAVE_NEON_INSTRUCTIONS 36 | # include 37 | # include "neon-strstr-v2.cpp" 38 | #endif 39 | #ifdef HAVE_AARCH64_ARCHITECTURE 40 | # include "aarch64-strstr-v2.cpp" 41 | #endif 42 | 43 | -------------------------------------------------------------------------------- /src/all_procedures.cpp: -------------------------------------------------------------------------------- 1 | #include "all.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using str_find_fun = size_t (*)(const char*, size_t, const char*, size_t); 9 | 10 | struct Procedures { 11 | 12 | struct Item { 13 | str_find_fun proc; 14 | std::string name; 15 | char code; 16 | bool builtin; 17 | 18 | Item(str_find_fun proc_, const char* name_, char code_, bool builtin_ = false) 19 | : proc(proc_) 20 | , name(name_) 21 | , code(code_) 22 | , builtin(builtin_) {} 23 | }; 24 | 25 | std::vector procedures; 26 | 27 | const Item& operator[](char code) { 28 | auto pred = [code](const Item& item){return item.code == code;}; 29 | auto it = std::find_if(procedures.begin(), procedures.end(), pred); 30 | 31 | if (it == procedures.end()) { 32 | throw std::logic_error("can't find procedure with code '" + std::string(1, code) + "'"); 33 | } 34 | 35 | return *it; 36 | } 37 | }; 38 | 39 | size_t strstr_libc(const char* s, size_t, const char* needle, size_t) { 40 | const char* ptr = strstr(s, needle); 41 | if (ptr) { 42 | return ptr - s; 43 | } else { 44 | return std::string::npos; 45 | } 46 | } 47 | 48 | Procedures all_procedures() { 49 | 50 | Procedures db; 51 | 52 | db.procedures.emplace_back( 53 | strstr_naive, 54 | "scalar (naive)", 55 | 'a' 56 | ); 57 | 58 | db.procedures.emplace_back( 59 | strstr_libc, 60 | "std::strstr", 61 | 'b', 62 | true 63 | ); 64 | 65 | db.procedures.emplace_back( 66 | nullptr, 67 | "std::string::find", 68 | 'c', 69 | true 70 | ); 71 | 72 | #define REGISTER(code, name, procedure) \ 73 | { \ 74 | str_find_fun f = procedure; \ 75 | db.procedures.emplace_back(f, name, code); \ 76 | } 77 | 78 | REGISTER('d', "SWAR 64-bit (generic)", swar64_strstr_v2); 79 | REGISTER('e', "SWAR 32-bit (generic)", swar32_strstr_v2); 80 | 81 | #ifdef HAVE_SSE_INSTRUCTIONS 82 | REGISTER('f', "SSE2 (generic)", sse2_strstr_v2); 83 | REGISTER('g', "SSE4.1 (MPSADBW)", sse4_strstr); 84 | REGISTER('h', "SSE4.1 (MPSADBW unrolled)", sse4_strstr_unrolled); 85 | REGISTER('i', "SSE4.2 (PCMPESTRM)", sse42_strstr); 86 | REGISTER('j', "SSE (naive)", sse_naive_strstr); 87 | REGISTER('v', "SSE2 (4-byte needle)", sse2_strstr_needle4); 88 | REGISTER('w', "SSE2 (4-byte needle v2)", sse2_strstr_needle4_v2); 89 | #endif 90 | #ifdef HAVE_AVX2_INSTRUCTIONS 91 | REGISTER('k', "AVX2 (MPSADBW)", avx2_strstr); 92 | REGISTER('l', "AVX2 (generic)", avx2_strstr_v2); 93 | REGISTER('m', "AVX2 (naive)", avx2_naive_strstr); 94 | REGISTER('n', "AVX2 (naive unrolled)", avx2_naive_unrolled_strstr); 95 | REGISTER('o', "AVX2-wide (naive)", avx2_naive_strstr64); 96 | #endif 97 | 98 | #ifdef HAVE_AVX512F_INSTRUCTIONS 99 | REGISTER('p', "AVX512F (MPSADBW-like)", avx512f_strstr); 100 | REGISTER('q', "AVX512F (generic)", avx512f_strstr_v2); 101 | #endif 102 | 103 | #ifdef HAVE_AVX512BW_INSTRUCTIONS 104 | REGISTER('r', "AVX512BW (generic)", avx512bw_strstr_v2); 105 | REGISTER('s', "AVX512BW (masked)", avx512bw_strstr_v3); 106 | #endif 107 | 108 | #ifdef HAVE_NEON_INSTRUCTIONS 109 | REGISTER('t', "ARM Neon 32 bit (v2)", neon_strstr_v2); 110 | #endif 111 | 112 | #ifdef HAVE_AARCH64_ARCHITECTURE 113 | REGISTER('u', "AArch64 64 bit (v2)", aarch64_strstr_v2); 114 | #endif 115 | 116 | #undef REGISTER 117 | return db; 118 | } 119 | -------------------------------------------------------------------------------- /src/application_base.cpp: -------------------------------------------------------------------------------- 1 | class ApplicationBase { 2 | 3 | protected: 4 | std::string file; 5 | std::vector words; 6 | 7 | public: 8 | class Error final { 9 | public: 10 | const std::string message; 11 | 12 | public: 13 | Error(const std::string& msg) : message(msg) {} 14 | }; 15 | 16 | public: 17 | void prepare(const std::string& file_name, const std::string& words_name) { 18 | 19 | load_text(file_name); 20 | load_words(words_name); 21 | } 22 | 23 | private: 24 | void load_text(const std::string& path) { 25 | 26 | FILE* f = fopen(path.c_str(), "rt"); 27 | if (f == nullptr) { 28 | throw_errno(path); 29 | } 30 | 31 | fseek(f, -1, SEEK_END); 32 | const auto size = ftell(f); 33 | 34 | rewind(f); 35 | 36 | char* buffer = new char[size]; 37 | fread(buffer, size, 1, f); 38 | buffer[size] = 0; 39 | fclose(f); 40 | 41 | file = buffer; 42 | 43 | delete[] buffer; 44 | } 45 | 46 | 47 | void load_words(const std::string& path) { 48 | 49 | char buffer[1024]; 50 | 51 | FILE* f = fopen(path.c_str(), "rt"); 52 | if (f == nullptr) { 53 | throw_errno(path); 54 | } 55 | 56 | while (!feof(f)) { 57 | fgets(buffer, sizeof(buffer), f); 58 | 59 | const auto len = strlen(buffer); 60 | if (buffer[len - 1] == '\n') { 61 | buffer[len - 1] = 0; 62 | if (len == 1) // skip empty strings 63 | continue; 64 | } 65 | 66 | words.push_back(buffer); 67 | } 68 | 69 | fclose(f); 70 | } 71 | 72 | 73 | void throw_errno(const std::string& prefix) { 74 | 75 | const std::string msg = prefix + ": " + std::string(strerror(errno)); 76 | throw Error(msg); 77 | } 78 | }; 79 | 80 | -------------------------------------------------------------------------------- /src/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "all_procedures.cpp" 9 | 10 | // ------------------------------------------------------------------------ 11 | 12 | #include 13 | #include "benchmark.h" 14 | #include "application_base.cpp" 15 | 16 | class Application final: public ApplicationBase { 17 | 18 | Procedures db; 19 | 20 | public: 21 | enum class TestType { 22 | OptimisticCase, 23 | Random, 24 | WorstCase 25 | }; 26 | 27 | struct Parameters { 28 | size_t needle_position; 29 | size_t needle_size; 30 | size_t count; 31 | TestType test_type; 32 | std::string procedure_codes; 33 | }; 34 | 35 | public: 36 | Application(const Parameters& params) 37 | : db(all_procedures()) 38 | , parameters(params) { 39 | 40 | prepare(); 41 | } 42 | 43 | bool operator()() { 44 | 45 | // strstr is treated as built-in function by GCC 46 | // it seems it's wiped out in benchmark 47 | const bool measure_stdstring = false; 48 | 49 | #if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE) 50 | // On Raspberry Pi it's terribly slow, but on Aarch64 51 | // the 64-bit procedure is pretty fast 52 | const bool measure_swar64 = false; 53 | #else 54 | const bool measure_swar64 = true; 55 | #endif 56 | 57 | if (is_enabled('a')) { 58 | 59 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 60 | 61 | return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size()); 62 | }; 63 | 64 | measure(find, 'a'); 65 | } 66 | 67 | if (is_enabled('b')) { 68 | 69 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 70 | const char* res = strstr(s.data(), neddle.data()); 71 | 72 | if (res != nullptr) { 73 | return res - s.data(); 74 | } else { 75 | return std::string::npos; 76 | } 77 | }; 78 | 79 | measure(find, 'b'); 80 | } 81 | 82 | if (measure_stdstring && is_enabled('c')) { 83 | 84 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 85 | 86 | return s.find(neddle); 87 | }; 88 | 89 | measure(find, 'c'); 90 | } 91 | 92 | if (measure_swar64 && is_enabled('d')) { 93 | 94 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 95 | 96 | return swar64_strstr_v2(s, neddle); 97 | }; 98 | 99 | measure(find, 'd'); 100 | } 101 | 102 | if (is_enabled('e')) { 103 | 104 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 105 | 106 | return swar32_strstr_v2(s, neddle); 107 | }; 108 | 109 | measure(find, 'e'); 110 | } 111 | 112 | #ifdef HAVE_SSE_INSTRUCTIONS 113 | if (is_enabled('f')) { 114 | 115 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 116 | 117 | return sse2_strstr_v2(s, neddle); 118 | }; 119 | 120 | measure(find, 'f'); 121 | } 122 | 123 | if (is_enabled('g')) { 124 | 125 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 126 | 127 | return sse4_strstr(s, neddle); 128 | }; 129 | 130 | measure(find, 'g'); 131 | } 132 | 133 | if (is_enabled('h')) { 134 | 135 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 136 | 137 | return sse4_strstr_unrolled(s, neddle); 138 | }; 139 | 140 | measure(find, 'h'); 141 | } 142 | 143 | if (is_enabled('i')) { 144 | 145 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 146 | 147 | return sse42_strstr(s, neddle); 148 | }; 149 | 150 | measure(find, 'i'); 151 | } 152 | 153 | if (is_enabled('j')) { 154 | 155 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 156 | 157 | return sse_naive_strstr(s, neddle); 158 | }; 159 | 160 | measure(find, 'j'); 161 | } 162 | 163 | if (is_enabled('v')) { 164 | 165 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 166 | 167 | return sse2_strstr_needle4(s, neddle); 168 | }; 169 | 170 | measure(find, 'v'); 171 | } 172 | 173 | if (is_enabled('w')) { 174 | 175 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 176 | 177 | return sse2_strstr_needle4_v2(s, neddle); 178 | }; 179 | 180 | measure(find, 'w'); 181 | } 182 | #endif 183 | 184 | #ifdef HAVE_AVX2_INSTRUCTIONS 185 | if (is_enabled('k')) { 186 | 187 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 188 | 189 | return avx2_strstr(s, neddle); 190 | }; 191 | 192 | measure(find, 'k'); 193 | } 194 | 195 | if (is_enabled('l')) { 196 | 197 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 198 | 199 | return avx2_strstr_v2(s, neddle); 200 | }; 201 | 202 | measure(find, 'l'); 203 | } 204 | 205 | if (is_enabled('m')) { 206 | 207 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 208 | 209 | return avx2_naive_strstr(s, neddle); 210 | }; 211 | 212 | measure(find, 'm'); 213 | } 214 | 215 | if (is_enabled('n')) { 216 | 217 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 218 | 219 | return avx2_naive_unrolled_strstr(s, neddle); 220 | }; 221 | 222 | measure(find, 'n'); 223 | } 224 | 225 | if (is_enabled('o')) { 226 | 227 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 228 | 229 | return avx2_naive_strstr64(s, neddle); 230 | }; 231 | 232 | measure(find, 'o'); 233 | } 234 | #endif 235 | 236 | #ifdef HAVE_AVX512F_INSTRUCTIONS 237 | if (is_enabled('p')) { 238 | 239 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 240 | 241 | return avx512f_strstr(s, neddle); 242 | }; 243 | 244 | measure(find, 'p'); 245 | } 246 | 247 | if (is_enabled('q')) { 248 | 249 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 250 | 251 | return avx512f_strstr_v2(s, neddle); 252 | }; 253 | 254 | measure(find, 'q'); 255 | } 256 | #endif 257 | 258 | #ifdef HAVE_AVX512BW_INSTRUCTIONS 259 | if (is_enabled('r')) { 260 | 261 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 262 | 263 | return avx512bw_strstr_v2(s, neddle); 264 | }; 265 | 266 | measure(find, 'r'); 267 | } 268 | 269 | if (is_enabled('u')) { 270 | 271 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 272 | 273 | return avx512bw_strstr_v3(s, neddle); 274 | }; 275 | 276 | measure(find, 'u'); 277 | } 278 | #endif 279 | 280 | #ifdef HAVE_NEON_INSTRUCTIONS 281 | if (is_enabled('s')) { 282 | 283 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 284 | 285 | return neon_strstr_v2(s, neddle); 286 | }; 287 | 288 | measure(find, 's'); 289 | } 290 | #endif 291 | 292 | #ifdef HAVE_AARCH64_ARCHITECTURE 293 | if (is_enabled('t')) { 294 | 295 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 296 | 297 | return aarch64_strstr_v2(s, neddle); 298 | }; 299 | 300 | measure(find, 't'); 301 | } 302 | #endif 303 | 304 | return true; 305 | } 306 | 307 | 308 | static void print_help(const char* progname) { 309 | std::printf("%s needle-position needle-size iteration-count test-name [procedures]\n", progname); 310 | std::puts(""); 311 | std::puts("Parameters:"); 312 | std::puts(""); 313 | std::puts(" needle-position position of the needle"); 314 | std::puts(" needle-size length of the needle"); 315 | std::puts(" count how many times test is repeated"); 316 | std::puts(" test-name one of 'optimistic', 'random', 'worst'"); 317 | std::puts(" procedures procedure code(s), listed below [by default all will be tested]"); 318 | std::puts(""); 319 | std::puts("Test kinds"); 320 | std::puts(""); 321 | std::puts(" optimistic data before needle contains characters don't present in the needle"); 322 | std::puts(" random data before needle contains some random characters"); 323 | std::puts(" worst needle has form 'aaa...aaaXaaa...aaa', and data before is filled with the 'a'"); 324 | std::puts(""); 325 | std::puts("Following procedures are available:"); 326 | for (auto& item: all_procedures().procedures) { 327 | printf(" [%c] %s\n", item.code, item.name.c_str()); 328 | } 329 | } 330 | 331 | 332 | private: 333 | volatile size_t sink; 334 | 335 | template 336 | void measure(T_FIND find, char code) { 337 | 338 | BEST_TIME(/**/, 339 | sink = find(input, needle), 340 | db[code].name.c_str(), 341 | parameters.count, 342 | parameters.needle_position); 343 | } 344 | 345 | 346 | bool is_enabled(char proc) const { 347 | return (parameters.procedure_codes.empty()) 348 | || (parameters.procedure_codes.find(proc) != std::string::npos); 349 | } 350 | 351 | void prepare_needle() { 352 | 353 | needle.append(parameters.needle_size/2, 'a'); 354 | needle.append(1, 'X'); 355 | needle.append(parameters.needle_size - needle.size(), 'a'); 356 | } 357 | 358 | void prepare_input() { 359 | 360 | const size_t padding = 256; 361 | 362 | switch (parameters.test_type) { 363 | case TestType::OptimisticCase: 364 | input.assign(parameters.needle_position, '_'); 365 | break; 366 | 367 | case TestType::WorstCase: 368 | input.assign(parameters.needle_position, 'a'); 369 | break; 370 | 371 | case TestType::Random: 372 | for (size_t i=0; i < parameters.needle_position; i++) { 373 | const char c = rand() % ('z' - 'a' + 1) + 'a'; 374 | input.push_back(c); 375 | } 376 | break; 377 | } 378 | 379 | input += needle; 380 | input.append(padding, '_'); // to make sure that memory after the needle is accessible 381 | } 382 | 383 | void prepare() { 384 | prepare_needle(); 385 | prepare_input(); 386 | } 387 | 388 | std::string needle; 389 | std::string input; 390 | Parameters parameters; 391 | }; 392 | 393 | 394 | bool parse(int argc, char* argv[], Application::Parameters& p) { 395 | 396 | if (argc < 5) { 397 | return false; 398 | } 399 | 400 | for (int i=1; i < argc; i++) { 401 | const std::string tmp = argv[i]; 402 | if (tmp == "-h" || tmp == "--help") 403 | return false; 404 | } 405 | 406 | p.needle_position = atoi(argv[1]); 407 | p.needle_size = atoi(argv[2]); 408 | p.count = atoi(argv[3]); 409 | 410 | if (p.needle_size < 3) { 411 | throw std::runtime_error("needle size must be greater than 2"); 412 | } 413 | 414 | if (p.count == 0) { 415 | throw std::runtime_error("count must be greater than 0"); 416 | } 417 | 418 | std::string tmp(argv[4]); 419 | if (tmp == "optimistic") { 420 | p.test_type = Application::TestType::OptimisticCase; 421 | } else if (tmp == "worst") { 422 | p.test_type = Application::TestType::WorstCase; 423 | } else if (tmp == "random") { 424 | p.test_type = Application::TestType::Random; 425 | } else { 426 | throw std::runtime_error("expected 'optimistic', 'worst' or 'random', got '" + tmp + "'"); 427 | } 428 | 429 | if (argc >= 6) { 430 | p.procedure_codes = argv[5]; 431 | } 432 | 433 | return true; 434 | } 435 | 436 | 437 | int main(int argc, char* argv[]) { 438 | 439 | try { 440 | 441 | Application::Parameters params; 442 | if (!parse(argc, argv, params)) { 443 | Application::print_help(argv[0]); 444 | return EXIT_FAILURE; 445 | } 446 | 447 | Application app(params); 448 | return app() ? EXIT_SUCCESS : EXIT_FAILURE; 449 | 450 | } catch (std::runtime_error& err) { 451 | 452 | const auto msg = ansi::seq("Error", ansi::RED); 453 | printf("%s: %s\n", msg.data(), err.what()); 454 | 455 | return EXIT_FAILURE; 456 | } catch (ApplicationBase::Error& err) { 457 | 458 | const auto msg = ansi::seq("Error", ansi::RED); 459 | printf("%s: %s\n", msg.data(), err.message.data()); 460 | 461 | return EXIT_FAILURE; 462 | } 463 | } 464 | 465 | -------------------------------------------------------------------------------- /src/benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef _BENCHMARK_H_ 2 | #define _BENCHMARK_H_ 3 | 4 | #include 5 | #define RDTSC_START(cycles) \ 6 | do { \ 7 | uint32_t cyc_high, cyc_low; \ 8 | __asm volatile("cpuid\n" \ 9 | "rdtsc\n" \ 10 | "mov %%edx, %0\n" \ 11 | "mov %%eax, %1" : \ 12 | "=r" (cyc_high), \ 13 | "=r"(cyc_low) : \ 14 | : /* no read only */ \ 15 | "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 16 | ); \ 17 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 18 | } while (0) 19 | 20 | #define RDTSC_STOP(cycles) \ 21 | do { \ 22 | uint32_t cyc_high, cyc_low; \ 23 | __asm volatile("rdtscp\n" \ 24 | "mov %%edx, %0\n" \ 25 | "mov %%eax, %1\n" \ 26 | "cpuid" : \ 27 | "=r"(cyc_high), \ 28 | "=r"(cyc_low) : \ 29 | /* no read only registers */ : \ 30 | "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 31 | ); \ 32 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 33 | } while (0) 34 | 35 | static __attribute__ ((noinline)) 36 | uint64_t rdtsc_overhead_func(uint64_t dummy) { 37 | return dummy; 38 | } 39 | 40 | uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX; 41 | 42 | #define RDTSC_SET_OVERHEAD(test, repeat) \ 43 | do { \ 44 | uint64_t cycles_start, cycles_final, cycles_diff; \ 45 | uint64_t min_diff = UINT64_MAX; \ 46 | for (unsigned i = 0; i < repeat; i++) { \ 47 | __asm volatile("" ::: /* pretend to clobber */ "memory"); \ 48 | RDTSC_START(cycles_start); \ 49 | test; \ 50 | RDTSC_STOP(cycles_final); \ 51 | cycles_diff = (cycles_final - cycles_start); \ 52 | if (cycles_diff < min_diff) min_diff = cycles_diff; \ 53 | } \ 54 | global_rdtsc_overhead = min_diff; \ 55 | printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ 56 | } while (0) \ 57 | 58 | 59 | /* 60 | * Prints the best number of operations per cycle where 61 | * test is the function call, answer is the expected answer generated by 62 | * test, repeat is the number of times we should repeat and size is the 63 | * number of operations represented by test. 64 | */ 65 | #define BEST_TIME(pre, test, test_name, repeat, size) \ 66 | do { \ 67 | if (global_rdtsc_overhead == UINT64_MAX) { \ 68 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ 69 | } \ 70 | printf("%-30s\t: ", test_name); fflush(stdout); \ 71 | uint64_t cycles_start, cycles_final, cycles_diff; \ 72 | uint64_t min_diff = (uint64_t)-1; \ 73 | uint64_t sum_diff = 0; \ 74 | for (size_t i = 0; i < repeat; i++) { \ 75 | pre; \ 76 | __asm volatile("" ::: /* pretend to clobber */ "memory"); \ 77 | RDTSC_START(cycles_start); \ 78 | test; \ 79 | RDTSC_STOP(cycles_final); \ 80 | cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ 81 | if (cycles_diff < min_diff) min_diff = cycles_diff; \ 82 | sum_diff += cycles_diff; \ 83 | } \ 84 | uint64_t S = size; \ 85 | float cycle_per_op = (min_diff) / (double)S; \ 86 | float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ 87 | printf(" %8.3f cycle/op (best) %8.3f cycle/op (avg)\n", cycle_per_op, avg_cycle_per_op); \ 88 | } while (0) 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/speedup.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "all_procedures.cpp" 10 | 11 | // ------------------------------------------------------------------------ 12 | 13 | #include 14 | #include "application_base.cpp" 15 | 16 | class Application final: public ApplicationBase { 17 | 18 | Procedures db; 19 | std::size_t count; 20 | const std::string procedure_codes; 21 | 22 | public: 23 | struct Parameters { 24 | std::string file_name; 25 | std::string words_name; 26 | size_t count = 10; 27 | std::string procedure_codes; 28 | }; 29 | 30 | public: 31 | Application(const Parameters& params) 32 | : db(all_procedures()) 33 | , count(params.count) 34 | , procedure_codes(params.procedure_codes) { 35 | 36 | prepare(params.file_name, params.words_name); 37 | } 38 | 39 | bool operator()() { 40 | 41 | #if defined(__GNUC__) && !defined(HAVE_NEON_INSTRUCTIONS) 42 | // GNU std::string::find was proven to be utterly slow, 43 | // don't waste our time on reconfirming that fact. 44 | // 45 | // (On Raspberry Pi it's fast, though) 46 | const bool measure_stdstring = false; 47 | #else 48 | const bool measure_stdstring = true; 49 | #endif 50 | #if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE) 51 | // On Raspberry Pi it's terribly slow, but on Aarch64 52 | // the 64-bit procedure is pretty fast 53 | const bool measure_swar64 = false; 54 | #else 55 | const bool measure_swar64 = true; 56 | #endif 57 | 58 | if (is_enabled('a')) { 59 | 60 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 61 | 62 | return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size()); 63 | }; 64 | 65 | measure(find, 'a'); 66 | } 67 | 68 | if (is_enabled('b')) { 69 | 70 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 71 | const char* res = strstr(s.data(), neddle.data()); 72 | 73 | if (res != nullptr) { 74 | return res - s.data(); 75 | } else { 76 | return std::string::npos; 77 | } 78 | }; 79 | 80 | measure(find, 'b'); 81 | } 82 | 83 | if (measure_stdstring && is_enabled('c')) { 84 | 85 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 86 | 87 | return s.find(neddle); 88 | }; 89 | 90 | measure(find, 'c'); 91 | } 92 | 93 | if (measure_swar64 && is_enabled('d')) { 94 | 95 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 96 | 97 | return swar64_strstr_v2(s, neddle); 98 | }; 99 | 100 | measure(find, 'd'); 101 | } 102 | 103 | if (is_enabled('e')) { 104 | 105 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 106 | 107 | return swar32_strstr_v2(s, neddle); 108 | }; 109 | 110 | measure(find, 'e'); 111 | } 112 | 113 | #ifdef HAVE_SSE_INSTRUCTIONS 114 | if (is_enabled('f')) { 115 | 116 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 117 | 118 | return sse2_strstr_v2(s, neddle); 119 | }; 120 | 121 | measure(find, 'f'); 122 | } 123 | 124 | if (is_enabled('g')) { 125 | 126 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 127 | 128 | return sse4_strstr(s, neddle); 129 | }; 130 | 131 | measure(find, 'g'); 132 | } 133 | 134 | if (is_enabled('h')) { 135 | 136 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 137 | 138 | return sse4_strstr_unrolled(s, neddle); 139 | }; 140 | 141 | measure(find, 'h'); 142 | } 143 | 144 | if (is_enabled('i')) { 145 | 146 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 147 | 148 | return sse42_strstr(s, neddle); 149 | }; 150 | 151 | measure(find, 'i'); 152 | } 153 | 154 | if (is_enabled('j')) { 155 | 156 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 157 | 158 | return sse_naive_strstr(s, neddle); 159 | }; 160 | 161 | measure(find, 'j'); 162 | } 163 | #endif 164 | 165 | #ifdef HAVE_AVX2_INSTRUCTIONS 166 | if (is_enabled('k')) { 167 | 168 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 169 | 170 | return avx2_strstr(s, neddle); 171 | }; 172 | 173 | measure(find, 'k'); 174 | } 175 | 176 | if (is_enabled('l')) { 177 | 178 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 179 | 180 | return avx2_strstr_v2(s, neddle); 181 | }; 182 | 183 | measure(find, 'l'); 184 | } 185 | 186 | if (is_enabled('m')) { 187 | 188 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 189 | 190 | return avx2_naive_strstr(s, neddle); 191 | }; 192 | 193 | measure(find, 'm'); 194 | } 195 | 196 | if (is_enabled('n')) { 197 | 198 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 199 | 200 | return avx2_naive_unrolled_strstr(s, neddle); 201 | }; 202 | 203 | measure(find, 'n'); 204 | } 205 | 206 | if (is_enabled('o')) { 207 | 208 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 209 | 210 | return avx2_naive_strstr64(s, neddle); 211 | }; 212 | 213 | measure(find, 'o'); 214 | } 215 | #endif 216 | 217 | #ifdef HAVE_AVX512F_INSTRUCTIONS 218 | if (is_enabled('p')) { 219 | 220 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 221 | 222 | return avx512f_strstr(s, neddle); 223 | }; 224 | 225 | measure(find, 'p'); 226 | } 227 | 228 | if (is_enabled('q')) { 229 | 230 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 231 | 232 | return avx512f_strstr_v2(s, neddle); 233 | }; 234 | 235 | measure(find, 'q'); 236 | } 237 | #endif 238 | 239 | #ifdef HAVE_AVX512BW_INSTRUCTIONS 240 | if (is_enabled('r')) { 241 | 242 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 243 | 244 | return avx512bw_strstr_v2(s, neddle); 245 | }; 246 | 247 | measure(find, 'r'); 248 | } 249 | #endif 250 | 251 | #ifdef HAVE_NEON_INSTRUCTIONS 252 | if (is_enabled('s')) { 253 | 254 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 255 | 256 | return neon_strstr_v2(s, neddle); 257 | }; 258 | 259 | measure(find, 's'); 260 | } 261 | #endif 262 | 263 | #ifdef HAVE_AARCH64_ARCHITECTURE 264 | if (is_enabled('t')) { 265 | 266 | auto find = [](const std::string& s, const std::string& neddle) -> size_t { 267 | 268 | return aarch64_strstr_v2(s, neddle); 269 | }; 270 | 271 | measure(find, 't'); 272 | } 273 | #endif 274 | 275 | return true; 276 | } 277 | 278 | 279 | static void print_help(const char* progname) { 280 | std::printf("%s file words [count] [procedure]\n", progname); 281 | std::puts(""); 282 | std::puts("Parameters:"); 283 | std::puts(""); 284 | std::puts(" file - arbitrary file"); 285 | std::puts(" words - list of words in separate lines"); 286 | std::puts(" count - repeat count (optional, default = 10)"); 287 | std::puts(" procedure - letter(s) from square brackets (by default all functions are checked)"); 288 | std::puts(""); 289 | std::puts("Following procedures ara available:"); 290 | for (auto& item: all_procedures().procedures) { 291 | printf(" [%c] %s\n", item.code, item.name.c_str()); 292 | } 293 | } 294 | 295 | 296 | private: 297 | template 298 | void measure(T_FIND find, char code) { 299 | 300 | printf("%-40s... ", db[code].name.c_str()); 301 | fflush(stdout); 302 | 303 | size_t result = 0; 304 | 305 | const auto t1 = std::chrono::high_resolution_clock::now(); 306 | 307 | auto k = count; 308 | while (k != 0) { 309 | for (const auto& word: words) { 310 | result += find(file, word); 311 | } 312 | 313 | k--; 314 | } 315 | 316 | const auto t2 = std::chrono::high_resolution_clock::now(); 317 | const std::chrono::duration td = t2-t1; 318 | 319 | printf("reference result = %lu, time = %10.6f s\n", result, td.count()); 320 | } 321 | 322 | 323 | bool is_enabled(char proc) const { 324 | return (procedure_codes.empty()) 325 | || (procedure_codes.find(proc) != std::string::npos); 326 | } 327 | }; 328 | 329 | 330 | bool parse(int argc, char* argv[], Application::Parameters& p) { 331 | if (argc < 3) { 332 | return false; 333 | } 334 | 335 | for (int i=1; i < argc; i++) { 336 | const std::string tmp = argv[i]; 337 | if (tmp == "-h" || tmp == "--help") 338 | return false; 339 | } 340 | 341 | p.file_name = argv[1]; 342 | p.words_name = argv[2]; 343 | 344 | if (argc >= 4) { 345 | size_t tmp = atoi(argv[3]); 346 | if (tmp > 0) { 347 | p.count = tmp; 348 | } else { 349 | printf("repeat count '%s' invalid, keeping default %lu\n", argv[3], p.count); 350 | } 351 | } 352 | 353 | if (argc >= 5) { 354 | p.procedure_codes = argv[4]; 355 | } 356 | 357 | return true; 358 | } 359 | 360 | 361 | int main(int argc, char* argv[]) { 362 | 363 | try { 364 | 365 | Application::Parameters params; 366 | if (!parse(argc, argv, params)) { 367 | Application::print_help(argv[0]); 368 | return EXIT_FAILURE; 369 | } 370 | 371 | Application app(params); 372 | return app() ? EXIT_SUCCESS : EXIT_FAILURE; 373 | 374 | } catch (ApplicationBase::Error& err) { 375 | 376 | const auto msg = ansi::seq("Error: ", ansi::RED); 377 | printf("%s: %s\n", msg.data(), err.message.data()); 378 | 379 | return EXIT_FAILURE; 380 | } 381 | } 382 | -------------------------------------------------------------------------------- /src/unittests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "all.h" 9 | 10 | #include 11 | #include "all_procedures.cpp" 12 | 13 | bool test(const char* name, str_find_fun strstr_function) { 14 | 15 | std::printf("%s... ", name); 16 | std::fflush(stdout); 17 | 18 | for (size_t size = 1; size < 64; size++) { 19 | 20 | const std::string neddle = "$" + std::string(size, 'x') + "#"; 21 | 22 | for (size_t n = 0; n < 3*16; n++) { 23 | 24 | const std::string prefix(n, '.'); 25 | 26 | for (size_t k = 0; k < 3*16; k++) { 27 | // '.' * k + '$' + 'x' * size + '#' + '.' * k 28 | 29 | const std::string suffix(k, '.'); 30 | const std::string str = prefix + neddle + suffix; 31 | 32 | const auto result = strstr_function(str.data(), str.size(), neddle.data(), neddle.size()); 33 | 34 | if (result != n) { 35 | printf("%s\n", ansi::seq("FAILED", ansi::RED).c_str()); 36 | 37 | printf(" string = '%s' (length %lu)\n", str.data(), str.size()); 38 | printf(" neddle = '%s' (length %lu)\n", neddle.data(), neddle.size()); 39 | printf(" expected result = %lu, actual result = %lu\n", n, result); 40 | 41 | return false; 42 | } 43 | } 44 | } 45 | } 46 | 47 | const auto msg = ansi::seq("OK", ansi::GREEN); 48 | printf("%s\n", msg.c_str()); 49 | 50 | return true; 51 | } 52 | 53 | 54 | int main() { 55 | 56 | int ret = EXIT_SUCCESS; 57 | 58 | puts("running unit tests"); 59 | 60 | auto db = all_procedures(); 61 | for (auto& item: db.procedures) { 62 | if (item.builtin) { 63 | continue; 64 | } 65 | 66 | if (!test(item.name.c_str(), item.proc)) { 67 | ret = EXIT_FAILURE; 68 | } 69 | } 70 | 71 | return ret; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /src/validate.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // ------------------------------------------------------------------------ 9 | 10 | #include "all_procedures.cpp" 11 | 12 | // ------------------------------------------------------------------------ 13 | 14 | #include 15 | #include "application_base.cpp" 16 | 17 | 18 | class Application final: public ApplicationBase { 19 | 20 | public: 21 | Application(const std::string& file_name, const std::string& words_name) { 22 | prepare(file_name, words_name); 23 | } 24 | 25 | bool run() { 26 | const auto n = words.size(); 27 | 28 | auto db = all_procedures(); 29 | 30 | for (size_t i = 0; i < n; i++) { 31 | 32 | if (i % 100 == 0) { 33 | print_progress(i, n); 34 | } 35 | 36 | const auto& word = words[i]; 37 | const size_t reference = file.find(word); 38 | 39 | for (auto& item: db.procedures) { 40 | if (item.builtin) { 41 | continue; 42 | } 43 | 44 | const size_t result = item.proc(file.data(), file.size(), word.data(), word.size()); 45 | if (reference != result) { 46 | putchar('\n'); 47 | const auto msg = ansi::seq("ERROR", ansi::RED); 48 | printf("%s: std::find result = %lu, %s = %lu\n", 49 | msg.data(), reference, item.name.c_str(), result); 50 | 51 | printf("word: '%s' (length %lu)\n", word.data(), word.size()); 52 | 53 | return false; 54 | } 55 | } 56 | } 57 | 58 | print_progress(n, n); 59 | putchar('\n'); 60 | 61 | const auto msg = ansi::seq("OK", ansi::GREEN); 62 | printf("%s\n", msg.c_str()); 63 | 64 | return true; 65 | } 66 | 67 | 68 | static void print_help(const char* progname) { 69 | std::printf("usage: %s [file] [words]\n", progname); 70 | std::puts(""); 71 | std::puts("Search all words in a file using std::string::find and SSE4 procedure"); 72 | std::puts(""); 73 | std::puts("Parameters:"); 74 | std::puts(""); 75 | std::puts(" file - arbitrary file"); 76 | std::puts(" words - list of words in separate lines"); 77 | } 78 | 79 | private: 80 | void print_progress(size_t pos, size_t n) { 81 | 82 | printf("validating... %0.2f%% (%lu/%lu)\r", 100.0*pos/n, pos, n); 83 | fflush(stdout); 84 | } 85 | }; 86 | 87 | 88 | int main(int argc, char* argv[]) { 89 | 90 | 91 | if (argc == 3) { 92 | try { 93 | Application app(argv[1], argv[2]); 94 | 95 | const auto ret = app.run(); 96 | 97 | return ret ? EXIT_SUCCESS : EXIT_FAILURE; 98 | } catch (ApplicationBase::Error& err) { 99 | 100 | const auto msg = ansi::seq("Error: ", ansi::RED); 101 | printf("%s: %s\n", msg.data(), err.message.data()); 102 | 103 | return EXIT_FAILURE; 104 | } 105 | } else { 106 | Application::print_help(argv[0]); 107 | 108 | return EXIT_FAILURE; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /sse-naive-strstr.cpp: -------------------------------------------------------------------------------- 1 | // Method descibed in https://arxiv.org/pdf/1612.01506.pdf 2 | // 3 | // Implementation by Daniel Lemire 4 | // https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c 5 | 6 | size_t FORCE_INLINE sse_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 7 | 8 | assert(k > 0); 9 | assert(n > 0); 10 | 11 | if (n == k) { 12 | return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos; 13 | } 14 | 15 | for (size_t i = 0; i < n - k + 1; i += 16) { 16 | uint16_t found = 0xffff; 17 | for (size_t j = 0; (j < k) && (found != 0) ; ++j) { 18 | const __m128i textvector = _mm_loadu_si128((const __m128i *)(s + i + j)); 19 | const __m128i needlevector = _mm_set1_epi8(needle[j]); 20 | uint16_t bitmask = _mm_movemask_epi8(_mm_cmpeq_epi8(textvector, needlevector)); 21 | found = found & bitmask; 22 | } 23 | if (found != 0) { 24 | return i + __builtin_ctz(found); 25 | } 26 | } 27 | 28 | return std::string::npos; 29 | } 30 | 31 | 32 | // ------------------------------------------------------------------------ 33 | 34 | size_t sse_naive_strstr(const char* s, size_t n, const char* needle, size_t k) { 35 | 36 | size_t result = std::string::npos; 37 | 38 | if (n < k) { 39 | return result; 40 | } 41 | 42 | result = sse_naive_strstr_anysize(s, n, needle, k); 43 | 44 | if (result <= n - k) { 45 | return result; 46 | } else { 47 | return std::string::npos; 48 | } 49 | } 50 | 51 | // ------------------------------------------------------------------------ 52 | 53 | size_t sse_naive_strstr(const std::string& s, const std::string& needle) { 54 | 55 | return sse_naive_strstr(s.data(), s.size(), needle.data(), needle.size()); 56 | } 57 | 58 | 59 | -------------------------------------------------------------------------------- /sse2-needle4.cpp: -------------------------------------------------------------------------------- 1 | size_t FORCE_INLINE sse2_needle4(const char* s, size_t n, const char* needle, size_t k) { 2 | 3 | uint32_t u32; 4 | memcpy(&u32, needle, sizeof(u32)); 5 | 6 | const __m128i v_needle = _mm_set1_epi32(u32); 7 | const __m128i shuffle = _mm_setr_epi8(0, 1, 2, 3, 8 | 1, 2, 3, 4, 9 | 2, 3, 4, 5, 10 | 3, 4, 5, 6); 11 | 12 | for (size_t i = 0; i < n - k + 1; i += 4) { 13 | // 1. load 7 bytes: 14 | // [abcd|efg?|????|????] 15 | uint64_t u64; 16 | memcpy(&u64, &s[i], sizeof(u64)); 17 | const __m128i t0 = _mm_cvtsi64x_si128(u64); 18 | 19 | // 2. make all possible 4-byte substrings 20 | // [abcd|bcde|cdef|defg] 21 | const __m128i t1 = _mm_shuffle_epi8(shuffle, t0); 22 | 23 | // 3. compare the 4-byte substrings with the needle 24 | const __m128i t2 = _mm_cmpeq_epi32(v_needle, t1); 25 | 26 | const int mask = _mm_movemask_ps((__m128)t2); 27 | if (mask != 0) { 28 | return i + __builtin_clz(mask); 29 | } 30 | } 31 | 32 | return std::string::npos; 33 | } 34 | 35 | 36 | // ------------------------------------------------------------------------ 37 | 38 | size_t sse2_strstr_needle4(const char* s, size_t n, const char* needle, size_t k) { 39 | 40 | if (k != 4) { 41 | return std::string::npos; 42 | } 43 | 44 | return sse2_needle4(s, n, needle, k); 45 | } 46 | 47 | // ------------------------------------------------------------------------ 48 | 49 | size_t sse2_strstr_needle4(const std::string& s, const std::string& needle) { 50 | 51 | return sse2_strstr_needle4(s.data(), s.size(), needle.data(), needle.size()); 52 | } 53 | 54 | 55 | size_t FORCE_INLINE sse2_needle4_v2(const char* s, size_t n, const char* needle, size_t k) { 56 | 57 | uint32_t u32; 58 | memcpy(&u32, needle, sizeof(u32)); 59 | 60 | const __m128i v_needle = _mm_set1_epi32(u32); 61 | const __m128i shuffle0 = _mm_setr_epi8(0, 1, 2, 3, 62 | 1, 2, 3, 4, 63 | 2, 3, 4, 5, 64 | 3, 4, 5, 6); 65 | const __m128i shuffle1 = _mm_setr_epi8(4, 5, 6, 7, 66 | 5, 6, 7, 8, 67 | 6, 7, 8, 9, 68 | 7, 8, 9, 10); 69 | 70 | for (size_t i = 0; i < n - k + 1; i += 8) { 71 | // 1. load 15 ytes: 72 | // [abcd|efgh|ijkl|????] 73 | const __m128i input = _mm_loadu_si128((const __m128i*)(s + i)); 74 | 75 | // 2a. make all possible 4-byte substrings 76 | // lo = [abcd|bcde|cdef|defg] 77 | const __m128i lo = _mm_shuffle_epi8(shuffle0, input); 78 | 79 | // hi = [efgh|fghi|ghij|hijk] 80 | const __m128i hi = _mm_shuffle_epi8(shuffle1, input); 81 | 82 | // 3. compare the 4-byte substrings with the needle 83 | const __m128i eq_lo = _mm_cmpeq_epi32(v_needle, lo); 84 | const __m128i eq_hi = _mm_cmpeq_epi32(v_needle, hi); 85 | 86 | // to perform single movemask in the main loop 87 | const __m128i t0 = _mm_or_si128(eq_lo, eq_hi); 88 | 89 | const int mask = _mm_movemask_ps((__m128)t0); 90 | if (mask != 0) { 91 | const int mask_lo = _mm_movemask_ps((__m128)eq_lo); 92 | if (mask_lo != 0) { 93 | return i + __builtin_clz(mask_lo); 94 | } else { 95 | return i + 4 + __builtin_clz(mask); 96 | } 97 | } 98 | } 99 | 100 | return std::string::npos; 101 | } 102 | 103 | 104 | // ------------------------------------------------------------------------ 105 | 106 | size_t sse2_strstr_needle4_v2(const char* s, size_t n, const char* needle, size_t k) { 107 | 108 | if (k != 4) { 109 | return std::string::npos; 110 | } 111 | 112 | return sse2_needle4_v2(s, n, needle, k); 113 | } 114 | 115 | // ------------------------------------------------------------------------ 116 | 117 | size_t sse2_strstr_needle4_v2(const std::string& s, const std::string& needle) { 118 | 119 | return sse2_strstr_needle4_v2(s.data(), s.size(), needle.data(), needle.size()); 120 | } 121 | 122 | 123 | -------------------------------------------------------------------------------- /sse2-strstr.cpp: -------------------------------------------------------------------------------- 1 | // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html 2 | 3 | size_t FORCE_INLINE sse2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 4 | 5 | assert(k > 0); 6 | assert(n > 0); 7 | 8 | const __m128i first = _mm_set1_epi8(needle[0]); 9 | const __m128i last = _mm_set1_epi8(needle[k - 1]); 10 | 11 | for (size_t i = 0; i < n; i += 16) { 12 | 13 | const __m128i block_first = _mm_loadu_si128(reinterpret_cast(s + i)); 14 | const __m128i block_last = _mm_loadu_si128(reinterpret_cast(s + i + k - 1)); 15 | 16 | const __m128i eq_first = _mm_cmpeq_epi8(first, block_first); 17 | const __m128i eq_last = _mm_cmpeq_epi8(last, block_last); 18 | 19 | uint16_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)); 20 | 21 | while (mask != 0) { 22 | 23 | const auto bitpos = bits::get_first_bit_set(mask); 24 | 25 | if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { 26 | return i + bitpos; 27 | } 28 | 29 | mask = bits::clear_leftmost_set(mask); 30 | } 31 | } 32 | 33 | return std::string::npos; 34 | } 35 | 36 | // ------------------------------------------------------------------------ 37 | 38 | template 39 | size_t FORCE_INLINE sse2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 40 | 41 | assert(k > 0); 42 | assert(n > 0); 43 | 44 | const __m128i first = _mm_set1_epi8(needle[0]); 45 | const __m128i last = _mm_set1_epi8(needle[k - 1]); 46 | 47 | for (size_t i = 0; i < n; i += 16) { 48 | 49 | const __m128i block_first = _mm_loadu_si128(reinterpret_cast(s + i)); 50 | const __m128i block_last = _mm_loadu_si128(reinterpret_cast(s + i + k - 1)); 51 | 52 | const __m128i eq_first = _mm_cmpeq_epi8(first, block_first); 53 | const __m128i eq_last = _mm_cmpeq_epi8(last, block_last); 54 | 55 | uint32_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)); 56 | 57 | while (mask != 0) { 58 | 59 | const auto bitpos = bits::get_first_bit_set(mask); 60 | 61 | if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { 62 | return i + bitpos; 63 | } 64 | 65 | mask = bits::clear_leftmost_set(mask); 66 | } 67 | } 68 | 69 | return std::string::npos; 70 | } 71 | 72 | // ------------------------------------------------------------------------ 73 | 74 | size_t sse2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 75 | 76 | size_t result = std::string::npos; 77 | 78 | if (n < k) { 79 | return result; 80 | } 81 | 82 | switch (k) { 83 | case 0: 84 | return 0; 85 | 86 | case 1: { 87 | const char* res = reinterpret_cast(strchr(s, needle[0])); 88 | 89 | return (res != nullptr) ? res - s : std::string::npos; 90 | } 91 | 92 | case 2: 93 | result = sse2_strstr_memcmp<2>(s, n, needle, always_true); 94 | break; 95 | 96 | case 3: 97 | result = sse2_strstr_memcmp<3>(s, n, needle, memcmp1); 98 | break; 99 | 100 | case 4: 101 | result = sse2_strstr_memcmp<4>(s, n, needle, memcmp2); 102 | break; 103 | 104 | case 5: 105 | result = sse2_strstr_memcmp<5>(s, n, needle, memcmp4); 106 | break; 107 | 108 | case 6: 109 | result = sse2_strstr_memcmp<6>(s, n, needle, memcmp4); 110 | break; 111 | 112 | case 7: 113 | result = sse2_strstr_memcmp<7>(s, n, needle, memcmp5); 114 | break; 115 | 116 | case 8: 117 | result = sse2_strstr_memcmp<8>(s, n, needle, memcmp6); 118 | break; 119 | 120 | case 9: 121 | result = sse2_strstr_memcmp<9>(s, n, needle, memcmp8); 122 | break; 123 | 124 | case 10: 125 | result = sse2_strstr_memcmp<10>(s, n, needle, memcmp8); 126 | break; 127 | 128 | case 11: 129 | result = sse2_strstr_memcmp<11>(s, n, needle, memcmp9); 130 | break; 131 | 132 | case 12: 133 | result = sse2_strstr_memcmp<12>(s, n, needle, memcmp10); 134 | break; 135 | 136 | default: 137 | result = sse2_strstr_anysize(s, n, needle, k); 138 | break; 139 | } 140 | 141 | if (result <= n - k) { 142 | return result; 143 | } else { 144 | return std::string::npos; 145 | } 146 | } 147 | 148 | // ------------------------------------------------------------------------ 149 | 150 | size_t sse2_strstr_v2(const std::string& s, const std::string& needle) { 151 | 152 | return sse2_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 153 | } 154 | 155 | 156 | -------------------------------------------------------------------------------- /sse4-strstr-unrolled.cpp: -------------------------------------------------------------------------------- 1 | // Note: it appears that these specialized functions do not help. 2 | // But I decided to left them, just in case. 3 | 4 | // use functions/templates dealing with certain substring length 5 | //#define ENABLE_SSE4_LENGTH_SPECIALIZATIONS 6 | 7 | // When defined use sse4_strstr_unrolled_memcmp template, 8 | // otherwise use just sse4_strstr_unrolled_max20 and sse4_strstr_unrolled_max36 9 | //#define ENABLE_SSE4_MEMCMP_TEMPLATES 10 | 11 | size_t sse4_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t needle_size) { 12 | 13 | assert(needle_size > 4); 14 | assert(n > 0); 15 | 16 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 17 | const __m128i zeros = _mm_setzero_si128(); 18 | 19 | __m128i prev = _mm_loadu_si128(reinterpret_cast(s)); 20 | __m128i curr; 21 | 22 | for (size_t i = 0; i < n; i += 16) { 23 | 24 | curr = _mm_loadu_si128(reinterpret_cast(s + i + 16)); 25 | 26 | const __m128i data0 = prev; 27 | const __m128i data1 = _mm_alignr_epi8(curr, prev, 8); 28 | const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0); 29 | const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0); 30 | prev = curr; 31 | 32 | const __m128i result = _mm_packus_epi16(result0, result1); 33 | const __m128i cmp = _mm_cmpeq_epi8(result, zeros); 34 | 35 | unsigned mask = _mm_movemask_epi8(cmp); 36 | 37 | while (mask != 0) { 38 | 39 | const auto bitpos = bits::get_first_bit_set(mask); 40 | 41 | if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) { 42 | return i + bitpos; 43 | } 44 | 45 | mask = bits::clear_leftmost_set(mask); 46 | } 47 | } 48 | 49 | return std::string::npos; 50 | } 51 | 52 | // ------------------------------------------------------------------------ 53 | 54 | template 55 | size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 56 | 57 | assert(k > 4); 58 | assert(n > 0); 59 | 60 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 61 | const __m128i zeros = _mm_setzero_si128(); 62 | 63 | __m128i prev = _mm_loadu_si128(reinterpret_cast(s)); 64 | __m128i curr; 65 | 66 | for (size_t i = 0; i < n; i += 16) { 67 | 68 | curr = _mm_loadu_si128(reinterpret_cast(s + i + 16)); 69 | 70 | const __m128i data0 = prev; 71 | const __m128i data1 = _mm_alignr_epi8(curr, prev, 8); 72 | const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0); 73 | const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0); 74 | prev = curr; 75 | 76 | const __m128i result = _mm_packus_epi16(result0, result1); 77 | const __m128i cmp = _mm_cmpeq_epi8(result, zeros); 78 | 79 | unsigned mask = _mm_movemask_epi8(cmp); 80 | 81 | while (mask != 0) { 82 | 83 | const auto bitpos = bits::get_first_bit_set(mask); 84 | 85 | if (memcmp_fun(s + i + bitpos + 4, needle + 4)) { 86 | return i + bitpos; 87 | } 88 | 89 | mask = bits::clear_leftmost_set(mask); 90 | } 91 | } 92 | 93 | return std::string::npos; 94 | } 95 | 96 | // ------------------------------------------------------------------------ 97 | 98 | size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { 99 | 100 | const __m128i zeros = _mm_setzero_si128(); 101 | const __m128i prefix = sse::load(needle); 102 | const __m128i suffix = sse::load(needle + 4); 103 | const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); 104 | 105 | for (size_t i = 0; i < n; i += 8) { 106 | 107 | const __m128i data = sse::load(s + i); 108 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 109 | 110 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 111 | 112 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 113 | 114 | while (mask != 0) { 115 | 116 | const auto bitpos = bits::get_first_bit_set(mask)/2; 117 | 118 | const __m128i str = sse::load(s + i + bitpos + 4); 119 | const __m128i cmp = _mm_cmpeq_epi8(str, suffix); 120 | 121 | if (_mm_testc_si128(cmp, suff_mask)) { 122 | 123 | return i + bitpos; 124 | } 125 | 126 | mask = bits::clear_leftmost_set(mask); 127 | } 128 | } 129 | 130 | return std::string::npos; 131 | } 132 | 133 | // ------------------------------------------------------------------------ 134 | 135 | size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) { 136 | 137 | const __m128i zeros = _mm_setzero_si128(); 138 | const __m128i prefix = sse::load(needle); 139 | const __m128i suffix1 = sse::load(needle + 4); 140 | const __m128i suffix2 = sse::load(needle + 16 + 4); 141 | const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); 142 | 143 | for (size_t i = 0; i < n; i += 8) { 144 | 145 | const __m128i data = sse::load(s + i); 146 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 147 | 148 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 149 | 150 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 151 | 152 | while (mask != 0) { 153 | 154 | const auto bitpos = bits::get_first_bit_set(mask)/2; 155 | 156 | const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); 157 | const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); 158 | 159 | const __m128i c3 = _mm_or_si128(c2, suff_mask); 160 | const __m128i tmp = _mm_and_si128(c1, c3); 161 | 162 | if (_mm_movemask_epi8(tmp) == 0xffff) { 163 | 164 | return i + bitpos; 165 | } 166 | 167 | mask = bits::clear_leftmost_set(mask); 168 | } 169 | } 170 | 171 | return std::string::npos; 172 | } 173 | 174 | // ------------------------------------------------------------------------ 175 | 176 | size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) { 177 | 178 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 179 | const __m128i zeros = _mm_setzero_si128(); 180 | 181 | for (size_t i = 0; i < n; i += 8) { 182 | 183 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 184 | const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); 185 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 186 | 187 | const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); 188 | 189 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 190 | 191 | if (mask != 0) { 192 | 193 | return i + bits::get_first_bit_set(mask)/2; 194 | } 195 | } 196 | 197 | return std::string::npos; 198 | } 199 | 200 | // ------------------------------------------------------------------------ 201 | 202 | size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) { 203 | 204 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 205 | const __m128i zeros = _mm_setzero_si128(); 206 | 207 | for (size_t i = 0; i < n; i += 8) { 208 | 209 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 210 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 211 | 212 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 213 | 214 | unsigned mask = _mm_movemask_epi8(cmp); 215 | 216 | if (mask != 0) { 217 | 218 | return i + bits::get_first_bit_set(mask)/2; 219 | } 220 | } 221 | 222 | return std::string::npos; 223 | } 224 | 225 | // ------------------------------------------------------------------------ 226 | 227 | size_t sse4_strstr_unrolled(const char* s, size_t n, const char* needle, size_t needle_size) { 228 | 229 | size_t result = std::string::npos; 230 | 231 | if (n < needle_size) { 232 | return result; 233 | } 234 | 235 | switch (needle_size) { 236 | case 0: 237 | return 0; 238 | 239 | case 1: { 240 | const char* res = reinterpret_cast(strchr(s, needle[0])); 241 | 242 | return (res != nullptr) ? res - s : std::string::npos; 243 | } 244 | 245 | case 2: { 246 | const char* res = reinterpret_cast(strstr(s, needle)); 247 | 248 | return (res != nullptr) ? res - s : std::string::npos; 249 | } 250 | case 3: 251 | 252 | result = sse4_strstr_unrolled_len3(s, n, needle); 253 | break; 254 | 255 | case 4: 256 | result = sse4_strstr_unrolled_len4(s, n, needle); 257 | break; 258 | 259 | #ifdef ENABLE_SSE4_LENGTH_SPECIALIZATIONS 260 | #ifdef ENABLE_SSE4_MEMCMP_TEMPLATES 261 | case 5: 262 | result = sse4_strstr_unrolled_memcmp<5>(s, n, needle, memcmp1); 263 | break; 264 | 265 | case 6: 266 | result = sse4_strstr_unrolled_memcmp<6>(s, n, needle, memcmp2); 267 | break; 268 | 269 | case 7: 270 | result = sse4_strstr_unrolled_memcmp<7>(s, n, needle, memcmp3); 271 | break; 272 | 273 | case 8: 274 | result = sse4_strstr_unrolled_memcmp<8>(s, n, needle, memcmp4); 275 | break; 276 | 277 | case 9: 278 | result = sse4_strstr_unrolled_memcmp<9>(s, n, needle, memcmp5); 279 | break; 280 | 281 | case 10: 282 | result = sse4_strstr_unrolled_memcmp<10>(s, n, needle, memcmp6); 283 | break; 284 | 285 | case 11: 286 | result = sse4_strstr_unrolled_memcmp<11>(s, n, needle, memcmp7); 287 | break; 288 | 289 | case 12: 290 | result = sse4_strstr_unrolled_memcmp<12>(s, n, needle, memcmp8); 291 | break; 292 | 293 | case 13: 294 | result = sse4_strstr_unrolled_memcmp<13>(s, n, needle, memcmp9); 295 | break; 296 | 297 | case 14: 298 | result = sse4_strstr_unrolled_memcmp<14>(s, n, needle, memcmp10); 299 | break; 300 | #else 301 | case 5: case 6: case 7: case 8: 302 | case 9: case 10: case 11: case 12: 303 | case 13: case 14: /* 5 .. 14 */ 304 | #endif // ENABLE_SSE4_MEMCMP_TEMPLATES 305 | case 15: case 16: case 17: case 18: case 19: 306 | case 20: /* 15..20 */ 307 | result = sse4_strstr_unrolled_max20(s, n, needle, needle_size); 308 | break; 309 | 310 | case 21: case 22: case 23: case 24: case 25: 311 | case 26: case 27: case 28: case 29: case 30: 312 | case 31: case 32: case 33: case 34: case 35: 313 | case 36: /* 21..36 */ 314 | result = sse4_strstr_unrolled_max36(s, n, needle, needle_size); 315 | break; 316 | #endif // ENABLE_SSE4_LENGTH_SPECIALIZATIONS 317 | default: 318 | result = sse4_strstr_unrolled_anysize(s, n, needle, needle_size); 319 | break; 320 | } 321 | 322 | 323 | if (result <= n - needle_size) { 324 | return result; 325 | } else { 326 | return std::string::npos; 327 | } 328 | } 329 | 330 | // -------------------------------------------------- 331 | 332 | size_t sse4_strstr_unrolled(const std::string& s, const std::string& needle) { 333 | 334 | return sse4_strstr_unrolled(s.data(), s.size(), needle.data(), needle.size()); 335 | } 336 | 337 | 338 | -------------------------------------------------------------------------------- /sse4-strstr.cpp: -------------------------------------------------------------------------------- 1 | size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, size_t needle_size) { 2 | 3 | assert(needle_size > 4); 4 | assert(n > 0); 5 | 6 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 7 | const __m128i zeros = _mm_setzero_si128(); 8 | 9 | for (size_t i = 0; i < n; i += 8) { 10 | 11 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 12 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 13 | 14 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 15 | 16 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 17 | 18 | while (mask != 0) { 19 | 20 | const auto bitpos = bits::get_first_bit_set(mask)/2; 21 | 22 | if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) { 23 | return i + bitpos; 24 | } 25 | 26 | mask = bits::clear_leftmost_set(mask); 27 | } 28 | } 29 | 30 | return std::string::npos; 31 | } 32 | 33 | // ------------------------------------------------------------------------ 34 | 35 | template 36 | size_t sse4_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 37 | 38 | assert(k > 4); 39 | assert(n > 0); 40 | 41 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 42 | const __m128i zeros = _mm_setzero_si128(); 43 | 44 | for (size_t i = 0; i < n; i += 8) { 45 | 46 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 47 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 48 | 49 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 50 | 51 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 52 | 53 | while (mask != 0) { 54 | 55 | const auto bitpos = bits::get_first_bit_set(mask)/2; 56 | 57 | if (memcmp_fun(s + i + bitpos + 4, needle + 4)) { 58 | return i + bitpos; 59 | } 60 | 61 | mask = bits::clear_leftmost_set(mask); 62 | } 63 | } 64 | 65 | return std::string::npos; 66 | } 67 | 68 | // ------------------------------------------------------------------------ 69 | 70 | size_t sse4_strstr_max20(const char* s, size_t n, const char* needle, size_t needle_size) { 71 | 72 | const __m128i zeros = _mm_setzero_si128(); 73 | const __m128i prefix = sse::load(needle); 74 | const __m128i suffix = sse::load(needle + 4); 75 | const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); 76 | 77 | for (size_t i = 0; i < n; i += 8) { 78 | 79 | const __m128i data = sse::load(s + i); 80 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 81 | 82 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 83 | 84 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 85 | 86 | while (mask != 0) { 87 | 88 | const auto bitpos = bits::get_first_bit_set(mask)/2; 89 | 90 | const __m128i str = sse::load(s + i + bitpos + 4); 91 | const __m128i cmp = _mm_cmpeq_epi8(str, suffix); 92 | 93 | if (_mm_testc_si128(cmp, suff_mask)) { 94 | 95 | return i + bitpos; 96 | } 97 | 98 | mask = bits::clear_leftmost_set(mask); 99 | } 100 | } 101 | 102 | return std::string::npos; 103 | } 104 | 105 | // ------------------------------------------------------------------------ 106 | 107 | size_t sse4_strstr_max36(const char* s, size_t n, const char* needle, size_t needle_size) { 108 | 109 | const __m128i zeros = _mm_setzero_si128(); 110 | const __m128i prefix = sse::load(needle); 111 | const __m128i suffix1 = sse::load(needle + 4); 112 | const __m128i suffix2 = sse::load(needle + 16 + 4); 113 | const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); 114 | 115 | for (size_t i = 0; i < n; i += 8) { 116 | 117 | const __m128i data = sse::load(s + i); 118 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 119 | 120 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 121 | 122 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 123 | 124 | while (mask != 0) { 125 | 126 | const auto bitpos = bits::get_first_bit_set(mask)/2; 127 | 128 | const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); 129 | const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); 130 | 131 | const __m128i c3 = _mm_or_si128(c2, suff_mask); 132 | const __m128i tmp = _mm_and_si128(c1, c3); 133 | 134 | if (_mm_movemask_epi8(tmp) == 0xffff) { 135 | 136 | return i + bitpos; 137 | } 138 | 139 | mask = bits::clear_leftmost_set(mask); 140 | } 141 | } 142 | 143 | return std::string::npos; 144 | } 145 | 146 | // ------------------------------------------------------------------------ 147 | 148 | size_t sse4_strstr_len3(const char* s, size_t n, const char* needle) { 149 | 150 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 151 | const __m128i zeros = _mm_setzero_si128(); 152 | 153 | for (size_t i = 0; i < n; i += 8) { 154 | 155 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 156 | const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); 157 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 158 | 159 | const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); 160 | 161 | unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; 162 | 163 | if (mask != 0) { 164 | 165 | return i + bits::get_first_bit_set(mask)/2; 166 | } 167 | } 168 | 169 | return std::string::npos; 170 | } 171 | 172 | // ------------------------------------------------------------------------ 173 | 174 | size_t sse4_strstr_len4(const char* s, size_t n, const char* needle) { 175 | 176 | const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); 177 | const __m128i zeros = _mm_setzero_si128(); 178 | 179 | for (size_t i = 0; i < n; i += 8) { 180 | 181 | const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); 182 | const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); 183 | 184 | const __m128i cmp = _mm_cmpeq_epi16(result, zeros); 185 | 186 | unsigned mask = _mm_movemask_epi8(cmp); 187 | 188 | if (mask != 0) { 189 | 190 | return i + bits::get_first_bit_set(mask)/2; 191 | } 192 | } 193 | 194 | return std::string::npos; 195 | } 196 | 197 | // ------------------------------------------------------------------------ 198 | 199 | size_t sse4_strstr(const char* s, size_t n, const char* needle, size_t needle_size) { 200 | 201 | size_t result = std::string::npos; 202 | 203 | if (n < needle_size) { 204 | return result; 205 | } 206 | 207 | switch (needle_size) { 208 | case 0: 209 | return 0; 210 | 211 | case 1: { 212 | const char* res = reinterpret_cast(strchr(s, needle[0])); 213 | 214 | return (res != nullptr) ? res - s : std::string::npos; 215 | } 216 | case 2: { 217 | const char* res = reinterpret_cast(strstr(s, needle)); 218 | 219 | return (res != nullptr) ? res - s : std::string::npos; 220 | } 221 | case 3: 222 | 223 | result = sse4_strstr_len3(s, n, needle); 224 | break; 225 | 226 | case 4: 227 | result = sse4_strstr_len4(s, n, needle); 228 | break; 229 | 230 | #if 1 231 | case 5: 232 | result = sse4_strstr_memcmp<5>(s, n, needle, memcmp1); 233 | break; 234 | 235 | case 6: 236 | result = sse4_strstr_memcmp<6>(s, n, needle, memcmp2); 237 | break; 238 | 239 | case 7: 240 | result = sse4_strstr_memcmp<7>(s, n, needle, memcmp3); 241 | break; 242 | 243 | case 8: 244 | result = sse4_strstr_memcmp<8>(s, n, needle, memcmp4); 245 | break; 246 | 247 | case 9: 248 | result = sse4_strstr_memcmp<9>(s, n, needle, memcmp5); 249 | break; 250 | 251 | case 10: 252 | result = sse4_strstr_memcmp<10>(s, n, needle, memcmp6); 253 | break; 254 | 255 | case 11: 256 | result = sse4_strstr_memcmp<11>(s, n, needle, memcmp7); 257 | break; 258 | 259 | case 12: 260 | result = sse4_strstr_memcmp<12>(s, n, needle, memcmp8); 261 | break; 262 | 263 | case 13: 264 | result = sse4_strstr_memcmp<13>(s, n, needle, memcmp9); 265 | break; 266 | 267 | case 14: 268 | result = sse4_strstr_memcmp<14>(s, n, needle, memcmp10); 269 | break; 270 | #else 271 | case 5: case 6: case 7: case 8: 272 | case 9: case 10: case 11: case 12: 273 | case 13: case 14: /* 5 .. 14 */ 274 | #endif 275 | case 15: case 16: case 17: case 18: case 19: 276 | case 20: /* 15..20 */ 277 | result = sse4_strstr_max20(s, n, needle, needle_size); 278 | break; 279 | 280 | case 21: case 22: case 23: case 24: case 25: 281 | case 26: case 27: case 28: case 29: case 30: 282 | case 31: case 32: case 33: case 34: case 35: 283 | case 36: /* 21..36 */ 284 | result = sse4_strstr_max36(s, n, needle, needle_size); 285 | break; 286 | 287 | default: 288 | result = sse4_strstr_anysize(s, n, needle, needle_size); 289 | break; 290 | } 291 | 292 | 293 | if (result <= n - needle_size) { 294 | return result; 295 | } else { 296 | return std::string::npos; 297 | } 298 | } 299 | 300 | // -------------------------------------------------- 301 | 302 | size_t sse4_strstr(const std::string& s, const std::string& needle) { 303 | 304 | return sse4_strstr(s.data(), s.size(), needle.data(), needle.size()); 305 | } 306 | 307 | -------------------------------------------------------------------------------- /sse4.2-strstr.cpp: -------------------------------------------------------------------------------- 1 | /* Usage of PCMPESTRM instruction from SSE 4.1 */ 2 | 3 | size_t FORCE_INLINE sse42_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 4 | 5 | assert(k > 0); 6 | assert(n > 0); 7 | 8 | const __m128i N = _mm_loadu_si128((__m128i*)needle); 9 | 10 | for (size_t i = 0; i < n; i += 16) { 11 | 12 | const int mode = _SIDD_UBYTE_OPS 13 | | _SIDD_CMP_EQUAL_ORDERED 14 | | _SIDD_BIT_MASK; 15 | 16 | const __m128i D = _mm_loadu_si128((__m128i*)(s + i)); 17 | const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode); 18 | uint64_t mask = _mm_cvtsi128_si64(res); 19 | 20 | while (mask != 0) { 21 | 22 | const auto bitpos = bits::get_first_bit_set(mask); 23 | 24 | // we know that at least the first character of needle matches 25 | if (memcmp(s + i + bitpos + 1, needle + 1, k - 1) == 0) { 26 | return i + bitpos; 27 | } 28 | 29 | mask = bits::clear_leftmost_set(mask); 30 | } 31 | } 32 | 33 | return std::string::npos; 34 | } 35 | 36 | 37 | template 38 | size_t FORCE_INLINE sse42_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 39 | 40 | assert(k > 0); 41 | assert(n > 0); 42 | 43 | const __m128i N = _mm_loadu_si128((__m128i*)needle); 44 | 45 | for (size_t i = 0; i < n; i += 16) { 46 | 47 | const int mode = _SIDD_UBYTE_OPS 48 | | _SIDD_CMP_EQUAL_ORDERED 49 | | _SIDD_BIT_MASK; 50 | 51 | const __m128i D = _mm_loadu_si128((__m128i*)(s + i)); 52 | const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode); 53 | uint64_t mask = _mm_cvtsi128_si64(res); 54 | 55 | while (mask != 0) { 56 | 57 | const auto bitpos = bits::get_first_bit_set(mask); 58 | 59 | if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { 60 | return i + bitpos; 61 | } 62 | 63 | mask = bits::clear_leftmost_set(mask); 64 | } 65 | } 66 | 67 | return std::string::npos; 68 | } 69 | 70 | // ------------------------------------------------------------------------ 71 | 72 | size_t sse42_strstr(const char* s, size_t n, const char* needle, size_t k) { 73 | 74 | size_t result = std::string::npos; 75 | 76 | if (n < k) { 77 | return result; 78 | } 79 | 80 | switch (k) { 81 | case 0: 82 | return 0; 83 | 84 | case 1: { 85 | const char* res = reinterpret_cast(strchr(s, needle[0])); 86 | 87 | return (res != nullptr) ? res - s : std::string::npos; 88 | } 89 | 90 | case 2: 91 | result = sse42_strstr_memcmp<2>(s, n, needle, memcmp1); 92 | break; 93 | 94 | case 3: 95 | result = sse42_strstr_memcmp<3>(s, n, needle, memcmp2); 96 | break; 97 | 98 | case 4: 99 | result = sse42_strstr_memcmp<4>(s, n, needle, memcmp3); 100 | break; 101 | 102 | case 5: 103 | result = sse42_strstr_memcmp<5>(s, n, needle, memcmp4); 104 | break; 105 | 106 | case 6: 107 | result = sse42_strstr_memcmp<6>(s, n, needle, memcmp5); 108 | break; 109 | 110 | case 7: 111 | result = sse42_strstr_memcmp<7>(s, n, needle, memcmp6); 112 | break; 113 | 114 | case 8: 115 | result = sse42_strstr_memcmp<8>(s, n, needle, memcmp7); 116 | break; 117 | 118 | case 9: 119 | result = sse42_strstr_memcmp<9>(s, n, needle, memcmp8); 120 | break; 121 | 122 | case 10: 123 | result = sse42_strstr_memcmp<10>(s, n, needle, memcmp9); 124 | break; 125 | 126 | case 11: 127 | result = sse42_strstr_memcmp<11>(s, n, needle, memcmp10); 128 | break; 129 | 130 | case 12: 131 | result = sse42_strstr_memcmp<12>(s, n, needle, memcmp11); 132 | break; 133 | 134 | default: 135 | result = sse42_strstr_anysize(s, n, needle, k); 136 | break; 137 | } 138 | 139 | if (result <= n - k) { 140 | return result; 141 | } else { 142 | return std::string::npos; 143 | } 144 | } 145 | 146 | // ------------------------------------------------------------------------ 147 | 148 | size_t sse42_strstr(const std::string& s, const std::string& needle) { 149 | 150 | return sse42_strstr(s.data(), s.size(), needle.data(), needle.size()); 151 | } 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /swar32-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | size_t FORCE_INLINE swar32_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 2 | 3 | assert(k > 0); 4 | assert(n > 0); 5 | 6 | const uint32_t first = 0x01010101u * static_cast(needle[0]); 7 | const uint32_t last = 0x01010101u * static_cast(needle[k - 1]); 8 | 9 | uint32_t* block_first = reinterpret_cast(const_cast(s)); 10 | uint32_t* block_last = reinterpret_cast(const_cast(s + k - 1)); 11 | 12 | // 2. sequence scan 13 | for (auto i=0u; i < n; i+=4, block_first++, block_last++) { 14 | // 0 bytes in eq indicate matching chars 15 | const uint32_t eq = (*block_first ^ first) | (*block_last ^ last); 16 | 17 | // 7th bit set if lower 7 bits are zero 18 | const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u; 19 | // 7th bit set if 7th bit is zero 20 | const uint32_t t1 = (~eq & 0x80808080u); 21 | uint32_t zeros = t0 & t1; 22 | size_t j = 0; 23 | 24 | while (zeros) { 25 | if (zeros & 0x80) { 26 | const char* substr = reinterpret_cast(block_first) + j + 1; 27 | if (memcmp(substr, needle + 1, k - 2) == 0) { 28 | return i + j; 29 | } 30 | } 31 | 32 | zeros >>= 8; 33 | j += 1; 34 | } 35 | } 36 | 37 | return std::string::npos; 38 | } 39 | 40 | 41 | template 42 | size_t FORCE_INLINE swar32_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 43 | 44 | assert(n > 0); 45 | 46 | const uint32_t first = 0x01010101u * static_cast(needle[0]); 47 | const uint32_t last = 0x01010101u * static_cast(needle[k - 1]); 48 | 49 | uint32_t* block_first = reinterpret_cast(const_cast(s)); 50 | uint32_t* block_last = reinterpret_cast(const_cast(s + k - 1)); 51 | 52 | // 2. sequence scan 53 | for (auto i=0u; i < n; i+=4, block_first++, block_last++) { 54 | const uint32_t eq = (*block_first ^ first) | (*block_last ^ last); 55 | const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u; 56 | const uint32_t t1 = (~eq & 0x80808080u); 57 | uint32_t zeros = t0 & t1; 58 | size_t j = 0; 59 | 60 | while (zeros) { 61 | if (zeros & 0x80) { 62 | const char* substr = reinterpret_cast(block_first) + j + 1; 63 | if (memcmp_fun(substr, needle + 1)) { 64 | return i + j; 65 | } 66 | } 67 | 68 | zeros >>= 8; 69 | j += 1; 70 | } 71 | } 72 | 73 | return std::string::npos; 74 | } 75 | 76 | // ------------------------------------------------------------------------ 77 | 78 | size_t swar32_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 79 | 80 | size_t result = std::string::npos; 81 | 82 | if (n < k) { 83 | return result; 84 | } 85 | 86 | switch (k) { 87 | case 0: 88 | return 0; 89 | 90 | case 1: { 91 | const char* res = reinterpret_cast(strchr(s, needle[0])); 92 | 93 | return (res != nullptr) ? res - s : std::string::npos; 94 | } 95 | 96 | case 2: 97 | result = swar32_strstr_memcmp<2>(s, n, needle, always_true); 98 | break; 99 | 100 | case 3: 101 | result = swar32_strstr_memcmp<3>(s, n, needle, memcmp1); 102 | break; 103 | 104 | case 4: 105 | result = swar32_strstr_memcmp<4>(s, n, needle, memcmp2); 106 | break; 107 | 108 | case 5: 109 | // Note: use memcmp4 rather memcmp3, as the last character 110 | // of needle is already proven to be equal 111 | result = swar32_strstr_memcmp<5>(s, n, needle, memcmp4); 112 | break; 113 | 114 | case 6: 115 | result = swar32_strstr_memcmp<6>(s, n, needle, memcmp4); 116 | break; 117 | 118 | case 7: 119 | result = swar32_strstr_memcmp<7>(s, n, needle, memcmp5); 120 | break; 121 | 122 | case 8: 123 | result = swar32_strstr_memcmp<8>(s, n, needle, memcmp6); 124 | break; 125 | 126 | case 9: 127 | // Note: use memcmp8 rather memcmp7 for the same reason as above. 128 | result = swar32_strstr_memcmp<9>(s, n, needle, memcmp8); 129 | break; 130 | 131 | case 10: 132 | result = swar32_strstr_memcmp<10>(s, n, needle, memcmp8); 133 | break; 134 | 135 | case 11: 136 | result = swar32_strstr_memcmp<11>(s, n, needle, memcmp9); 137 | break; 138 | 139 | case 12: 140 | result = swar32_strstr_memcmp<12>(s, n, needle, memcmp10); 141 | break; 142 | 143 | default: 144 | result = swar32_strstr_anysize(s, n, needle, k); 145 | break; 146 | } 147 | 148 | if (result <= n - k) { 149 | return result; 150 | } else { 151 | return std::string::npos; 152 | } 153 | } 154 | 155 | 156 | size_t swar32_strstr_v2(const std::string& s, const std::string& needle) { 157 | 158 | return swar32_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 159 | } 160 | -------------------------------------------------------------------------------- /swar64-strstr-v2.cpp: -------------------------------------------------------------------------------- 1 | size_t FORCE_INLINE swar64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { 2 | 3 | assert(k > 0); 4 | assert(n > 0); 5 | 6 | const uint64_t first = 0x0101010101010101llu * static_cast(needle[0]); 7 | const uint64_t last = 0x0101010101010101llu * static_cast(needle[k - 1]); 8 | 9 | uint64_t* block_first = reinterpret_cast(const_cast(s)); 10 | uint64_t* block_last = reinterpret_cast(const_cast(s + k - 1)); 11 | 12 | // 2. sequence scan 13 | for (auto i=0u; i < n; i+=8, block_first++, block_last++) { 14 | // 0 bytes in eq indicate matching chars 15 | const uint64_t eq = (*block_first ^ first) | (*block_last ^ last); 16 | 17 | // 7th bit set if lower 7 bits are zero 18 | const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu; 19 | // 7th bit set if 7th bit is zero 20 | const uint64_t t1 = (~eq & 0x8080808080808080llu); 21 | uint64_t zeros = t0 & t1; 22 | size_t j = 0; 23 | 24 | while (zeros) { 25 | if (zeros & 0x80) { 26 | const char* substr = reinterpret_cast(block_first) + j + 1; 27 | if (memcmp(substr, needle + 1, k - 2) == 0) { 28 | return i + j; 29 | } 30 | } 31 | 32 | zeros >>= 8; 33 | j += 1; 34 | } 35 | } 36 | 37 | return std::string::npos; 38 | } 39 | 40 | 41 | template 42 | size_t FORCE_INLINE swar64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { 43 | 44 | assert(n > 0); 45 | 46 | const uint64_t first = 0x0101010101010101llu * static_cast(needle[0]); 47 | const uint64_t last = 0x0101010101010101llu * static_cast(needle[k - 1]); 48 | 49 | uint64_t* block_first = reinterpret_cast(const_cast(s)); 50 | uint64_t* block_last = reinterpret_cast(const_cast(s + k - 1)); 51 | 52 | // 2. sequence scan 53 | for (auto i=0u; i < n; i+=8, block_first++, block_last++) { 54 | const uint64_t eq = (*block_first ^ first) | (*block_last ^ last); 55 | const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu; 56 | const uint64_t t1 = (~eq & 0x8080808080808080llu); 57 | uint64_t zeros = t0 & t1; 58 | size_t j = 0; 59 | 60 | while (zeros) { 61 | if (zeros & 0x80) { 62 | const char* substr = reinterpret_cast(block_first) + j + 1; 63 | if (memcmp_fun(substr, needle + 1)) { 64 | return i + j; 65 | } 66 | } 67 | 68 | zeros >>= 8; 69 | j += 1; 70 | } 71 | } 72 | 73 | return std::string::npos; 74 | } 75 | 76 | // ------------------------------------------------------------------------ 77 | 78 | size_t swar64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { 79 | 80 | size_t result = std::string::npos; 81 | 82 | if (n < k) { 83 | return result; 84 | } 85 | 86 | switch (k) { 87 | case 0: 88 | return 0; 89 | 90 | case 1: { 91 | const char* res = reinterpret_cast(strchr(s, needle[0])); 92 | 93 | return (res != nullptr) ? res - s : std::string::npos; 94 | } 95 | 96 | case 2: 97 | result = swar64_strstr_memcmp<2>(s, n, needle, always_true); 98 | break; 99 | 100 | case 3: 101 | result = swar64_strstr_memcmp<3>(s, n, needle, memcmp1); 102 | break; 103 | 104 | case 4: 105 | result = swar64_strstr_memcmp<4>(s, n, needle, memcmp2); 106 | break; 107 | 108 | case 5: 109 | // Note: use memcmp4 rather memcmp3, as the last character 110 | // of needle is already proven to be equal 111 | result = swar64_strstr_memcmp<5>(s, n, needle, memcmp4); 112 | break; 113 | 114 | case 6: 115 | result = swar64_strstr_memcmp<6>(s, n, needle, memcmp4); 116 | break; 117 | 118 | case 7: 119 | result = swar64_strstr_memcmp<7>(s, n, needle, memcmp5); 120 | break; 121 | 122 | case 8: 123 | result = swar64_strstr_memcmp<8>(s, n, needle, memcmp6); 124 | break; 125 | 126 | case 9: 127 | // Note: use memcmp8 rather memcmp7 for the same reason as above. 128 | result = swar64_strstr_memcmp<9>(s, n, needle, memcmp8); 129 | break; 130 | 131 | case 10: 132 | result = swar64_strstr_memcmp<10>(s, n, needle, memcmp8); 133 | break; 134 | 135 | case 11: 136 | result = swar64_strstr_memcmp<11>(s, n, needle, memcmp9); 137 | break; 138 | 139 | case 12: 140 | result = swar64_strstr_memcmp<12>(s, n, needle, memcmp10); 141 | break; 142 | 143 | default: 144 | result = swar64_strstr_anysize(s, n, needle, k); 145 | break; 146 | } 147 | 148 | if (result <= n - k) { 149 | return result; 150 | } else { 151 | return std::string::npos; 152 | } 153 | } 154 | 155 | 156 | size_t swar64_strstr_v2(const std::string& s, const std::string& needle) { 157 | 158 | return swar64_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); 159 | } 160 | -------------------------------------------------------------------------------- /utils/ansi.cpp: -------------------------------------------------------------------------------- 1 | namespace ansi { 2 | 3 | const int RED = 31; 4 | const int GREEN = 32; 5 | const int WHITE = 37; 6 | 7 | std::string seq(const std::string& str, int color) { 8 | 9 | return "\033[" + std::to_string(color) + "m" + str + "\033[0m"; 10 | } 11 | 12 | } // namespace ansi 13 | 14 | -------------------------------------------------------------------------------- /utils/avx2.cpp: -------------------------------------------------------------------------------- 1 | namespace avx2 { 2 | 3 | union proxy { 4 | __m256i vec; 5 | uint8_t u8[32]; 6 | uint16_t u16[16]; 7 | }; 8 | 9 | 10 | namespace dump { 11 | 12 | void epu16(const __m256i vec) { 13 | 14 | proxy p; 15 | p.vec = vec; 16 | 17 | for (int i=0; i < 16; i++) { 18 | printf("%04x ", p.u16[i]); 19 | } 20 | 21 | putchar('\n'); 22 | } 23 | 24 | void epu8(const __m256i vec) { 25 | 26 | proxy p; 27 | p.vec = vec; 28 | 29 | putchar('\''); 30 | for (int i=0; i < 32; i++) { 31 | printf("%02x ", p.u8[i]); 32 | } 33 | 34 | putchar('\''); 35 | putchar('\n'); 36 | } 37 | 38 | } // namespace dump 39 | 40 | } // namespace sse 41 | -------------------------------------------------------------------------------- /utils/avx512.cpp: -------------------------------------------------------------------------------- 1 | namespace avx512 { 2 | 3 | union proxy { 4 | __m512i vec; 5 | uint8_t u8[64]; 6 | uint16_t u16[32]; 7 | }; 8 | 9 | 10 | namespace dump { 11 | 12 | void epu16(const __m512i vec) { 13 | 14 | proxy p; 15 | p.vec = vec; 16 | 17 | for (int i=0; i < 32; i++) { 18 | printf("%04x ", p.u16[i]); 19 | } 20 | 21 | putchar('\n'); 22 | } 23 | 24 | void epu8(const __m512i vec) { 25 | 26 | proxy p; 27 | p.vec = vec; 28 | 29 | putchar('\''); 30 | for (int i=0; i < 64; i++) { 31 | printf("%02x ", p.u8[i]); 32 | } 33 | 34 | putchar('\''); 35 | putchar('\n'); 36 | } 37 | 38 | } // namespace dump 39 | 40 | } // namespace sse 41 | -------------------------------------------------------------------------------- /utils/bits.cpp: -------------------------------------------------------------------------------- 1 | 2 | namespace bits { 3 | 4 | template 5 | T clear_leftmost_set(const T value) { 6 | 7 | assert(value != 0); 8 | 9 | return value & (value - 1); 10 | } 11 | 12 | 13 | template 14 | unsigned get_first_bit_set(const T value) { 15 | 16 | assert(value != 0); 17 | 18 | return __builtin_ctz(value); 19 | } 20 | 21 | 22 | template <> 23 | unsigned get_first_bit_set(const uint64_t value) { 24 | 25 | assert(value != 0); 26 | 27 | return __builtin_ctzl(value); 28 | } 29 | 30 | } // namespace bits 31 | -------------------------------------------------------------------------------- /utils/neon.cpp: -------------------------------------------------------------------------------- 1 | namespace neon { 2 | 3 | namespace dump { 4 | 5 | void epu8(const uint8x16_t vec) { 6 | 7 | uint8_t p[16]; 8 | vst1q_u8(p, vec); 9 | 10 | putchar('\''); 11 | for (int i=0; i < 16; i++) { 12 | printf("%02x ", p[i]); 13 | } 14 | 15 | putchar('\''); 16 | putchar('\n'); 17 | } 18 | 19 | void epu8(const uint8x8_t vec) { 20 | 21 | uint8_t p[8]; 22 | vst1_u8(p, vec); 23 | 24 | putchar('\''); 25 | for (int i=0; i < 8; i++) { 26 | printf("%02x ", p[i]); 27 | } 28 | 29 | putchar('\''); 30 | putchar('\n'); 31 | } 32 | 33 | } // namespace dump 34 | 35 | } // namespace sse 36 | -------------------------------------------------------------------------------- /utils/sse.cpp: -------------------------------------------------------------------------------- 1 | namespace sse { 2 | 3 | template 4 | __m128i load(T ptr) { 5 | 6 | return _mm_loadu_si128(reinterpret_cast(ptr)); 7 | } 8 | 9 | __m128i mask_lower_bytes(size_t n) { 10 | 11 | // assert(n < 16) 12 | 13 | static const uint8_t mask[32] = { 14 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 16 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 17 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 18 | }; 19 | 20 | return load(mask + 16 - n); 21 | } 22 | 23 | __m128i mask_higher_bytes(size_t n) { 24 | 25 | // assert(n < 16) 26 | 27 | static const uint8_t mask[32] = { 28 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 29 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 30 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 31 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 32 | }; 33 | 34 | return load(mask + 16 - n); 35 | } 36 | 37 | 38 | union proxy { 39 | __m128i vec; 40 | uint8_t u8[16]; 41 | uint16_t u16[8]; 42 | }; 43 | 44 | 45 | namespace dump { 46 | 47 | void epu16(const __m128i vec) { 48 | 49 | proxy p; 50 | p.vec = vec; 51 | 52 | for (int i=0; i < 8; i++) { 53 | printf("%04x ", p.u16[i]); 54 | } 55 | 56 | putchar('\n'); 57 | } 58 | 59 | void epu8(const __m128i vec) { 60 | 61 | proxy p; 62 | p.vec = vec; 63 | 64 | putchar('\''); 65 | for (int i=0; i < 16; i++) { 66 | printf("%02x ", p.u8[i]); 67 | } 68 | 69 | putchar('\''); 70 | putchar('\n'); 71 | } 72 | 73 | } // namespace dump 74 | 75 | } // namespace sse 76 | --------------------------------------------------------------------------------