├── swar.h ├── compiler.h ├── LICENSE ├── README.md ├── test ├── swar_bench.cpp └── swar_test.cpp ├── swar_fwd.h └── swar_inl.h /swar.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "swar_fwd.h" 3 | #include "swar_inl.h" 4 | -------------------------------------------------------------------------------- /compiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__linux__) && defined(__GNUC__) 4 | #define CODE_SECTION __attribute__ ((section (".text#"))) 5 | #else 6 | #define CODE_SECTION 7 | #endif 8 | 9 | #if defined(__GNUC__) 10 | #define likely(X) __builtin_expect(!!(X), 1) 11 | #define unlikely(X) __builtin_expect(!!(X), 0) 12 | #else 13 | #define likely(X) X 14 | #define unlikely(X) X 15 | #endif 16 | 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 yb303 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SWAR 2 | Low level, branch-free functions for number and string conversion and other utils. 3 | 4 | SWAR stands for SIMD within a register. This means treating parts of a uint64_t as individual uint8_t's, uint16_t's, or uint32_t's. 5 | 6 | This library is header-only. There is nothing to build. 7 | 8 | Most functions have different variants optimized for limited lengths (8 and 4), and printable input (ascii < 128), and more 9 | 10 | ### Language and build 11 | 12 | I'm used to C++17 so used `if constexpr`, but the rest of the code is C++03 copatible, and can easily be converted to C.
13 | Include `swar.h` and build with -std=c++17
14 | For forward declarations only, include `swar_fwd.h` instead. 15 | 16 | ### Test and benchmark 17 | 18 | The test dir includes: 19 | - `swar_test.cpp` that is using google-test for unit testing. (**TODO** create a `build: passing` badge)
20 | - `swar_bench.cpp` that produces the numbers for the graph below.
21 | 22 | ### Performance 23 | 24 | Comparison of various atoi implementations (actually a-to-ull) on my home machine: 25 |
26 | Machine and env spec: i5-3470, Cygwin on Windows 10 64 bit, g++ 9.3.0 27 | 28 | The not-perfectly-straight lines, in SWAR's performance, are just measurement artifacts. In *swar8*, for example, it's the same instructions executed for every input, so there should be no difference. 29 | You can see how SWAR is faster than the naive impl and is fixed cost per word. The stock implementation is surprisingly slow. I don't know why as I didn't read its code yet.
30 | 31 | Functions with 8, or 4, suffix are branchless and faster (see *swar8* vs *swar*). Functions with longer input must have a branch per word.
32 | An SSE implementation can follow the same ideas as here for longer inputs. However, using SSE instruction may switch some processors to a different P-state, if the BIOS allows, and the switching itself can take a few hundred cycles. 33 | 34 | Branchless code is not always faster than branched code.
35 | Benchmarks are typically less impacted by branch miss-predictions, then real world applications. This applies also in my benchmark. I did not take special care to litter the BP caches before each function call as this would make each call harder to measure.
36 | Loops may be fully predicted, especially if BP caches are all working for the benchmark. However, in a real world app, using branchless low level code means that BP caches have more room for the application logic so the app as a whole may become faster. The only way to know for sure is to test within the app. 37 | 38 | These performance characteristics are the same for strlen and similar functions. 39 | 40 | ### Functions 41 | All functions come in a few variants: 42 | * memchr and memrchr 43 | * strlen 44 | * atoi, htoi (hex string to int), atod 45 | * itoa 46 | * hasbyte - does word include a certain byte? 47 | 48 | ### Supported operating systems 49 | * Linux 50 | * Cygwin 51 | 52 | ### Supported programming languages 53 | * C++ 54 | 55 | ### Supported compilers 56 | * g++ 57 | 58 | ### Supported architectures 59 | * x86_64 60 | 61 | -------------------------------------------------------------------------------- /test/swar_bench.cpp: -------------------------------------------------------------------------------- 1 | #include "../swar.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | inline int64_t rdtsc() { 11 | union { 12 | struct { uint32_t lo, hi; }; 13 | int64_t ts; 14 | } u; 15 | 16 | asm volatile("rdtsc" : "=a"(u.lo), "=d"(u.hi) : : "memory"); 17 | return u.ts; 18 | } 19 | 20 | inline naive_atoull(const char* p, int n) { 21 | uint64_t ret = 0; 22 | for (int i = 0; i < n; i++) 23 | ret = ret * 10 + p[i] - '0'; 24 | return ret; 25 | } 26 | 27 | void acc(uint64_t& dst, uint64_t src) 28 | { 29 | if (dst == 0) 30 | dst = src; 31 | else if (src < dst) 32 | dst = src; 33 | } 34 | 35 | int main(int argc, char* argv[]) { 36 | (void)argc; 37 | (void)argv; 38 | 39 | int test_size = 10000; 40 | int test_repetitions = 10; 41 | 42 | for (int i = 1; i < argc; i++) { 43 | if (strcmp(argv[i], "-n") == 0) { 44 | test_size = atoi(argv[++i]); 45 | } 46 | else if (strcmp(argv[i], "-r") == 0) { 47 | test_repetitions = atoi(argv[++i]); 48 | } 49 | } 50 | 51 | // Generate a long string of random numbers for atoi 52 | std::vector> v(21); 53 | std::mt19937_64 mt(rdtsc()); 54 | uint64_t mask = 1; 55 | for (int len = 1; len < 21; len++) { 56 | v[len].resize(test_size * (len + 1)); 57 | mask *= 10; 58 | int total_len = 0; 59 | char* buf = v[len].data(); 60 | for (int i = 1; i < test_size; i++) { 61 | sprintf(buf + total_len, "%0lu", mt() % mask); 62 | total_len += len + 1; 63 | } 64 | } 65 | 66 | uint64_t junk = 0; 67 | std::vector dt_no_op(21); 68 | std::vector dt_stock(21); 69 | std::vector dt_naive(21); 70 | std::vector dt_swar_(21); 71 | std::vector dt_swarX(21); 72 | std::vector dt_swar8(21); 73 | std::vector dt_swar4(21); 74 | 75 | for (int r = 0; r < test_repetitions; r++) { 76 | for (int len = 1; len < 21; len++) { 77 | char* buf = v[len].data(); 78 | 79 | // Test no-op 80 | uint64_t t0 = rdtsc(); 81 | int total_len = 0; 82 | for (int i = 0; i < test_size; i++ ) { 83 | junk += buf[total_len]; 84 | total_len += len + 1; 85 | } 86 | 87 | // Test stock atoull 88 | uint64_t t1 = rdtsc(); 89 | total_len = 0; 90 | for (int i = 0; i < test_size; i++ ) { 91 | junk += atoll(buf + total_len); 92 | total_len += len + 1; 93 | } 94 | 95 | // Test naive atoull 96 | uint64_t t2 = rdtsc(); 97 | total_len = 0; 98 | for (int i = 0; i < test_size; i++ ) { 99 | junk += naive_atoull(buf + total_len, len); 100 | total_len += len + 1; 101 | } 102 | 103 | // Test swar atou 104 | uint64_t t3 = rdtsc(); 105 | total_len = 0; 106 | for (int i = 0; i < test_size; i++ ) { 107 | junk += swar::atou(buf + total_len, len); 108 | total_len += len + 1; 109 | } 110 | 111 | // Test swar atou8 112 | uint64_t t4 = rdtsc(); 113 | int len8 = len <= 8 ? len : 8; 114 | total_len = 0; 115 | for (int i = 0; i < test_size; i++ ) { 116 | junk += swar::atou8(buf + total_len, len8); 117 | total_len += len + 1; 118 | } 119 | 120 | // Test swar atou4 121 | uint64_t t5 = rdtsc(); 122 | int len4 = len <= 4 ? len : 4; 123 | total_len = 0; 124 | for (int i = 0; i < test_size; i++ ) { 125 | junk += swar::atou4(buf + total_len, len4); 126 | total_len += len + 1; 127 | } 128 | 129 | uint64_t t6 = rdtsc(); 130 | acc(dt_no_op[len], t1 - t0); 131 | acc(dt_stock[len], t2 - t1); 132 | acc(dt_naive[len], t3 - t2); 133 | acc(dt_swar_[len], t4 - t3); 134 | acc(dt_swar8[len], t5 - t4); 135 | acc(dt_swar4[len], t6 - t5); 136 | } 137 | } 138 | 139 | printf("%d%c", uint32_t(junk) % 10, 8); 140 | printf("len %7s %7s %7s %7s %7s\n", 141 | "stock", "naive", "swar", "swar8", "swar4"); 142 | double f = 1.0 / test_size; 143 | for (int len = 1; len < 21; len++) { 144 | double tf_stock = (dt_stock[len] - dt_no_op[len]) * f; 145 | double tf_naive = (dt_naive[len] - dt_no_op[len]) * f; 146 | double tf_swar_ = (dt_swar_[len] - dt_no_op[len]) * f; 147 | double tf_swar8 = (dt_swar8[len] - dt_no_op[len]) * f; 148 | double tf_swar4 = (dt_swar4[len] - dt_no_op[len]) * f; 149 | printf("%3d %7.1f %7.1f %7.1f %7.1f %7.1f\n", 150 | len, tf_stock, tf_naive, tf_swar_, tf_swar8, tf_swar4); 151 | } 152 | 153 | return 0; 154 | } 155 | 156 | 157 | -------------------------------------------------------------------------------- /swar_fwd.h: -------------------------------------------------------------------------------- 1 | #include "compiler.h" 2 | 3 | #include 4 | #include // for memcpy, memset 5 | #include 6 | 7 | namespace swar { 8 | 9 | // 10 | // Function naming convention : 11 | // Length 12 | // 8 means input is 8 bytes 13 | // 4 means input is 4 bytes 14 | // None means input is any length 15 | // Prefix 16 | // p Printable. Ascii 0 to 127. 17 | // pmemchr vs memchr is slighly optimized for printable input 18 | // _ Mostly internal use 19 | // Suffix 20 | // k Haystack is known to contain needle 21 | // _nc Non-const, modifiable, input 22 | // These functions modify and restore the input. Not thread safe 23 | // 24 | // Performance 25 | // Functions with 8 suffix are branchless. Function with longer input must 26 | // have a branch per word. This can be improved with SSE. 27 | // - SSE may switch some processors to different P state, if the BIOS allows, 28 | // and the switching itself can take a few hundred cycles 29 | // - Branchless code is not always faster than branched code. 30 | // Lab tests are typically less impacted by branch miss-predictions, then real 31 | // world applications. 32 | // Loops may be fully predicted, if BP caches are all working for the test. 33 | // However, in a real world app, using branchless low level code means that BP 34 | // caches have more room for the application logic so the app as a whole may 35 | // become faster. 36 | // The only way to know for sure, is to test within the app. 37 | // 38 | 39 | // 40 | // Utils 41 | // 42 | 43 | // Cast char* to T using memcpy. memcpy is optimized away on x86 44 | template inline T cast(const char* src); 45 | 46 | // Get the uint _cast_ of a string of up to 8 chars 47 | inline uint64_t cast8(const char* s, uint32_t len); 48 | 49 | // Fill T with c's 50 | template inline T extend(char c); 51 | 52 | // swap bytes 53 | inline uint64_t bswap(uint64_t x); 54 | inline uint32_t bswap(uint32_t x); 55 | inline uint16_t bswap(uint16_t x); 56 | 57 | // 58 | // Find byte in word 59 | // 60 | 61 | // Check if word has zero byte 62 | inline bool haszero(uint64_t x); 63 | 64 | // Check if word has some byte 65 | inline bool hasbyte(uint64_t x, uint8_t c); 66 | 67 | // Find char in string. Support all options. 68 | template 69 | inline uint32_t _memchr8(const char* s, uint8_t c); 70 | 71 | // Find char in string and trim it 72 | template 73 | inline uint32_t _trim8(const char* s, uint8_t c); 74 | 75 | // Find char in printable (chars < 128) string of 8 chars 76 | inline uint32_t pmemchr8(const char* s, uint8_t c); 77 | 78 | // Find char in printable (chars < 128) string of 8 chars 79 | // * The string is known to contain the char 80 | inline uint32_t pmemchr8k(const char* s, uint8_t c); 81 | 82 | // Find char in binary string of 8 chars 83 | inline uint32_t memchr8(const char* s, uint8_t c); 84 | 85 | // Find char in binary string of 8 chars 86 | // * The string is known to contain the char 87 | inline uint32_t memchr8k(const char* s, uint8_t c); 88 | 89 | // 90 | // Strlen variants 91 | // 92 | 93 | // Find zero byte in binary string up to 8 chars 94 | inline uint32_t strlen8(const char* s); 95 | 96 | // Find zero byte in printable string up to 8 chars 97 | inline uint32_t pstrlen8(const char* s); 98 | 99 | // Find zero byte in binary string 100 | inline uint32_t strlen(const char* s); 101 | 102 | // Find zero byte in printable string 103 | inline uint32_t pstrlen(const char* s); 104 | 105 | // 106 | // Find byte in word - reverse 107 | // 108 | 109 | // Find char in printable (chars < 128) string of 8 chars 110 | inline uint32_t pmemrchr8(const char* s, uint8_t c); 111 | 112 | // Find char in printable (chars < 128) string of 8 chars 113 | // * The string is known to contain the char 114 | inline uint32_t pmemrchr8k(const char* s, uint8_t c); 115 | 116 | // Find char in binary string of 8 chars 117 | inline uint32_t memrchr8(const char* s, uint8_t c); 118 | 119 | // Find char in binary string of 8 chars 120 | // * The string is known to contain the char 121 | inline uint32_t memrchr8k(const char* s, uint8_t c); 122 | 123 | // 124 | // Find byte in const string. Like memchr 125 | // 126 | 127 | // Find char in const binary string 128 | template 129 | inline uint32_t _memchr(const char* s, uint32_t len, uint8_t c); 130 | 131 | // Find char in binary string 132 | inline uint32_t memchr(const char* s, uint32_t len, uint8_t c); 133 | 134 | // Find char in binary string. Char c is known to be in s + len 135 | inline uint32_t memchrk(const char* s, uint32_t len, uint8_t c); 136 | 137 | // Find char in printable string 138 | inline uint32_t pmemchr(const char* s, uint32_t len, uint8_t c); 139 | 140 | // Find char in printable string. Char c is known to be in s + len 141 | inline uint32_t pmemchrk(const char* s, uint32_t len, uint8_t c); 142 | 143 | // 144 | // Find byte, from end, in const string. Like memrchr 145 | // 146 | 147 | // Find char, in reverse, in const binary string 148 | template 149 | inline uint32_t _memrchr(const char* s, uint32_t len, uint8_t c); 150 | 151 | // Find char in binary string 152 | inline uint32_t memrchr(const char* s, uint32_t len, uint8_t c); 153 | 154 | // Find char in binary string. Char c is known to be in s + len 155 | inline uint32_t memrchrk(const char* s, uint32_t len, uint8_t c); 156 | 157 | // Find char in printable string 158 | inline uint32_t pmemrchr(const char* s, uint32_t len, uint8_t c); 159 | 160 | // Find char in printable string. Char c is known to be in s + len 161 | inline uint32_t pmemrchrk(const char* s, uint32_t len, uint8_t c); 162 | 163 | // 164 | // Find byte in NON-CONST string 165 | // 166 | 167 | template 168 | inline uint32_t _memchr_nc(char* s, uint32_t len, uint8_t c); 169 | 170 | // Find char in binary NON-CONST string 171 | inline uint32_t memchr_nc(char* s, uint32_t len, uint8_t c); 172 | 173 | // Find char in printable NON-CONST string 174 | inline uint32_t pmemchr_nc(char* s, uint32_t len, uint8_t c); 175 | 176 | //// string to int 177 | 178 | // Parse uint from string of up to 4 chars 179 | inline uint16_t atou4(const char* s, uint32_t len); 180 | 181 | // Parse uint from string of up to 8 chars 182 | inline uint32_t atou8(const char* s, uint32_t len); 183 | 184 | // Parse uint64_t from string of up to 20 chars 185 | // *** More than 20 char returns junk. 186 | inline uint64_t atou(const char* s, uint32_t len); 187 | 188 | // Parse _signed_ int from string of up to 20 chars. No spaces 189 | inline int64_t atoi(const char* s, uint32_t len); 190 | 191 | // Parse hex int from string of up to 8 chars 192 | inline uint32_t htou8(const char* s, uint32_t len); 193 | 194 | // Parse hex int from string of up to 16 chars 195 | inline uint64_t htou(const char* s, uint32_t len); 196 | 197 | //// int to string 198 | 199 | // *** p suffix means zero-padded 200 | 201 | // Convert uint, of less than 100, to %02u, as int 16 202 | inline uint16_t utoa2p(uint64_t x); 203 | 204 | // Convert uint, of less than 100, to %02u 205 | inline void utoa2p(uint64_t x, char* s); 206 | 207 | // Convert uint to %0u, N <= 8 208 | template 209 | inline uint64_t _utoap(uint64_t x, char* s); 210 | 211 | // Convert uint to %0u, N <= 20 212 | template 213 | inline char* utoap(uint64_t x, char* s); 214 | 215 | // Convert signed int 32 to string of up to 8 bytes. 216 | inline uint32_t itoa8(int32_t x, char* buf); 217 | 218 | // Convert signed int 64 to string. String buffer is at least 22 bytes. 219 | // Returns length 220 | // *** this feels inefficient :( *** 221 | inline uint32_t itoa(int64_t x, char* buf); 222 | 223 | //// Double to string 224 | 225 | // Copy the sign from src to dst that is unsigned. 226 | // *** dst is up to 63 bit 227 | inline int64_t _copySign(int64_t src, uint64_t dst); 228 | 229 | // Parse double from string 230 | // *** More than 20 char integer part returns junk. 231 | // *** Too much decimal char will get lost to precision 232 | inline double atod(const char* s, uint32_t len); 233 | 234 | } // namespace swar 235 | -------------------------------------------------------------------------------- /test/swar_test.cpp: -------------------------------------------------------------------------------- 1 | #include "../swar.h" 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | TEST(r8, memchr) { 8 | EXPECT_EQ(swar::memchr8("12345678=90", '='), -1); 9 | EXPECT_EQ(swar::memchr8("1234567=890", '='), 7); 10 | EXPECT_EQ(swar::memchr8("123456=7890", '='), 6); 11 | EXPECT_EQ(swar::memchr8("12345=67890", '='), 5); 12 | EXPECT_EQ(swar::memchr8("1234=567890", '='), 4); 13 | EXPECT_EQ(swar::memchr8("123=4567890", '='), 3); 14 | EXPECT_EQ(swar::memchr8("12=34567890", '='), 2); 15 | EXPECT_EQ(swar::memchr8("1=234567890", '='), 1); 16 | EXPECT_EQ(swar::memchr8("=1234567890", '='), 0); 17 | 18 | EXPECT_EQ(swar::memchr8("1234=", '='), 4); 19 | EXPECT_EQ(swar::memchr8("123=4", '='), 3); 20 | EXPECT_EQ(swar::memchr8("12=34", '='), 2); 21 | EXPECT_EQ(swar::memchr8("1=234", '='), 1); 22 | EXPECT_EQ(swar::memchr8("=1234", '='), 0); 23 | 24 | EXPECT_EQ(swar::memchr8("===", '='), 0); 25 | EXPECT_EQ(swar::memchr8("==", '='), 0); 26 | EXPECT_EQ(swar::memchr8("=", '='), 0); 27 | 28 | // 123456789 123456789 123456789 29 | EXPECT_EQ(swar::memchr("1234567890abcdefghij=", 20, '='), -1); 30 | EXPECT_EQ(swar::memchr("12345678=90abcdefghi", 20, '='), 8); 31 | EXPECT_EQ(swar::memchr("1234=567890abcdefghi", 20, '='), 4); 32 | 33 | char nc[24] = "1234567890abcdefghij=12"; 34 | EXPECT_EQ(swar::memchr_nc(nc, 20, '='), -1); 35 | EXPECT_EQ(swar::memchr_nc(nc, 21, '='), 20); 36 | EXPECT_EQ(swar::memchr_nc(nc, 23, '='), 20); 37 | 38 | } 39 | 40 | TEST(r8, cast8) { 41 | EXPECT_EQ(swar::cast8("1234567890", 0), 0); 42 | EXPECT_EQ(swar::cast8("1234567890", 1), 0x31ull); 43 | EXPECT_EQ(swar::cast8("1234567890", 2), 0x3231ull); 44 | EXPECT_EQ(swar::cast8("1234567890", 3), 0x333231ull); 45 | EXPECT_EQ(swar::cast8("1234567890", 4), 0x34333231ull); 46 | EXPECT_EQ(swar::cast8("1234567890", 5), 0x3534333231ull); 47 | EXPECT_EQ(swar::cast8("1234567890", 6), 0x363534333231ull); 48 | EXPECT_EQ(swar::cast8("1234567890", 7), 0x37363534333231ull); 49 | EXPECT_EQ(swar::cast8("1234567890", 8), 0x3837363534333231ull); 50 | } 51 | 52 | TEST(r8, atoi) { 53 | // short 54 | EXPECT_EQ(swar::atou4("1234567890", 0), 0); 55 | EXPECT_EQ(swar::atou4("1234567890", 1), 1); 56 | EXPECT_EQ(swar::atou4("1234567890", 2), 12); 57 | EXPECT_EQ(swar::atou4("1234567890", 3), 123); 58 | EXPECT_EQ(swar::atou4("1234567890", 4), 1234); 59 | 60 | EXPECT_EQ(swar::atou8("1234567890", 0), 0); 61 | EXPECT_EQ(swar::atou8("1234567890", 1), 1); 62 | EXPECT_EQ(swar::atou8("1234567890", 2), 12); 63 | EXPECT_EQ(swar::atou8("1234567890", 3), 123); 64 | EXPECT_EQ(swar::atou8("1234567890", 4), 1234); 65 | EXPECT_EQ(swar::atou8("1234567890", 5), 12345); 66 | EXPECT_EQ(swar::atou8("1234567890", 6), 123456); 67 | EXPECT_EQ(swar::atou8("1234567890", 7), 1234567); 68 | EXPECT_EQ(swar::atou8("1234567890", 8), 12345678); 69 | 70 | // long 71 | EXPECT_EQ(swar::atou("12345678901234567890", 0), 0ull); 72 | EXPECT_EQ(swar::atou("12345678901234567890", 1), 1ull); 73 | EXPECT_EQ(swar::atou("12345678901234567890", 2), 12ull); 74 | EXPECT_EQ(swar::atou("12345678901234567890", 3), 123ull); 75 | EXPECT_EQ(swar::atou("12345678901234567890", 4), 1234ull); 76 | EXPECT_EQ(swar::atou("12345678901234567890", 5), 12345ull); 77 | EXPECT_EQ(swar::atou("12345678901234567890", 6), 123456ull); 78 | EXPECT_EQ(swar::atou("12345678901234567890", 7), 1234567ull); 79 | EXPECT_EQ(swar::atou("12345678901234567890", 8), 12345678ull); 80 | EXPECT_EQ(swar::atou("12345678901234567890", 9), 123456789ull); 81 | EXPECT_EQ(swar::atou("12345678901234567890", 10), 1234567890ull); 82 | EXPECT_EQ(swar::atou("12345678901234567890", 11), 12345678901ull); 83 | EXPECT_EQ(swar::atou("12345678901234567890", 12), 123456789012ull); 84 | EXPECT_EQ(swar::atou("12345678901234567890", 13), 1234567890123ull); 85 | EXPECT_EQ(swar::atou("12345678901234567890", 14), 12345678901234ull); 86 | EXPECT_EQ(swar::atou("12345678901234567890", 15), 123456789012345ull); 87 | EXPECT_EQ(swar::atou("12345678901234567890", 16), 1234567890123456ull); 88 | EXPECT_EQ(swar::atou("12345678901234567890", 17), 12345678901234567ull); 89 | EXPECT_EQ(swar::atou("12345678901234567890", 18), 123456789012345678ull); 90 | EXPECT_EQ(swar::atou("12345678901234567890", 19), 1234567890123456789ull); 91 | EXPECT_EQ(swar::atou("12345678901234567890", 20), 12345678901234567890ull); 92 | 93 | // long signed w/o sign 94 | EXPECT_EQ(swar::atoi("12345678901234567890", 0), 0); 95 | EXPECT_EQ(swar::atoi("12345678901234567890", 1), 1ll); 96 | EXPECT_EQ(swar::atoi("12345678901234567890", 2), 12ll); 97 | EXPECT_EQ(swar::atoi("12345678901234567890", 3), 123ll); 98 | EXPECT_EQ(swar::atoi("12345678901234567890", 4), 1234ll); 99 | EXPECT_EQ(swar::atoi("12345678901234567890", 5), 12345ll); 100 | EXPECT_EQ(swar::atoi("12345678901234567890", 6), 123456ll); 101 | EXPECT_EQ(swar::atoi("12345678901234567890", 7), 1234567ll); 102 | EXPECT_EQ(swar::atoi("12345678901234567890", 8), 12345678ll); 103 | EXPECT_EQ(swar::atoi("12345678901234567890", 9), 123456789ll); 104 | EXPECT_EQ(swar::atoi("12345678901234567890", 10), 1234567890ll); 105 | EXPECT_EQ(swar::atoi("12345678901234567890", 11), 12345678901ll); 106 | EXPECT_EQ(swar::atoi("12345678901234567890", 12), 123456789012ll); 107 | EXPECT_EQ(swar::atoi("12345678901234567890", 13), 1234567890123ll); 108 | EXPECT_EQ(swar::atoi("12345678901234567890", 14), 12345678901234ll); 109 | EXPECT_EQ(swar::atoi("12345678901234567890", 15), 123456789012345ll); 110 | EXPECT_EQ(swar::atoi("12345678901234567890", 16), 1234567890123456ll); 111 | EXPECT_EQ(swar::atoi("12345678901234567890", 17), 12345678901234567ll); 112 | EXPECT_EQ(swar::atoi("12345678901234567890", 18), 123456789012345678ll); 113 | EXPECT_EQ(swar::atoi("12345678901234567890", 19), 1234567890123456789ll); 114 | 115 | // long signed + 116 | EXPECT_EQ(swar::atoi("+12345678901234567890", 2), 1ll); 117 | EXPECT_EQ(swar::atoi("+12345678901234567890", 3), 12ll); 118 | EXPECT_EQ(swar::atoi("+12345678901234567890", 4), 123ll); 119 | EXPECT_EQ(swar::atoi("+12345678901234567890", 5), 1234ll); 120 | EXPECT_EQ(swar::atoi("+12345678901234567890", 6), 12345ll); 121 | EXPECT_EQ(swar::atoi("+12345678901234567890", 7), 123456ll); 122 | EXPECT_EQ(swar::atoi("+12345678901234567890", 8), 1234567ll); 123 | EXPECT_EQ(swar::atoi("+12345678901234567890", 9), 12345678ll); 124 | EXPECT_EQ(swar::atoi("+12345678901234567890", 10), 123456789ll); 125 | EXPECT_EQ(swar::atoi("+12345678901234567890", 11), 1234567890ll); 126 | EXPECT_EQ(swar::atoi("+12345678901234567890", 12), 12345678901ll); 127 | EXPECT_EQ(swar::atoi("+12345678901234567890", 13), 123456789012ll); 128 | EXPECT_EQ(swar::atoi("+12345678901234567890", 14), 1234567890123ll); 129 | EXPECT_EQ(swar::atoi("+12345678901234567890", 15), 12345678901234ll); 130 | EXPECT_EQ(swar::atoi("+12345678901234567890", 16), 123456789012345ll); 131 | EXPECT_EQ(swar::atoi("+12345678901234567890", 17), 1234567890123456ll); 132 | EXPECT_EQ(swar::atoi("+12345678901234567890", 18), 12345678901234567ll); 133 | EXPECT_EQ(swar::atoi("+12345678901234567890", 19), 123456789012345678ll); 134 | EXPECT_EQ(swar::atoi("+12345678901234567890", 20), 1234567890123456789ll); 135 | 136 | // long signed - 137 | EXPECT_EQ(swar::atoi("-12345678901234567890", 2), -1ll); 138 | EXPECT_EQ(swar::atoi("-12345678901234567890", 3), -12ll); 139 | EXPECT_EQ(swar::atoi("-12345678901234567890", 4), -123ll); 140 | EXPECT_EQ(swar::atoi("-12345678901234567890", 5), -1234ll); 141 | EXPECT_EQ(swar::atoi("-12345678901234567890", 6), -12345ll); 142 | EXPECT_EQ(swar::atoi("-12345678901234567890", 7), -123456ll); 143 | EXPECT_EQ(swar::atoi("-12345678901234567890", 8), -1234567ll); 144 | EXPECT_EQ(swar::atoi("-12345678901234567890", 9), -12345678ll); 145 | EXPECT_EQ(swar::atoi("-12345678901234567890", 10), -123456789ll); 146 | EXPECT_EQ(swar::atoi("-12345678901234567890", 11), -1234567890ll); 147 | EXPECT_EQ(swar::atoi("-12345678901234567890", 12), -12345678901ll); 148 | EXPECT_EQ(swar::atoi("-12345678901234567890", 13), -123456789012ll); 149 | EXPECT_EQ(swar::atoi("-12345678901234567890", 14), -1234567890123ll); 150 | EXPECT_EQ(swar::atoi("-12345678901234567890", 15), -12345678901234ll); 151 | EXPECT_EQ(swar::atoi("-12345678901234567890", 16), -123456789012345ll); 152 | EXPECT_EQ(swar::atoi("-12345678901234567890", 17), -1234567890123456ll); 153 | EXPECT_EQ(swar::atoi("-12345678901234567890", 18), -12345678901234567ll); 154 | EXPECT_EQ(swar::atoi("-12345678901234567890", 19), -123456789012345678ll); 155 | EXPECT_EQ(swar::atoi("-12345678901234567890", 20), -1234567890123456789ll); 156 | } 157 | 158 | TEST(r8, htou) { 159 | EXPECT_EQ(swar::htou8("123456789abcdef0", 0), 0); 160 | EXPECT_EQ(swar::htou8("123456789abcdef0", 1), 0x1); 161 | EXPECT_EQ(swar::htou8("123456789abcdef0", 2), 0x12); 162 | EXPECT_EQ(swar::htou8("123456789abcdef0", 3), 0x123); 163 | EXPECT_EQ(swar::htou8("123456789abcdef0", 4), 0x1234); 164 | EXPECT_EQ(swar::htou8("123456789abcdef0", 5), 0x12345); 165 | EXPECT_EQ(swar::htou8("123456789abcdef0", 6), 0x123456); 166 | EXPECT_EQ(swar::htou8("123456789aBCDEf0", 7), 0x1234567); 167 | EXPECT_EQ(swar::htou8("123456789aBCDEf0", 8), 0x12345678); 168 | 169 | EXPECT_EQ(swar::htou("123456789abcdef0", 0), 0ull); 170 | EXPECT_EQ(swar::htou("123456789abcdef0", 1), 0x1ull); 171 | EXPECT_EQ(swar::htou("123456789abcdef0", 2), 0x12ull); 172 | EXPECT_EQ(swar::htou("123456789abcdef0", 3), 0x123ull); 173 | EXPECT_EQ(swar::htou("123456789abcdef0", 4), 0x1234ull); 174 | EXPECT_EQ(swar::htou("123456789abcdef0", 5), 0x12345ull); 175 | EXPECT_EQ(swar::htou("123456789abcdef0", 6), 0x123456ull); 176 | EXPECT_EQ(swar::htou("123456789aBCDEf0", 7), 0x1234567ull); 177 | EXPECT_EQ(swar::htou("123456789aBCDEf0", 8), 0x12345678ull); 178 | EXPECT_EQ(swar::htou("123456789abCDEF0", 9), 0x123456789ull); 179 | EXPECT_EQ(swar::htou("123456789abCDEF0", 10), 0x123456789aull); 180 | EXPECT_EQ(swar::htou("123456789abCDEF0", 11), 0x123456789abull); 181 | EXPECT_EQ(swar::htou("123456789abCDEF0", 12), 0x123456789abcull); 182 | EXPECT_EQ(swar::htou("123456789abCDEF0", 13), 0x123456789abcdull); 183 | EXPECT_EQ(swar::htou("123456789ABcdef0", 14), 0x123456789abcdeull); 184 | EXPECT_EQ(swar::htou("123456789ABcdef0", 15), 0x123456789abcdefull); 185 | EXPECT_EQ(swar::htou("123456789ABcdef0", 16), 0x123456789abcdef0ull); 186 | 187 | EXPECT_EQ(swar::htou8("abcdef..", 1), 0xa); 188 | EXPECT_EQ(swar::htou8("abcdef..", 2), 0xab); 189 | EXPECT_EQ(swar::htou8("abcdef..", 3), 0xabc); 190 | EXPECT_EQ(swar::htou8("abcdef..", 4), 0xabcd); 191 | EXPECT_EQ(swar::htou8("abcdef..", 5), 0xabcde); 192 | EXPECT_EQ(swar::htou8("abcdef..", 6), 0xabcdef); 193 | 194 | EXPECT_EQ(swar::htou8("ABCDEF..", 1), 0xa); 195 | EXPECT_EQ(swar::htou8("ABCDEF..", 2), 0xab); 196 | EXPECT_EQ(swar::htou8("ABCDEF..", 3), 0xabc); 197 | EXPECT_EQ(swar::htou8("ABCDEF..", 4), 0xabcd); 198 | EXPECT_EQ(swar::htou8("ABCDEF..", 5), 0xabcde); 199 | EXPECT_EQ(swar::htou8("ABCDEF..", 6), 0xabcdef); 200 | 201 | EXPECT_EQ(swar::htou8("abef0189", 5), 0xabef0); 202 | EXPECT_EQ(swar::htou8("abef0189", 6), 0xabef01); 203 | EXPECT_EQ(swar::htou8("abef0189", 7), 0xabef018); 204 | EXPECT_EQ(swar::htou8("abef0189", 8), 0xabef0189); 205 | 206 | EXPECT_EQ(swar::htou8("1234abef", 5), 0x1234a); 207 | EXPECT_EQ(swar::htou8("1234abef", 6), 0x1234ab); 208 | EXPECT_EQ(swar::htou8("1234abef", 7), 0x1234abe); 209 | EXPECT_EQ(swar::htou8("1234abef", 8), 0x1234abef); 210 | } 211 | 212 | TEST(r8, itoa) { 213 | union { 214 | char test_buf[100]; 215 | uint16_t test16; 216 | }; 217 | char itoa_ret[100]; 218 | 219 | for (int i = -100000; i < 100000; i++) { 220 | sprintf(test_buf, "%d", i); 221 | swar::itoa(i, itoa_ret); 222 | EXPECT_STREQ(itoa_ret, test_buf); 223 | } 224 | 225 | for (int64_t i = std::numeric_limits::min(); 226 | i < std::numeric_limits::min(); i += 13371) { 227 | sprintf(test_buf, "%ld", i); 228 | swar::itoa(i, itoa_ret); 229 | EXPECT_STREQ(itoa_ret, test_buf); 230 | } 231 | 232 | EXPECT_STREQ(swar::utoap< 1>(0, itoa_ret), "0"); 233 | EXPECT_STREQ(swar::utoap< 2>(0, itoa_ret), "00"); 234 | EXPECT_STREQ(swar::utoap< 3>(0, itoa_ret), "000"); 235 | EXPECT_STREQ(swar::utoap< 4>(0, itoa_ret), "0000"); 236 | EXPECT_STREQ(swar::utoap< 5>(0, itoa_ret), "00000"); 237 | EXPECT_STREQ(swar::utoap< 6>(0, itoa_ret), "000000"); 238 | EXPECT_STREQ(swar::utoap< 7>(0, itoa_ret), "0000000"); 239 | EXPECT_STREQ(swar::utoap< 8>(0, itoa_ret), "00000000"); 240 | EXPECT_STREQ(swar::utoap< 9>(0, itoa_ret), "000000000"); 241 | EXPECT_STREQ(swar::utoap<10>(0, itoa_ret), "0000000000"); 242 | EXPECT_STREQ(swar::utoap<11>(0, itoa_ret), "00000000000"); 243 | EXPECT_STREQ(swar::utoap<12>(0, itoa_ret), "000000000000"); 244 | EXPECT_STREQ(swar::utoap<13>(0, itoa_ret), "0000000000000"); 245 | EXPECT_STREQ(swar::utoap<14>(0, itoa_ret), "00000000000000"); 246 | EXPECT_STREQ(swar::utoap<15>(0, itoa_ret), "000000000000000"); 247 | EXPECT_STREQ(swar::utoap<16>(0, itoa_ret), "0000000000000000"); 248 | EXPECT_STREQ(swar::utoap<17>(0, itoa_ret), "00000000000000000"); 249 | EXPECT_STREQ(swar::utoap<18>(0, itoa_ret), "000000000000000000"); 250 | EXPECT_STREQ(swar::utoap<19>(0, itoa_ret), "0000000000000000000"); 251 | 252 | EXPECT_STREQ(swar::utoap< 0>(7, itoa_ret), ""); 253 | EXPECT_STREQ(swar::utoap< 1>(7, itoa_ret), "7"); 254 | EXPECT_STREQ(swar::utoap< 2>(7, itoa_ret), "07"); 255 | EXPECT_STREQ(swar::utoap< 3>(7, itoa_ret), "007"); 256 | EXPECT_STREQ(swar::utoap< 4>(7, itoa_ret), "0007"); 257 | EXPECT_STREQ(swar::utoap< 5>(7, itoa_ret), "00007"); 258 | EXPECT_STREQ(swar::utoap< 6>(12345, itoa_ret), "012345"); 259 | EXPECT_STREQ(swar::utoap< 7>(12345, itoa_ret), "0012345"); 260 | EXPECT_STREQ(swar::utoap< 8>(12345, itoa_ret), "00012345"); 261 | EXPECT_STREQ(swar::utoap< 9>(12345, itoa_ret), "000012345"); 262 | EXPECT_STREQ(swar::utoap<10>(12345, itoa_ret), "0000012345"); 263 | EXPECT_STREQ(swar::utoap<11>(12345, itoa_ret), "00000012345"); 264 | EXPECT_STREQ(swar::utoap<12>(12345678901, itoa_ret), "012345678901"); 265 | EXPECT_STREQ(swar::utoap<13>(12345678901, itoa_ret), "0012345678901"); 266 | EXPECT_STREQ(swar::utoap<14>(12345678901, itoa_ret), "00012345678901"); 267 | EXPECT_STREQ(swar::utoap<15>(12345678901, itoa_ret), "000012345678901"); 268 | EXPECT_STREQ(swar::utoap<16>(12345678901, itoa_ret), "0000012345678901"); 269 | EXPECT_STREQ(swar::utoap<17>(123456789012345, itoa_ret), "00123456789012345"); 270 | EXPECT_STREQ(swar::utoap<18>(123456789012345, itoa_ret), "000123456789012345"); 271 | EXPECT_STREQ(swar::utoap<19>(123456789012345, itoa_ret), "0000123456789012345"); 272 | EXPECT_STREQ(swar::utoap<20>(123456789012345, itoa_ret), "00000123456789012345"); 273 | 274 | swar::itoa8(0, itoa_ret); EXPECT_STREQ(itoa_ret, "0"); 275 | swar::itoa8(1, itoa_ret); EXPECT_STREQ(itoa_ret, "1"); 276 | swar::itoa8(12, itoa_ret); EXPECT_STREQ(itoa_ret, "12"); 277 | swar::itoa8(123, itoa_ret); EXPECT_STREQ(itoa_ret, "123"); 278 | swar::itoa8(1234, itoa_ret); EXPECT_STREQ(itoa_ret, "1234"); 279 | swar::itoa8(12345, itoa_ret); EXPECT_STREQ(itoa_ret, "12345"); 280 | swar::itoa8(123456, itoa_ret); EXPECT_STREQ(itoa_ret, "123456"); 281 | swar::itoa8(1234567, itoa_ret); EXPECT_STREQ(itoa_ret, "1234567"); 282 | swar::itoa8(12345678, itoa_ret); EXPECT_STREQ(itoa_ret, "12345678"); 283 | } 284 | 285 | 286 | -------------------------------------------------------------------------------- /swar_inl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "compiler.h" 4 | 5 | #include 6 | #include // for memcpy, memset 7 | #include 8 | 9 | // Function naming convention [prefix] [length] 10 | // - function 11 | 12 | namespace swar { 13 | 14 | // cast and fill utils 15 | template 16 | inline T cast(const char* src) { 17 | T ret; 18 | ::memcpy(&ret, src, sizeof(T)); 19 | return ret; 20 | } 21 | 22 | template 23 | inline T extend(char c) { 24 | T ret; 25 | ::memset(&ret, c, sizeof(T)); 26 | return ret; 27 | } 28 | 29 | // swap bytes 30 | inline uint64_t bswap(uint64_t x) { return __builtin_bswap64(x); } 31 | inline uint32_t bswap(uint32_t x) { return __builtin_bswap32(x); } 32 | inline uint16_t bswap(uint16_t x) { return __builtin_bswap16(x); } 33 | 34 | //// Find bytes 35 | 36 | inline bool haszero(uint64_t x) { 37 | uint64_t a = 0x7f7f7f7f7f7f7f7full; 38 | uint64_t l = 0x0101010101010101ull; 39 | return (x - l) & ~x & ~a; 40 | } 41 | 42 | // Check if word has some byte 43 | inline bool hasbyte(uint64_t x, uint8_t c) { 44 | return haszero(x ^ extend(c)); 45 | } 46 | 47 | // Find char in string. Support all options. 48 | template 49 | inline uint32_t _memchr8(const char* s, uint8_t c) { 50 | // int 64 of all c's 51 | uint64_t m = extend(c); 52 | 53 | // int 64 of s 54 | uint64_t x = cast(s); 55 | 56 | // remove c's from string 57 | // so now we have to find first zero byte 58 | x ^= m; 59 | 60 | uint64_t a = 0x7f7f7f7f7f7f7f7full; 61 | 62 | // set the high bit in non-zero bytes 63 | if (Printable) { 64 | x += a; 65 | } 66 | else { 67 | x = ((x & a) + a) | x; 68 | } 69 | 70 | // flip to set the high bit in zero bytes, and clear other high bits 71 | x = ~x; 72 | 73 | // clear all bits except the high bit of the zero byte 74 | x &= ~a; 75 | 76 | // find the high bit, from right (little endian) 77 | if (Exists) { 78 | if (!Reverse) { 79 | // ctz returns 7, 15, 23, etc 80 | return __builtin_ctzll(x) / 8; 81 | } 82 | else { 83 | // clz returns 0, 8, 16, etc 84 | return 7 - __builtin_clzll(x) / 8; 85 | } 86 | } 87 | else { 88 | if (!Reverse) { 89 | // ffs returns + 1, so that's going to be 8, 16, 24, etc 90 | return (__builtin_ffsll(x) - 8) / 8; 91 | } 92 | else { 93 | // clz returns 0, 8, 16, 24, 32, 40, 48, 56 94 | // x == 0 returns 63 that we increment to 64 to return -1 95 | return 7 - (__builtin_clzll(x | 1) + 1) / 8; 96 | } 97 | } 98 | } 99 | 100 | // Find char in string and trim it 101 | template 102 | inline uint32_t _trim8(const char* s, uint8_t c) { 103 | // int 64 of all c's 104 | uint64_t m = extend(c); 105 | 106 | // int 64 of s 107 | uint64_t x = cast(s); 108 | uint64_t xo = x; 109 | 110 | // remove c's from string 111 | // so now we have to find first zero byte 112 | x ^= m; 113 | 114 | uint64_t a = 0x7f7f7f7f7f7f7f7full; 115 | 116 | // set the high bit in non-zero bytes 117 | if (Printable) { 118 | x += a; 119 | } 120 | else { 121 | x = ((x & a) + a) | x; 122 | } 123 | 124 | // flip to set the high bit in zero bytes, and clear other high bits 125 | x = ~x; 126 | 127 | // clear all bits except the high bit of the zero byte 128 | x &= ~a; 129 | 130 | // set all bits under the lowest high bit 131 | x &= x - 1u; 132 | 133 | if (!Exists) { 134 | x >>= 7; 135 | } 136 | else if (Printable) { 137 | // complete the killed high bit 138 | x <<= 1u; 139 | x |= 1u; 140 | } 141 | 142 | return xo & x; 143 | } 144 | 145 | // Find char in printable (chars < 128) string of 8 chars 146 | inline uint32_t pmemchr8(const char* s, uint8_t c) { 147 | return _memchr8(s, c); 148 | } 149 | 150 | // Find char in printable (chars < 128) string of 8 chars 151 | // * The string is known to contain the char 152 | inline uint32_t pmemchr8k(const char* s, uint8_t c) { 153 | return _memchr8(s, c); 154 | } 155 | 156 | // Find char in binary string of 8 chars 157 | inline uint32_t memchr8(const char* s, uint8_t c) { 158 | return _memchr8(s, c); 159 | } 160 | 161 | // Find char in binary string of 8 chars 162 | // * The string is known to contain the char 163 | inline uint32_t memchr8k(const char* s, uint8_t c) { 164 | return _memchr8(s, c); 165 | } 166 | 167 | // Find char in printable (chars < 128) string of 8 chars 168 | inline uint32_t pmemrchr8(const char* s, uint8_t c) { 169 | return _memchr8(s, c); 170 | } 171 | 172 | // Find char in printable (chars < 128) string of 8 chars 173 | // * The string is known to contain the char 174 | inline uint32_t pmemrchr8k(const char* s, uint8_t c) { 175 | return _memchr8(s, c); 176 | } 177 | 178 | // Find char in binary string of 8 chars 179 | inline uint32_t memrchr8(const char* s, uint8_t c) { 180 | return _memchr8(s, c); 181 | } 182 | 183 | // Find char in binary string of 8 chars 184 | // * The string is known to contain the char 185 | inline uint32_t memrchr8k(const char* s, uint8_t c) { 186 | return _memchr8(s, c); 187 | } 188 | 189 | // Find char in const binary string 190 | template 191 | inline uint32_t _memchr(const char* s, uint32_t len, uint8_t c) { 192 | const char* p = s; 193 | const char* end = s + len; 194 | 195 | // If shorter than 8 bytes, we have to mask away c bytes past len 196 | uint32_t partLen = (len & 7) ? (len & 7) : 8; 197 | uint64_t partMask = len < 8 ? ~0ull >> (64 - partLen * 8) : 0ull; 198 | uint64_t first = cast(p) & ~(partMask & extend(c)); 199 | 200 | // Check first 8 bytes 201 | if (hasbyte(first, c)) 202 | return _memchr8((char*)&first, c); 203 | 204 | // Advance to leave multiple of 8 bytes 205 | p += partLen; 206 | 207 | // Check words for that byte 208 | for (;;) { 209 | if (hasbyte(cast(p), c)) { 210 | return (p - s) + _memchr8(p, c); 211 | } 212 | p += 8; 213 | if (!Known) { 214 | if (p == end) 215 | return -1; 216 | } 217 | } 218 | } 219 | 220 | // Find char, in reverse, in const binary string 221 | template 222 | inline uint32_t _memrchr(const char* s, uint32_t len, uint8_t c) { 223 | const char* p = s + len; 224 | 225 | // If shorter than 8 bytes, we have to mask away c bytes past len 226 | uint32_t partLen = (len & 7) ? (len & 7) : 8; 227 | uint64_t partMask = len < 8 ? ~0ull >> (64 - partLen * 8) : 0ull; 228 | uint64_t first = cast(p) & ~(partMask & extend(c)); 229 | 230 | // Advance to leave multiple of 8 bytes 231 | p -= partLen; 232 | 233 | // Check first 8 bytes 234 | if (hasbyte(first, c)) { 235 | return (p - s) + _memchr8((char*)&first, c); 236 | } 237 | 238 | // Check words for that byte 239 | for (;;) { 240 | if (hasbyte(cast(p), c)) { 241 | return (p - s) + _memchr8(p, c); 242 | } 243 | p -= 8; 244 | if (!Known) { 245 | if (p == s) 246 | return -1; 247 | } 248 | } 249 | } 250 | 251 | // Find char in binary string 252 | inline uint32_t memchr(const char* s, uint32_t len, uint8_t c) { 253 | return _memchr(s, len, c); 254 | } 255 | 256 | // Find char in binary string. Char c is known to be in s + len 257 | inline uint32_t memchrk(const char* s, uint32_t len, uint8_t c) { 258 | return _memchr(s, len, c); 259 | } 260 | 261 | // Find char in printable string 262 | inline uint32_t pmemchr(const char* s, uint32_t len, uint8_t c) { 263 | return _memchr(s, len, c); 264 | } 265 | 266 | // Find char in printable string. Char c is known to be in s + len 267 | inline uint32_t pmemchrk(const char* s, uint32_t len, uint8_t c) { 268 | return _memchr(s, len, c); 269 | } 270 | 271 | // Find char in binary string 272 | inline uint32_t memrchr(const char* s, uint32_t len, uint8_t c) { 273 | return _memrchr(s, len, c); 274 | } 275 | 276 | // Find char in binary string. Char c is known to be in s + len 277 | inline uint32_t memrchrk(const char* s, uint32_t len, uint8_t c) { 278 | return _memrchr(s, len, c); 279 | } 280 | 281 | // Find char in printable string 282 | inline uint32_t pmemrchr(const char* s, uint32_t len, uint8_t c) { 283 | return _memrchr(s, len, c); 284 | } 285 | 286 | // Find char in printable string. Char c is known to be in s + len 287 | inline uint32_t pmemrchrk(const char* s, uint32_t len, uint8_t c) { 288 | return _memrchr(s, len, c); 289 | } 290 | 291 | // Find char in NON-CONST string 292 | template 293 | inline uint32_t _memchr_nc(char* s, uint32_t len, uint8_t c) { 294 | const char* p = s; 295 | 296 | // Replace back with sentinel so we don't have to check for length 297 | uint8_t back = s[len - 1]; 298 | s[len - 1] = c; 299 | 300 | // Check words for that byte. 301 | while (!hasbyte(cast(p), c)) { 302 | p += 8; 303 | } 304 | 305 | // Find the position 306 | p += _memchr8(p, c); 307 | 308 | // restore sentinel 309 | s[len - 1] = back; 310 | 311 | uint32_t ret = p - s; 312 | return (ret != len - 1 || back == c) ? ret : -1; 313 | } 314 | 315 | // Find char in binary NON-CONST string 316 | inline uint32_t memchr_nc(char* s, uint32_t len, uint8_t c) { 317 | return _memchr_nc(s, len, c); 318 | } 319 | 320 | // Find char in printable NON-CONST string 321 | inline uint32_t pmemchr_nc(char* s, uint32_t len, uint8_t c) { 322 | return _memchr_nc(s, len, c); 323 | } 324 | 325 | // Find zero byte in binary string up to 8 chars 326 | inline uint32_t strlen8(const char* s) { 327 | return memchr8(s, 0); 328 | } 329 | 330 | // Find zero byte in printable string up to 8 chars 331 | inline uint32_t pstrlen8(const char* s) { 332 | return pmemchr8(s, 0); 333 | } 334 | 335 | // Find zero byte in binary string 336 | inline uint32_t strlen(const char* s) { 337 | // check words for zero 338 | const char* p = s; 339 | while (!haszero(cast(p))) { 340 | p += 8; 341 | } 342 | 343 | return p - s + memchr8k(p, 8); 344 | } 345 | 346 | // Find zero byte in printable string 347 | inline uint32_t pstrlen(const char* s) { 348 | // check words for zero 349 | const char* p = s; 350 | while (!haszero(cast(p))) { 351 | p += 8; 352 | } 353 | 354 | return p - s + pmemchr8k(p, 8); 355 | } 356 | 357 | // Get the uint _cast_ of a string of up to 8 chars 358 | inline uint64_t cast8(const char* s, uint32_t len) { 359 | assert(len <= 8); 360 | 361 | // int 64 of s 362 | uint64_t x = cast(s); 363 | 364 | uint64_t mask = (1ull << (len * 8)) - 1; 365 | mask |= -(len == 8); // fill 1's if len == 8 366 | 367 | return x & mask; 368 | } 369 | 370 | //// string to int 371 | 372 | // Parse uint from string of up to 4 chars 373 | inline uint16_t atou4(const char* s, uint32_t len) { 374 | assert(len <= 4); 375 | 376 | // int 64 of s. "1234" --> 0x34333231 377 | uint32_t x = cast(s); 378 | 379 | // apply len. len of 2 --> 0x32310000 380 | x <<= 32 - len * 8; 381 | x &= -(uint32_t)(len > 0); 382 | 383 | // add ones and tens, in int8's, from 0x0[2]0[1] to 0x00[12] 384 | x = (x & 0x0f0f0f0fu) * ((1u << 8) * 10 + 1) >> 8; 385 | 386 | // add int16's, from 0x00[34]00[12] to 0x0000[1234] 387 | x = (x & 0x00ff00ffu) * ((1u << 16) * 100 + 1) >> 16; 388 | 389 | return x; 390 | } 391 | 392 | // Parse uint from string of up to 8 chars 393 | inline uint32_t atou8(const char* s, uint32_t len) { 394 | assert(len <= 8); 395 | 396 | // int 64 of s. "12345678" --> 0x3837363534333231 397 | uint64_t x = cast(s); 398 | 399 | // apply len. len of 2 --> 0x3231000000000000 400 | x <<= 64 - len * 8; 401 | x &= -(uint64_t)(len > 0); 402 | 403 | // add ones and tens, in int8's, from 0x0[2]0[1] to 0x00[12] 404 | x = (x & 0x0f0f0f0f0f0f0f0full) * ((1ull << 8) * 10 + 1) >> 8; 405 | 406 | // add int16's, from 0x00[34]00[12] to 0x0000[1234] 407 | x = (x & 0x00ff00ff00ff00ffull) * ((1ull << 16) * 100 + 1) >> 16; 408 | 409 | // add int32's, from 0x0000[1234]0000[5678] to 0x[12345678] 410 | x = (x & 0x0000ffff0000ffffull) * ((1ull << 32) * 10000 + 1) >> 32; 411 | 412 | return x; 413 | } 414 | 415 | // Parse uint64_t from string of up to 20 chars 416 | // *** More than 20 char returns junk. 417 | inline uint64_t atou(const char* s, uint32_t len) { 418 | assert(len <= 20); 419 | uint64_t x = 0; 420 | if (len > 8) { 421 | uint32_t lh = len % 8; 422 | x = atou8(s, lh); 423 | x *= 100000000; 424 | len -= lh; 425 | s += lh; 426 | if (len > 8) { 427 | x += atou8(s, 8); 428 | x *= 100000000; 429 | len -= 8; 430 | s += 8; 431 | } 432 | } 433 | return x + atou8(s, len); 434 | } 435 | 436 | // Parse _signed_ int from string of up to 20 chars. No spaces 437 | inline int64_t atoi(const char* s, uint32_t len) { 438 | bool neg = !!len & (*s == '-'); 439 | bool ls = !!len & (*s == '-' || *s == '+'); 440 | s += ls; 441 | 442 | int64_t x = atou(s, len - ls); 443 | 444 | return neg ? -x : x; 445 | } 446 | 447 | // Parse hex int from string of up to 8 chars 448 | inline uint32_t htou8(const char* s, uint32_t len) { 449 | assert(len <= 8); 450 | 451 | // int 64 of s. "12345678" --> 0x3837363534333231 452 | uint64_t x = cast(s); 453 | 454 | // apply len. len of 2 --> 0x3231000000000000 455 | x <<= 64 - len * 8; 456 | 457 | // handle length of 0. remove all bits if zero 458 | x &= -(uint64_t)(len > 0); 459 | 460 | // change a-f to to number. 0x41 --> 0x0a 461 | x += ((x & 0x4040404040404040ull) >> 6) * 9; 462 | 463 | // add ones and 0x10's, in int8's, from 0x0[f]0[1] to 0x00[1f] 464 | x = (x & 0x0f0f0f0f0f0f0f0full) * ((1ull << 12) + 1) >> 8; 465 | 466 | // add int16's, from 0x00[ed]00[1f] to 0x0000[1fed] 467 | x = (x & 0x00ff00ff00ff00ffull) * ((1ull << 24) + 1) >> 16; 468 | 469 | // add int32's, from 0x0000[1fed]0000[cba9] to 0x[1fedcba9] 470 | x = (x & 0x0000ffff0000ffffull) * ((1ull << 48) + 1) >> 32; 471 | 472 | return x; 473 | } 474 | 475 | // Parse hex int from string of up to 16 chars 476 | inline uint64_t htou(const char* s, uint32_t len) { 477 | assert(len <= 16); 478 | uint64_t x = 0; 479 | if (len > 8) { 480 | uint32_t lh = len - 8; 481 | x = htou8(s, lh); 482 | x <<= 32; 483 | len -= lh; 484 | s += lh; 485 | } 486 | return x + htou8(s, len); 487 | } 488 | 489 | //// int to string 490 | 491 | // *** p suffix means zero-padded 492 | 493 | // Convert uint, of less than 100, to %02u, as int 16 494 | inline uint16_t utoa2p(uint64_t x) { 495 | static const CODE_SECTION uint8_t pairs[50] = { // 0..49, little endian 496 | 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 497 | 0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71, 0x81, 0x91, 498 | 0x02, 0x12, 0x22, 0x32, 0x42, 0x52, 0x62, 0x72, 0x82, 0x92, 499 | 0x03, 0x13, 0x23, 0x33, 0x43, 0x53, 0x63, 0x73, 0x83, 0x93, 500 | 0x04, 0x14, 0x24, 0x34, 0x44, 0x54, 0x64, 0x74, 0x84, 0x94, 501 | }; 502 | 503 | uint32_t b50 = -(uint32_t)(x >= 50); // x >= 50 ? ~0 : 0; 504 | uint32_t x2 = x - (50u & b50); // x2 = x % 50; 505 | uint16_t t = pairs[x2] + (b50 & 5); // t = pairs[x % 50] + 5 in low nibble if x > 50 506 | 507 | // move upper nibble to next byte and add '00' 508 | return ((t | (t << 4)) & 0x0f0f) | 0x3030; 509 | } 510 | 511 | // Convert uint, of less than 100, to %02u 512 | inline void utoa2p(uint64_t x, char* s) { 513 | uint16_t t = utoa2p(x); 514 | memcpy(s, &t, sizeof(uint16_t)); 515 | } 516 | 517 | // Convert uint to %0u, N <= 8 518 | template 519 | inline uint64_t _utoap(uint64_t x, char* s) { 520 | static_assert(N <= 8); 521 | 522 | uint64_t tmp = utoa2p(x % 100); 523 | 524 | for (int i = 0; i < N - 2; i += 2) { 525 | x /= 100; 526 | tmp <<= 16; 527 | tmp |= utoa2p(x % 100); 528 | } 529 | 530 | tmp >>= (N & 1) * 8; 531 | memcpy(s, &tmp, 8); 532 | 533 | return x; 534 | } 535 | 536 | // Convert uint to %0u, N <= 20 537 | template 538 | inline char* utoap(uint64_t x, char* s) { 539 | if constexpr (N <= 8) { 540 | _utoap(x, s); 541 | } 542 | else if constexpr (N <= 16) { 543 | x = _utoap(x, s + 8); 544 | x /= (N & 1) ? 10 : 100; 545 | _utoap<8>(x, s); 546 | } 547 | else { 548 | x = _utoap(x, s + 16); 549 | x /= (N & 1) ? 10 : 100; 550 | x = _utoap<8>(x, s + 8); 551 | _utoap<8>(x / 100, s); 552 | } 553 | 554 | s[N] = '\0'; 555 | return s; 556 | } 557 | 558 | // Convert signed int 32 to string of up to 8 bytes. 559 | inline uint32_t itoa8(int32_t x, char* buf) { 560 | // Handle negatives 561 | bool neg = x < 0; 562 | *buf = '-'; // Always write 563 | buf += neg; // But advance only if negative 564 | x = __builtin_abs(x); 565 | 566 | uint64_t tmp = 0; 567 | int n = 0; 568 | 569 | // Convert pairs of digits 570 | while (x >= 100) { 571 | n += 2; 572 | tmp <<= 16; 573 | tmp |= utoa2p(x % 100); 574 | x /= 100; 575 | } 576 | 577 | // Last pair - no need to divide any more 578 | n += 2; 579 | tmp <<= 16; 580 | tmp |= utoa2p(x); 581 | 582 | // If last pair is 0 we want to remove this "0" 583 | n -= x < 10; 584 | tmp >>= x < 10 ? 8 : 0; 585 | 586 | // Copy to provided buffer 587 | memcpy(buf, &tmp, 8); 588 | buf[n] = '\0'; 589 | 590 | return n + neg; 591 | } 592 | 593 | // Convert signed int 64 to string. String buffer is at least 22 bytes. 594 | // Returns length 595 | // *** this feels inefficient :( *** 596 | inline uint32_t itoa(int64_t x, char* buf) { 597 | // Handle negatives 598 | bool neg = x < 0; 599 | *buf = '-'; // Always write 600 | buf += neg; // But advance only if negative 601 | x = __builtin_abs(x); 602 | 603 | char tmp[20]; 604 | char* p = tmp + 20; 605 | 606 | while (x >= 100) { 607 | p -= 2; 608 | utoa2p(x % 100, p); 609 | x /= 100; 610 | } 611 | 612 | p -= 2; 613 | utoa2p(x, p); 614 | 615 | p += x < 10; 616 | 617 | uint32_t len = tmp + 20 - p; 618 | 619 | memcpy(buf, p, 20); 620 | buf[len] = '\0'; 621 | 622 | return len + neg; 623 | } 624 | 625 | //// Double to string 626 | 627 | // Copy the sign from src to dst that is unsigned. 628 | // *** dst is up to 63 bit 629 | inline int64_t _copySign(int64_t src, uint64_t dst) { 630 | // This is better than `src > 0 ? dst : -dst` that is using cmov 631 | uint64_t m = ~(src >> 63); // all 1s if >= 0 (opposite of abs) 632 | return (dst + m) ^ m; // flip if src negative 633 | } 634 | 635 | // Parse double from string 636 | // *** More than 20 char integer part returns junk. 637 | // *** Too much decimal char will get lost to precision 638 | inline double atod(const char* s, uint32_t len) { 639 | // Get int part 640 | int ilen = pmemchr(s, len, '.'); 641 | 642 | // If no decimal dot, return the int 643 | if (ilen == -1) { 644 | return atoi(s, len); 645 | } 646 | 647 | // Do the int part 648 | int64_t ipart = atoi(s, ilen); 649 | s += ilen + 1; 650 | len -= ilen + 1; 651 | 652 | // Int of decimal part 653 | int64_t dpart = atou(s, len); 654 | 655 | // To add the two parts we need matching signs 656 | dpart = _copySign(ipart, dpart); 657 | 658 | // Array of 20 * 8 = 160 bytes 659 | static const CODE_SECTION double scales[20] = { 660 | 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 661 | 1e-11, 1e-12, 1e-13, 1e-14, 1e-15, 1e-16, 1e-17, 1e-18, 1e-19, 1e-20 }; 662 | 663 | return ipart + dpart * scales[len]; 664 | } 665 | 666 | } // namespace swar 667 | --------------------------------------------------------------------------------