├── bin └── .gitignore ├── obj └── .placeholder ├── scripts ├── table.py ├── hybrid-shift-back.py ├── stats-bytes-converted.py ├── stats-utilization.py ├── hybrid-unsigned.py ├── writer.py ├── cost.py ├── hybrid.py ├── hybrid-signed.py └── generator.py ├── experiments ├── hwevents │ ├── testcases.py │ ├── experiment.py │ ├── loader.py │ └── runner.py ├── speedup-comparison │ ├── loader.py │ └── report.py ├── spanmaskhistogram │ ├── hwevents_loader.py │ ├── report_writer.py │ ├── experiment.py │ ├── testcases.py │ ├── loader.py │ ├── runner.py │ ├── microbenchmark_loader.py │ └── report.py ├── microbenchmarks │ ├── results │ │ ├── skylake-i7-6700-gcc7.3.0.metadata │ │ └── westmere-i5-m540-gcc7.3.0.metadata │ ├── update_reports.sh │ ├── writer.py │ ├── experiment.py │ ├── testcases.py │ ├── loader.py │ ├── runner.py │ └── report.py ├── overalltests │ ├── results │ │ ├── skylake-i7-6700-gcc7.3.0.metadata │ │ └── westmere-i5-m540-gcc7.3.0.metadata │ ├── average_writer.py │ ├── loader.py │ ├── report_writer.py │ ├── experiment.py │ ├── testcases.py │ ├── runner.py │ ├── average.py │ └── report.py ├── utils.py ├── distribution.py ├── prettyprint.py └── README.rst ├── include ├── scalar │ ├── scalar-parse-common.h │ ├── scalar-parse-unsigned.h │ ├── std-parser-signed.h │ └── scalar-parse-signed.h ├── block_info.h ├── safe-convert.h ├── test │ ├── input_generator.h │ ├── time_utils.h │ ├── application.h │ ├── command_line.h │ ├── linux-perf-events.h │ └── benchmark.h ├── sse │ ├── sse-matcher.h │ ├── sse-matcher-stni.h │ ├── sse-parser-unsigned.h │ ├── sse-utils.h │ ├── sse-parser-statistics.h │ ├── sse-simplified-parser-signed.h │ ├── sse-parser-signed.h │ ├── sse-block-parser-unsigned.h │ ├── sse-block-parser-signed.h │ ├── sse-convert.h │ └── sse-parser-common.h ├── hybrid-parser.h ├── hybrid-parser-signed.h └── avx512 │ └── avx512-parser-signed.h ├── .gitignore ├── test ├── unittest │ ├── test-stni-matcher.cpp │ ├── verify_sse_unsigned_conversion.cpp │ ├── verify_sse_signed_overflow_detection.cpp │ ├── verify_sse_unsigned_parser.cpp │ └── verify_sse_signed_parser_validation.cpp ├── spanmaskhistogram.cpp ├── statistics.cpp ├── utils │ ├── command_line.cpp │ ├── input_generator.cpp │ └── application.cpp ├── compare-avx512.cpp ├── compare-unsigned.cpp ├── benchmark-hwevents.cpp ├── compare-signed.cpp ├── benchmark-cpuclocks.cpp └── benchmark.cpp ├── LICENSE ├── src ├── block_info.cpp └── sse-parser-statistics.cpp └── README.rst /bin/.gitignore: -------------------------------------------------------------------------------- 1 | [a-z]* 2 | -------------------------------------------------------------------------------- /obj/.placeholder: -------------------------------------------------------------------------------- 1 | # placeholder 2 | -------------------------------------------------------------------------------- /scripts/table.py: -------------------------------------------------------------------------------- 1 | ../experiments/table.py -------------------------------------------------------------------------------- /experiments/hwevents/testcases.py: -------------------------------------------------------------------------------- 1 | ../spanmaskhistogram/testcases.py -------------------------------------------------------------------------------- /experiments/speedup-comparison/loader.py: -------------------------------------------------------------------------------- 1 | ../microbenchmarks/loader.py -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/hwevents_loader.py: -------------------------------------------------------------------------------- 1 | ../hwevents/loader.py -------------------------------------------------------------------------------- /experiments/microbenchmarks/results/skylake-i7-6700-gcc7.3.0.metadata: -------------------------------------------------------------------------------- 1 | OS: Ubuntu 2 | Compiler: gcc (GCC) 7.3.0 3 | CPU: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz 4 | -------------------------------------------------------------------------------- /experiments/overalltests/results/skylake-i7-6700-gcc7.3.0.metadata: -------------------------------------------------------------------------------- 1 | OS: Ubuntu 2 | Compiler: gcc (GCC) 7.3.0 3 | CPU: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz 4 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/results/westmere-i5-m540-gcc7.3.0.metadata: -------------------------------------------------------------------------------- 1 | OS: Linux 2 | Compiler: gcc (Debian 7.3.0-11) 7.3.0 3 | CPU: Intel(R) Core(TM) i5 CPU M 540 @ 2.53GHz 4 | -------------------------------------------------------------------------------- /experiments/overalltests/results/westmere-i5-m540-gcc7.3.0.metadata: -------------------------------------------------------------------------------- 1 | OS: Linux 2 | Compiler: gcc (Debian 7.3.0-11) 7.3.0 3 | CPU: Intel(R) Core(TM) i5 CPU M 540 @ 2.53GHz 4 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/update_reports.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for f in results/*.txt 4 | do 5 | SRC=$f 6 | TRG="${SRC%.txt}".rst 7 | echo "(re)creating $TRG" 8 | python report.py $SRC "#*" > $TRG 9 | done 10 | -------------------------------------------------------------------------------- /include/scalar/scalar-parse-common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace scalar { 4 | 5 | bool contains(const char* set, char c) { 6 | char* s = const_cast(set); 7 | while (*s) { 8 | if (*s++ == c) { 9 | return true; 10 | } 11 | } 12 | 13 | return false; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # a helper file not included in this repo 2 | sse-dump.h 3 | 4 | # auto-generated files 5 | *.inl 6 | 7 | # the rest 8 | *.o 9 | *.pyc 10 | 11 | # overall reports 12 | report-*.rst 13 | overall.txt 14 | 15 | # microbenchmarks 16 | microbenchmarks.txt 17 | microbenchmarks.rst 18 | 19 | # span mask histogram 20 | spanmaskhistogram.txt 21 | spanmaskhistogram.rst 22 | 23 | # hwevents 24 | hwevents.txt 25 | 26 | perf.data 27 | perf.data.old 28 | 29 | -------------------------------------------------------------------------------- /experiments/utils.py: -------------------------------------------------------------------------------- 1 | def groupby(sequence, keyfun): 2 | d = {} 3 | for item in sequence: 4 | key = keyfun(item) 5 | if key not in d: 6 | d[key] = [item] 7 | else: 8 | d[key].append(item) 9 | 10 | return d 11 | 12 | def splitsorted(sequence, keyfun): 13 | prev = None 14 | result = [] 15 | for item in sequence: 16 | val = keyfun(item) 17 | if val != prev: 18 | result.append([]) 19 | prev = val 20 | 21 | result[-1].append(item) 22 | 23 | return result 24 | -------------------------------------------------------------------------------- /scripts/hybrid-shift-back.py: -------------------------------------------------------------------------------- 1 | from hybrid import Parser 2 | 3 | def get_shifts(): 4 | for number in range(2**8): 5 | parser = Parser(number) 6 | ranges = parser.get_ranges() 7 | if len(ranges) == 0: 8 | yield 0 9 | continue 10 | 11 | last = ranges[-1] 12 | if last.last == 7: 13 | yield last.digits() 14 | else: 15 | yield 0 16 | 17 | 18 | def main(): 19 | shifts = list(get_shifts()) 20 | shifts = map(str, shifts) 21 | print("static int shift[256] = {%s};" % (','.join(shifts))) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /include/block_info.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | enum class Conversion: uint8_t { 9 | Empty, 10 | SSE1Digit, 11 | SSE2Digits, 12 | SSE3Digits, 13 | SSE4Digits, 14 | SSE8Digits, 15 | Scalar 16 | }; 17 | 18 | struct BlockInfo { 19 | uint8_t first_skip; 20 | uint8_t total_skip; 21 | uint8_t element_count; 22 | Conversion conversion_routine; 23 | uint16_t invalid_sign_mask; 24 | uint8_t shuffle_digits[16]; 25 | uint8_t shuffle_signs[16]; 26 | 27 | void dump(FILE* file) const; 28 | }; 29 | 30 | extern BlockInfo blocks[65536]; 31 | 32 | -------------------------------------------------------------------------------- /experiments/distribution.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def single_digit_distribution(digit, count = 8): 5 | tmp = [0] * count 6 | tmp[digit - 1] = 1 7 | 8 | return tmp 9 | 10 | 11 | def uniform_distribution(count): 12 | return [1] * count 13 | 14 | 15 | def normal_distribution(mu, sigma, count = 8): 16 | tmp = [0] * count 17 | 18 | def gauss(x): 19 | c = 1.0/(sigma * math.sqrt(2 * math.pi)) 20 | return c * math.exp(-(x - mu)**2/(2 * sigma**2)) 21 | 22 | for x in range(0, count): 23 | tmp[x] = gauss(float(x + 1)) 24 | 25 | m = max(tmp) 26 | 27 | for x in range(0, count): 28 | tmp[x] = int(1000 * tmp[x]/m) 29 | 30 | return tmp 31 | 32 | 33 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/report_writer.py: -------------------------------------------------------------------------------- 1 | class RestWriter(object): 2 | def __init__(self, file, report): 3 | self.file = file 4 | self.report = report 5 | 6 | 7 | def write(self, restsection): 8 | 9 | assert len(restsection) >= 1 10 | 11 | for title, table in self.report: 12 | self.write_header(title, restsection[0], 80) 13 | self.file.write('\n') 14 | self.file.write(str(table)) 15 | 16 | 17 | def write_header(self, title, char, width = 80): 18 | f = self.file 19 | f.write('\n') 20 | f.write('\n') 21 | f.write("%s\n" % title) 22 | f.write(char * max(len(title), width)) 23 | f.write('\n') 24 | 25 | -------------------------------------------------------------------------------- /include/safe-convert.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | void mul10_add_digit(T& number, char c) { 8 | // number * 10 + 9 <= MAX <=> number <= (MAX - 9)/10 9 | if (number < (std::numeric_limits::max() - 9) / 10) { 10 | // no overflow is possible, use the faster path 11 | number = 10*number + c - '0'; 12 | } else { 13 | // check for overflow 14 | if (__builtin_umul_overflow(number, 10, &number)) { 15 | throw std::range_error("unsigned overflow (1)"); 16 | } 17 | 18 | if (__builtin_uadd_overflow(number, c - '0', &number)) { 19 | throw std::range_error("unsigned overflow (2)"); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /scripts/stats-bytes-converted.py: -------------------------------------------------------------------------------- 1 | from generator import Generator 2 | from table import Table 3 | 4 | if __name__ == '__main__': 5 | gen = Generator() 6 | 7 | freq = {} 8 | for bi in gen.run(): 9 | k = bi.total_skip 10 | 11 | freq[k] = freq.get(k, 0) + 1 12 | 13 | table = Table() 14 | table.add_header(["bytes processed", "patterns", "%", "cumulative %"]) 15 | 16 | cumulative = 0 17 | for total_skip in sorted(freq.keys()): 18 | count = freq[total_skip] 19 | cumulative += count 20 | 21 | table.add_row([ 22 | '%d' % total_skip, 23 | '%d' % count, 24 | '%0.2f%%' % (100 * count/65536.0), 25 | '%0.2f%%' % (100 * cumulative/65536.0), 26 | ]) 27 | 28 | print(table) 29 | -------------------------------------------------------------------------------- /include/test/input_generator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | std::string generate_unsigned(size_t size, 8 | const std::string& separators_set, 9 | std::mt19937 random, 10 | std::discrete_distribution<> numbers, 11 | std::discrete_distribution<> separators); 12 | 13 | std::string generate_signed(size_t size, 14 | const std::string& separators_set, 15 | std::mt19937 random, 16 | std::discrete_distribution<> numbers, 17 | std::discrete_distribution<> separators, 18 | std::discrete_distribution<> sign); 19 | -------------------------------------------------------------------------------- /experiments/overalltests/average_writer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class RestWriter(object): 4 | def __init__(self, file, report): 5 | self.file = file 6 | self.report = report 7 | 8 | 9 | def write(self, restsection): 10 | 11 | assert len(restsection) >= 1 12 | 13 | for separator, distribution, table in self.report: 14 | header = '%s (%s)' % (distribution, separator) 15 | self.write_header(header, restsection[0], 80) 16 | self.file.write('\n') 17 | self.file.write(str(table)) 18 | 19 | 20 | def write_header(self, title, char, width = 80): 21 | f = self.file 22 | f.write('\n') 23 | f.write('\n') 24 | f.write("%s\n" % title) 25 | f.write(char * max(len(title), width)) 26 | f.write('\n') 27 | 28 | -------------------------------------------------------------------------------- /experiments/overalltests/loader.py: -------------------------------------------------------------------------------- 1 | class Item(object): 2 | __slots__ = ("procedure", 3 | "size", 4 | "loops", 5 | "distribution_name", 6 | "num_distribution", 7 | "sep_distribution", 8 | "sign_distribution", 9 | "time") 10 | 11 | 12 | def load(file): 13 | L = [] 14 | for line in file: 15 | F = line.split(';') 16 | 17 | L.append(Item()) 18 | item = L[-1] 19 | 20 | item.procedure = F[0] 21 | item.size = int(F[1]) 22 | item.loops = int(F[2]) 23 | item.distribution_name = F[3] 24 | item.num_distribution = F[4] 25 | item.sep_distribution = F[5] 26 | item.sign_distribution = F[6] 27 | item.time = int(F[7]) 28 | 29 | return L 30 | 31 | -------------------------------------------------------------------------------- /experiments/hwevents/experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from testcases import testcases 8 | from runner import Runner, format_distribution 9 | 10 | def main(): 11 | 12 | for item in testcases(): 13 | size, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item 14 | 15 | r = Runner(size, numbers_distribution, separators_distribution, sign_distribution) 16 | result = r.run() 17 | print("%d;%s;%s;%s;%s;%s" % ( 18 | size, 19 | distribution_name, 20 | numbers_distribution, 21 | separators_distribution, 22 | sign_distribution, 23 | result 24 | )) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from testcases import testcases 8 | from runner import Runner, format_distribution 9 | 10 | def main(): 11 | 12 | for item in testcases(): 13 | size, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item 14 | 15 | r = Runner(size, numbers_distribution, separators_distribution, sign_distribution) 16 | result = r.run() 17 | print("%d;%s;%s;%s;%s;%s" % ( 18 | size, 19 | distribution_name, 20 | numbers_distribution, 21 | separators_distribution, 22 | sign_distribution, 23 | result 24 | )) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /test/unittest/test-stni-matcher.cpp: -------------------------------------------------------------------------------- 1 | #include "sse/sse-matcher-stni.h" 2 | 3 | #include 4 | #include 5 | 6 | bool all_bytes_equal(__m128i a, __m128i b) { 7 | 8 | uint8_t tmpa[16]; 9 | uint8_t tmpb[16]; 10 | 11 | _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpa), a); 12 | _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpb), b); 13 | 14 | return memcmp(tmpa, tmpb, 16) == 0; 15 | } 16 | 17 | int main() { 18 | 19 | sse::STNIMatcher matcher(",.;"); 20 | 21 | const std::string s = ".123,45.;091;.;,"; 22 | assert(s.size() == 16); 23 | 24 | const __m128i input = _mm_loadu_si128(reinterpret_cast(s.c_str())); 25 | const __m128i ret = matcher.get_mask(input, _mm_setzero_si128()); 26 | 27 | const __m128i expected = _mm_setr_epi8(-1, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1); 28 | 29 | assert(all_bytes_equal(ret, expected)); 30 | } 31 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/writer.py: -------------------------------------------------------------------------------- 1 | class RestWriter(object): 2 | def __init__(self, file, report): 3 | self.file = file 4 | self.report = report 5 | 6 | 7 | def write(self, restsection): 8 | 9 | assert len(restsection) >= 2 10 | 11 | for section, collection in self.report: 12 | self.write_header(section, restsection[0], 80) 13 | 14 | for subsection, table in collection: 15 | self.file.write('\n') 16 | self.file.write("**%s**\n" % subsection) 17 | self.file.write('\n') 18 | self.file.write(str(table)) 19 | self.file.write('\n') 20 | 21 | 22 | def write_header(self, title, char, width): 23 | f = self.file 24 | f.write('\n') 25 | f.write('\n') 26 | f.write("%s\n" % title) 27 | f.write(char * max(len(title), width)) 28 | f.write('\n') 29 | 30 | -------------------------------------------------------------------------------- /experiments/overalltests/report_writer.py: -------------------------------------------------------------------------------- 1 | class RestWriter(object): 2 | def __init__(self, file, report): 3 | self.file = file 4 | self.report = report 5 | 6 | 7 | def write(self, restsection): 8 | 9 | assert len(restsection) >= 3 10 | 11 | for separator, collection1 in self.report: 12 | self.write_header(separator, restsection[0], 80) 13 | for distribution, collection2 in collection1: 14 | self.write_header(distribution, restsection[1], 50) 15 | for parameters, table in collection2: 16 | self.write_header(parameters, restsection[2], 40) 17 | self.file.write('\n') 18 | self.file.write(str(table)) 19 | 20 | 21 | def write_header(self, title, char, width = 80): 22 | f = self.file 23 | f.write('\n') 24 | f.write('\n') 25 | f.write("%s\n" % title) 26 | f.write(char * max(len(title), width)) 27 | f.write('\n') 28 | 29 | -------------------------------------------------------------------------------- /include/test/time_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using Clock = std::chrono::high_resolution_clock; 8 | 9 | template 10 | Clock::time_point::rep elapsed(const Clock::time_point& t1, const Clock::time_point& t2) { 11 | return std::chrono::duration_cast(t2 - t1).count(); 12 | } 13 | 14 | template 15 | Clock::time_point::rep measure_time(FUN fun) { 16 | 17 | const auto t1 = Clock::now(); 18 | fun(); 19 | const auto t2 = Clock::now(); 20 | 21 | return elapsed(t1, t2); 22 | } 23 | 24 | template 25 | Clock::time_point::rep measure_time(const std::string& info, FUN fun) { 26 | 27 | if (!info.empty()) { 28 | printf("%s", info.c_str()); 29 | fflush(stdout); 30 | } 31 | 32 | const auto dt = measure_time(fun); 33 | 34 | if (!info.empty()) { 35 | printf("%lu us\n", dt); 36 | } 37 | 38 | return dt; 39 | } 40 | 41 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | 8 | from testcases import testcases 9 | from runner import Runner, format_distribution 10 | 11 | 12 | def test(file): 13 | for item in testcases(): 14 | size, loops, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item 15 | 16 | r = Runner(size, loops, numbers_distribution, separators_distribution, sign_distribution) 17 | clocks = r.run() 18 | 19 | file.write("%d;%d;%s;%s;%s;%s;%s\n" % ( 20 | size, 21 | loops, 22 | distribution_name, 23 | format_distribution(numbers_distribution), 24 | format_distribution(separators_distribution), 25 | format_distribution(sign_distribution), 26 | str(clocks))) 27 | 28 | 29 | def main(): 30 | test(sys.stdout) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /experiments/overalltests/experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from testcases import testcases 8 | from runner import Runner, format_distribution 9 | 10 | 11 | def test(file): 12 | for item in testcases(): 13 | procedure, size, loops, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item 14 | 15 | r = Runner(procedure, size, loops, numbers_distribution, separators_distribution, sign_distribution) 16 | time = r.run() 17 | 18 | file.write("%s;%d;%d;%s;%s;%s;%s;%d\n" % ( 19 | procedure, 20 | size, 21 | loops, 22 | distribution_name, 23 | format_distribution(numbers_distribution), 24 | format_distribution(separators_distribution), 25 | format_distribution(sign_distribution), 26 | time)) 27 | 28 | 29 | def main(): 30 | test(sys.stdout) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /include/sse/sse-matcher.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "immintrin.h" 7 | 8 | namespace sse { 9 | 10 | template 11 | class NaiveMatcher { 12 | __m128i letters[K + 1]; 13 | size_t n; 14 | 15 | public: 16 | NaiveMatcher(const char* s) { 17 | assert(s != nullptr); 18 | n = strlen(s); 19 | assert(n < K); 20 | 21 | for (size_t i=0; i < n + 1; i++) { 22 | letters[i] = _mm_set1_epi8(s[i]); 23 | } 24 | } 25 | 26 | NaiveMatcher(char c) : n(0) { 27 | letters[0] = _mm_set1_epi8(c); 28 | } 29 | 30 | public: 31 | __m128i get_mask(const __m128i& input, const __m128i& initial) const { 32 | __m128i result = initial; 33 | for (size_t i=0; i < n + 1; i++) { 34 | 35 | const __m128i mask = _mm_cmpeq_epi8(letters[i], input); 36 | result = _mm_or_si128(result, mask); 37 | } 38 | 39 | return result; 40 | } 41 | }; 42 | 43 | } // namespace sse 44 | -------------------------------------------------------------------------------- /include/sse/sse-matcher-stni.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "immintrin.h" 8 | 9 | namespace sse { 10 | 11 | class STNIMatcher { 12 | __m128i set; 13 | size_t set_size; 14 | 15 | public: 16 | static bool can_handle(const char* s) { 17 | return (s != nullptr) 18 | && (strlen(s) > 0) 19 | && (strlen(s) <= 16); 20 | } 21 | 22 | public: 23 | STNIMatcher(const char* s) { 24 | assert(can_handle(s)); 25 | 26 | set_size = strlen(s); 27 | set = _mm_loadu_si128(reinterpret_cast(s)); 28 | } 29 | 30 | public: 31 | __m128i get_mask(const __m128i& input, const __m128i& initial) { 32 | 33 | const uint8_t mode = _SIDD_UBYTE_OPS 34 | | _SIDD_CMP_EQUAL_ANY 35 | | _SIDD_UNIT_MASK; 36 | 37 | return _mm_or_si128(initial, _mm_cmpestrm(set, set_size, input, 16, mode)); 38 | } 39 | }; 40 | 41 | } // namespace sse 42 | -------------------------------------------------------------------------------- /include/scalar/scalar-parse-unsigned.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "safe-convert.h" 8 | #include "scalar-parse-common.h" 9 | 10 | namespace scalar { 11 | 12 | template 13 | void parse_unsigned(const char* data, size_t size, const char* separators, INSERTER output) { 14 | 15 | uint32_t result = 0; 16 | size_t digits = 0; 17 | 18 | for (size_t i=0; i < size; i++) { 19 | const char c = data[i]; 20 | if (c >= '0' && c <= '9') { 21 | mul10_add_digit(result, c); 22 | digits += 1; 23 | } else if (contains(separators, c)) { 24 | if (digits > 0) { 25 | *output++ = result; 26 | result = 0; 27 | digits = 0; 28 | } 29 | } else { 30 | throw std::runtime_error("Wrong character (scalar)"); 31 | } 32 | } 33 | 34 | if (digits > 0) { 35 | *output++ = result; 36 | } 37 | } 38 | 39 | } // namespace 40 | 41 | 42 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/testcases.py: -------------------------------------------------------------------------------- 1 | from distribution import * 2 | 3 | def testcases(): 4 | sign_distribution = [1,1,1] 5 | 6 | for size in sizes: 7 | for separator_distribution in separator_distributions: 8 | for k in range(1, 8 + 1): 9 | numbers_distribution = single_digit_distribution(k) 10 | yield size, 'single', numbers_distribution, separator_distribution, sign_distribution 11 | 12 | for k in range(1, 8 + 1): 13 | numbers_distribution = uniform_distribution(k) 14 | yield size, 'uniform', numbers_distribution, separator_distribution, sign_distribution 15 | 16 | for k in range(1, 8 + 1): 17 | numbers_distribution = normal_distribution(k, 1.0) 18 | yield size, 'normal', numbers_distribution, separator_distribution, sign_distribution 19 | 20 | 21 | sizes = [ 22 | 1024, 23 | 4096, 24 | 65536, 25 | 102400, 26 | 1024000, 27 | 10240000, 28 | ] 29 | 30 | separator_distributions = [ 31 | [1], # single character 32 | [1,1,1,1,1,1] # from 1 to 6 separators 33 | ] 34 | 35 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/testcases.py: -------------------------------------------------------------------------------- 1 | from distribution import * 2 | 3 | def testcases(): 4 | sign_distribution = [1,1,1] 5 | 6 | for size, loops in sizes: 7 | for separator_distribution in separator_distributions: 8 | for k in range(1, 8 + 1): 9 | numbers_distribution = single_digit_distribution(k) 10 | yield size, loops, 'single', numbers_distribution, separator_distribution, sign_distribution 11 | 12 | for k in range(1, 8 + 1): 13 | numbers_distribution = uniform_distribution(k) 14 | yield size, loops, 'uniform', numbers_distribution, separator_distribution, sign_distribution 15 | 16 | for k in range(1, 8 + 1): 17 | numbers_distribution = normal_distribution(k, 1.0) 18 | yield size, loops, 'normal', numbers_distribution, separator_distribution, sign_distribution 19 | 20 | 21 | sizes = [ 22 | (1024, 10000), 23 | (4096, 10000), 24 | (65536, 1000), 25 | (102400, 100), 26 | ] 27 | 28 | separator_distributions = [ 29 | [1], # single character 30 | [1,1,1,1,1,1] # from 1 to 6 separators 31 | ] 32 | 33 | -------------------------------------------------------------------------------- /include/scalar/std-parser-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace scalar { 9 | 10 | namespace cstd { 11 | 12 | template 13 | void parse_signed(const char* data, size_t size, const char* separators, INSERTER output) { 14 | 15 | char* ptr = const_cast(data); 16 | char* end = ptr + size; 17 | 18 | char* endptr; 19 | while (true) { 20 | ptr += strspn(ptr, separators); 21 | if (ptr == end) { 22 | break; 23 | } 24 | 25 | errno = 0; 26 | const long val = std::strtol(ptr, &endptr, 10); 27 | 28 | // the following check comes from "man 3 strtol" 29 | if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) || (errno != 0 && val == 0)) { 30 | throw std::logic_error("invalid input"); 31 | } 32 | 33 | if (endptr == ptr) { 34 | throw std::logic_error("no digits"); 35 | } 36 | 37 | ptr = endptr; 38 | *output++ = val; 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /experiments/hwevents/loader.py: -------------------------------------------------------------------------------- 1 | class HWEvents(object): 2 | __slots__ = ('branches', 'branch_misses', 'cache_references', 'cache_misses') 3 | 4 | def get_branch_miss_ratio(self): 5 | return float(self.branch_misses) / self.branches 6 | 7 | def get_cache_miss_ratio(self): 8 | return float(self.cache_misses) / self.cache_references 9 | 10 | 11 | class Item(object): 12 | __slots__ = ('size', 'distribution_name', 'numbers_distribution', 13 | 'separators_distribution', 'sign_distribution', 'hwevents') 14 | 15 | def load(path): 16 | with open(path, 'rt') as f: 17 | for line in f: 18 | F = line.split(';') 19 | 20 | item = Item() 21 | item.size = int(F[0]) 22 | item.distribution_name = F[1] 23 | item.numbers_distribution = tuple(eval(F[2])) 24 | item.separators_distribution = tuple(eval(F[3])) 25 | item.sign_distribution = tuple(eval(F[4])) 26 | 27 | tmp = eval(F[5]) 28 | hwevents = HWEvents() 29 | hwevents.branches = tmp[0] 30 | hwevents.branch_misses = tmp[1] 31 | hwevents.cache_references = tmp[2] 32 | hwevents.cache_misses = tmp[3] 33 | item.hwevents = hwevents 34 | 35 | yield item 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, Wojciech Muła 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /scripts/stats-utilization.py: -------------------------------------------------------------------------------- 1 | from generator import Generator 2 | from table import Table 3 | 4 | if __name__ == '__main__': 5 | gen = Generator() 6 | 7 | conversion = {} 8 | 9 | for bi in gen.run(): 10 | size = bi.element_size 11 | k = len(bi.spans) 12 | 13 | if size not in conversion: 14 | conversion[size] = [] 15 | 16 | conversion[size].append(k) 17 | 18 | table = Table() 19 | table.add_header(["element size", "occurances", "%", "avg", "max", "utilization"]) 20 | 21 | def get_capacity(element_size): 22 | if element_size == 1: 23 | return 16 24 | elif element_size == 2: 25 | return 8 26 | elif element_size == 4: 27 | return 4 28 | elif element_size == 8: 29 | return 2 30 | else: 31 | return 1 32 | 33 | 34 | for element_size in sorted(conversion.keys()): 35 | list = conversion[element_size] 36 | n = len(list) 37 | avg = sum(list)/float(n) 38 | utilization = 100 * avg/get_capacity(element_size) 39 | 40 | table.add_row([ 41 | '%d' % element_size, 42 | '%d' % n, 43 | '%0.1f%%' % (100 * n/65536.0), 44 | '%0.2f' % avg, 45 | '%d' % max(list), 46 | '%0.1f%%' % utilization, 47 | ]) 48 | 49 | print(table) 50 | -------------------------------------------------------------------------------- /experiments/overalltests/testcases.py: -------------------------------------------------------------------------------- 1 | from distribution import * 2 | 3 | def testcases(): 4 | sign_distribution = [1,1,1] 5 | 6 | for procedure in procedures: 7 | for size, loops in sizes: 8 | for separator_distribution in separator_distributions: 9 | for k in range(1, 8 + 1): 10 | numbers_distribution = single_digit_distribution(k) 11 | yield procedure, size, loops, 'single', numbers_distribution, separator_distribution, sign_distribution 12 | 13 | for k in range(1, 8 + 1): 14 | numbers_distribution = uniform_distribution(k) 15 | yield procedure, size, loops, 'uniform', numbers_distribution, separator_distribution, sign_distribution 16 | 17 | for k in range(1, 8 + 1): 18 | numbers_distribution = normal_distribution(k, 1.0) 19 | yield procedure, size, loops, 'normal', numbers_distribution, separator_distribution, sign_distribution 20 | 21 | 22 | procedures = [ 23 | 'scalar', 24 | 'sse', 25 | 'sse-block' 26 | ] 27 | 28 | sizes = [ 29 | (1024, 10000), 30 | (4096, 10000), 31 | (102400, 1000), 32 | (1024000, 100), 33 | (10240000, 10), 34 | ] 35 | 36 | separator_distributions = [ 37 | [1], # single character 38 | [1,1,1,1,1,1] # from 1 to 6 separators 39 | ] 40 | 41 | -------------------------------------------------------------------------------- /test/spanmaskhistogram.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "sse/sse-matcher.h" 7 | #include "sse/sse-parser-signed.h" 8 | 9 | #include "application.h" 10 | 11 | class App: public Application { 12 | 13 | using Vector = std::vector; 14 | 15 | public: 16 | App(int argc, char** argv) : Application(argc, argv) {} 17 | 18 | private: 19 | virtual bool custom_run() override; 20 | virtual void custom_init() override; 21 | }; 22 | 23 | void App::custom_init() { 24 | quiet = true; 25 | } 26 | 27 | bool App::custom_run() { 28 | const auto tmp = generate_signed(); 29 | 30 | const char* separators = ",; "; 31 | sse::NaiveMatcher<8> matcher(separators); 32 | std::vector result; 33 | sse::parser_signed(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result)); 34 | sse::stats.span_mask_histogram_to_csv(stdout); 35 | 36 | return true; 37 | } 38 | 39 | int main(int argc, char* argv[]) { 40 | 41 | try { 42 | App app(argc, argv); 43 | 44 | app.run(); 45 | #ifndef USE_STATISTICS 46 | puts("Program was not compiled with USE_STATISTICS"); 47 | #endif 48 | return EXIT_SUCCESS; 49 | 50 | } catch (std::exception& e) { 51 | printf("%s\n", e.what()); 52 | return EXIT_FAILURE; 53 | } catch (Application::Exit&) { 54 | return EXIT_SUCCESS; 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/block_info.cpp: -------------------------------------------------------------------------------- 1 | #include "block_info.h" 2 | 3 | #include "block_info.inl" 4 | 5 | namespace { 6 | 7 | void as_array(FILE* f, const uint8_t data[16]) { 8 | fprintf(f, "{"); 9 | fprintf(f, "%02x", data[0]); 10 | for (int i=1; i < 16; i++) 11 | fprintf(f, ", %02x", data[i]); 12 | fprintf(f, "}\n"); 13 | } 14 | 15 | const char* to_string(Conversion c) { 16 | switch (c) { 17 | case Conversion::Scalar: 18 | return "Scalar"; 19 | 20 | case Conversion::SSE1Digit: 21 | return "SSE1Digit"; 22 | 23 | case Conversion::SSE2Digits: 24 | return "SSE2Digits"; 25 | 26 | case Conversion::SSE4Digits: 27 | return "SSE4Digits"; 28 | 29 | case Conversion::SSE8Digits: 30 | return "SSE8Digits"; 31 | 32 | default: 33 | return ""; 34 | } 35 | } 36 | 37 | } // namespace 38 | 39 | void BlockInfo::dump(FILE* f) const { 40 | fprintf(f, "first_skip : %d\n", first_skip); 41 | fprintf(f, "total_skip : %d\n", total_skip); 42 | fprintf(f, "element_count : %d\n", element_count); 43 | fprintf(f, "conversion : %s\n", to_string(conversion_routine)); 44 | fprintf(f, "invalid_sign_mask : %04x\n", invalid_sign_mask); 45 | fprintf(f, "shuffle_digits : "); as_array(f, shuffle_digits); 46 | fprintf(f, "shuffle_signs : "); as_array(f, shuffle_signs); 47 | } 48 | -------------------------------------------------------------------------------- /include/sse/sse-parser-unsigned.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "scalar/scalar-parse-unsigned.h" 7 | #include "sse-utils.h" 8 | #include "sse-convert.h" 9 | #include "sse-parser-common.h" 10 | #include "sse-parser-statistics.h" 11 | #include "block_info.h" 12 | 13 | namespace sse { 14 | 15 | template 16 | void parser(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) { 17 | 18 | char* data = const_cast(string); 19 | char* end = data + size; 20 | 21 | while (data + 16 < end) { 22 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 23 | const __m128i t0 = decimal_digits_mask(input); 24 | const uint16_t digit_mask = _mm_movemask_epi8(t0); 25 | const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, t0)); 26 | 27 | STATS_INC(loops); 28 | STATS_SPAN_MASK(digit_mask); 29 | 30 | if (valid_mask != 0xffff) { 31 | throw std::runtime_error("Wrong character"); 32 | } 33 | 34 | if (digit_mask == 0) { 35 | data += 16; 36 | continue; 37 | } 38 | 39 | const BlockInfo& bi = blocks[digit_mask]; 40 | data = detail::parse_unsigned(bi, input, data, end, output); 41 | 42 | } // for 43 | 44 | // process the tail 45 | scalar::parse_unsigned(data, string + size - data, separators, output); 46 | } 47 | 48 | } // namespace sse 49 | -------------------------------------------------------------------------------- /include/test/application.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "command_line.h" 9 | 10 | class Application { 11 | 12 | public: 13 | class Exit {}; 14 | class ArgumentError: public std::logic_error { 15 | public: 16 | ArgumentError(const std::string& msg) : std::logic_error(msg) {} 17 | }; 18 | 19 | protected: 20 | CommandLine cmdline; 21 | 22 | protected: 23 | bool quiet; 24 | 25 | private: 26 | size_t size; 27 | size_t debug_size; 28 | size_t loop_count; 29 | struct { 30 | std::discrete_distribution<> numbers; 31 | std::discrete_distribution<> separators; 32 | std::discrete_distribution<> sign; 33 | } distribution; 34 | bool sign_nonnull; 35 | std::string separators_set; 36 | 37 | std::random_device rd; 38 | std::mt19937 random; 39 | 40 | public: 41 | bool run(); 42 | 43 | protected: 44 | Application(int argc, char* argv[]); 45 | 46 | virtual bool custom_run() = 0; 47 | virtual void custom_init(); 48 | 49 | bool has_signed_distribution() const { 50 | return sign_nonnull; 51 | } 52 | 53 | std::string get_separators_set() const { 54 | return separators_set; 55 | } 56 | std::string generate_unsigned(); 57 | std::string generate_signed(); 58 | 59 | public: 60 | size_t get_size() const { 61 | return size; 62 | } 63 | 64 | size_t get_loop_count() const { 65 | return loop_count; 66 | } 67 | 68 | protected: 69 | virtual void print_custom_help() const; 70 | 71 | private: 72 | void init(); 73 | void print_help() const; 74 | }; 75 | 76 | -------------------------------------------------------------------------------- /experiments/hwevents/runner.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from os.path import exists 3 | 4 | EXECUTABLE = './bin/benchmark-hwevents' 5 | LOOPS = 3 6 | 7 | 8 | class Runner(object): 9 | def __init__(self, size, numbers_distribution, separators_distribution, sign_distribution): 10 | 11 | assert len(numbers_distribution) > 0 12 | assert len(separators_distribution) > 0 13 | assert len(sign_distribution) > 0 14 | 15 | self.size = size 16 | self.numbers_distribution = numbers_distribution 17 | self.separators_distribution = separators_distribution 18 | self.sign_distribution = sign_distribution 19 | 20 | def run(self): 21 | args = self.__prepare_arguments() 22 | proc = subprocess.Popen(args, stdout=subprocess.PIPE) 23 | res = proc.communicate()[0] 24 | ret = proc.wait() 25 | if ret != 0: 26 | print(args) 27 | print(res) 28 | raise RuntimeError("program failed") 29 | 30 | return self.__parse_output(res) 31 | 32 | 33 | def __prepare_arguments(self): 34 | return ( 35 | EXECUTABLE, 36 | '--size=%d' % self.size, 37 | '--num=%s' % format_distribution(self.numbers_distribution), 38 | '--sep=%s' % format_distribution(self.separators_distribution), 39 | '--sign=%s' % format_distribution(self.sign_distribution), 40 | '--loops=%d' % LOOPS, 41 | '--csv-output' 42 | ) 43 | 44 | 45 | def __parse_output(self, output): 46 | return list(map(int, (s.strip() for s in output.split(b',')))) 47 | 48 | 49 | def format_distribution(dist): 50 | return ','.join(map(str, dist)) 51 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/loader.py: -------------------------------------------------------------------------------- 1 | from hwevents_loader import load as load_hwevents 2 | from microbenchmark_loader import load as load_cycles 3 | 4 | def load(spanmaskhistogram, hwevents, microbenchmark): 5 | 6 | microbenchmarks_dict = {} 7 | for item in load_cycles(microbenchmark): 8 | key = (item.size, item.num_distribution, item.sep_distribution, item.sign_distribution) 9 | microbenchmarks_dict[key] = item.cycles['SSE'] 10 | 11 | hwevents_dict = {} 12 | for item in load_hwevents(hwevents): 13 | key = (item.size, item.numbers_distribution, item.separators_distribution, item.sign_distribution) 14 | hwevents_dict[key] = item.hwevents 15 | 16 | for item in load_histogram(spanmaskhistogram): 17 | key = (item.size, item.numbers_distribution, item.separators_distribution, item.sign_distribution) 18 | item.hwevents = hwevents_dict[key] 19 | item.cycles = microbenchmarks_dict.get(key, None) 20 | 21 | yield item 22 | 23 | 24 | class Item(object): 25 | __slots__ = ('size', 'distribution_name', 'numbers_distribution', 26 | 'separators_distribution', 'sign_distribution', 27 | 'histogram', 'hwevents', 'cycles') 28 | 29 | def load_histogram(path): 30 | with open(path, 'rt') as f: 31 | for line in f: 32 | F = line.split(';') 33 | 34 | item = Item() 35 | item.size = int(F[0]) 36 | item.distribution_name = F[1] 37 | item.numbers_distribution = tuple(eval(F[2])) 38 | item.separators_distribution = tuple(eval(F[3])) 39 | item.sign_distribution = tuple(eval(F[4])) 40 | item.histogram = eval(F[5]) 41 | 42 | yield item 43 | 44 | -------------------------------------------------------------------------------- /include/test/command_line.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | class CommandLine { 8 | 9 | std::vector args; 10 | 11 | class OptionNotFound: public std::logic_error { 12 | public: 13 | OptionNotFound(const std::string& s) : std::logic_error(s) {} 14 | }; 15 | 16 | public: 17 | CommandLine(int argc, char** argv); 18 | 19 | public: 20 | bool empty() const { return args.size() == 1; } 21 | 22 | // like "-h", "--version" 23 | bool has_flag(const std::string& flag) const; 24 | 25 | // for "--name=value" returns "value" 26 | std::string get_value(const std::string& option) const; 27 | std::string get_value(const std::string& option, const std::string& defvalue) const; 28 | bool has_value(const std::string& option) const; 29 | 30 | template 31 | T parse_value(const std::string& option, CONVERSION conv) { 32 | try { 33 | return conv(get_value(option)); 34 | } catch (OptionNotFound&) { 35 | throw; 36 | } catch (std::exception& e) { 37 | const auto msg = "Wrong value of '" + option + "': " + std::string(e.what()); 38 | throw std::logic_error(msg); 39 | } 40 | } 41 | 42 | template 43 | T parse_value(const std::string& option, CONVERSION conv, const T& defvalue) { 44 | try { 45 | return conv(get_value(option)); 46 | } catch (OptionNotFound&) { 47 | return defvalue; 48 | } catch (std::exception& e) { 49 | const auto msg = "Wrong value of '" + option + "': " + std::string(e.what()); 50 | throw std::logic_error(msg); 51 | } 52 | } 53 | 54 | const std::string& get_program_name() const; 55 | }; 56 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/runner.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from os.path import exists 3 | 4 | EXECUTABLE = './bin/spanmaskhistogram' 5 | 6 | 7 | class Runner(object): 8 | def __init__(self, size, numbers_distribution, separators_distribution, sign_distribution): 9 | 10 | assert len(numbers_distribution) > 0 11 | assert len(separators_distribution) > 0 12 | assert len(sign_distribution) > 0 13 | 14 | self.size = size 15 | self.numbers_distribution = numbers_distribution 16 | self.separators_distribution = separators_distribution 17 | self.sign_distribution = sign_distribution 18 | 19 | def run(self): 20 | args = self.__prepare_arguments() 21 | proc = subprocess.Popen(args, stdout=subprocess.PIPE) 22 | res = proc.communicate()[0] 23 | ret = proc.wait() 24 | if ret != 0: 25 | print(args) 26 | print(res) 27 | raise RuntimeError("program failed") 28 | 29 | return self.__parse_output(res) 30 | 31 | 32 | def __prepare_arguments(self): 33 | return ( 34 | EXECUTABLE, 35 | '--size=%d' % self.size, 36 | '--num=%s' % format_distribution(self.numbers_distribution), 37 | '--sep=%s' % format_distribution(self.separators_distribution), 38 | '--sign=%s' % format_distribution(self.sign_distribution), 39 | ) 40 | 41 | 42 | def __parse_output(self, output): 43 | res = [] 44 | for line in output.splitlines(): 45 | F = line.split(b',') 46 | mask = int(F[0].strip(), 16) 47 | count = int(F[1].strip()) 48 | 49 | res.append((mask, count)) 50 | 51 | return res 52 | 53 | 54 | def format_distribution(dist): 55 | return ','.join(map(str, dist)) 56 | -------------------------------------------------------------------------------- /include/hybrid-parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "sse/sse-utils.h" 7 | 8 | namespace scalar { 9 | 10 | template 11 | uint32_t convert(const char* s, uint32_t prev) { 12 | return convert(s + 1, prev * 10 + uint8_t(s[0]) - '0'); 13 | } 14 | 15 | template<> 16 | uint32_t convert<0>(const char* /*s*/, uint32_t prev) { 17 | return prev; 18 | } 19 | 20 | template 21 | uint32_t convert(const char* s) { 22 | return convert(s, 0); 23 | } 24 | 25 | } 26 | 27 | template 28 | void hybrid_parser(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) { 29 | char* data = const_cast(string); 30 | char* end = data + size; 31 | bool has_last = false; 32 | uint32_t val = 0; 33 | while (data + 16 < end) { 34 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 35 | const __m128i t0 = sse::decimal_digits_mask(input); 36 | const uint16_t digit_mask = _mm_movemask_epi8(t0); 37 | const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, t0)); 38 | 39 | if (valid_mask != 0xffff) { 40 | throw std::runtime_error("Wrong character"); 41 | } 42 | 43 | if (digit_mask == 0) { 44 | data += 16; 45 | continue; 46 | } 47 | 48 | switch (digit_mask & 0xff) { 49 | #include "hybrid-parser-unsigned.inl" 50 | } 51 | 52 | data += 8; 53 | 54 | switch (digit_mask >> 8) { 55 | #include "hybrid-parser-unsigned.inl" 56 | } 57 | 58 | data += 8; 59 | } // for 60 | 61 | // process the tail 62 | scalar::parse_unsigned(data, string + size - data, separators, output); 63 | } 64 | -------------------------------------------------------------------------------- /experiments/prettyprint.py: -------------------------------------------------------------------------------- 1 | class Parameters(object): 2 | def __init__(self, weight, title): 3 | self.weight = weight 4 | self.title = title 5 | 6 | def get_num_distribution_parameters(distribution_name, num_distribution): 7 | 8 | if type(num_distribution) is str: 9 | distr = map(int, num_distribution.split(',')) 10 | else: 11 | distr = num_distribution 12 | 13 | def format_count(count, noun): 14 | if count == 1: 15 | return '%d %s' % (count, noun) 16 | else: 17 | return '%d %ss' % (count, noun) 18 | 19 | if distribution_name == 'single': 20 | def get_fixed(): 21 | return distr.index(1) + 1 22 | 23 | n = get_fixed() 24 | return Parameters(n, format_count(n, 'digit')) 25 | 26 | elif distribution_name == 'normal': 27 | def get_max(): 28 | return distr.index(max(distr)) + 1 29 | 30 | n = get_max() 31 | return Parameters(n, "max at %d digit" % n) 32 | 33 | elif distribution_name == 'uniform': 34 | n = len(distr) 35 | return Parameters(n, "1 .. %s" % format_count(n, 'digit')) 36 | 37 | assert False 38 | 39 | 40 | def get_distribution_title(distribution_name): 41 | if distribution_name == 'single': 42 | return 'Fixed length' 43 | elif distribution_name == 'normal': 44 | return 'Gaussian distribution' 45 | elif distribution_name == 'uniform': 46 | return 'Uniform distribution' 47 | 48 | assert False 49 | 50 | 51 | def get_separator_title(sep_distribution): 52 | if type(sep_distribution) is str: 53 | sep = sep_distribution.split(',') 54 | else: 55 | sep = sep_distribution 56 | 57 | if sep == ['1']: 58 | separator = 'single separator character' 59 | else: 60 | k = len(sep) 61 | separator = '1 .. %d separator characters' % k 62 | 63 | return separator 64 | 65 | -------------------------------------------------------------------------------- /include/sse/sse-utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace sse { 7 | 8 | __m128i decimal_digits_mask(const __m128i input) { 9 | const __m128i ascii0 = _mm_set1_epi8('0'); 10 | const __m128i after_ascii9 = _mm_set1_epi8('9' + 1); 11 | 12 | const __m128i t0 = _mm_cmplt_epi8(input, ascii0); // t1 = (x < '0') 13 | const __m128i t1 = _mm_cmplt_epi8(input, after_ascii9); // t0 = (x <= '9') 14 | 15 | return _mm_andnot_si128(t0, t1); // x <= '9' and x >= '0' 16 | } 17 | 18 | __m128i sign_mask(const __m128i input) { 19 | const __m128i t0 = _mm_cmpeq_epi8(input, _mm_set1_epi8('+')); 20 | const __m128i t1 = _mm_cmpeq_epi8(input, _mm_set1_epi8('-')); 21 | 22 | return _mm_or_si128(t0, t1); 23 | } 24 | 25 | uint64_t compose_bitmask(const __m128i bytemask0, 26 | const __m128i bytemask1, 27 | const __m128i bytemask2, 28 | const __m128i bytemask3) { 29 | 30 | const uint64_t mask0 = _mm_movemask_epi8(bytemask0); 31 | const uint64_t mask1 = _mm_movemask_epi8(bytemask1); 32 | const uint64_t mask2 = _mm_movemask_epi8(bytemask2); 33 | const uint64_t mask3 = _mm_movemask_epi8(bytemask3); 34 | 35 | return mask0 36 | | (mask1 << (1*16)) 37 | | (mask2 << (2*16)) 38 | | (mask3 << (3*16)); 39 | } 40 | 41 | __m128i from_epu16(const uint16_t x, uint8_t one = 0xff) { 42 | 43 | uint8_t tmp[16]; 44 | memset(tmp, 0, sizeof(tmp)); 45 | 46 | int idx = 0; 47 | uint16_t val = x; 48 | while (val) { 49 | if (val & 0x0001) { 50 | tmp[idx] = one; 51 | } 52 | 53 | val >>= 1; 54 | idx += 1; 55 | } 56 | 57 | return _mm_loadu_si128((const __m128i*)tmp); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/loader.py: -------------------------------------------------------------------------------- 1 | import prettyprint 2 | 3 | class Item(object): 4 | __slots__ = ( 5 | "size", 6 | "iterations", 7 | "distribution_name", 8 | "num_distribution", 9 | "sep_distribution", 10 | "sign_distribution", 11 | "cycles", 12 | ) 13 | 14 | def get_num_distribution_title(self): 15 | return '%s (%s)' % (prettyprint.get_distribution_title(self.distribution_name), \ 16 | prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).title) 17 | 18 | def get_num_distribution_weight(self): 19 | return prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).weight 20 | 21 | def get_sep_distribution_title(self): 22 | return prettyprint.get_separator_title(self.sep_distribution) 23 | 24 | 25 | procedures = ( 26 | 'scalar', 27 | 'scalar (std)', 28 | 'scalar (hybrid)', 29 | 'SSE', 30 | 'SSE (block)', 31 | 'SSE (simplified)', 32 | ) 33 | 34 | reference_procedure = 'scalar' 35 | 36 | assert reference_procedure in procedures 37 | 38 | def load(path): 39 | with open(path, 'rt') as f: 40 | for item in load_file(f): 41 | yield item 42 | 43 | 44 | def load_file(file): 45 | 46 | for line in file: 47 | 48 | item = Item() 49 | F = line.split(';') 50 | 51 | item.size = int(F[0]) 52 | item.iterations = int(F[1]) 53 | item.distribution_name = F[2] 54 | 55 | def get_tuple(string): 56 | tmp = eval(string) 57 | if type(tmp) is int: 58 | return (1,) 59 | else: 60 | assert type(tmp) is tuple 61 | return tmp 62 | 63 | item.num_distribution = get_tuple(F[3]) 64 | item.sep_distribution = get_tuple(F[4]) 65 | item.sign_distribution = get_tuple(F[5]) 66 | item.cycles = eval(F[6]) 67 | 68 | yield item 69 | 70 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/microbenchmark_loader.py: -------------------------------------------------------------------------------- 1 | import prettyprint 2 | 3 | class Item(object): 4 | __slots__ = ( 5 | "size", 6 | "iterations", 7 | "distribution_name", 8 | "num_distribution", 9 | "sep_distribution", 10 | "sign_distribution", 11 | "cycles", 12 | ) 13 | 14 | def get_num_distribution_title(self): 15 | return '%s (%s)' % (prettyprint.get_distribution_title(self.distribution_name), \ 16 | prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).title) 17 | 18 | def get_num_distribution_weight(self): 19 | return prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).weight 20 | 21 | def get_sep_distribution_title(self): 22 | return prettyprint.get_separator_title(self.sep_distribution) 23 | 24 | 25 | procedures = ( 26 | 'scalar', 27 | 'scalar (std)', 28 | 'scalar (hybrid)', 29 | 'SSE', 30 | 'SSE (block)', 31 | 'SSE (simplified)', 32 | ) 33 | 34 | reference_procedure = 'scalar' 35 | 36 | assert reference_procedure in procedures 37 | 38 | def load(path): 39 | with open(path, 'rt') as f: 40 | for item in load_file(f): 41 | yield item 42 | 43 | 44 | def load_file(file): 45 | 46 | for line in file: 47 | 48 | item = Item() 49 | F = line.split(';') 50 | 51 | item.size = int(F[0]) 52 | item.iterations = int(F[1]) 53 | item.distribution_name = F[2] 54 | 55 | def get_tuple(string): 56 | tmp = eval(string) 57 | if type(tmp) is int: 58 | return (1,) 59 | else: 60 | assert type(tmp) is tuple 61 | return tmp 62 | 63 | item.num_distribution = get_tuple(F[3]) 64 | item.sep_distribution = get_tuple(F[4]) 65 | item.sign_distribution = get_tuple(F[5]) 66 | item.cycles = eval(F[6]) 67 | 68 | yield item 69 | 70 | -------------------------------------------------------------------------------- /scripts/hybrid-unsigned.py: -------------------------------------------------------------------------------- 1 | from hybrid import GeneratorBase 2 | 3 | class GenerateUnsingedParser(GeneratorBase): 4 | 5 | def before(self): 6 | self.lines.append('case 0x%02x:' % self.number) 7 | 8 | def after(self): 9 | self.lines.append('break;') 10 | 11 | def empty(self): 12 | pass 13 | 14 | def full(self): 15 | l = self.lines 16 | 17 | l.append("if (has_last) {") 18 | l.append(" val = %s;" % self.expression(self.span, "val")) 19 | l.append("} else {") 20 | l.append(" val = %s;" % self.expression(self.span)) 21 | l.append("}") 22 | l.append("has_last = true;") 23 | 24 | def finalize_previous(self): 25 | l = self.lines 26 | 27 | l.append("if (has_last) {") 28 | l.append(" has_last = false;") 29 | l.append(" *output++ = val;") 30 | l.append("}") 31 | 32 | def first_continuation(self): 33 | l = self.lines 34 | 35 | l.append("if (has_last) {") 36 | l.append(" val = %s;" % self.expression(self.span, "val")) 37 | l.append(" has_last = false;") 38 | l.append("} else {") 39 | l.append(" val = %s;" % self.expression(self.span)) 40 | l.append("}") 41 | l.append("*output++ = val;") 42 | 43 | def whole(self): 44 | self.lines.append("*output++ = %s;" % self.expression(self.span)) 45 | 46 | def last(self): 47 | l = self.lines 48 | 49 | l.append("val = %s;" % self.expression(self.span)) 50 | l.append("has_last = true;") 51 | 52 | def expression(self, span, arg = None): 53 | result = "scalar::convert<%d>(data" % span.digits() 54 | if span.first != 0: 55 | result += ' + %d' % span.first; 56 | 57 | if arg is not None: 58 | result += ', %s' % arg 59 | 60 | result += ')' 61 | return result 62 | 63 | 64 | if __name__ == '__main__': 65 | gen = GenerateUnsingedParser() 66 | for line in gen.get(): 67 | print(line) 68 | 69 | -------------------------------------------------------------------------------- /test/statistics.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "input_generator.h" 7 | #include "sse/sse-matcher.h" 8 | #include "sse/sse-parser-unsigned.h" 9 | #include "sse/sse-parser-signed.h" 10 | 11 | #include "application.h" 12 | 13 | class StatisticsApp: public Application { 14 | 15 | using Vector = std::vector; 16 | 17 | public: 18 | StatisticsApp(int argc, char** argv) : Application(argc, argv) {} 19 | 20 | private: 21 | virtual bool custom_run() override; 22 | 23 | private: 24 | void run_unsigned(); 25 | void run_signed(); 26 | }; 27 | 28 | bool StatisticsApp::custom_run() { 29 | if (has_signed_distribution()) { 30 | run_signed(); 31 | } else { 32 | run_unsigned(); 33 | } 34 | 35 | return true; 36 | } 37 | 38 | void StatisticsApp::run_unsigned() { 39 | 40 | const auto tmp = generate_unsigned(); 41 | 42 | const char* separators = ",; "; 43 | sse::NaiveMatcher<8> matcher(separators); 44 | std::vector result; 45 | sse::parser(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result)); 46 | sse::stats.print(); 47 | } 48 | 49 | void StatisticsApp::run_signed() { 50 | 51 | const auto tmp = generate_signed(); 52 | 53 | const char* separators = ",; "; 54 | sse::NaiveMatcher<8> matcher(separators); 55 | std::vector result; 56 | sse::parser_signed(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result)); 57 | sse::stats.print(); 58 | } 59 | 60 | int main(int argc, char* argv[]) { 61 | 62 | try { 63 | StatisticsApp app(argc, argv); 64 | 65 | app.run(); 66 | #ifndef USE_STATISTICS 67 | puts("Program was not compiled with USE_STATISTICS"); 68 | #endif 69 | return EXIT_SUCCESS; 70 | 71 | } catch (std::exception& e) { 72 | printf("%s\n", e.what()); 73 | return EXIT_FAILURE; 74 | } catch (Application::Exit&) { 75 | return EXIT_SUCCESS; 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /test/utils/command_line.cpp: -------------------------------------------------------------------------------- 1 | #include "command_line.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | CommandLine::CommandLine(int argc, char** argv) { 8 | for (int i=0; i < argc; i++) { 9 | args.emplace_back(argv[i]); 10 | } 11 | } 12 | 13 | bool CommandLine::has_flag(const std::string& flag) const { 14 | return std::find(args.begin(), args.end(), flag) != args.end(); 15 | } 16 | 17 | namespace { 18 | 19 | // is s2 prefix of s1 20 | bool is_prefix(const std::string s1, const std::string s2) { 21 | if (s2.size() > s1.size()) { 22 | return false; 23 | } 24 | 25 | if (s1.size() == s2.size()) { 26 | return s1 == s2; 27 | } 28 | 29 | return memcmp(s1.c_str(), s2.c_str(), s2.size()) == 0; 30 | } 31 | } 32 | 33 | std::string CommandLine::get_value(const std::string& option) const { 34 | 35 | for (size_t i=0; i < args.size(); i++) { 36 | if (args[i] == option) { 37 | try { 38 | return args.at(i + 1); 39 | } catch (std::out_of_range&) { 40 | throw OptionNotFound("Argument " + args[i] + " should be followed by a value"); 41 | } 42 | } 43 | } 44 | 45 | const auto long_option = option + "="; 46 | for (const auto& arg: args) { 47 | if (is_prefix(arg, long_option)) { 48 | return arg.substr(long_option.size()); 49 | } 50 | } 51 | 52 | throw OptionNotFound("Argument " + option + " not found"); 53 | } 54 | 55 | std::string CommandLine::get_value(const std::string& option, const std::string& defvalue) const { 56 | try { 57 | return get_value(option); 58 | } catch (std::logic_error&) { 59 | return defvalue; 60 | } 61 | } 62 | 63 | bool CommandLine::has_value(const std::string& option) const { 64 | try { 65 | get_value(option); 66 | return true; 67 | } catch (std::logic_error&) { 68 | return false; 69 | } 70 | } 71 | 72 | const std::string& CommandLine::get_program_name() const { 73 | return args[0]; 74 | } 75 | 76 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/runner.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | EXECUTABLE = './bin/benchmark-cpuclocks' 4 | 5 | class Runner(object): 6 | def __init__(self, size, loops, numbers_distribution, separators_distribution, sign_distribution): 7 | 8 | assert len(numbers_distribution) > 0 9 | assert len(separators_distribution) > 0 10 | assert len(sign_distribution) > 0 11 | 12 | self.size = size 13 | self.loops = loops 14 | self.numbers_distribution = numbers_distribution 15 | self.separators_distribution = separators_distribution 16 | self.sign_distribution = sign_distribution 17 | 18 | def run(self): 19 | args = self.__prepare_arguments() 20 | proc = subprocess.Popen(args, stdout=subprocess.PIPE) 21 | res = proc.communicate()[0] 22 | ret = proc.wait() 23 | if ret != 0: 24 | print(args) 25 | print(res) 26 | raise RuntimeError("program failed") 27 | 28 | return self.__parse_output(res) 29 | 30 | 31 | def __prepare_arguments(self): 32 | return ( 33 | EXECUTABLE, 34 | '--size=%d' % self.size, 35 | '--loops=%d' % self.loops, 36 | '--num=%s' % format_distribution(self.numbers_distribution), 37 | '--sep=%s' % format_distribution(self.separators_distribution), 38 | '--sign=%s' % format_distribution(self.sign_distribution), 39 | ) 40 | 41 | 42 | def __parse_output(self, output): 43 | d = {} 44 | 45 | for line in output.splitlines(): 46 | if 'cycle/op' not in line: 47 | continue 48 | 49 | # line = "scalar : 14.503 cycle/op (best) 15.494 cycle/op (avg)" 50 | name, tmp = line.split(':') 51 | name = name.strip() 52 | tmp = tmp.split() 53 | best = float(tmp[0]) 54 | avg = float(tmp[3]) 55 | 56 | d[name] = (best, avg) 57 | 58 | return d 59 | 60 | 61 | def format_distribution(dist): 62 | return ','.join(map(str, dist)) 63 | -------------------------------------------------------------------------------- /include/test/linux-perf-events.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include // for syscall 4 | #include // for ioctl 5 | #include // for __NR_perf_event_open 6 | #include // for perf event constants 7 | 8 | #include // for errno 9 | #include // for memset 10 | #include 11 | 12 | 13 | template 14 | class LinuxEvents { 15 | 16 | int fd; 17 | perf_event_attr attribs; 18 | 19 | public: 20 | LinuxEvents(int config) : fd(0) { 21 | memset(&attribs, 0, sizeof(attribs)); 22 | attribs.type = TYPE; 23 | attribs.size = sizeof(attribs); 24 | attribs.config = config; 25 | attribs.disabled = 1; 26 | attribs.exclude_kernel = 1; 27 | attribs.exclude_hv = 1; 28 | 29 | int pid = 0; // the current process 30 | int cpu = -1; // all CPUs 31 | int group = -1; // no group 32 | unsigned long flags = 0; 33 | fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); 34 | if (fd == -1) { 35 | report_error("perf_event_open"); 36 | } 37 | } 38 | 39 | ~LinuxEvents() { 40 | close(fd); 41 | } 42 | 43 | void start() { 44 | if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { 45 | report_error("ioctl(PERF_EVENT_IOC_RESET)"); 46 | } 47 | 48 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { 49 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); 50 | } 51 | } 52 | 53 | unsigned long end() { 54 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { 55 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); 56 | } 57 | 58 | unsigned long result; 59 | if (read(fd, &result, sizeof(result)) == -1) { 60 | report_error("read"); 61 | } 62 | 63 | return result; 64 | } 65 | 66 | private: 67 | void report_error(const std::string& context) { 68 | throw std::runtime_error(context + ": " + std::string(strerror(errno))); 69 | } 70 | 71 | }; 72 | 73 | -------------------------------------------------------------------------------- /experiments/overalltests/runner.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | EXECUTABLE = './bin/benchmark' 4 | 5 | 6 | class Runner(object): 7 | def __init__(self, procedure, size, loops, numbers_distribution, separators_distribution, sign_distribution): 8 | 9 | assert len(numbers_distribution) > 0 10 | assert len(separators_distribution) > 0 11 | assert len(sign_distribution) > 0 12 | 13 | self.procedure = procedure 14 | self.size = size 15 | self.loops = loops 16 | self.numbers_distribution = numbers_distribution 17 | self.separators_distribution = separators_distribution 18 | self.sign_distribution = sign_distribution 19 | 20 | def run(self): 21 | args = self.__prepare_arguments() 22 | proc = subprocess.Popen(args, stdout=subprocess.PIPE) 23 | res = proc.communicate()[0] 24 | ret = proc.wait() 25 | if ret != 0: 26 | print(args) 27 | print res 28 | raise RuntimeError("program failed") 29 | 30 | d = self.__parse_output(res) 31 | return d['time'] 32 | 33 | 34 | def __prepare_arguments(self): 35 | return ( 36 | EXECUTABLE, 37 | '--procedure=%s' % self.procedure, 38 | '--size=%d' % self.size, 39 | '--loops=%d' % self.loops, 40 | '--num=%s' % format_distribution(self.numbers_distribution), 41 | '--sep=%s' % format_distribution(self.separators_distribution), 42 | '--sign=%s' % format_distribution(self.sign_distribution), 43 | ) 44 | 45 | 46 | def __parse_output(self, output): 47 | d = {} 48 | 49 | for line in output.splitlines(): 50 | tmp = line.split(':') 51 | try: 52 | key = tmp[0].strip() 53 | value = tmp[1].strip() 54 | 55 | d[key] = value 56 | except IndexError: 57 | continue 58 | 59 | try: 60 | value = int(value.split()[0]) 61 | d[key] = value 62 | except ValueError: 63 | pass 64 | 65 | return d 66 | 67 | 68 | def format_distribution(dist): 69 | return ','.join(map(str, dist)) 70 | -------------------------------------------------------------------------------- /experiments/microbenchmarks/report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from table import Table 8 | from loader import load_file as load, procedures, reference_procedure 9 | from utils import groupby 10 | from writer import RestWriter 11 | from prettyprint import * 12 | 13 | 14 | class Report(object): 15 | def __init__(self, path): 16 | with open(path, 'rt') as f: 17 | data = list(load(f)) 18 | 19 | bysize = lambda item: item.size 20 | data = groupby(data, bysize) 21 | self.report = [] 22 | for size in sorted(data): 23 | collection = data[size] 24 | sortby = lambda item: (item.distribution_name, item.num_distribution, item.sep_distribution) 25 | collection.sort(key=sortby) 26 | 27 | res = [] 28 | for item in collection: 29 | title = self.get_title(item) 30 | table = self.prepare_table(item) 31 | 32 | res.append((title, table)) 33 | 34 | self.report.append(( 35 | 'Input size %d bytes' % size, 36 | res 37 | )) 38 | 39 | def get(self): 40 | return self.report 41 | 42 | 43 | def get_title(self, item): 44 | return '%s --- %s' % (item.get_num_distribution_title(), item.get_sep_distribution_title()) 45 | 46 | 47 | def prepare_table(self, item): 48 | table = Table() 49 | table.add_header(["", ("cycles per one input byte", 2), "speed-up"]) 50 | table.add_header(["procedure", "min", "avg", "(min)"]) 51 | 52 | refmin, refavg = item.cycles[reference_procedure] 53 | 54 | for key in procedures: 55 | try: 56 | min, avg = item.cycles[key] 57 | except KeyError: 58 | continue 59 | 60 | table.add_row([ 61 | key, 62 | '%5.3f' % min, 63 | '%5.3f' % avg, 64 | '%0.2f' % (refmin/min) 65 | ]) 66 | 67 | return table 68 | 69 | 70 | def main(): 71 | report = Report(sys.argv[1]) 72 | writer = RestWriter(sys.stdout, report.get()) 73 | try: 74 | restsection = sys.argv[2] 75 | except IndexError: 76 | restsection = '-~' 77 | 78 | writer.write(restsection) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /experiments/README.rst: -------------------------------------------------------------------------------- 1 | Scripts purpose 2 | -------------------------------------------------------------------------------- 3 | 4 | ``microbenchamarks/*.py`` --- run all implementations (scalar and vectorized) 5 | on rather small inputs and count how many **CPU cycles** are needed to complete 6 | conversion. Subdirectory ``results`` contains produced files from some computers. 7 | 8 | It uses ``bin/benchmark-cpuclocks`` utility; makefile targets 9 | ``microbenchmarks.txt`` and ``microbenchmarks.rst``. 10 | 11 | -------------------------------------------------------------------------------- 12 | 13 | ``speedup-comparison/report.py`` --- from ``microbenchmarks.txt`` produces a 14 | summary speedup array (min/avg/max) for all methods and input 15 | size/distribution. Usage:: 16 | 17 | $ python speedup-comparison/report.py microbenchmarks/results/file.txt > file.rst 18 | 19 | The result of this script is shown in article__. 20 | 21 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html#core-i7-results 22 | 23 | -------------------------------------------------------------------------------- 24 | 25 | ``overalltests/*.py`` --- run scalar and SSE implementations on small and huge 26 | input, measure **wall clock** of algorithms. Subdirectory ``results`` contains 27 | produced files from some computers. 28 | 29 | It uses ``bin/benchmark`` utility; makefile targets ``overall.txt``, 30 | ``report-overall.rst`` (all numbers are shown), ``report-overall-short.rst`` 31 | (just min/mav/max speedup is shown). 32 | 33 | -------------------------------------------------------------------------------- 34 | 35 | ``hwevents/*.py`` --- runs SSE searches and counts hardware events: branch 36 | taken & misses and cache references & misses. 37 | 38 | It uses ``bin/benchmark-hwevents`` utility; makefile target: ``hwevents.txt``. 39 | 40 | -------------------------------------------------------------------------------- 41 | 42 | ``spanmaskhistogram/*.py`` --- for different input sizes and input data 43 | distrubutions it gets following parameters of **SSE procedure**: 44 | 45 | * ``span_pattern`` statistics usage (runs ``bin/statistics``); 46 | * running time in CPU clocks (from ``measurements.txt``); 47 | * brach and cache events count (from ``hwevents.txt``). 48 | 49 | All these data is collated in single a table; makefile target 50 | ``spanmaskhistogram.rst``. 51 | 52 | The result of this script is shown in article__. 53 | 54 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html#sse-conversion-runtime-analysis 55 | 56 | -------------------------------------------------------------------------------- /include/sse/sse-parser-statistics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef USE_STATISTICS 8 | #define STATS_ADD(__field__, val) sse::stats.__field__ += (val) 9 | #define STATS_INC(__field__) STATS_ADD(__field__, 1) 10 | #define STATS_SPAN_MASK(mask) sse::stats.span_masks_histogram[mask] += 1 11 | #else 12 | #define STATS_ADD(__field__, val) 13 | #define STATS_INC(__field__) 14 | #define STATS_SPAN_MASK(mask) 15 | #endif 16 | 17 | namespace sse { 18 | 19 | struct SSEStatistics { 20 | size_t scalar_conversions = 0; 21 | 22 | size_t digit1_calls = 0; 23 | size_t digit1_converted = 0; 24 | size_t digit2_calls = 0; 25 | size_t digit2_converted = 0; 26 | size_t digit3_calls = 0; 27 | size_t digit3_converted = 0; 28 | size_t digit4_calls = 0; 29 | size_t digit4_converted = 0; 30 | size_t digit8_calls = 0; 31 | size_t digit8_converted = 0; 32 | 33 | size_t get_all_converted() const { 34 | return scalar_conversions 35 | + get_SSE_converted(); 36 | } 37 | 38 | size_t get_SSE_converted() const { 39 | return digit1_converted 40 | + digit2_converted 41 | + digit3_converted 42 | + digit4_converted 43 | + digit8_converted; 44 | } 45 | }; 46 | 47 | struct Statistics { 48 | 49 | std::map total_skip_histogram; 50 | std::map span_masks_histogram; 51 | 52 | size_t loops = 0; 53 | 54 | SSEStatistics unsigned_path; 55 | SSEStatistics signed_path; 56 | 57 | Statistics(); 58 | 59 | size_t get_all_converted() const { 60 | return unsigned_path.get_all_converted() 61 | + signed_path.get_all_converted(); 62 | } 63 | 64 | size_t get_scalar_conversions() const { 65 | return unsigned_path.scalar_conversions 66 | + signed_path.scalar_conversions; 67 | } 68 | 69 | size_t get_SSE_converted() const { 70 | return unsigned_path.get_SSE_converted() 71 | + signed_path.get_SSE_converted(); 72 | } 73 | 74 | void print(FILE* file) const; 75 | void span_mask_histogram_to_csv(FILE* file) const; 76 | 77 | void print() const { 78 | print(stdout); 79 | } 80 | 81 | void init(); 82 | }; 83 | 84 | extern Statistics stats; 85 | 86 | } // namespace sse 87 | 88 | -------------------------------------------------------------------------------- /test/utils/input_generator.cpp: -------------------------------------------------------------------------------- 1 | #include "input_generator.h" 2 | 3 | #include 4 | 5 | static const std::string numbers = "0123456789"; 6 | 7 | static 8 | std::string random_string(size_t n, const std::string& set) { 9 | 10 | std::string result(n, ' '); 11 | for (size_t i=0; i < n; i++) { 12 | result[i] = set[rand() % set.size()]; 13 | } 14 | 15 | return result; 16 | } 17 | 18 | std::string generate_unsigned(size_t size, 19 | const std::string& separators_set, 20 | std::mt19937 random, 21 | std::discrete_distribution<> num, 22 | std::discrete_distribution<> sep) { 23 | 24 | std::string result; 25 | 26 | while (true) { 27 | const size_t n = num(random) + 1; 28 | const size_t k = sep(random) + 1; 29 | 30 | const std::string number = random_string(n, numbers); 31 | const std::string sep = random_string(k, separators_set); 32 | 33 | if (result.size() + n + k < size) { 34 | result += number; 35 | result += sep; 36 | } else { 37 | result += random_string(size - result.size(), separators_set); 38 | return result; 39 | } 40 | } 41 | } 42 | 43 | std::string generate_signed(size_t size, 44 | const std::string& separators_set, 45 | std::mt19937 random, 46 | std::discrete_distribution<> num, 47 | std::discrete_distribution<> sep, 48 | std::discrete_distribution<> sign) { 49 | 50 | std::string result; 51 | 52 | while (true) { 53 | const size_t n = num(random) + 1; 54 | const size_t k = sep(random) + 1; 55 | const size_t s = sign(random) % 3; 56 | 57 | const std::string number = random_string(n, numbers); 58 | const std::string sep = random_string(k, separators_set); 59 | 60 | if (result.size() + n + k + s < size) { 61 | switch (s) { 62 | case 0: 63 | break; 64 | 65 | case 1: 66 | result += '-'; 67 | break; 68 | 69 | case 2: 70 | result += '+'; 71 | break; 72 | } 73 | result += number; 74 | result += sep; 75 | } else { 76 | result += random_string(size - result.size(), separators_set); 77 | return result; 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /scripts/writer.py: -------------------------------------------------------------------------------- 1 | ITEM_PATTERN = \ 2 | "/* %(ID)04x %(SCALAR_COST)5.2f %(SSE_COST)5.2f */ {" \ 3 | "%(FIRST_SKIP)s," \ 4 | "%(TOTAL_SKIP)s," \ 5 | "%(ELEMENT_COUNT)s," \ 6 | "%(CONVERSION)s," \ 7 | "0x%(INVALID_SIGN_MASK)04x," \ 8 | "{%(SHUFFLE_DIGITS)s}," \ 9 | "{%(SHUFFLE_SIGNS)s}" \ 10 | "}" 11 | 12 | FILE_PATTERN = """ 13 | #pragma once 14 | 15 | #include "block_info.h" 16 | 17 | BlockInfo blocks[%(COUNT)d] = { 18 | %(ITEMS)s 19 | }; 20 | """ 21 | 22 | from cost import scalar_cost, SSE_cost 23 | 24 | class CPPWriter(object): 25 | def __init__(self, data): 26 | self.data = data 27 | pass 28 | 29 | def save(self, path): 30 | tmp = [self._render_item(item) for item in self.data] 31 | params = { 32 | 'COUNT': len(tmp), 33 | 'ITEMS': ',\n'.join(tmp), 34 | } 35 | 36 | text = FILE_PATTERN % params 37 | 38 | with open(path, 'wt') as f: 39 | f.write(text) 40 | 41 | 42 | def _render_item(self, block): 43 | params = { 44 | 'ID' : block.id, 45 | 'FIRST_SKIP' : block.first_skip, 46 | 'TOTAL_SKIP' : block.total_skip, 47 | 'ELEMENT_COUNT' : len(block.spans), 48 | 'CONVERSION' : self.get_conversion_enum(block), 49 | 'INVALID_SIGN_MASK' : block.get_invalid_sign_mask(), 50 | 'SHUFFLE_DIGITS' : self._make_c_array(block.shuffle_digits), 51 | 'SHUFFLE_SIGNS' : self._make_c_array(block.shuffle_signs), 52 | 'SCALAR_COST' : scalar_cost(block).value(), 53 | 'SSE_COST' : SSE_cost(block).value(), 54 | } 55 | 56 | return ITEM_PATTERN % params 57 | 58 | 59 | def _make_c_array(self, numbers): 60 | return ','.join('0x%02x' % x for x in numbers) 61 | 62 | def get_conversion_enum(self, block): 63 | if block.element_size == 0: 64 | return 'Conversion::Empty' 65 | 66 | if block.element_size == 1: 67 | return 'Conversion::SSE1Digit' 68 | 69 | if block.element_size == 2: 70 | return 'Conversion::SSE2Digits' 71 | 72 | if block.element_size == 4: 73 | if all(r.digits() == 3 for r in block.spans): 74 | print(block) 75 | return 'Conversion::SSE3Digits' 76 | else: 77 | return 'Conversion::SSE4Digits' 78 | 79 | if block.element_size == 8: 80 | return 'Conversion::SSE8Digits' 81 | 82 | if block.element_size == 16: 83 | return 'Conversion::Scalar' 84 | 85 | assert False 86 | -------------------------------------------------------------------------------- /test/compare-avx512.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "input_generator.h" 7 | #include "avx512/avx512-parser-signed.h" 8 | 9 | #include "application.h" 10 | 11 | class CompareApp: public Application { 12 | 13 | using Vector = std::vector; 14 | 15 | public: 16 | CompareApp(int argc, char** argv) : Application(argc, argv) {} 17 | 18 | virtual bool custom_run() override; 19 | 20 | private: 21 | void dump(const Vector& vec) const; 22 | bool compare(const Vector& expected, const Vector& result) const; 23 | 24 | }; 25 | 26 | 27 | bool CompareApp::custom_run() { 28 | 29 | const auto tmp = generate_signed(); 30 | 31 | Vector reference; 32 | Vector result; 33 | const char* separators = ";, "; 34 | scalar::parse_signed(tmp.data(), tmp.size(), separators, std::back_inserter(reference)); 35 | 36 | avx512::parser_signed(tmp.data(), tmp.size(), separators, std::back_inserter(result)); 37 | 38 | if (!compare(reference, result)) { 39 | puts(tmp.c_str()); 40 | puts(""); 41 | dump(reference); 42 | puts(""); 43 | dump(result); 44 | 45 | return false; 46 | } else { 47 | puts("All OK"); 48 | return true; 49 | } 50 | } 51 | 52 | void CompareApp::dump(const Vector& vec) const { 53 | printf("size = %lu: [", vec.size()); 54 | 55 | const size_t n = vec.size(); 56 | if (n) { 57 | printf("%d", vec[0]); 58 | } 59 | 60 | for (size_t i=1; i < n; i++) { 61 | printf(", %d", vec[i]); 62 | } 63 | 64 | printf("]\n"); 65 | } 66 | 67 | bool CompareApp::compare(const Vector& expected, const Vector& result) const { 68 | 69 | if (expected.size() != result.size()) { 70 | puts("different sizes"); 71 | return false; 72 | } 73 | 74 | const size_t n = expected.size(); 75 | for (size_t i=0; i < n; i++) { 76 | const auto e = expected[i]; 77 | const auto r = result[i]; 78 | 79 | if (e != r) { 80 | printf("error at #%lu: expected = %d, result = %d\n", i, e, r); 81 | return false; 82 | } 83 | } 84 | 85 | return true; 86 | } 87 | 88 | int main(int argc, char* argv[]) { 89 | 90 | try { 91 | CompareApp app(argc, argv); 92 | 93 | return app.run() ? EXIT_SUCCESS : EXIT_FAILURE; 94 | 95 | } catch (std::exception& e) { 96 | printf("%s\n", e.what()); 97 | return EXIT_FAILURE; 98 | } catch (Application::Exit&) { 99 | return EXIT_SUCCESS; 100 | } 101 | } 102 | 103 | -------------------------------------------------------------------------------- /include/sse/sse-simplified-parser-signed.h: -------------------------------------------------------------------------------- 1 | /* 2 | Nate's idea: simply consider all non-digit and non-sign characters as separators. 3 | */ 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #include "scalar/scalar-parse-signed.h" 10 | #include "sse-utils.h" 11 | #include "sse-convert.h" 12 | #include "sse-parser-common.h" 13 | #include "sse-parser-statistics.h" 14 | #include "block_info.h" 15 | 16 | namespace sse_simplified { 17 | 18 | namespace detail { 19 | 20 | template 21 | char* process_chunk(char* data, char* end, const __m128i& input, INSERTER output) { 22 | 23 | const __m128i ascii_minus = _mm_set1_epi8('-'); 24 | const __m128i ascii_plus = _mm_set1_epi8('+'); 25 | 26 | const __m128i bytemask_digit = sse::decimal_digits_mask(input); 27 | 28 | const __m128i bytemask_plus = _mm_cmpeq_epi8(input, ascii_plus); 29 | const __m128i bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus); 30 | const __m128i bytemask_sign = _mm_or_si128(bytemask_plus, bytemask_minus); 31 | const __m128i bytemask_span = _mm_or_si128(bytemask_digit, bytemask_sign); 32 | 33 | const uint16_t span_mask = _mm_movemask_epi8(bytemask_span); 34 | const uint16_t sign_mask = _mm_movemask_epi8(bytemask_sign); 35 | const BlockInfo& bi = blocks[span_mask]; 36 | if (sign_mask & bi.invalid_sign_mask) { 37 | throw std::runtime_error("'+' or '-' at invalid position"); 38 | } 39 | 40 | if (span_mask == 0) { 41 | return data + 16; 42 | } 43 | 44 | STATS_INC(loops); 45 | 46 | if (sign_mask == 0 || bi.conversion_routine == Conversion::SSE1Digit) { 47 | // unsigned path 48 | return sse::detail::parse_unsigned(bi, input, data, end, output); 49 | } else { 50 | return sse::detail::parse_signed(bi, input, data, end, output); 51 | } 52 | } 53 | 54 | } 55 | 56 | template 57 | void parse_signed(const char* string, size_t size, const char* separators, INSERTER output) { 58 | 59 | char* data = const_cast(string); 60 | char* end = data + size; 61 | 62 | while (data + 16 < end) { 63 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 64 | data = detail::process_chunk(data, end, input, output); 65 | 66 | } // for 67 | 68 | // process the tail 69 | scalar::parse_signed(data, string + size - data, separators, output); 70 | } 71 | 72 | } // namespace sse_simplified 73 | -------------------------------------------------------------------------------- /experiments/speedup-comparison/report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from loader import load_file, procedures as all_procedures, reference_procedure 8 | from utils import groupby 9 | from prettyprint import * 10 | from table import Table 11 | 12 | procedures = [proc for proc in all_procedures if proc != reference_procedure] 13 | 14 | def calculate_speeups(item): 15 | speedup = {} 16 | ref = float(item.cycles[reference_procedure][0]) 17 | for proc in procedures: 18 | val = item.cycles[proc][0] 19 | speedup[proc] = ref/val 20 | 21 | return speedup 22 | 23 | 24 | def statistics(array): 25 | n = len(array) 26 | assert n > 0 27 | return (min(array), sum(array)/n, max(array)) 28 | 29 | 30 | def calculate_speedup_statistics(collection): 31 | speedups = {} 32 | for proc in procedures: 33 | speedups[proc] = [] 34 | 35 | for item in collection: 36 | speedup = calculate_speeups(item) 37 | for proc in procedures: 38 | speedups[proc].append(speedup[proc]) 39 | 40 | size = collection[0].size 41 | name = collection[0].distribution_name 42 | result = {} 43 | for proc in procedures: 44 | array = speedups[proc] 45 | if len(array) == 0: 46 | continue 47 | 48 | result[proc] = statistics(array) 49 | 50 | return (size, name, result) 51 | 52 | 53 | def main(path): 54 | with open(path, 'rt') as f: 55 | keyfun = lambda item: (item.size, item.distribution_name) 56 | data = groupby(load_file(f), keyfun) 57 | 58 | header1 = [("", 3), ("speedup over %s procedure" % reference_procedure, len(procedures) * 3)] 59 | header2 = [("", 3)] 60 | header3 = ["size [B]", "distribution", "samples"] 61 | for proc in procedures: 62 | header2.append((proc, 3)) 63 | header3.extend(["min", "avg", "max"]) 64 | 65 | table = Table() 66 | table.add_header(header1) 67 | table.add_header(header2) 68 | table.add_header(header3) 69 | 70 | for key in sorted(data): 71 | collection = data[key] 72 | size, name, stats = calculate_speedup_statistics(collection) 73 | 74 | row = [] 75 | row.append('%d' % size) 76 | row.append(get_distribution_title(name)) 77 | row.append('%d' % len(collection)) 78 | 79 | for proc in procedures: 80 | row.append('%0.2f' % stats[proc][0]) 81 | row.append('%0.2f' % stats[proc][1]) 82 | row.append('%0.2f' % stats[proc][2]) 83 | 84 | table.add_row(row) 85 | 86 | 87 | print(table) 88 | 89 | if __name__ == '__main__': 90 | main(sys.argv[1]) 91 | -------------------------------------------------------------------------------- /test/compare-unsigned.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "input_generator.h" 7 | #include "scalar/scalar-parse-unsigned.h" 8 | #include "sse/sse-matcher.h" 9 | #include "sse/sse-parser-unsigned.h" 10 | 11 | #include "application.h" 12 | 13 | class CompareApp: public Application { 14 | 15 | using Vector = std::vector; 16 | 17 | public: 18 | CompareApp(int argc, char** argv) : Application(argc, argv) {} 19 | 20 | private: 21 | virtual bool custom_run() override; 22 | 23 | private: 24 | void dump(const Vector& vec) const; 25 | bool compare(const Vector& expected, const Vector& result) const; 26 | 27 | }; 28 | 29 | bool CompareApp::custom_run() { 30 | 31 | const auto tmp = generate_unsigned(); 32 | 33 | Vector reference; 34 | Vector result; 35 | const char* separators = ";, "; 36 | scalar::parse_unsigned(tmp.data(), tmp.size(), separators, std::back_inserter(reference)); 37 | 38 | sse::NaiveMatcher<8> matcher(separators); 39 | sse::parser(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result)); 40 | 41 | if (!compare(reference, result)) { 42 | puts(tmp.c_str()); 43 | puts(""); 44 | dump(reference); 45 | puts(""); 46 | dump(result); 47 | 48 | return false; 49 | } else { 50 | puts("All OK"); 51 | return true; 52 | } 53 | } 54 | 55 | void CompareApp::dump(const Vector& vec) const { 56 | printf("size = %lu: [", vec.size()); 57 | 58 | const size_t n = vec.size(); 59 | if (n) { 60 | printf("%u", vec[0]); 61 | } 62 | 63 | for (size_t i=1; i < n; i++) { 64 | printf(", %u", vec[i]); 65 | } 66 | 67 | printf("]\n"); 68 | } 69 | 70 | bool CompareApp::compare(const Vector& expected, const Vector& result) const { 71 | 72 | if (expected.size() != result.size()) { 73 | puts("different sizes"); 74 | return false; 75 | } 76 | 77 | const size_t n = expected.size(); 78 | for (size_t i=0; i < n; i++) { 79 | const auto e = expected[i]; 80 | const auto r = result[i]; 81 | 82 | if (e != r) { 83 | printf("error at #%lu: expected = %u, result = %d\n", i, e, r); 84 | return false; 85 | } 86 | } 87 | 88 | return true; 89 | } 90 | 91 | int main(int argc, char* argv[]) { 92 | 93 | try { 94 | CompareApp app(argc, argv); 95 | 96 | return app.run() ? EXIT_SUCCESS : EXIT_FAILURE; 97 | 98 | } catch (std::exception& e) { 99 | printf("%s\n", e.what()); 100 | return EXIT_FAILURE; 101 | } catch (Application::Exit&) { 102 | return EXIT_SUCCESS; 103 | } 104 | } 105 | 106 | -------------------------------------------------------------------------------- /scripts/cost.py: -------------------------------------------------------------------------------- 1 | SPACE = '_' 2 | DIGIT = 'd' 3 | 4 | class Cost(object): 5 | def __init__(self): 6 | self.compare = 0 7 | self.multiplication = 0 8 | self.add_sub = 0 9 | self.bit_and = 0 10 | self.pack = 0 11 | self.movemask = 0 12 | self.load = 0 13 | 14 | self.store = 0 15 | 16 | def value(self): 17 | c = 1.00 * self.compare + \ 18 | 1.00 * self.multiplication + \ 19 | 1.00 * self.add_sub + \ 20 | 1.00 * self.bit_and + \ 21 | 1.00 * self.pack + \ 22 | 1.00 * self.movemask + \ 23 | 1.00 * self.load 24 | 25 | if self.store > 0: 26 | return c/float(self.store) 27 | else: 28 | return c 29 | 30 | def __str__(self): 31 | return '%0.2f' % self.value() 32 | 33 | def scalar_cost(bi): 34 | cost = Cost() 35 | 36 | prev = SPACE 37 | for c in bi.image: 38 | if c == SPACE: 39 | cost.compare += 1 # char in separators 40 | if prev == DIGIT: 41 | # end of digits span 42 | cost.store += 1 43 | else: 44 | # tmp = x - '0' 45 | cost.add_sub += 1 46 | # if x > 9 then invalid char 47 | pass 48 | # else 49 | # result = 10 * result + tmp 50 | cost.multiplication += 1 51 | cost.add_sub += 1 52 | 53 | prev = c 54 | 55 | # unlike SIMD algorithm, the last range is considered 56 | if prev == DIGIT: 57 | cost.store += 1 58 | 59 | return cost 60 | 61 | 62 | def SSE_cost(bi): 63 | cost = Cost() 64 | 65 | # simd code always validate whole input 66 | cost.compare += 3 67 | cost.bit_and += 3 68 | cost.movemask += 2 69 | 70 | if bi.element_size == 1: 71 | cost.add_sub += 1 72 | cost.load += len(bi.spans) 73 | 74 | elif bi.element_size == 2: 75 | cost.add_sub += 1 76 | cost.multiplication += 1 77 | cost.load += len(bi.spans) 78 | 79 | elif bi.element_size == 4: 80 | cost.add_sub += 1 81 | cost.multiplication += 2 82 | cost.pack += 1 83 | cost.load += len(bi.spans) 84 | 85 | elif bi.element_size == 8: 86 | cost.add_sub += 1 87 | cost.multiplication += 3 88 | cost.pack += 1 89 | cost.load += len(bi.spans) 90 | 91 | else: 92 | cost.compare = 100000 93 | 94 | cost.store = len(bi.spans) 95 | 96 | return cost 97 | 98 | 99 | def is_profitable(bi): 100 | if len(bi.spans) == 0: 101 | return False 102 | 103 | scalar = scalar_cost(bi) 104 | sse = SSE_cost(bi) 105 | 106 | return sse.cost() < scalar.cost() 107 | 108 | -------------------------------------------------------------------------------- /include/sse/sse-parser-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "scalar/scalar-parse-signed.h" 7 | #include "sse-utils.h" 8 | #include "sse-convert.h" 9 | #include "sse-parser-common.h" 10 | #include "sse-parser-statistics.h" 11 | #include "block_info.h" 12 | 13 | namespace sse { 14 | 15 | namespace detail { 16 | 17 | template 18 | char* process_chunk(char* data, char* end, const __m128i& input, MATCHER matcher, INSERTER output) { 19 | 20 | const __m128i ascii_minus = _mm_set1_epi8('-'); 21 | const __m128i ascii_plus = _mm_set1_epi8('+'); 22 | 23 | const __m128i bytemask_digit = decimal_digits_mask(input); 24 | 25 | const __m128i bytemask_plus = _mm_cmpeq_epi8(input, ascii_plus); 26 | const __m128i bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus); 27 | const __m128i bytemask_sign = _mm_or_si128(bytemask_plus, bytemask_minus); 28 | const __m128i bytemask_span = _mm_or_si128(bytemask_digit, bytemask_sign); 29 | 30 | const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, bytemask_span)); 31 | 32 | if (valid_mask != 0xffff) { 33 | throw std::runtime_error("Wrong character"); 34 | } 35 | 36 | const uint16_t sign_mask = _mm_movemask_epi8(bytemask_sign); 37 | const uint16_t span_mask = _mm_movemask_epi8(bytemask_span); 38 | STATS_SPAN_MASK(span_mask); 39 | const BlockInfo& bi = blocks[span_mask]; 40 | if (sign_mask & bi.invalid_sign_mask) { 41 | throw std::runtime_error("'+' or '-' at invalid position"); 42 | } 43 | 44 | if (span_mask == 0) { 45 | return data + 16; 46 | } 47 | 48 | STATS_INC(loops); 49 | 50 | if (sign_mask == 0 || bi.conversion_routine == Conversion::SSE1Digit) { 51 | // unsigned path 52 | return detail::parse_unsigned(bi, input, data, end, output); 53 | } else { 54 | return detail::parse_signed(bi, input, data, end, output); 55 | } 56 | } 57 | 58 | } 59 | 60 | template 61 | void parser_signed(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) { 62 | 63 | char* data = const_cast(string); 64 | char* end = data + size; 65 | 66 | while (data + 16 < end) { 67 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 68 | data = detail::process_chunk(data, end, input, matcher, output); 69 | 70 | } // for 71 | 72 | // process the tail 73 | scalar::parse_signed(data, string + size - data, separators, output); 74 | } 75 | 76 | } // namespace sse 77 | -------------------------------------------------------------------------------- /test/benchmark-hwevents.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "linux-perf-events.h" 8 | #include "sse/sse-matcher.h" 9 | #include "sse/sse-parser-signed.h" 10 | #include "sse/sse-simplified-parser-signed.h" 11 | 12 | #include "application.h" 13 | 14 | class BenchmarkApp: public Application { 15 | 16 | using SignedVector = std::vector; 17 | 18 | public: 19 | BenchmarkApp(int argc, char** argv) : Application(argc, argv) {} 20 | 21 | private: 22 | virtual bool custom_run() override; 23 | virtual void custom_init() override; 24 | 25 | private: 26 | std::string tmp; 27 | bool csv; 28 | 29 | SignedVector result; 30 | }; 31 | 32 | void BenchmarkApp::custom_init() { 33 | csv = cmdline.has_flag("--csv-output"); 34 | quiet = csv; 35 | } 36 | 37 | bool BenchmarkApp::custom_run() { 38 | 39 | if (!csv) { 40 | printf("Input size: %lu, loops: %lu\n", get_size(), get_loop_count()); 41 | } 42 | 43 | tmp = generate_signed(); 44 | 45 | const char* separators = ";, "; 46 | auto k = get_loop_count(); 47 | 48 | LinuxEvents ev_branches(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 49 | LinuxEvents ev_branch_misses(PERF_COUNT_HW_BRANCH_MISSES); 50 | LinuxEvents ev_cache_references(PERF_COUNT_HW_CACHE_REFERENCES); 51 | LinuxEvents ev_cache_misses(PERF_COUNT_HW_CACHE_MISSES); 52 | 53 | ev_branches.start(); 54 | ev_branch_misses.start(); 55 | ev_cache_references.start(); 56 | ev_cache_misses.start(); 57 | while (k--) { 58 | result.clear(); 59 | sse::NaiveMatcher<8> matcher(separators); 60 | sse::parser_signed(tmp.data(), tmp.size(), separators, 61 | std::move(matcher), std::back_inserter(result)); 62 | } 63 | 64 | const auto branches = ev_branches.end(); 65 | const auto branch_misses = ev_branch_misses.end(); 66 | const auto cache_references = ev_cache_references.end(); 67 | const auto cache_misses = ev_cache_misses.end(); 68 | 69 | if (csv) { 70 | printf("%lu, %lu, %lu, %lu\n", branches, branch_misses, cache_references, cache_misses); 71 | } else { 72 | printf("branches: %lu\n", branches); 73 | printf("branch misses: %lu\n", branch_misses); 74 | printf("branch miss ratio: %0.2f%%\n", (100.0 * branch_misses) / branches); 75 | printf("cache references: %lu\n", cache_references); 76 | printf("cache misses: %lu\n", cache_misses); 77 | printf("cache miss ratio: %0.2f%%\n", (100.0 * cache_misses) / cache_references); 78 | } 79 | 80 | return true; 81 | } 82 | 83 | 84 | int main(int argc, char* argv[]) { 85 | 86 | try { 87 | BenchmarkApp app(argc, argv); 88 | 89 | return app.run() ? EXIT_SUCCESS : EXIT_FAILURE; 90 | 91 | } catch (std::exception& e) { 92 | printf("%s\n", e.what()); 93 | return EXIT_FAILURE; 94 | } catch (Application::Exit&) { 95 | return EXIT_SUCCESS; 96 | } 97 | } 98 | 99 | -------------------------------------------------------------------------------- /include/sse/sse-block-parser-unsigned.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "scalar/scalar-parse-unsigned.h" 7 | #include "sse-utils.h" 8 | #include "sse-convert.h" 9 | #include "sse-parser-common.h" 10 | #include "sse-parser-statistics.h" 11 | #include "block_info.h" 12 | 13 | namespace sse { 14 | 15 | template 16 | void parser_block_unsigned(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) { 17 | 18 | char* data = const_cast(string); 19 | char* end = data + size; 20 | while (data + 16*4 < end) { 21 | const __m128i input0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 0*16)); 22 | const __m128i input1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 1*16)); 23 | const __m128i input2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 2*16)); 24 | const __m128i input3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 3*16)); 25 | const __m128i t0 = decimal_digits_mask(input0); 26 | const uint64_t digit_mask0 = _mm_movemask_epi8(t0); 27 | const uint64_t valid_mask0 = _mm_movemask_epi8(matcher.get_mask(input0, t0)); 28 | const __m128i t1 = decimal_digits_mask(input1); 29 | const uint64_t digit_mask1 = _mm_movemask_epi8(t1); 30 | const uint64_t valid_mask1 = _mm_movemask_epi8(matcher.get_mask(input1, t1)); 31 | const __m128i t2 = decimal_digits_mask(input2); 32 | const uint64_t digit_mask2 = _mm_movemask_epi8(t2); 33 | const uint64_t valid_mask2 = _mm_movemask_epi8(matcher.get_mask(input2, t2)); 34 | const __m128i t3 = decimal_digits_mask(input3); 35 | const uint64_t digit_mask3 = _mm_movemask_epi8(t3); 36 | const uint64_t valid_mask3 = _mm_movemask_epi8(matcher.get_mask(input3, t3)); 37 | 38 | STATS_INC(loops); 39 | 40 | if ((valid_mask0 & valid_mask1 & valid_mask2 & valid_mask3) != 0xffff) { 41 | throw std::runtime_error("Wrong character"); 42 | } 43 | 44 | const uint64_t digit_mask = digit_mask0 45 | | (digit_mask1 << (1*16)) 46 | | (digit_mask2 << (2*16)) 47 | | (digit_mask3 << (3*16)); 48 | 49 | if (digit_mask == 0) { 50 | data += 16*4; 51 | continue; 52 | } 53 | 54 | __m128i input = input0; 55 | uint64_t mask = digit_mask; 56 | char* loopend = data + 3*16; 57 | while (data < loopend) { 58 | char* prevdata = data; 59 | const BlockInfo& bi = blocks[mask & 0xffff]; 60 | data = detail::parse_unsigned(bi, input, data, end, output); 61 | if (data == end) { 62 | break; 63 | } 64 | 65 | const int shift = data - prevdata; 66 | mask >>= shift; 67 | input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 68 | } 69 | 70 | } // for 71 | 72 | // process the tail 73 | scalar::parse_unsigned(data, string + size - data, separators, output); 74 | } 75 | 76 | } // namespace sse 77 | -------------------------------------------------------------------------------- /include/hybrid-parser-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "sse/sse-utils.h" 7 | #include "scalar/scalar-parse-signed.h" 8 | 9 | namespace hybrid_signed { 10 | 11 | template 12 | uint32_t convert(const char* s, uint32_t prev) { 13 | int8_t digit = int8_t(s[0]) - '0'; 14 | if (digit < 0) { 15 | throw std::runtime_error("'+' or '-' on a wrong position"); 16 | } 17 | return convert(s + 1, prev * 10 + digit); 18 | } 19 | 20 | template<> 21 | uint32_t convert<0>(const char* /*s*/, uint32_t prev) { 22 | return prev; 23 | } 24 | 25 | template 26 | int32_t convert(const char* s) { 27 | if (s[0] == '+') 28 | return convert(s + 1, 0); 29 | else if (s[0] == '-') // TODO: check range 30 | return -static_cast(convert(s + 1, 0)); 31 | else 32 | return convert(s, 0); 33 | } 34 | 35 | } 36 | 37 | template 38 | void parser_hybrid_signed(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) { 39 | 40 | #include "hybrid-shift-back.inl" 41 | 42 | const __m128i ascii_plus = _mm_set1_epi8('+'); 43 | const __m128i ascii_minus = _mm_set1_epi8('-'); 44 | 45 | char* data = const_cast(string); 46 | char* end = data + size; 47 | 48 | enum Previous { 49 | none, 50 | has_sign, 51 | has_value 52 | }; 53 | 54 | Previous prev = none; 55 | bool negative = false; 56 | int32_t val = 0; 57 | 58 | uint16_t span_mask = 0; 59 | while (data + 16 < end) { 60 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 61 | const __m128i bytemask_digit = sse::decimal_digits_mask(input); 62 | const __m128i bytemask_plus = _mm_cmpeq_epi8(input, ascii_plus); 63 | const __m128i bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus); 64 | const __m128i bytemask_sign = _mm_or_si128(bytemask_minus, bytemask_plus); 65 | const __m128i bytemask_span = _mm_or_si128(bytemask_digit, bytemask_sign); 66 | const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, bytemask_span)); 67 | 68 | if (valid_mask != 0xffff) { 69 | throw std::runtime_error("Wrong character"); 70 | } 71 | 72 | span_mask = _mm_movemask_epi8(bytemask_span); 73 | if (span_mask == 0) { 74 | data += 16; 75 | continue; 76 | } 77 | 78 | switch (span_mask & 0xff) { 79 | #include "hybrid-parser-signed.inl" 80 | } 81 | 82 | data += 8; 83 | 84 | switch (span_mask >> 8) { 85 | #include "hybrid-parser-signed.inl" 86 | } 87 | 88 | data += 8; 89 | } // for 90 | 91 | // Shift back if the last span in the last wasn't saved yet 92 | // XXX: If there is a really long sequence of digits (more 93 | // than 16) then this fixup will not help. 94 | if (prev != none) { 95 | data -= shift[span_mask >> 8]; 96 | } 97 | 98 | // process the tail 99 | scalar::parse_signed(data, string + size - data, separators, output); 100 | } 101 | -------------------------------------------------------------------------------- /experiments/overalltests/average.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from loader import load 8 | from utils import groupby 9 | from average_writer import RestWriter 10 | from table import Table 11 | from prettyprint import * 12 | 13 | class Report(object): 14 | def __init__(self, path): 15 | with open(path, 'rt') as f: 16 | self.raw_data = load(f) 17 | 18 | bydistribution = lambda item: item.distribution_name 19 | 20 | bysep = lambda item: (item.sep_distribution, item.distribution_name) 21 | 22 | self.report = [] 23 | for (sep, distribution_name), collection in groupby(self.raw_data, bysep).iteritems(): 24 | ret = self.prepare_table(collection) 25 | self.report.append(( 26 | get_separator_title(sep), 27 | get_distribution_title(distribution_name), 28 | ret 29 | )) 30 | 31 | def get(self): 32 | return self.report 33 | 34 | 35 | def prepare_table(self, procedures): 36 | 37 | keyfun = lambda item: (item.size, item.loops, item.num_distribution) 38 | tmp = groupby(procedures, keyfun) 39 | 40 | data = {} 41 | for (size, loops, _), items in tmp.iteritems(): 42 | def get_time(procedure): 43 | for item in items: 44 | if item.procedure == procedure: 45 | return item.time 46 | 47 | raise KeyError("Procedure '%s' not found" % procedure) 48 | 49 | t0 = get_time("scalar") 50 | t1 = get_time("sse") 51 | t2 = get_time("sse-block") 52 | 53 | if t0 < 10 and t1 < 10 and t2 < 10: 54 | # don't fool people when all measurements are single-digit numbers 55 | continue 56 | 57 | speedup_sse = float(t0)/t1 58 | speedup_sse_block = float(t0)/t2 59 | 60 | key = (size, loops) 61 | if key not in data: 62 | data[key] = [[], []] 63 | 64 | data[key][0].append(speedup_sse) 65 | data[key][1].append(speedup_sse_block) 66 | 67 | t = Table() 68 | t.add_header([("input", 2), ("SSE speed-up", 3), ("SSE block speed-up", 3)]) 69 | t.add_header(["size [B]", "loops", "min", "avg", "max", "min", "avg", "max"]) 70 | 71 | def stats(numbers): 72 | s = sum(numbers) 73 | n = len(numbers) 74 | return min(numbers), s/n, max(numbers) 75 | 76 | for size, loops in sorted(data, key=lambda t: t[0]): 77 | 78 | key = size, loops 79 | 80 | sse = stats(data[key][0]) 81 | sse_block = stats(data[key][1]) 82 | 83 | t.add_row([ 84 | '{:,}'.format(size), 85 | '%d' % loops, 86 | 87 | '%0.2f' % sse[0], 88 | '%0.2f' % sse[1], 89 | '%0.2f' % sse[2], 90 | 91 | '%0.2f' % sse_block[0], 92 | '%0.2f' % sse_block[1], 93 | '%0.2f' % sse_block[2], 94 | ]) 95 | 96 | return t 97 | 98 | 99 | if __name__ == '__main__': 100 | rep = Report(sys.argv[1]) 101 | wrt = RestWriter(sys.stdout, rep.get()) 102 | try: 103 | sep = sys.argv[2] 104 | except IndexError: 105 | sep = '~' 106 | 107 | wrt.write(sep) 108 | -------------------------------------------------------------------------------- /test/unittest/verify_sse_unsigned_conversion.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "block_info.h" 9 | #include "scalar/scalar-parse-unsigned.h" 10 | #include "scalar/scalar-parse-signed.h" 11 | #include "sse/sse-convert.h" 12 | #include "sse/sse-matcher.h" 13 | 14 | class Verify { 15 | 16 | char buffer[17]; 17 | __m128i input; 18 | 19 | std::vector result; 20 | std::vector reference; 21 | 22 | public: 23 | Verify() { 24 | memset(buffer, 0, sizeof(buffer)); 25 | result.resize(16); 26 | } 27 | 28 | bool run() { 29 | 30 | unsigned unsupported = 0; 31 | 32 | for (int x=0; x < 65536; x++) { 33 | generate_input(x); 34 | 35 | const BlockInfo& b = blocks[x]; 36 | const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)b.shuffle_digits); 37 | const __m128i shuffled = _mm_shuffle_epi8(input, shuffle_digits); 38 | 39 | using namespace sse; 40 | 41 | if (b.conversion_routine == Conversion::SSE1Digit) { 42 | convert_1digit(shuffled, b.element_count, &result[0]); 43 | } else if (b.conversion_routine == Conversion::SSE2Digits) { 44 | convert_2digits(shuffled, b.element_count, &result[0]); 45 | } else if (b.conversion_routine == Conversion::SSE3Digits) { 46 | convert_3digits(shuffled, b.element_count, &result[0]); 47 | } else if (b.conversion_routine == Conversion::SSE4Digits) { 48 | convert_4digits(shuffled, b.element_count, &result[0]); 49 | } else if (b.conversion_routine == Conversion::SSE8Digits) { 50 | convert_8digits(shuffled, b.element_count, &result[0]); 51 | } else { 52 | unsupported += 1; 53 | continue; 54 | } 55 | 56 | if (!compare(b.element_count)) { 57 | return false; 58 | } 59 | } // for 60 | 61 | printf("All OK (%d cases will never be supported by SIMD code)\n", unsupported); 62 | return true; 63 | } 64 | 65 | private: 66 | void generate_input(uint16_t x) { 67 | 68 | int k = 0; 69 | for (int i=0; i < 16; i++) { 70 | if (x & (1 << i)) { 71 | buffer[i] = (k % 10) + '0'; 72 | k += 1; 73 | } else { 74 | buffer[i] = '_'; 75 | } 76 | } 77 | 78 | input = _mm_loadu_si128((const __m128i*)buffer); 79 | 80 | std::fill(result.begin(), result.end(), -1); 81 | 82 | reference.clear(); 83 | scalar::parse_unsigned(buffer, 16, "_", std::back_inserter(reference)); 84 | } 85 | 86 | bool compare(size_t n) const { 87 | for (size_t i=0; i < n; i++) { 88 | if (result[i] != reference[i]) { 89 | printf("mismatch at %lu: expected=%u, result=%u\n", i, reference[i], result[i]); 90 | printf("reference = "); dump(reference, n); 91 | printf("result = "); dump(result, n); 92 | return false; 93 | } 94 | } 95 | 96 | return true; 97 | } 98 | 99 | void dump(const std::vector& vec, size_t n) const { 100 | for (size_t i=0; i < n; i++) { 101 | if (i > 0) printf(", "); 102 | printf("%u", vec[i]); 103 | } 104 | putchar('\n'); 105 | } 106 | 107 | }; 108 | 109 | 110 | int main() { 111 | 112 | puts("Verify SSE unsigned converters for valid inputs"); 113 | Verify verify; 114 | if (!verify.run()) { 115 | return EXIT_FAILURE; 116 | } 117 | 118 | return EXIT_SUCCESS; 119 | } 120 | 121 | -------------------------------------------------------------------------------- /include/scalar/scalar-parse-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "safe-convert.h" 9 | #include "scalar-parse-common.h" 10 | 11 | namespace scalar { 12 | 13 | template 14 | void parse_signed(const char* data, size_t size, const char* separators, INSERTER output) { 15 | 16 | enum State { 17 | Separator, 18 | Plus, 19 | Minus, 20 | Digit 21 | }; 22 | 23 | State state = Separator; 24 | State prev = Separator; 25 | bool negative = false; 26 | uint32_t number = 0; 27 | 28 | for (size_t i=0; i < size; i++) { 29 | const char c = data[i]; 30 | if (c == '+') { 31 | state = Plus; 32 | } else if (c == '-') { 33 | state = Minus; 34 | } else if (c >= '0' && c <= '9') { 35 | state = Digit; 36 | } else if (contains(separators, c)) { 37 | state = Separator; 38 | } else { 39 | throw std::runtime_error("Wrong character (scalar)"); 40 | } 41 | 42 | switch (state) { 43 | case Plus: 44 | if (prev != Separator) { 45 | throw std::runtime_error("Invalid syntax ('+' follows a non-separator character)"); 46 | } 47 | number = 0; 48 | negative = false; 49 | break; 50 | 51 | case Minus: 52 | if (prev != Separator) { 53 | throw std::runtime_error("Invalid syntax ('-' follows a non-separator character)"); 54 | } 55 | number = 0; 56 | negative = true; 57 | break; 58 | 59 | case Digit: 60 | if (prev == Separator) { 61 | number = c - '0'; 62 | negative = false; 63 | } else { 64 | mul10_add_digit(number, c); 65 | } 66 | break; 67 | 68 | case Separator: 69 | if (prev == Digit) { 70 | if (negative) { 71 | const int64_t tmp = std::numeric_limits::max(); 72 | const uint32_t absmin = -tmp; 73 | if (number > absmin) { 74 | throw std::range_error("signed overflow"); 75 | } 76 | *output = -number; 77 | } else { 78 | if (number > std::numeric_limits::max()) { 79 | throw std::range_error("signed overflow"); 80 | } 81 | 82 | *output = number; 83 | } 84 | } else if (prev != Separator) { 85 | throw std::runtime_error("Invalid syntax ('-' or '+' not followed by any digit)"); 86 | } 87 | break; 88 | } // switch 89 | 90 | prev = state; 91 | } // for 92 | 93 | if (state == Separator) { 94 | if (prev == Digit) { 95 | if (negative) { 96 | *output = -number; 97 | } else { 98 | *output = number; 99 | } 100 | } else if (prev != Separator) { 101 | throw std::runtime_error("Invalid syntax ('-' or '+' not followed by any digit)"); 102 | } 103 | } 104 | } 105 | 106 | } // namespace 107 | 108 | -------------------------------------------------------------------------------- /scripts/hybrid.py: -------------------------------------------------------------------------------- 1 | class DigitsSpan(object): 2 | # range: [first, last] - include the both ends 3 | def __init__(self, first, last): 4 | assert first <= last 5 | 6 | self.first = first 7 | self.last = last 8 | 9 | assert self.digits() <= 8 10 | 11 | def digits(self): 12 | return self.last - self.first + 1 13 | 14 | def __str__(self): 15 | return "<%d, %d>" % (self.first, self.last) 16 | 17 | __repr__ = __str__ 18 | 19 | 20 | DIGIT = 'd' 21 | SPACE = '_' 22 | 23 | class Parser(object): 24 | def __init__(self, number): 25 | assert number >= 0 26 | assert number < 256 27 | 28 | self.number = number 29 | self.image = self.__convert_to_string(number) 30 | 31 | 32 | def get_ranges(self): 33 | prev = SPACE 34 | start = None 35 | ranges = [] 36 | for i, c in enumerate(self.image): 37 | if c == prev: 38 | continue 39 | 40 | if c == DIGIT: # transition 41 | start = i 42 | else: 43 | ranges.append(DigitsSpan(start, i - 1)) 44 | start = None 45 | 46 | prev = c 47 | 48 | if start is not None: 49 | ranges.append(DigitsSpan(start, 7)) 50 | 51 | return ranges 52 | 53 | 54 | def __convert_to_string(self, x): 55 | s = '' 56 | for i in range(8): 57 | if x & (1 << i): 58 | s += DIGIT 59 | else: 60 | s += SPACE 61 | 62 | return s 63 | 64 | 65 | EMPTY = 1 66 | FULL = 2 67 | FINALIZE_PREVIOUS = 3 68 | FIRST_CONTINUATION = 4 69 | WHOLE = 5 70 | LAST = 6 71 | 72 | def tokenize(number): 73 | parser = Parser(number) 74 | ranges = parser.get_ranges() 75 | if number == 0x00: 76 | assert len(ranges) == 0 77 | yield (EMPTY, []) 78 | 79 | elif number == 0xff: 80 | assert len(ranges) == 1 81 | yield (FULL, ranges[0]) 82 | 83 | else: 84 | if ranges[0].first > 0: 85 | yield (FINALIZE_PREVIOUS, None) 86 | for r in ranges: 87 | if r.first == 0: 88 | yield (FIRST_CONTINUATION, r) 89 | elif r.last == 7: 90 | yield (LAST, r) 91 | else: 92 | yield (WHOLE, r) 93 | 94 | 95 | class GeneratorBase(object): 96 | def __init__(self): 97 | self.span = None 98 | self.number = None 99 | 100 | def get(self): 101 | 102 | self.begin() 103 | 104 | self.lines = [] 105 | for number in range(2**8): 106 | self.number = number 107 | self.before() 108 | for (kind, span) in tokenize(number): 109 | self.span = span 110 | 111 | if kind == EMPTY: 112 | self.empty() 113 | elif kind == FULL: 114 | self.full() 115 | elif kind == FINALIZE_PREVIOUS: 116 | self.finalize_previous() 117 | elif kind == FIRST_CONTINUATION: 118 | self.first_continuation() 119 | elif kind == WHOLE: 120 | self.whole() 121 | elif kind == LAST: 122 | self.last() 123 | else: 124 | assert False 125 | 126 | #for 127 | self.after() 128 | #for 129 | 130 | self.end() 131 | 132 | return self.lines 133 | 134 | 135 | def begin(self): 136 | pass 137 | 138 | 139 | def end(self): 140 | pass 141 | 142 | 143 | def before(self): 144 | pass 145 | 146 | 147 | def after(self): 148 | pass 149 | 150 | -------------------------------------------------------------------------------- /scripts/hybrid-signed.py: -------------------------------------------------------------------------------- 1 | from hybrid import GeneratorBase 2 | 3 | class GenerateSingedParser(GeneratorBase): 4 | 5 | def before(self): 6 | self.lines.append('case 0x%02x:' % self.number) 7 | 8 | def after(self): 9 | self.lines.append('break;') 10 | 11 | def empty(self): 12 | pass 13 | 14 | def full(self): 15 | l = self.lines 16 | 17 | l.append("if (prev != none) {") 18 | l.append(" val = %s;" % self.expression(self.span, "val")) 19 | l.append("} else {") 20 | l.append(" val = %s;" % self.expression(self.span)) 21 | l.append("}") 22 | l.append("prev = has_value;") 23 | 24 | def finalize_previous(self): 25 | l = self.lines 26 | 27 | l.append("if (prev == has_value) {") 28 | l.append(" *output++ = (negative) ? -val : val;") 29 | l.append(" prev = none;") 30 | l.append("} else if (prev == has_sign) {") 31 | # there was a sole sign at the end of the previous block 32 | l.append(' throw std::runtime_error("wrong syntax");') 33 | l.append("}") 34 | 35 | def first_continuation(self): 36 | l = self.lines 37 | 38 | l.append("if (prev == has_value) {") 39 | l.append(" val = %s;" % self.expression(self.span, "val")) 40 | l.append(" *output++ = (negative) ? -val : val;") 41 | l.append(" prev = none;") 42 | l.append("} else if (prev == has_sign) {") 43 | l.append(" val = %s;" % self.expression(self.span, "0")) 44 | l.append(" *output++ = (negative) ? -val : val;") 45 | l.append(" prev = none;") 46 | l.append("} else {") 47 | l.append(" *output++ = %s;" % self.expression(self.span)) 48 | l.append("}") 49 | 50 | def whole(self): 51 | self.lines.append("*output++ = %s;" % self.expression(self.span)) 52 | 53 | def last(self): 54 | l = self.lines 55 | span = self.span 56 | 57 | if span.digits() == 1: 58 | # just one character 59 | l.append("if (data[%d] == '+') {" % span.first) 60 | l.append(" prev = has_sign;") 61 | l.append(" negative = false;") 62 | l.append("} else if (data[%d] == '-') {" % span.first) 63 | l.append(" prev = has_sign;") 64 | l.append(" negative = true;") 65 | l.append("} else {") 66 | l.append(" val = %s;" % self.expression(span, "0")) 67 | l.append(" prev = has_value;") 68 | l.append(" negative = false;") 69 | l.append("}") 70 | 71 | else: 72 | l.append("if (data[%d] == '+') {" % span.first) 73 | l.append(" val = %s;" % self.invocation(span.digits() - 1, span.first + 1)) 74 | l.append(" negative = false;") 75 | l.append("} else if (data[%d] == '-') {" % span.first) 76 | l.append(" val = %s;" % self.invocation(span.digits() - 1, span.first + 1)) 77 | l.append(" negative = true;") 78 | l.append("} else {") 79 | l.append(" val = %s;" % self.expression(span, "0")) 80 | l.append(" negative = false;") 81 | l.append("}") 82 | l.append("prev = has_value;") 83 | 84 | 85 | def invocation(self, digits, offset, arg = None): 86 | result = "hybrid_signed::convert<%d>(data" % digits 87 | if offset != 0: 88 | result += ' + %d' % offset 89 | 90 | if arg is not None: 91 | result += ', %s' % arg 92 | 93 | result += ')' 94 | return result 95 | 96 | 97 | def expression(self, span, arg = None): 98 | return self.invocation(span.digits(), span.first, arg) 99 | 100 | 101 | if __name__ == '__main__': 102 | gen = GenerateSingedParser() 103 | for line in gen.get(): 104 | print(line) 105 | 106 | -------------------------------------------------------------------------------- /test/compare-signed.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "scalar/scalar-parse-unsigned.h" 7 | #include "sse/sse-matcher.h" 8 | #include "sse/sse-parser-signed.h" 9 | #include "sse/sse-block-parser-signed.h" 10 | 11 | #include "application.h" 12 | 13 | class CompareApp: public Application { 14 | 15 | using Vector = std::vector; 16 | 17 | private: 18 | const std::string separators; 19 | std::string input_string; 20 | Vector reference; 21 | Vector result; 22 | 23 | public: 24 | CompareApp(int argc, char** argv) 25 | : Application(argc, argv) 26 | , separators(";, ") {} 27 | 28 | private: 29 | virtual bool custom_run() override; 30 | 31 | private: 32 | void run_sse_parser() { 33 | 34 | sse::NaiveMatcher<8> matcher(separators.c_str()); 35 | result.clear(); 36 | sse::parser_signed( 37 | input_string.data(), 38 | input_string.size(), 39 | separators.c_str(), 40 | std::move(matcher), 41 | std::back_inserter(reference)); 42 | } 43 | 44 | void run_sse_block_parser() { 45 | 46 | sse::NaiveMatcher<8> matcher(separators.c_str()); 47 | result.clear(); 48 | sse::parser_block_signed( 49 | input_string.data(), 50 | input_string.size(), 51 | separators.c_str(), 52 | std::move(matcher), 53 | std::back_inserter(result)); 54 | } 55 | 56 | private: 57 | void dump(const Vector& vec) const; 58 | bool compare(const Vector& expected, const Vector& result) const; 59 | 60 | }; 61 | 62 | bool CompareApp::custom_run() { 63 | 64 | input_string = generate_signed(); 65 | scalar::parse_signed(input_string.data(), 66 | input_string.size(), 67 | separators.c_str(), 68 | std::back_inserter(reference)); 69 | 70 | puts("Checking SSE parser"); 71 | run_sse_parser(); 72 | if (!compare(reference, result)) { 73 | puts(input_string.c_str()); 74 | puts(""); 75 | dump(reference); 76 | puts(""); 77 | dump(result); 78 | 79 | return false; 80 | } 81 | 82 | puts("Checking SSE block parser"); 83 | run_sse_block_parser(); 84 | if (!compare(reference, result)) { 85 | puts(input_string.c_str()); 86 | puts(""); 87 | dump(reference); 88 | puts(""); 89 | dump(result); 90 | 91 | return false; 92 | } 93 | 94 | 95 | puts("All OK"); 96 | return true; 97 | } 98 | 99 | void CompareApp::dump(const Vector& vec) const { 100 | printf("size = %lu: [", vec.size()); 101 | 102 | const size_t n = vec.size(); 103 | if (n) { 104 | printf("%d", vec[0]); 105 | } 106 | 107 | for (size_t i=1; i < n; i++) { 108 | printf(", %d", vec[i]); 109 | } 110 | 111 | printf("]\n"); 112 | } 113 | 114 | bool CompareApp::compare(const Vector& expected, const Vector& result) const { 115 | 116 | if (expected.size() != result.size()) { 117 | puts("different sizes"); 118 | return false; 119 | } 120 | 121 | const size_t n = expected.size(); 122 | for (size_t i=0; i < n; i++) { 123 | const auto e = expected[i]; 124 | const auto r = result[i]; 125 | 126 | if (e != r) { 127 | printf("error at #%lu: expected = %d, result = %d\n", i, e, r); 128 | return false; 129 | } 130 | } 131 | 132 | return true; 133 | } 134 | 135 | int main(int argc, char* argv[]) { 136 | 137 | try { 138 | CompareApp app(argc, argv); 139 | 140 | return app.run() ? EXIT_SUCCESS : EXIT_FAILURE; 141 | 142 | } catch (std::exception& e) { 143 | printf("%s\n", e.what()); 144 | return EXIT_FAILURE; 145 | } catch (Application::Exit&) { 146 | return EXIT_SUCCESS; 147 | } 148 | } 149 | 150 | -------------------------------------------------------------------------------- /test/benchmark-cpuclocks.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "benchmark.h" 8 | #include "scalar/std-parser-signed.h" 9 | #include "hybrid-parser-signed.h" 10 | #include "sse/sse-matcher.h" 11 | #include "sse/sse-parser-signed.h" 12 | #include "sse/sse-block-parser-signed.h" 13 | #include "sse/sse-simplified-parser-signed.h" 14 | 15 | #include "application.h" 16 | 17 | class BenchmarkApp: public Application { 18 | 19 | using SignedVector = std::vector; 20 | 21 | public: 22 | BenchmarkApp(int argc, char** argv) : Application(argc, argv) {} 23 | 24 | private: 25 | virtual bool custom_run() override; 26 | 27 | private: 28 | std::string tmp; 29 | 30 | struct ResultSigned { 31 | SignedVector reference; 32 | SignedVector SSE; 33 | SignedVector SSEblock; 34 | SignedVector std_scalar; 35 | SignedVector SSEsimplified; 36 | SignedVector hybrid; 37 | } result_signed; 38 | }; 39 | 40 | bool BenchmarkApp::custom_run() { 41 | 42 | printf("Input size: %lu, loops: %lu\n", get_size(), get_loop_count()); 43 | 44 | tmp = generate_signed(); 45 | 46 | const char* separators = ";, "; 47 | 48 | const auto repeat = get_loop_count(); 49 | const auto size = tmp.size(); 50 | 51 | BEST_TIME( 52 | // pre: 53 | result_signed.reference.clear(), 54 | 55 | // test: 56 | scalar::parse_signed(tmp.data(), tmp.size(), separators, 57 | std::back_inserter(result_signed.reference)), 58 | "scalar", 59 | repeat, 60 | size 61 | ); 62 | 63 | BEST_TIME( 64 | // pre: 65 | result_signed.SSE.clear(); 66 | sse::NaiveMatcher<8> matcher(separators);, 67 | 68 | // test: 69 | sse::parser_signed(tmp.data(), tmp.size(), separators, 70 | std::move(matcher), std::back_inserter(result_signed.SSE)), 71 | "SSE", 72 | repeat, 73 | size 74 | ); 75 | 76 | BEST_TIME( 77 | // pre: 78 | result_signed.SSEblock.clear(); 79 | sse::NaiveMatcher<8> matcher(separators);, 80 | 81 | // test: 82 | sse::parser_block_signed( 83 | tmp.data(), tmp.size(), 84 | separators, 85 | std::move(matcher), std::back_inserter(result_signed.SSEblock));, 86 | 87 | "SSE (block)", 88 | repeat, 89 | size 90 | ); 91 | 92 | BEST_TIME( 93 | // pre: 94 | result_signed.std_scalar.clear();, 95 | 96 | // test: 97 | scalar::cstd::parse_signed( 98 | tmp.data(), tmp.size(), 99 | separators, 100 | std::back_inserter(result_signed.std_scalar));, 101 | 102 | "scalar (std)", 103 | repeat, 104 | size 105 | ); 106 | 107 | BEST_TIME( 108 | // pre: 109 | result_signed.SSEsimplified.clear();, 110 | 111 | // result: 112 | sse_simplified::parse_signed( 113 | tmp.data(), tmp.size(), 114 | separators, 115 | std::back_inserter(result_signed.SSEsimplified));, 116 | 117 | "SSE (simplified)", 118 | repeat, 119 | size 120 | ); 121 | 122 | BEST_TIME( 123 | // pre: 124 | result_signed.hybrid.clear(); 125 | sse::NaiveMatcher<8> matcher(separators);, 126 | 127 | // test: 128 | parser_hybrid_signed(tmp.data(), tmp.size(), separators, 129 | std::move(matcher), std::back_inserter(result_signed.hybrid)), 130 | "scalar (hybrid)", 131 | repeat, 132 | size 133 | ); 134 | 135 | return true; 136 | } 137 | 138 | 139 | int main(int argc, char* argv[]) { 140 | 141 | try { 142 | BenchmarkApp app(argc, argv); 143 | 144 | return app.run() ? EXIT_SUCCESS : EXIT_FAILURE; 145 | 146 | } catch (std::exception& e) { 147 | printf("%s\n", e.what()); 148 | return EXIT_FAILURE; 149 | } catch (Application::Exit&) { 150 | return EXIT_SUCCESS; 151 | } 152 | } 153 | 154 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | Parsing series of integers with SIMD 3 | ================================================================================ 4 | 5 | Sample programs for article `Parsing series of integers with SIMD`__ 6 | 7 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html 8 | 9 | Parsers extract integer numbers from strings. A number can be prepended by a 10 | sign character. The numbers are separated by arbitrary sequences of separator 11 | chars. All other characters are invalid and the parsers detects them and raise 12 | exception. 13 | 14 | This repository contains: 15 | 16 | * scalar reference implementation; 17 | * two variants of SSE parsers; there are also separate variants designed 18 | solely of parsing unsigned numbers; 19 | * scalar hybrid that combines ideas from SIMD parsing with scalar 20 | conversion procedures. 21 | 22 | Requires: C++11 compiler (tested with GCC 7.3) and Python 2.7. 23 | 24 | 25 | Usage 26 | -------------------------------------------------------------------------------- 27 | 28 | Type ``make`` to build all programs. 29 | 30 | Type ``make run-unittests`` to build all unit tests and then run them. 31 | Some tests are time consuming, be patient. 32 | 33 | Type ``make microbenchmarks.rst`` to run microbenchmarks. 34 | 35 | Type ``make report-overall.rst`` to run performance benchmarks. 36 | 37 | Type ``make spanmaskhistogram.rst`` to produce runtime analysis report 38 | for SSE implementation. 39 | 40 | 41 | Programs 42 | -------------------------------------------------------------------------------- 43 | 44 | There are several programs available in ``bin`` subdirectory. 45 | 46 | * ``benchmark`` --- test performance of given procedure 47 | * ``benchmark-cpuclocks`` --- measure performance of all procedures; display 48 | CPU clocks 49 | * ``benchmark-all`` --- compare performance of different procedures 50 | * ``compare-singed`` and ``comapre-unsigned`` --- are used to 51 | validate if parsers produces the same results as the reference 52 | parser 53 | * ``compare-avx512`` --- the same as above, but tests only 54 | AVX512BW implementation 55 | * ``statistics`` --- gather execution statistics from SSE parsers 56 | 57 | Apart from these programs, there are several ``verify_*`` executables 58 | that runs various unit tests; they are invoked by ``make run-tests``. 59 | 60 | Common arguments 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | All the programs generates random data which is then parsed, 64 | following command line arguments can be used to control data 65 | characteristics: 66 | 67 | --size=NUMBER input size (in bytes) 68 | --loops=NUMBER how many times a test must be repeated [default: 1] 69 | --seed=NUMBER seed for random number generator [default: 0] 70 | --num=DISTRIBUTION distribution of lengths of numbers 71 | --sep=DISTRIBUTION distribution of lengths of gaps between numbers [default: '1'] 72 | --separators=string list of separator characters [default: ",; "] 73 | --sign=DISTRIBUTION distribution of sign in front of number [default: '1'] 74 | 75 | ``DISTRIBUTION`` is a list of weights separated with commas, which defines 76 | distribution of items. 77 | 78 | In case of ``--num`` it's the count of decimal digits in a random number. 79 | For instance ``--num=1,1,1,1`` will produce one-, two-, three- or four-digit 80 | numbers with the same probability; ``--num=0,0,0,1,5,1`` will produce four-, 81 | five- or six-digit numbers, but five-digits numbers with probability 5/7. 82 | 83 | In case of ``--sep`` it is the distribution of numbers of separator characters 84 | between the generated numbers. The default ``--sep=1`` means there's always 85 | exactly one character; ``--sep=0,0,1,1,1`` would put form 2 to 4 separator chars. 86 | 87 | The ``--sign`` defines distribution of set: no-character-sign, '+' and '-'. 88 | Thus the default ``--sign=1`` forces just unsigned numbers; ``--sign=0,0,1`` 89 | will force all numbers negative. 90 | 91 | 92 | TODO 93 | -------------------------------------------------------------------------------- 94 | 95 | * Complete AVX512 implementation to handle scalar fallback. 96 | 97 | 98 | License 99 | -------------------------------------------------------------------------------- 100 | 101 | BSD 102 | -------------------------------------------------------------------------------- /experiments/overalltests/report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from table import Table 8 | from loader import load 9 | from utils import groupby 10 | from report_writer import RestWriter 11 | from prettyprint import * 12 | 13 | 14 | class Report(object): 15 | 16 | def __init__(self, path): 17 | with open(path, 'rt') as f: 18 | self.raw_data = load(f) 19 | 20 | # group by separators distribution 21 | bysep = lambda item: item.sep_distribution 22 | 23 | self.report = [] 24 | for sep, collection in groupby(self.raw_data, bysep).items(): 25 | ret = self.split_by_distribution(collection) 26 | self.report.append(( 27 | get_separator_title(sep), 28 | ret 29 | )) 30 | 31 | 32 | def get(self): 33 | return self.report 34 | 35 | 36 | def split_by_distribution(self, collection): 37 | result = [] 38 | 39 | bynum = lambda item: (item.distribution_name) 40 | tmp = groupby(collection, bynum) 41 | for distribution_name, collection in tmp.items(): 42 | res = self.split_by_parameters(distribution_name, collection) 43 | result.append(( 44 | get_distribution_title(distribution_name), 45 | res 46 | )) 47 | 48 | return result 49 | 50 | 51 | def split_by_parameters(self, distribution_name, collection): 52 | byparam = lambda item: item.num_distribution 53 | 54 | result = [] 55 | for key, collection in groupby(collection, byparam).items(): 56 | table = self.prepare_table(collection) 57 | ret = get_num_distribution_parameters(distribution_name, key) 58 | result.append(( 59 | ret.title, 60 | table, 61 | ret.weight 62 | )) 63 | 64 | result.sort(key=lambda row: row[-1]) 65 | 66 | return [item[:2] for item in result] 67 | 68 | 69 | def prepare_table(self, procedures): 70 | 71 | keyfun = lambda item: (item.size, item.loops) 72 | tmp = groupby(procedures, keyfun) 73 | 74 | data = [] 75 | for (size, loops), items in tmp.items(): 76 | def get_time(procedure): 77 | for item in items: 78 | if item.procedure == procedure: 79 | return item.time 80 | 81 | raise KeyError("Procedure '%s' not found" % procedure) 82 | 83 | data.append(( 84 | size, 85 | loops, 86 | get_time("scalar"), 87 | get_time("sse"), 88 | get_time("sse-block"), 89 | )) 90 | 91 | data.sort(key=lambda t: t[0]) # sort by size 92 | 93 | t = Table() 94 | t.add_header([("input", 2), "scalar", ("SSE", 2), ("SSE block", 2)]) 95 | t.add_header(["size [B]", "loops", "time [us]", "time [us]", "speed-up", "time [us]", "speed-up"]) 96 | 97 | for item in data: 98 | t0 = item[2] 99 | t1 = item[3] 100 | t2 = item[4] 101 | if t0 < 10 and t1 < 10 and t2 < 10: 102 | # don't fool people when all measurements are single-digit numbers 103 | speedup_sse = '---' 104 | speedup_sse_block = '---' 105 | else: 106 | speedup_sse = '%0.2f' % (float(t0)/t1) 107 | speedup_sse_block = '%0.2f' % (float(t0)/t2) 108 | 109 | t.add_row([ 110 | '{:,}'.format(item[0]), 111 | '%d' % item[1], 112 | '%d' % item[2], 113 | '%d' % item[3], 114 | speedup_sse, 115 | '%d' % item[4], 116 | speedup_sse_block, 117 | ]) 118 | 119 | return t 120 | 121 | 122 | 123 | 124 | def main(): 125 | report = Report(sys.argv[1]) 126 | writer = RestWriter(sys.stdout, report.get()) 127 | try: 128 | restsection = sys.argv[2] 129 | except IndexError: 130 | restsection = "-~#" 131 | 132 | writer.write(restsection) 133 | 134 | if __name__ == '__main__': 135 | main() 136 | -------------------------------------------------------------------------------- /test/unittest/verify_sse_signed_overflow_detection.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "sse/sse-parser-signed.h" 6 | #include "sse/sse-matcher.h" 7 | 8 | 9 | const char SEPARATOR = '_'; 10 | 11 | class Verify { 12 | 13 | using Vector = std::vector; 14 | Vector result; 15 | 16 | std::string input; 17 | std::string image; 18 | size_t size; 19 | size_t position; 20 | int64_t value; 21 | public: 22 | 23 | Verify() : size(64) {} 24 | 25 | bool run() { 26 | 27 | //check_not_overflow(); 28 | check_overflow(); 29 | 30 | printf("All OK\n"); 31 | return true; 32 | } 33 | 34 | private: 35 | void check_not_overflow() { 36 | 37 | value = std::numeric_limits::max(); 38 | image = std::to_string(value); 39 | 40 | printf("'%s' should not overflow\n", image.c_str()); 41 | assume_not_overflow(); 42 | 43 | image = '+' + image; 44 | printf("'%s' should not overflow\n", image.c_str()); 45 | assume_not_overflow(); 46 | 47 | value = std::numeric_limits::min(); 48 | image = std::to_string(value); 49 | 50 | printf("'%s' should not overflow\n", image.c_str()); 51 | assume_not_overflow(); 52 | } 53 | 54 | void assume_not_overflow() { 55 | 56 | for (position=0; position < 32; position++) { 57 | 58 | prepare_input(); 59 | convert(); 60 | 61 | assert(result.size() == 1); 62 | assert(result[0] == value); 63 | } 64 | } 65 | 66 | void check_overflow() { 67 | 68 | value = std::numeric_limits::max(); 69 | value += 1; 70 | image = std::to_string(value); 71 | 72 | printf("'%s' should overflow\n", image.c_str()); 73 | assume_overflow(); 74 | 75 | image = '+' + image; 76 | printf("'%s' should overflow\n", image.c_str()); 77 | assume_overflow(); 78 | 79 | value = 9999999999l; 80 | image = std::to_string(value); 81 | 82 | printf("'%s' should overflow\n", image.c_str()); 83 | assume_overflow(); 84 | 85 | value = 100000000000000l; 86 | image = std::to_string(value); 87 | 88 | printf("'%s' should overflow\n", image.c_str()); 89 | assume_overflow(); 90 | 91 | value = std::numeric_limits::min(); 92 | value -= 1; 93 | image = std::to_string(value); 94 | 95 | printf("'%s' should overflow\n", image.c_str()); 96 | assume_overflow(); 97 | 98 | value = -9999999999l; 99 | image = std::to_string(value); 100 | 101 | printf("'%s' should overflow\n", image.c_str()); 102 | assume_overflow(); 103 | 104 | value = -100000000000000l; 105 | image = std::to_string(value); 106 | 107 | printf("'%s' should overflow\n", image.c_str()); 108 | assume_overflow(); 109 | } 110 | 111 | void assume_overflow() { 112 | 113 | for (position=0; position < 32; position++) { 114 | 115 | prepare_input(); 116 | try { 117 | convert(); 118 | assert(false && "must fail"); 119 | } catch (std::range_error& e) { 120 | assert(result.size() == 0); 121 | } catch (...) { 122 | assert(false && "unexpected exception"); 123 | } 124 | } 125 | } 126 | 127 | void prepare_input() { 128 | input.clear(); 129 | input += std::string(position, SEPARATOR); 130 | input += image; 131 | input += std::string(size - input.size(), SEPARATOR); 132 | } 133 | 134 | void convert() { 135 | const char separators[] = {SEPARATOR, 0}; 136 | 137 | result.clear(); 138 | sse::NaiveMatcher<8> matcher(separators); 139 | 140 | sse::parser_signed(input.data(), 141 | input.size(), separators, 142 | std::move(matcher), 143 | std::back_inserter(result)); 144 | } 145 | }; 146 | 147 | 148 | int main() { 149 | puts("Verify if sse::signed_parser detects overflows"); 150 | Verify verify; 151 | if (verify.run()) { 152 | return EXIT_SUCCESS; 153 | } else { 154 | return EXIT_FAILURE; 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /experiments/spanmaskhistogram/report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | if __name__ == '__main__' and __package__ is None: 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | from table import Table 8 | from utils import splitsorted 9 | from prettyprint import * 10 | from loader import load 11 | from report_writer import RestWriter 12 | 13 | 14 | class Report(object): 15 | 16 | def __init__(self): 17 | self.report = None 18 | self.tmp = [] 19 | self.prev_size = None 20 | 21 | def add(self, item): 22 | if item.size != self.prev_size: 23 | self.prev_size = item.size 24 | self.tmp.append((item.size, [])) 25 | 26 | title = '%s, %s' % ( 27 | get_num_distribution_parameters(item.distribution_name, item.numbers_distribution).title, 28 | get_separator_title(item.separators_distribution)) 29 | 30 | self.tmp[-1][1].append((item.distribution_name, title, item.histogram, item.hwevents, item.cycles)) 31 | 32 | 33 | def get(self): 34 | if self.report is None: 35 | self.report = [] 36 | for size, statistics in self.tmp: 37 | title = 'Input size {:,d} bytes'.format(size) 38 | self.report.append((title, self.prepare_table(statistics))) 39 | 40 | return self.report 41 | 42 | 43 | def prepare_table(self, stats): 44 | 45 | t = Table() 46 | t.add_header(["parameters", ("distinct span masks count", 5), ("cycles per byte", 2), ("branches", 3), ("cache references", 3)]) 47 | t.add_header(["", "< 25%", "< 50%", "< 75%", "< 95%", "100%", "min", "avg", "taken", "mispredicted", "ratio", "count", "missed", "ratio"]) 48 | 49 | splitted = splitsorted(stats, lambda item: item[0]) 50 | 51 | for subarray in splitted: 52 | distribution_name = subarray[0][0] 53 | title = get_distribution_title(distribution_name) 54 | t.add_row([(title, 14)]) 55 | 56 | for distribution_name, parameters, histogram, hwevents, cycles in subarray: 57 | 58 | row = [parameters] 59 | 60 | # histogram 61 | weights = [0.25, 0.50, 0.75, 0.95, 1.00] 62 | tmp = self.process_histogram(histogram, weights) 63 | for w in weights: 64 | row.append('%d' % tmp[w]) 65 | 66 | # cycles 67 | if cycles is None: 68 | row.append('') 69 | row.append('') 70 | else: 71 | row.append('%0.3f' % cycles[0]) 72 | row.append('%0.3f' % cycles[1]) 73 | 74 | # hwevents 75 | row.append('%d' % hwevents.branches) 76 | row.append('%d' % hwevents.branch_misses) 77 | row.append('%0.2f%%' % (100.0 * hwevents.get_branch_miss_ratio())) 78 | row.append('%d' % hwevents.cache_references) 79 | row.append('%d' % hwevents.cache_misses) 80 | row.append('%0.2f%%' % (100.0 * hwevents.get_cache_miss_ratio())) 81 | 82 | t.add_row(row) 83 | 84 | return t 85 | 86 | 87 | def process_histogram(self, list, weights): 88 | 89 | assert(len(weights) > 0) 90 | 91 | get_count = lambda item: item[1] 92 | list.sort(key=get_count) 93 | total = sum(get_count(item) for item in list) 94 | 95 | result = {} 96 | for w in weights: 97 | result[w] = 0 98 | 99 | cumulative = 0 100 | for k, (mask, count) in enumerate(list): 101 | cumulative += count 102 | proc = cumulative/float(total) 103 | for w in result: 104 | if proc <= w: 105 | result[w] = k + 1 106 | 107 | return result 108 | 109 | 110 | def main(): 111 | report = Report() 112 | 113 | spanmaskhistogram = sys.argv[1] 114 | hwevents = sys.argv[2] 115 | microbenchmarks = sys.argv[3] 116 | output = sys.argv[4] 117 | restseparator = sys.argv[5] 118 | 119 | for item in load(spanmaskhistogram, hwevents, microbenchmarks): 120 | report.add(item) 121 | 122 | data = report.get() 123 | 124 | with open(output, 'wt') as f: 125 | writer = RestWriter(f, data) 126 | writer.write(restseparator) 127 | 128 | 129 | if __name__ == '__main__': 130 | main() 131 | 132 | -------------------------------------------------------------------------------- /include/avx512/avx512-parser-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "scalar/scalar-parse-signed.h" 9 | #include "sse/sse-parser-signed.h" 10 | 11 | #include "block_info.h" 12 | 13 | namespace avx512 { 14 | 15 | enum Lookup: uint8_t { 16 | 17 | DIGIT = 0x80, 18 | SIGN = 0xc0, 19 | VALID = 1, 20 | INVALID = 0 21 | }; 22 | 23 | void prepare_lookup(const char* separators, uint8_t result[128]) { 24 | uint8_t* c = (uint8_t*)(separators); 25 | 26 | memset(result, INVALID, 128); 27 | 28 | for (int i='0'; i <= '9'; i++) { 29 | result[i] = DIGIT; 30 | } 31 | 32 | result['-'] = SIGN; 33 | result['+'] = SIGN; 34 | 35 | while (*c) { 36 | uint8_t x = *c++; 37 | if (x & 0x80) { 38 | throw std::logic_error("extended ASCII is not supported"); 39 | } 40 | 41 | switch (x) { 42 | case '0': case '1': case '2': 43 | case '3': case '4': case '5': 44 | case '6': case '7': case '8': 45 | case '9': case '+': case '-': 46 | throw std::logic_error("digits and sign chars are reserved"); 47 | } 48 | 49 | result[x] = VALID; 50 | } 51 | } 52 | 53 | template 54 | void parser_signed(const char* string, size_t size, const char* separators, INSERTER output) { 55 | 56 | char* data = const_cast(string); 57 | char* end = data + size; 58 | 59 | uint8_t classes_lookup[128]; 60 | prepare_lookup(separators, classes_lookup); 61 | 62 | const __m512i class_lo = _mm512_loadu_si512(reinterpret_cast<__m512i*>(&classes_lookup[0])); 63 | const __m512i class_hi = _mm512_loadu_si512(reinterpret_cast<__m512i*>(&classes_lookup[64])); 64 | while (data + 64 < end) { 65 | const __m512i input = _mm512_loadu_si512(reinterpret_cast<__m512i*>(data)); 66 | 67 | const __m512i classes = _mm512_permutex2var_epi8(class_lo, input, class_hi); 68 | 69 | if (_mm512_test_epi8_mask(classes, classes) != uint64_t(-1)) { 70 | throw std::logic_error("invalid character"); 71 | } 72 | 73 | uint64_t span_mask64 = _mm512_movepi8_mask(classes); 74 | uint64_t sign_mask64 = _mm512_test_epi8_mask(classes, _mm512_set1_epi8(int8_t(0x40))); 75 | 76 | char* bufend = data + 64; 77 | while (data + 16 <= bufend) { 78 | const uint16_t span_mask = span_mask64 & 0xffff; 79 | const uint16_t sign_mask = sign_mask64 & 0xffff; 80 | 81 | const BlockInfo& bi = blocks[span_mask]; 82 | if (sign_mask & bi.invalid_sign_mask) { 83 | throw std::runtime_error("'+' or '-' at invalid position"); 84 | } 85 | 86 | const __m128i chunk = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 87 | 88 | const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits); 89 | const __m128i shuffle_signs = _mm_loadu_si128((const __m128i*)bi.shuffle_signs); 90 | 91 | const __m128i shuffled = _mm_shuffle_epi8(chunk, shuffle_digits); 92 | const __m128i negate_mask = _mm_cmpeq_epi8(_mm_shuffle_epi8(chunk, shuffle_signs), _mm_set1_epi8('-')); 93 | if (bi.conversion_routine == Conversion::SSE1Digit) { 94 | 95 | sse::convert_1digit(shuffled, bi.element_count, output); 96 | 97 | } else if (bi.conversion_routine == Conversion::SSE2Digits) { 98 | 99 | sse::convert_2digits_signed(shuffled, negate_mask, bi.element_count, output); 100 | 101 | } else if (bi.conversion_routine == Conversion::SSE4Digits) { 102 | 103 | sse::convert_4digits_signed(shuffled, negate_mask, bi.element_count, output); 104 | 105 | } else if (bi.conversion_routine == Conversion::SSE8Digits) { 106 | 107 | sse::convert_8digits_signed(shuffled, negate_mask, bi.element_count, output); 108 | 109 | } else { 110 | 111 | printf("case %04x not handled yet\n", span_mask); 112 | assert(false); 113 | } 114 | 115 | data += bi.total_skip; 116 | 117 | span_mask64 >>= bi.total_skip; 118 | sign_mask64 >>= bi.total_skip; 119 | } 120 | 121 | } // for 122 | 123 | // process the tail 124 | scalar::parse_signed(data, string + size - data, separators, output); 125 | } 126 | 127 | } // namespace sse 128 | -------------------------------------------------------------------------------- /test/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "time_utils.h" 8 | #include "scalar/scalar-parse-unsigned.h" 9 | #include "sse/sse-matcher.h" 10 | #include "sse/sse-parser-unsigned.h" 11 | #include "sse/sse-block-parser-unsigned.h" 12 | #include "sse/sse-parser-signed.h" 13 | #include "sse/sse-block-parser-signed.h" 14 | 15 | #include "application.h" 16 | 17 | class BenchmarkApp: public Application { 18 | 19 | using Vector = std::vector; 20 | 21 | enum class Procedure { 22 | Scalar, 23 | SSE, 24 | SSEBlock 25 | }; 26 | 27 | std::string procedure_name; 28 | Procedure procedure; 29 | 30 | public: 31 | BenchmarkApp(int argc, char** argv); 32 | 33 | protected: 34 | virtual bool custom_run() override; 35 | virtual void custom_init() override; 36 | virtual void print_custom_help() const override; 37 | 38 | private: 39 | Vector result; 40 | std::string tmp; 41 | 42 | private: 43 | template 44 | uint64_t sum(const T& vec) const { 45 | return std::accumulate(vec.begin(), vec.end(), 0); 46 | } 47 | 48 | template 49 | Clock::time_point::rep measure_time(FUN fun) { 50 | 51 | Clock::time_point::rep min = 0; 52 | for (size_t i=0; i < get_loop_count(); i++) { 53 | result.clear(); 54 | const auto t1 = Clock::now(); 55 | fun(); 56 | const auto t2 = Clock::now(); 57 | 58 | const auto dt = elapsed(t1, t2); 59 | if (i == 0) { 60 | min = dt; 61 | } else { 62 | min = std::min(dt, min); 63 | } 64 | } 65 | 66 | return min; 67 | } 68 | 69 | }; 70 | 71 | BenchmarkApp::BenchmarkApp(int argc, char** argv) : Application(argc, argv) {} 72 | 73 | 74 | void BenchmarkApp::custom_init() { 75 | procedure_name = cmdline.get_value("--procedure", ""); 76 | if (procedure_name.empty()) { 77 | throw ArgumentError("Procedure name must not be empty"); 78 | } 79 | 80 | if (procedure_name == "scalar") { 81 | procedure = Procedure::Scalar; 82 | } else if (procedure_name == "sse") { 83 | procedure = Procedure::SSE; 84 | } else if (procedure_name == "sse-block") { 85 | procedure = Procedure::SSEBlock; 86 | } else { 87 | throw ArgumentError("Unknown procedure name. It must be: 'scalar', 'sse', 'sse-block'"); 88 | } 89 | } 90 | 91 | 92 | void BenchmarkApp::print_custom_help() const { 93 | puts("--procedure=NAME where name is 'scalar', sse' or 'sse-block'"); 94 | } 95 | 96 | 97 | bool BenchmarkApp::custom_run() { 98 | 99 | tmp = generate_signed(); 100 | 101 | Clock::time_point::rep time; 102 | 103 | switch (procedure) { 104 | case Procedure::Scalar: 105 | time = measure_time([this] { 106 | scalar::parse_signed(tmp.data(), tmp.size(), get_separators_set().c_str(), 107 | std::back_inserter(result)); 108 | }); 109 | break; 110 | 111 | case Procedure::SSE: 112 | time = measure_time([this] { 113 | sse::NaiveMatcher<8> matcher(get_separators_set().c_str()); 114 | sse::parser_signed( 115 | tmp.data(), 116 | tmp.size(), 117 | get_separators_set().c_str(), 118 | std::move(matcher), 119 | std::back_inserter(result)); 120 | }); 121 | break; 122 | 123 | case Procedure::SSEBlock: 124 | time = measure_time([this] { 125 | sse::NaiveMatcher<8> matcher(get_separators_set().c_str()); 126 | sse::parser_block_signed( 127 | tmp.data(), 128 | tmp.size(), 129 | get_separators_set().c_str(), 130 | std::move(matcher), 131 | std::back_inserter(result)); 132 | }); 133 | break; 134 | 135 | default: 136 | __builtin_unreachable(); 137 | time = 0; 138 | assert(false); 139 | break; 140 | } 141 | 142 | printf("input size : %lu\n", get_size()); 143 | printf("loops : %lu\n", get_loop_count()); 144 | printf("procedure : %s\n", procedure_name.c_str()); 145 | printf("time : %ld us\n", time); 146 | // this prevents compiler from optimizing out the benchmark loop 147 | printf("reference results: %lu\n", sum(result)); 148 | 149 | return true; 150 | } 151 | 152 | 153 | int main(int argc, char* argv[]) { 154 | 155 | try { 156 | BenchmarkApp app(argc, argv); 157 | app.run(); 158 | 159 | return EXIT_SUCCESS; 160 | 161 | } catch (std::exception& e) { 162 | printf("%s\n", e.what()); 163 | return EXIT_FAILURE; 164 | } catch (Application::Exit&) { 165 | return EXIT_SUCCESS; 166 | } 167 | } 168 | 169 | -------------------------------------------------------------------------------- /include/sse/sse-block-parser-signed.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "scalar/scalar-parse-signed.h" 7 | #include "sse-utils.h" 8 | #include "sse-convert.h" 9 | #include "sse-parser-common.h" 10 | #include "sse-parser-statistics.h" 11 | #include "block_info.h" 12 | 13 | namespace sse { 14 | 15 | namespace detail { 16 | 17 | struct result_type { 18 | uint64_t span_mask; 19 | uint64_t sign_mask; 20 | }; 21 | 22 | template 23 | result_type prepare_masks(char* data, MATCHER matcher) { 24 | const __m128i input0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 0*16)); 25 | const __m128i input1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 1*16)); 26 | const __m128i input2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 2*16)); 27 | const __m128i input3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 3*16)); 28 | 29 | const __m128i bytemask_digit0 = decimal_digits_mask(input0); 30 | const __m128i bytemask_digit1 = decimal_digits_mask(input1); 31 | const __m128i bytemask_digit2 = decimal_digits_mask(input2); 32 | const __m128i bytemask_digit3 = decimal_digits_mask(input3); 33 | 34 | const __m128i bytemask_sign0 = sign_mask(input0); 35 | const __m128i bytemask_sign1 = sign_mask(input1); 36 | const __m128i bytemask_sign2 = sign_mask(input2); 37 | const __m128i bytemask_sign3 = sign_mask(input3); 38 | 39 | const __m128i bytemask_span0 = _mm_or_si128(bytemask_digit0, bytemask_sign0); 40 | const __m128i bytemask_span1 = _mm_or_si128(bytemask_digit1, bytemask_sign1); 41 | const __m128i bytemask_span2 = _mm_or_si128(bytemask_digit2, bytemask_sign2); 42 | const __m128i bytemask_span3 = _mm_or_si128(bytemask_digit3, bytemask_sign3); 43 | 44 | const __m128i bytemask_valid = _mm_and_si128(matcher.get_mask(input0, bytemask_span0), 45 | _mm_and_si128(matcher.get_mask(input1, bytemask_span1), 46 | _mm_and_si128(matcher.get_mask(input2, bytemask_span2), 47 | matcher.get_mask(input3, bytemask_span3)))); 48 | 49 | if (_mm_movemask_epi8(bytemask_valid) != 0xffff) { 50 | throw std::runtime_error("Wrong character"); 51 | } 52 | 53 | result_type res; 54 | res.sign_mask = compose_bitmask(bytemask_sign0, 55 | bytemask_sign1, 56 | bytemask_sign2, 57 | bytemask_sign3); 58 | 59 | res.span_mask = compose_bitmask(bytemask_span0, 60 | bytemask_span1, 61 | bytemask_span2, 62 | bytemask_span3); 63 | return res; 64 | } 65 | 66 | } // namespace detail 67 | 68 | template 69 | void parser_block_signed( 70 | const char* string, 71 | size_t size, 72 | const char* separators, 73 | MATCHER matcher, 74 | INSERTER output) { 75 | 76 | 77 | char* data = const_cast(string); 78 | char* end = data + size; 79 | 80 | while (data + 4*16 < end) { 81 | detail::result_type res = detail::prepare_masks(data, matcher); 82 | 83 | char* loopend = data + 3*16; 84 | while (data < loopend) { 85 | const uint16_t span_mask = res.span_mask & 0xffff; 86 | if (span_mask == 0) { 87 | res.span_mask >>= 16; 88 | res.sign_mask >>= 16; 89 | data += 16; 90 | 91 | continue; 92 | 93 | } 94 | 95 | const BlockInfo& bi = blocks[span_mask]; 96 | const uint16_t sign_mask = res.sign_mask & 0xffff; 97 | if (sign_mask & bi.invalid_sign_mask) { 98 | throw std::runtime_error("'+' or '-' at invalid position"); 99 | } 100 | 101 | const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); 102 | char* prevdata = data; 103 | if (sign_mask == 0) { 104 | data = detail::parse_unsigned(bi, input, data, end, output); 105 | } else { 106 | data = detail::parse_signed(bi, input, data, end, output); 107 | } 108 | 109 | if (data == end) { 110 | break; 111 | } 112 | 113 | const int shift = data - prevdata; 114 | res.span_mask >>= shift; 115 | res.sign_mask >>= shift; 116 | } // inner while 117 | 118 | } // while 119 | 120 | // process the tail 121 | scalar::parse_signed(data, string + size - data, separators, output); 122 | } 123 | 124 | } // namespace sse 125 | -------------------------------------------------------------------------------- /src/sse-parser-statistics.cpp: -------------------------------------------------------------------------------- 1 | #include "sse/sse-parser-statistics.h" 2 | #include 3 | #include 4 | #include 5 | 6 | sse::Statistics sse::stats; // a global object 7 | 8 | namespace { 9 | 10 | void print_skip_histogram(FILE* file, const sse::Statistics& stats) { 11 | size_t sum = 0; 12 | for (const auto& item: stats.total_skip_histogram) { 13 | sum += item.second; 14 | } 15 | 16 | for (const auto& item: stats.total_skip_histogram) { 17 | const int skip = item.first; 18 | const size_t count = item.second; 19 | 20 | fprintf(file, "* process %2d byte(s): %5lu (%5.2f%%)\n", skip, count, 100.0*count/sum); 21 | } 22 | } 23 | 24 | struct span_histogram_entry { 25 | uint16_t mask; 26 | size_t count; 27 | 28 | span_histogram_entry(uint16_t mask_, size_t count_) 29 | : mask(mask_) 30 | , count(count_) {} 31 | }; 32 | 33 | using span_histogram = std::vector; 34 | 35 | void print_span_mask_histogram(FILE* file, const sse::Statistics& stats) { 36 | size_t sum = 0; 37 | span_histogram histogram; 38 | for (const auto& item: stats.span_masks_histogram) { 39 | sum += item.second; 40 | histogram.emplace_back(item.first, item.second); 41 | } 42 | 43 | std::sort(histogram.begin(), histogram.end(), 44 | [](const span_histogram_entry& a, const span_histogram_entry& b){return a.count < b.count;}); 45 | 46 | printf("Span mask histogram (%lu entries)\n", stats.span_masks_histogram.size()); 47 | size_t cumulative = 0; 48 | size_t id = 0; 49 | for (const auto& item: histogram) { 50 | cumulative += item.count; 51 | fprintf(file, "%5lu 0x%02x: %5lu (%5.2f%%; cumulative %5.2f%%)\n", 52 | id++, 53 | item.mask, item.count, 54 | 100.0*item.count/sum, 55 | 100.0*cumulative/sum); 56 | } 57 | } 58 | 59 | void print_sse_statistics(FILE* file, const char* title, size_t calls, size_t converted) { 60 | 61 | fprintf(file, "* %s:", title); 62 | if (calls == 0 && converted == 0) { 63 | printf(" none\n"); 64 | return; 65 | } else { 66 | printf("\n"); 67 | } 68 | 69 | fprintf(file, " - calls: %8lu\n", calls); 70 | fprintf(file, " - converted nums: %8lu\n", converted); 71 | fprintf(file, " - conversion/call: "); 72 | if (calls > 0) { 73 | fprintf(file, "%11.2f", converted/double(calls)); 74 | } else { 75 | fprintf(file, "-"); 76 | } 77 | fprintf(file, "\n"); 78 | } 79 | 80 | } 81 | 82 | void sse::Statistics::print(FILE* file) const { 83 | fprintf(file, "SSE parser statistics\n"); 84 | fprintf(file, "loops : %8lu\n", loops); 85 | fprintf(file, "total numbers converted : %8lu\n", get_all_converted()); 86 | fprintf(file, "scalar conversions : %8lu\n", get_scalar_conversions()); 87 | 88 | const double perc_total = 100.0*get_SSE_converted()/get_all_converted(); 89 | fprintf(file, "all converted by SSE : %8lu (%0.2f%%)\n", get_SSE_converted(), perc_total); 90 | fprintf(file, " - by unsinged routines : %8lu\n", unsigned_path.get_SSE_converted()); 91 | fprintf(file, " - by singed routines : %8lu\n", signed_path.get_SSE_converted()); 92 | print_skip_histogram(file, *this); 93 | 94 | print_sse_statistics(file, "1-digit vector conversions (unsigned)", unsigned_path.digit1_calls, unsigned_path.digit1_converted); 95 | print_sse_statistics(file, "2-digit vector conversions (unsigned)", unsigned_path.digit2_calls, unsigned_path.digit2_converted); 96 | print_sse_statistics(file, "2-digit vector conversions (signed)", signed_path.digit2_calls, signed_path.digit2_converted); 97 | print_sse_statistics(file, "3-digit vector conversions (unsigned)", unsigned_path.digit3_calls, unsigned_path.digit3_converted); 98 | print_sse_statistics(file, "3-digit vector conversions (signed)", signed_path.digit3_calls, signed_path.digit3_converted); 99 | print_sse_statistics(file, "4-digit vector conversions (unsigned)", unsigned_path.digit4_calls, unsigned_path.digit4_converted); 100 | print_sse_statistics(file, "4-digit vector conversions (signed)", signed_path.digit4_calls, signed_path.digit4_converted); 101 | print_sse_statistics(file, "8-digit vector conversions (unsigned)", unsigned_path.digit8_calls, unsigned_path.digit8_converted); 102 | print_sse_statistics(file, "8-digit vector conversions (signed)", signed_path.digit8_calls, signed_path.digit8_converted); 103 | 104 | print_span_mask_histogram(file, *this); 105 | } 106 | 107 | sse::Statistics::Statistics() { 108 | for (int i=0; i <= 16; i++) 109 | total_skip_histogram[i] = 0; 110 | } 111 | 112 | void sse::Statistics::span_mask_histogram_to_csv(FILE* file) const { 113 | assert(file != nullptr); 114 | 115 | for (const auto& item: stats.span_masks_histogram) { 116 | const uint16_t mask = item.first; 117 | const size_t count = item.second; 118 | fprintf(file, "%02x, %lu\n", mask, count); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /include/test/benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef _BENCHMARK_H_ 2 | #define _BENCHMARK_H_ 3 | 4 | #include 5 | #define RDTSC_START(cycles) \ 6 | do { \ 7 | uint32_t cyc_high, cyc_low; \ 8 | __asm volatile("cpuid\n" \ 9 | "rdtsc\n" \ 10 | "mov %%edx, %0\n" \ 11 | "mov %%eax, %1" : \ 12 | "=r" (cyc_high), \ 13 | "=r"(cyc_low) : \ 14 | : /* no read only */ \ 15 | "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 16 | ); \ 17 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 18 | } while (0) 19 | 20 | #define RDTSC_STOP(cycles) \ 21 | do { \ 22 | uint32_t cyc_high, cyc_low; \ 23 | __asm volatile("rdtscp\n" \ 24 | "mov %%edx, %0\n" \ 25 | "mov %%eax, %1\n" \ 26 | "cpuid" : \ 27 | "=r"(cyc_high), \ 28 | "=r"(cyc_low) : \ 29 | /* no read only registers */ : \ 30 | "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 31 | ); \ 32 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 33 | } while (0) 34 | 35 | static __attribute__ ((noinline)) 36 | uint64_t rdtsc_overhead_func(uint64_t dummy) { 37 | return dummy; 38 | } 39 | 40 | uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX; 41 | 42 | #define RDTSC_SET_OVERHEAD(test, repeat) \ 43 | do { \ 44 | uint64_t cycles_start, cycles_final, cycles_diff; \ 45 | uint64_t min_diff = UINT64_MAX; \ 46 | for (unsigned i = 0; i < repeat; i++) { \ 47 | __asm volatile("" ::: /* pretend to clobber */ "memory"); \ 48 | RDTSC_START(cycles_start); \ 49 | test; \ 50 | RDTSC_STOP(cycles_final); \ 51 | cycles_diff = (cycles_final - cycles_start); \ 52 | if (cycles_diff < min_diff) min_diff = cycles_diff; \ 53 | } \ 54 | global_rdtsc_overhead = min_diff; \ 55 | printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ 56 | } while (0) \ 57 | 58 | 59 | /* 60 | * Prints the best number of operations per cycle where 61 | * test is the function call, answer is the expected answer generated by 62 | * test, repeat is the number of times we should repeat and size is the 63 | * number of operations represented by test. 64 | */ 65 | #define BEST_TIME(pre, test, test_name, repeat, size) \ 66 | do { \ 67 | if (global_rdtsc_overhead == UINT64_MAX) { \ 68 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ 69 | } \ 70 | printf("%-30s\t: ", test_name); fflush(stdout); \ 71 | uint64_t cycles_start, cycles_final, cycles_diff; \ 72 | uint64_t min_diff = (uint64_t)-1; \ 73 | uint64_t sum_diff = 0; \ 74 | for (size_t i = 0; i < repeat; i++) { \ 75 | pre; \ 76 | __asm volatile("" ::: /* pretend to clobber */ "memory"); \ 77 | RDTSC_START(cycles_start); \ 78 | test; \ 79 | RDTSC_STOP(cycles_final); \ 80 | cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ 81 | if (cycles_diff < min_diff) min_diff = cycles_diff; \ 82 | sum_diff += cycles_diff; \ 83 | } \ 84 | uint64_t S = size; \ 85 | float cycle_per_op = (min_diff) / (double)S; \ 86 | float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ 87 | printf(" %8.3f cycle/op (best) %8.3f cycle/op (avg)\n", cycle_per_op, avg_cycle_per_op); \ 88 | } while (0) 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /test/unittest/verify_sse_unsigned_parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "block_info.h" 9 | #include "scalar/scalar-parse-unsigned.h" 10 | #include "scalar/scalar-parse-signed.h" 11 | #include "sse/sse-convert.h" 12 | #include "sse/sse-matcher.h" 13 | #include "sse/sse-parser-unsigned.h" 14 | 15 | 16 | class VerifyUnsignedParser { 17 | 18 | static const char* separators; 19 | 20 | static const int SIZE = 16 * 4; 21 | char buffer[SIZE + 1]; 22 | std::vector result; 23 | 24 | public: 25 | VerifyUnsignedParser() {} 26 | 27 | bool run() { 28 | try { 29 | do_run(); 30 | puts("All OK"); 31 | return true; 32 | } catch (std::exception& e) { 33 | printf("failed: %s\n", e.what()); 34 | dump(); 35 | return false; 36 | } 37 | } 38 | 39 | private: 40 | void do_run() { 41 | { 42 | printf("test 1... "); fflush(stdout); 43 | size_t cases = 0; 44 | for (int i=8; i <= 8; i++) { 45 | cases += verify1number(i); 46 | } 47 | printf(" %lu cases chcecked\n", cases); 48 | } 49 | 50 | { 51 | printf("test 2... "); fflush(stdout); 52 | size_t cases = 0; 53 | for (int i=1; i <= 8; i++) { 54 | for (int j=1; j <= 8; j++) { 55 | cases += verify2numbers(i, j); 56 | } 57 | } 58 | printf(" %lu cases chcecked\n", cases); 59 | } 60 | } 61 | 62 | private: 63 | size_t verify1number(const size_t digits) { 64 | assert(digits > 0); 65 | assert(digits <= 8); 66 | 67 | size_t cases = 0; 68 | uint32_t reference = test_number(digits); 69 | for (size_t i=0; i < SIZE; i++) { 70 | clear(); 71 | if (!put_number(i, digits)) break; 72 | cases += 1; 73 | 74 | sse::NaiveMatcher<8> matcher('_'); 75 | result.clear(); 76 | sse::parser(buffer, SIZE, separators, matcher, std::back_inserter(result)); 77 | 78 | if (result.size() != 1) { 79 | throw std::logic_error("size must be 1"); 80 | } 81 | 82 | if (result[0] != reference) { 83 | printf("result = %u, expected = %u\n", result[0], reference); 84 | throw std::logic_error("wrong value"); 85 | } 86 | } 87 | 88 | return cases; 89 | } 90 | 91 | size_t verify2numbers(int digits1, int digits2) { 92 | assert(digits1 > 0); 93 | assert(digits1 <= 8); 94 | assert(digits2 > 0); 95 | assert(digits2 <= 8); 96 | 97 | size_t cases = 0; 98 | 99 | const uint32_t reference1 = test_number(digits1); 100 | const uint32_t reference2 = test_number(digits2); 101 | 102 | for (int i = 0; i < SIZE; i++) { 103 | for (int j = i; j < SIZE; j++) { 104 | clear(); 105 | if (!put_number(i, digits1)) continue; 106 | if (!put_number(j, digits2)) continue; 107 | cases += 1; 108 | 109 | sse::NaiveMatcher<8> matcher('_'); 110 | result.clear(); 111 | sse::parser(buffer, SIZE, separators, matcher, std::back_inserter(result)); 112 | 113 | if (result.size() != 2) { 114 | throw std::logic_error("size must be 2"); 115 | } 116 | 117 | const bool e1 = (result[0] == reference1 && result[1] == reference2); 118 | const bool e2 = (result[0] == reference2 && result[1] == reference1); 119 | if (!(e1 || e2)) { 120 | printf("result = %u, %u, expected = %u, %u\n", result[0], result[1], reference1, reference2); 121 | throw std::logic_error("invalid value"); 122 | } 123 | } 124 | } 125 | 126 | return cases; 127 | } 128 | 129 | bool put_number(int offset, int digits) { 130 | for (int i=0; i < digits; i++) { 131 | if (offset + i >= SIZE) { 132 | return false; 133 | } 134 | 135 | if (!is_free(offset + i)) return false; 136 | } 137 | 138 | if (!is_free(offset - 1)) return false; 139 | if (!is_free(offset + digits)) return false; 140 | 141 | for (int i=0; i < digits; i++) { 142 | buffer[offset + i] = ((i + 1) % 10) + '0'; 143 | } 144 | return true; 145 | } 146 | 147 | bool is_free(int index) const { 148 | if (index < 0) return true; 149 | if (index >= SIZE) return true; 150 | 151 | return buffer[index] == '_'; 152 | } 153 | 154 | uint64_t test_number(int digits) { 155 | uint64_t x = 0; 156 | for (int i=0; i < digits; i++) { 157 | x = 10 * x + (i + 1) % 10; 158 | } 159 | 160 | return x; 161 | } 162 | 163 | void clear() { 164 | memset(buffer, '_', SIZE); 165 | buffer[SIZE] = 0; 166 | } 167 | 168 | void dump() { 169 | puts(buffer); 170 | } 171 | 172 | void dump(const std::vector& vec) { 173 | printf("size = %lu: [", vec.size()); 174 | 175 | const size_t n = vec.size(); 176 | if (n) { 177 | printf("%u", vec[0]); 178 | } 179 | 180 | for (size_t i=1; i < n; i++) { 181 | printf(", %u", vec[i]); 182 | } 183 | 184 | printf("]\n"); 185 | } 186 | 187 | }; 188 | 189 | const char* VerifyUnsignedParser::separators = "_"; 190 | 191 | 192 | int main() { 193 | 194 | puts("Verify SSE unsigned parser"); 195 | VerifyUnsignedParser verify; 196 | if (!verify.run()) { 197 | return EXIT_FAILURE; 198 | } 199 | 200 | return EXIT_SUCCESS; 201 | } 202 | 203 | -------------------------------------------------------------------------------- /test/utils/application.cpp: -------------------------------------------------------------------------------- 1 | #include "application.h" 2 | 3 | #include "input_generator.h" 4 | #include "time_utils.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace { 10 | 11 | std::vector parse_array(const std::string& str) { 12 | char* c; 13 | const char* s = str.c_str(); 14 | 15 | std::vector result; 16 | while (true) { 17 | const long tmp = strtol(s, &c, 10); 18 | if (*c == ',') { 19 | result.push_back(tmp); 20 | s = c + 1; 21 | } else if (*c == '\0') { 22 | if (c != s) { 23 | result.push_back(tmp); 24 | } 25 | break; 26 | } else { 27 | throw std::logic_error("Invalid character '" + std::string(1, *c) + "' in string \"" + str + "\""); 28 | } 29 | } 30 | 31 | if (result.empty()) { 32 | throw std::logic_error("Expected at least one number"); 33 | } 34 | 35 | return result; 36 | } 37 | 38 | std::string parse_separators(const std::string& s) { 39 | std::set set; 40 | static const std::string reserved_chars{"0123456789+-"}; 41 | 42 | for (char c: s) { 43 | set.insert(c); 44 | } 45 | 46 | const bool empty = set.empty(); 47 | const bool too_large = set.size() > 16; 48 | bool invalid_chars = false; 49 | for (char c: reserved_chars) { 50 | if (set.count(c)) { 51 | invalid_chars = true; 52 | break; 53 | } 54 | } 55 | 56 | if (empty || too_large || invalid_chars) { 57 | throw Application::ArgumentError 58 | ("Separators must be a non empty, up to 16 chars set; " 59 | "forbidden chars are: '0'..'9', '+' and '-'."); 60 | } 61 | 62 | return std::string{set.begin(), set.end()}; 63 | } 64 | 65 | } // namespace unnamed 66 | 67 | 68 | Application::Application(int argc, char* argv[]) 69 | : cmdline(argc, argv) 70 | , quiet(false) 71 | , rd() 72 | , random(rd()) {} 73 | 74 | 75 | bool Application::run() { 76 | init(); 77 | custom_init(); 78 | return custom_run(); 79 | } 80 | 81 | 82 | void Application::init() { 83 | 84 | if (cmdline.empty() || cmdline.has_flag("-h") || cmdline.has_flag("--help")) { 85 | print_help(); 86 | throw Application::Exit(); 87 | } 88 | 89 | auto to_int = [](const std::string& val) { 90 | return std::stol(val); 91 | }; 92 | 93 | size = cmdline.parse_value("--size", to_int); 94 | debug_size = cmdline.parse_value("--debug", to_int, 0); 95 | loop_count = cmdline.parse_value("--loops", to_int, 1); 96 | separators_set = cmdline.parse_value("--separators", parse_separators, ",; "); 97 | 98 | const auto seed = cmdline.parse_value("--seed", to_int, 0); 99 | random.seed(seed); 100 | 101 | { 102 | const auto arr = cmdline.parse_value>("--num", parse_array); 103 | distribution.numbers = std::discrete_distribution<>(arr.begin(), arr.end()); 104 | } 105 | { 106 | const auto arr = cmdline.parse_value>("--sep", parse_array, {1}); 107 | distribution.separators = std::discrete_distribution<>(arr.begin(), arr.end()); 108 | } 109 | 110 | if (cmdline.has_value("--sign")) { 111 | const auto arr = cmdline.parse_value>("--sign", parse_array, {}); 112 | if (arr.size() != 3) { 113 | throw std::logic_error("--sign expects exactly three-item distribution, like --sign=5,2,1"); 114 | } 115 | distribution.sign = std::discrete_distribution<>(arr.begin(), arr.end()); 116 | sign_nonnull = true; 117 | } else { 118 | sign_nonnull = false; 119 | } 120 | } 121 | 122 | std::string Application::generate_unsigned() { 123 | 124 | std::string tmp; 125 | 126 | const std::string msg = (quiet) ? "" : "generating random unsigned numbers "; 127 | measure_time(msg, [&tmp, this]{ 128 | tmp = ::generate_unsigned( 129 | size, 130 | get_separators_set(), 131 | random, 132 | distribution.numbers, 133 | distribution.separators); 134 | }); 135 | assert(tmp.size() == size); 136 | 137 | if (!quiet && debug_size > 0) { 138 | printf("first %lu bytes of the data:\n", debug_size); 139 | fwrite(tmp.data(), debug_size, 1, stdout); 140 | putchar('\n'); 141 | } 142 | 143 | return tmp; 144 | } 145 | 146 | std::string Application::generate_signed() { 147 | 148 | std::string tmp; 149 | 150 | const std::string msg = (quiet) ? "" : "generating random signed numbers "; 151 | measure_time(msg, [&tmp, this]{ 152 | tmp = ::generate_signed( 153 | size, 154 | get_separators_set(), 155 | random, 156 | distribution.numbers, 157 | distribution.separators, 158 | distribution.sign); 159 | }); 160 | assert(tmp.size() == size); 161 | 162 | if (!quiet && debug_size > 0) { 163 | printf("first %lu bytes of the data:\n", debug_size); 164 | fwrite(tmp.data(), debug_size, 1, stdout); 165 | putchar('\n'); 166 | } 167 | 168 | return tmp; 169 | } 170 | 171 | void Application::print_help() const { 172 | printf("Usage: %s [OPTIONS]\n", cmdline.get_program_name().c_str()); 173 | puts(""); 174 | puts("options are"); 175 | puts(""); 176 | puts("--size=NUMBER input size (in bytes)"); 177 | puts("--loops=NUMBER how many times a test must be repeated [default: 1]"); 178 | puts("--seed=NUMBER seed for random number generator [default: 0]"); 179 | puts("--num=DISTRIBUTION distribution of lengths of numbers"); 180 | puts("--sep=DISTRIBUTION distribution of lengths of gaps between numbers [default: '1']"); 181 | puts("--separators=string list of separator characters [default: \",; \"]"); 182 | puts("--sign=DISTRIBUTION distribution of sign in front of number [default: '1']"); 183 | puts("--debug=K prints K first bytes of generated input [default: 0]"); 184 | puts(""); 185 | puts("Distribution is given as a list of comma-separated values."); 186 | puts("For --num and --sep the list length is unbound, for --sign it"); 187 | puts("must have exactly three items."); 188 | 189 | puts(""); 190 | print_custom_help(); 191 | } 192 | 193 | 194 | void Application::custom_init() { 195 | // do nothing 196 | } 197 | 198 | 199 | void Application::print_custom_help() const { 200 | // do nothing 201 | } 202 | -------------------------------------------------------------------------------- /scripts/generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from writer import CPPWriter as Writer 3 | 4 | SIMD_ELEMENT_SIZES = [1, 2, 4, 8, 16] 5 | 6 | class DigitsSpan(object): 7 | # range: [first, last] - include the both ends 8 | def __init__(self, first, last): 9 | assert first <= last 10 | 11 | self.first = first 12 | self.last = last 13 | 14 | assert self.digits() <= 16 15 | 16 | 17 | def digits(self): 18 | return self.last - self.first + 1 19 | 20 | 21 | def simd_size(self): 22 | d = self.digits() 23 | if d == 1: 24 | return 1 25 | if d == 2: 26 | return 2 27 | if d <= 4: 28 | return 4 29 | if d <= 8: 30 | return 8 31 | return 16; 32 | 33 | def __str__(self): 34 | return "<%d,%d>" % (self.first, self.last) 35 | 36 | __repr__ = __str__ 37 | 38 | DIGIT = 'd' 39 | SPACE = '_' 40 | 41 | class Parser(object): 42 | def __init__(self, number): 43 | assert number >= 0 44 | assert number < 65536 45 | 46 | self.number = number 47 | self.image = self.__convert_to_string(number) 48 | 49 | 50 | def get_spans(self): 51 | prev = SPACE 52 | start = None 53 | spans = [] 54 | for i, c in enumerate(self.image): 55 | if c == prev: 56 | continue 57 | 58 | if c == DIGIT: # transition 59 | start = i 60 | else: 61 | # Note: a digits span which not ends within the chunk 62 | # doesn't appear in the result (we don't know how to parse it) 63 | spans.append(DigitsSpan(start, i - 1)) 64 | start = None 65 | 66 | prev = c 67 | 68 | if start is not None: 69 | incomplete = [DigitsSpan(start, 15)] 70 | else: 71 | incomplete = [] 72 | 73 | return (spans, incomplete) 74 | 75 | 76 | def __convert_to_string(self, x): 77 | s = '' 78 | for i in range(16): 79 | if x & (1 << i): 80 | s += DIGIT 81 | else: 82 | s += SPACE 83 | 84 | return s 85 | 86 | 87 | class Optimizer(object): 88 | def __init__(self, spans): 89 | self.spans = spans 90 | 91 | 92 | def get_best(self): 93 | best = None 94 | best_size = None 95 | for element_size in SIMD_ELEMENT_SIZES: 96 | 97 | res = self.__pack(element_size) 98 | if res is None: 99 | continue 100 | 101 | if best is None or len(res) > len(best): 102 | best = res; 103 | best_size = element_size 104 | 105 | if best is not None: 106 | return (best_size, best) 107 | else: 108 | return None 109 | 110 | 111 | def __pack(self, element_size, vector_size = 16): 112 | max_size = vector_size / element_size 113 | result = [] 114 | for r in self.spans: 115 | if r.digits() <= element_size: 116 | result.append(r) 117 | if len(result) == max_size: 118 | break 119 | else: 120 | break 121 | 122 | if len(result) > 0: 123 | return result 124 | else: 125 | return None 126 | 127 | 128 | class BlockInfo(object): 129 | 130 | __slots__ = ("id", "image", "first_skip", "total_skip", 131 | "spans", "all_spans", "element_size", "shuffle_digits", 132 | "shuffle_signs") 133 | 134 | def __init__(self, number): 135 | self.id = number 136 | self.first_skip = 0 137 | self.total_skip = 0 138 | self.spans = [] 139 | self.all_spans = [] 140 | self.element_size = 0 141 | self.shuffle_digits = [] 142 | self.shuffle_signs = [] 143 | 144 | def build_pshubf_masks(self): 145 | self.build_shuffle_digit() 146 | self.build_shuffle_signs() 147 | 148 | def build_shuffle_digit(self): 149 | self.shuffle_digits = [0x80] * 16 150 | for element, r in enumerate(self.spans): 151 | index = element * self.element_size 152 | index += self.element_size - r.digits() # align to "right" within the vector's element 153 | 154 | for i in range(r.first, r.last + 1): 155 | self.shuffle_digits[index] = i 156 | index += 1 157 | 158 | def build_shuffle_signs(self): 159 | self.shuffle_signs = [0x80] * 16 160 | for element, r in enumerate(self.spans): 161 | index = element * self.element_size 162 | for i in range(self.element_size): 163 | self.shuffle_signs[index + i] = r.first 164 | 165 | def get_invalid_sign_mask(self): 166 | result = 0 167 | for r in self.all_spans: 168 | if r.digits() <= 1: 169 | continue 170 | 171 | # only first character of span might be '+' or '-' 172 | bit = 1 << r.first 173 | result |= bit 174 | 175 | if self.all_spans: 176 | # if last span has just one char it might also be a sign 177 | last = self.all_spans[-1] 178 | if last.digits() == 1: 179 | bit = 1 << 15 180 | result |= bit 181 | 182 | # negate result, to avoid negation in runtime 183 | return ~result & 0xffff 184 | 185 | def __str__(self): 186 | param = ( 187 | self.id, 188 | self.first_skip, 189 | self.total_skip, 190 | self.element_size, 191 | self.spans 192 | ) 193 | 194 | return "" % param 196 | 197 | 198 | class Generator(object): 199 | def run(self): 200 | for i in range(2**16): 201 | yield self.__get_structure(i) 202 | 203 | 204 | def __get_structure(self, number): 205 | parser = Parser(number) 206 | spans, incomplete_span = parser.get_spans() 207 | 208 | opt = Optimizer(spans) 209 | ret = opt.get_best() 210 | 211 | block = BlockInfo(number) 212 | block.image = parser.image 213 | block.all_spans = spans + incomplete_span 214 | if ret is not None: 215 | element_size, items = ret 216 | 217 | block.first_skip = items[0].first 218 | block.spans = items 219 | block.element_size = element_size 220 | 221 | block.total_skip = items[-1].last + 1 222 | try: 223 | image = parser.image 224 | while image[block.total_skip] == '_': 225 | block.total_skip += 1 226 | except IndexError: 227 | pass 228 | else: 229 | if number != 0: # there are digits at the end of chunk 230 | image = parser.image 231 | block.first_skip = image.index(DIGIT) 232 | 233 | block.build_pshubf_masks() 234 | 235 | return block 236 | 237 | 238 | def main(path): 239 | gen = Generator() 240 | data = list(gen.run()) 241 | 242 | writer = Writer(data) 243 | writer.save(path) 244 | 245 | if __name__ == '__main__': 246 | if len(sys.argv) < 1: 247 | print("Usage: script output-path") 248 | sys.exit(1) 249 | 250 | main(sys.argv[1]) 251 | -------------------------------------------------------------------------------- /include/sse/sse-convert.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace sse { 7 | 8 | #define SSE_ALIGN __attribute__ ((aligned (16))) 9 | 10 | template 11 | void convert_1digit(const __m128i& input, int count, INSERTER output) { 12 | const __m128i ascii0 = _mm_set1_epi8('0'); 13 | 14 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 15 | 16 | uint8_t tmp[16] SSE_ALIGN; 17 | 18 | _mm_store_si128((__m128i*)tmp, t0); 19 | for (int i=0; i < count; i++) 20 | *output++ = tmp[i]; 21 | } 22 | 23 | template 24 | void convert_2digits(const __m128i& input, int count, INSERTER output) { 25 | const __m128i ascii0 = _mm_set1_epi8('0'); 26 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 27 | 28 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 29 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 30 | 31 | uint16_t tmp[8] SSE_ALIGN; 32 | 33 | _mm_store_si128((__m128i*)tmp, t1); 34 | for (int i=0; i < count; i++) 35 | *output++ = tmp[i]; 36 | } 37 | 38 | template 39 | void convert_3digits(const __m128i& input, int count, INSERTER output) { 40 | 41 | const __m128i ascii0 = _mm_set1_epi8('0'); 42 | const __m128i mul_all = _mm_setr_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1); 43 | 44 | // =-------------- 45 | 46 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 47 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_all); 48 | const __m128i t2 = _mm_hadd_epi16(t1, t1); 49 | 50 | uint16_t tmp[8] SSE_ALIGN; 51 | 52 | _mm_store_si128((__m128i*)tmp, t2); 53 | for (int i=0; i < count; i++) 54 | *output++ = tmp[i]; 55 | } 56 | 57 | template 58 | void convert_4digits(const __m128i& input, int count, INSERTER output) { 59 | 60 | const __m128i ascii0 = _mm_set1_epi8('0'); 61 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 62 | const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 63 | 64 | // =-------------- 65 | 66 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 67 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 68 | const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); 69 | 70 | uint32_t tmp[4] SSE_ALIGN; 71 | 72 | _mm_store_si128((__m128i*)tmp, t2); 73 | for (int i=0; i < count; i++) 74 | *output++ = tmp[i]; 75 | } 76 | 77 | template 78 | void convert_8digits(const __m128i& input, int count, INSERTER output) { 79 | 80 | const __m128i ascii0 = _mm_set1_epi8('0'); 81 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 82 | const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 83 | const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); 84 | 85 | // =-------------- 86 | 87 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 88 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 89 | const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); 90 | const __m128i t3 = _mm_packus_epi32(t2, t2); 91 | const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000); 92 | 93 | uint32_t tmp[4] SSE_ALIGN; 94 | 95 | _mm_store_si128((__m128i*)tmp, t4); 96 | for (int i=0; i < count; i++) 97 | *output++ = tmp[i]; 98 | } 99 | 100 | template 101 | void convert_2digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) { 102 | const __m128i ascii0 = _mm_set1_epi8('0'); 103 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 104 | 105 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 106 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 107 | 108 | const __m128i s0 = _mm_xor_si128(t1, negate_mask); 109 | const __m128i s1 = _mm_sub_epi16(s0, negate_mask); 110 | 111 | int16_t tmp[8] SSE_ALIGN; 112 | 113 | _mm_store_si128((__m128i*)tmp, s1); 114 | for (int i=0; i < count; i++) 115 | *output++ = tmp[i]; 116 | } 117 | 118 | template 119 | void convert_3digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) { 120 | 121 | const __m128i ascii0 = _mm_set1_epi8('0'); 122 | const __m128i mul_all = _mm_setr_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1); 123 | 124 | const __m128i s0 = _mm_xor_si128(mul_all, negate_mask); 125 | const __m128i s1 = _mm_sub_epi8(s0, negate_mask); 126 | 127 | // =-------------- 128 | 129 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 130 | const __m128i t1 = _mm_maddubs_epi16(t0, s1); 131 | const __m128i t2 = _mm_hadd_epi16(t1, t1); 132 | 133 | int16_t tmp[8] SSE_ALIGN; 134 | 135 | _mm_store_si128((__m128i*)tmp, t2); 136 | for (int i=0; i < count; i++) 137 | *output++ = tmp[i]; 138 | } 139 | 140 | template 141 | void convert_4digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) { 142 | 143 | const __m128i ascii0 = _mm_set1_epi8('0'); 144 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 145 | const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 146 | 147 | // =-------------- 148 | 149 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 150 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 151 | const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); 152 | 153 | const __m128i s0 = _mm_xor_si128(t2, negate_mask); 154 | const __m128i s1 = _mm_sub_epi32(s0, negate_mask); 155 | 156 | int32_t tmp[4] SSE_ALIGN; 157 | 158 | _mm_store_si128((__m128i*)tmp, s1); 159 | for (int i=0; i < count; i++) 160 | *output++ = tmp[i]; 161 | } 162 | 163 | template 164 | void convert_8digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) { 165 | 166 | const __m128i ascii0 = _mm_set1_epi8('0'); 167 | const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); 168 | const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 169 | const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); 170 | 171 | // =-------------- 172 | 173 | const __m128i t0 = _mm_subs_epu8(input, ascii0); 174 | const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10); 175 | const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); 176 | 177 | const __m128i s0 = _mm_xor_si128(t2, negate_mask); 178 | const __m128i s1 = _mm_sub_epi32(s0, negate_mask); 179 | 180 | const __m128i t3 = _mm_packs_epi32(s1, s1); 181 | const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000); 182 | 183 | int32_t tmp[4] SSE_ALIGN; 184 | 185 | _mm_store_si128((__m128i*)tmp, t4); 186 | for (int i=0; i < count; i++) 187 | *output++ = tmp[i]; 188 | } 189 | 190 | #undef SSE_ALIGN 191 | 192 | } 193 | 194 | -------------------------------------------------------------------------------- /test/unittest/verify_sse_signed_parser_validation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "block_info.h" 7 | #include "sse/sse-matcher.h" 8 | #include "sse/sse-parser-signed.h" 9 | 10 | const char PLUS = '+'; 11 | const char MINUS = '-'; 12 | const char SEP = '_'; 13 | const char DIGIT = '1'; 14 | const char INVALID = 'x'; 15 | 16 | class Test { 17 | 18 | enum Class { 19 | Separator, 20 | Digit, 21 | Sign, 22 | Invalid 23 | }; 24 | 25 | Class input_pattern[16]; 26 | char input_string[17]; 27 | __m128i input; 28 | 29 | enum class Result { 30 | NoException, 31 | OverflowException, 32 | OtherException 33 | }; 34 | 35 | public: 36 | Test() { 37 | for (int i=0; i < 16; i++) { 38 | input_pattern[i] = Separator; 39 | } 40 | 41 | input_string[16] = 0; 42 | render(); 43 | } 44 | 45 | public: 46 | bool run() { 47 | puts("Full validation of invalid input detection in SSE approach"); 48 | puts("Warning: this might take a few minutes on a decent computer"); 49 | const auto ret = validate(); 50 | if (ret) { 51 | puts("OK"); 52 | } else { 53 | puts("!!!FAILED!!!"); 54 | } 55 | 56 | return ret; 57 | } 58 | 59 | private: 60 | bool validate() { 61 | long id = 0; 62 | 63 | do { 64 | prepare(); 65 | 66 | if (id % 1000000 == 0) { 67 | printf("%ld %s\n", id, input_string); 68 | } 69 | 70 | Result expected; 71 | if (is_valid()) { 72 | if (will_overflow()) 73 | expected = Result::OverflowException; 74 | else 75 | expected = Result::NoException; 76 | } else { 77 | expected = Result::OtherException; 78 | } 79 | 80 | const Result result = SSE_validate_algorithm(); 81 | 82 | if (result != expected) { 83 | printf("failed for %ld: %s\n", id, input_string); 84 | return false; 85 | } 86 | id += 1; 87 | } while (!increment()); 88 | 89 | return true; 90 | } 91 | 92 | Result SSE_validate_algorithm() { 93 | std::vector sink; 94 | try { 95 | sse::NaiveMatcher<1> matcher(SEP); 96 | sse::detail::process_chunk( 97 | input_string, 98 | input_string + 16, 99 | input, 100 | matcher, 101 | std::back_inserter(sink) 102 | ); 103 | return Result::NoException; 104 | } catch (std::range_error&) { 105 | return Result::OverflowException; 106 | } catch (std::exception&) { 107 | return Result::OtherException; 108 | } 109 | } 110 | 111 | void prepare() { 112 | render(); 113 | input = _mm_loadu_si128(reinterpret_cast(input_string)); 114 | } 115 | 116 | void render() { 117 | int j = 0; 118 | for (int i=0; i < 16; i++) { 119 | switch (input_pattern[i]) { 120 | case Separator: 121 | input_string[i] = SEP; 122 | break; 123 | 124 | case Digit: 125 | input_string[i] = DIGIT; 126 | break; 127 | 128 | case Sign: 129 | input_string[i] = (j++ % 2) ? PLUS : MINUS; 130 | break; 131 | 132 | case Invalid: 133 | input_string[i] = INVALID; 134 | break; 135 | } 136 | } 137 | } 138 | 139 | Class next(Class x) { 140 | switch (x) { 141 | case Separator: 142 | return Digit; 143 | 144 | case Digit: 145 | return Sign; 146 | 147 | case Sign: 148 | return Invalid; 149 | 150 | case Invalid: 151 | return Separator; 152 | 153 | default: 154 | assert(false); 155 | return Separator; 156 | } 157 | } 158 | 159 | bool increment() { 160 | int i = 0; 161 | do { 162 | const auto n = next(input_pattern[i]); 163 | input_pattern[i] = n; 164 | if (n != Separator) { 165 | return false; 166 | } 167 | 168 | i += 1; 169 | 170 | } while (i < 16); 171 | 172 | return true; 173 | } 174 | 175 | 176 | bool is_valid() { 177 | Class prev = Separator; 178 | for (int i=0; i < 16; i++) { 179 | switch (input_pattern[i]) { 180 | case Separator: 181 | if (prev == Sign) { // a solitary sign, like "__+_" 182 | return false; 183 | } 184 | break; 185 | 186 | case Digit: 187 | // a digit can follow anything 188 | break; 189 | 190 | case Sign: 191 | if (prev != Separator) { 192 | return false; 193 | } 194 | break; 195 | 196 | case Invalid: 197 | return false; 198 | } // switch 199 | prev = input_pattern[i]; 200 | } 201 | 202 | return true; 203 | } 204 | 205 | uint16_t get_span_pattern() const { 206 | // assume is_valid() == true 207 | uint16_t result = 0; 208 | uint16_t bit = 1; 209 | for (int i=0; i < 16; i++, bit <<= 1) { 210 | switch (input_pattern[i]) { 211 | case Separator: 212 | break; 213 | 214 | case Digit: 215 | case Sign: 216 | result |= bit; 217 | break; 218 | 219 | default: 220 | assert(false); 221 | return 0; 222 | 223 | } // switch 224 | } 225 | 226 | return result; 227 | } 228 | 229 | bool will_overflow() const { 230 | // assume is_valid() == true 231 | const BlockInfo& b = blocks[get_span_pattern()]; 232 | if (b.conversion_routine != Conversion::Scalar) { 233 | // only scalar code might cause overflow error 234 | return false; 235 | } 236 | 237 | uint32_t result = 0; 238 | bool negative = false; 239 | for (int i=b.first_skip; i < 16; i++) { 240 | switch (input_pattern[i]) { 241 | case Separator: 242 | result = 0; 243 | negative = false; 244 | // scalar code converts just the first span 245 | return false; 246 | 247 | case Digit: 248 | try { 249 | mul10_add_digit(result, DIGIT); 250 | } catch (std::range_error& e) { 251 | return true; 252 | } 253 | break; 254 | 255 | case Sign: 256 | negative = true; // (MINUS == ((j++ % 2) ? PLUS : MINUS)); 257 | result = 0; 258 | break; 259 | 260 | default: 261 | assert(false); 262 | } // switch 263 | } 264 | 265 | return false; 266 | } 267 | 268 | }; 269 | 270 | 271 | int main() { 272 | Test test; 273 | 274 | return test.run() ? EXIT_SUCCESS : EXIT_FAILURE; 275 | } 276 | -------------------------------------------------------------------------------- /include/sse/sse-parser-common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "safe-convert.h" 9 | #include "sse-utils.h" 10 | #include "sse-convert.h" 11 | #include "sse-parser-statistics.h" 12 | #include "block_info.h" 13 | 14 | namespace sse { 15 | 16 | namespace detail { 17 | 18 | template 19 | char* parse_unsigned(const BlockInfo& bi, const __m128i input, char* data, char* end, INSERTER output) { 20 | 21 | const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits); 22 | const __m128i shuffled = _mm_shuffle_epi8(input, shuffle_digits); 23 | 24 | if (bi.conversion_routine == Conversion::SSE1Digit) { 25 | 26 | convert_1digit(shuffled, bi.element_count, output); 27 | 28 | STATS_INC(unsigned_path.digit1_calls); 29 | STATS_ADD(unsigned_path.digit1_converted, bi.element_count); 30 | 31 | } else if (bi.conversion_routine == Conversion::SSE2Digits) { 32 | 33 | convert_2digits(shuffled, bi.element_count, output); 34 | 35 | STATS_INC(unsigned_path.digit2_calls); 36 | STATS_ADD(unsigned_path.digit2_converted, bi.element_count); 37 | 38 | } else if (bi.conversion_routine == Conversion::SSE3Digits) { 39 | 40 | convert_3digits(shuffled, bi.element_count, output); 41 | 42 | STATS_INC(unsigned_path.digit3_calls); 43 | STATS_ADD(unsigned_path.digit3_converted, bi.element_count); 44 | 45 | } else if (bi.conversion_routine == Conversion::SSE4Digits) { 46 | 47 | convert_4digits(shuffled, bi.element_count, output); 48 | 49 | STATS_INC(unsigned_path.digit4_calls); 50 | STATS_ADD(unsigned_path.digit4_converted, bi.element_count); 51 | 52 | } else if (bi.conversion_routine == Conversion::SSE8Digits) { 53 | 54 | convert_8digits(shuffled, bi.element_count, output); 55 | 56 | STATS_INC(unsigned_path.digit8_calls); 57 | STATS_ADD(unsigned_path.digit8_converted, bi.element_count); 58 | 59 | } else { 60 | uint32_t result = 0; 61 | bool converted = false; 62 | 63 | data += bi.first_skip; 64 | while (data < end && *data >= '0' && *data <= '9') { 65 | mul10_add_digit(result, *data); 66 | data += 1; 67 | converted = true; 68 | } 69 | 70 | if (converted) { 71 | if (result > std::numeric_limits::max()) { 72 | throw std::range_error("unsigned 32-bit overflow"); 73 | } 74 | *output++ = result; 75 | } 76 | 77 | STATS_INC(unsigned_path.scalar_conversions); 78 | 79 | return data; 80 | } 81 | 82 | #ifdef USE_STATISTICS 83 | stats.total_skip_histogram[bi.total_skip] += 1; 84 | #endif 85 | 86 | return data + bi.total_skip; 87 | } 88 | 89 | template 90 | char* parse_signed( 91 | const BlockInfo& bi, 92 | const __m128i input, 93 | char* data, 94 | char* end, 95 | INSERTER output 96 | ) { 97 | const __m128i ascii_minus = _mm_set1_epi8('-'); 98 | 99 | const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits); 100 | const __m128i shuffle_signs = _mm_loadu_si128((const __m128i*)bi.shuffle_signs); 101 | 102 | // Note: there is not need to mask '+' or '-' in the input prior 103 | // shuffling. This is possible because ASCII codes of '+' and '-' 104 | // are smaller than '0' (43 < 48 && 45 < 48). These character will 105 | // be zeroed during subtraction of '0'. 106 | const __m128i shuffled = _mm_shuffle_epi8(input, shuffle_digits); 107 | const __m128i shuffled_signs = _mm_shuffle_epi8(input, shuffle_signs); 108 | const __m128i negate_mask = _mm_cmpeq_epi8(shuffled_signs, ascii_minus); 109 | 110 | if (bi.conversion_routine == Conversion::SSE1Digit) { 111 | 112 | convert_1digit(shuffled, bi.element_count, output); 113 | 114 | } else if (bi.conversion_routine == Conversion::SSE2Digits) { 115 | 116 | convert_2digits_signed(shuffled, negate_mask, bi.element_count, output); 117 | 118 | STATS_INC(signed_path.digit2_calls); 119 | STATS_ADD(signed_path.digit2_converted, bi.element_count); 120 | 121 | } else if (bi.conversion_routine == Conversion::SSE3Digits) { 122 | 123 | convert_3digits_signed(shuffled, negate_mask, bi.element_count, output); 124 | 125 | STATS_INC(signed_path.digit3_calls); 126 | STATS_ADD(signed_path.digit3_converted, bi.element_count); 127 | 128 | } else if (bi.conversion_routine == Conversion::SSE4Digits) { 129 | 130 | convert_4digits_signed(shuffled, negate_mask, bi.element_count, output); 131 | 132 | STATS_INC(signed_path.digit4_calls); 133 | STATS_ADD(signed_path.digit4_converted, bi.element_count); 134 | 135 | } else if (bi.conversion_routine == Conversion::SSE8Digits) { 136 | 137 | convert_8digits_signed(shuffled, negate_mask, bi.element_count, output); 138 | 139 | STATS_INC(signed_path.digit8_calls); 140 | STATS_ADD(signed_path.digit8_converted, bi.element_count); 141 | 142 | } else { 143 | bool converted = false; 144 | uint32_t result; 145 | bool negative; 146 | 147 | data += bi.first_skip; 148 | 149 | if (*data == '+') { 150 | data++; 151 | negative = false; 152 | result = 0; 153 | } else if (*data == '-') { 154 | data++; 155 | negative = true; 156 | result = 0; 157 | } else { 158 | result = *data++ - '0'; 159 | negative = false; 160 | converted = true; 161 | } 162 | 163 | while (data < end && *data >= '0' && *data <= '9') { 164 | mul10_add_digit(result, *data); 165 | data += 1; 166 | converted = true; 167 | } 168 | 169 | if (converted) { 170 | if (negative) { 171 | const int64_t tmp = std::numeric_limits::min(); 172 | const uint32_t absmin = -tmp; 173 | if (result > absmin) { 174 | throw std::range_error("signed 32-bit overflow"); 175 | } 176 | *output++ = -result; 177 | } else { 178 | const uint32_t max = std::numeric_limits::max(); 179 | if (result > max) { 180 | throw std::range_error("signed 32-bit overflow"); 181 | } 182 | *output++ = result; 183 | } 184 | } 185 | 186 | STATS_INC(signed_path.scalar_conversions); 187 | 188 | return data; 189 | } 190 | 191 | #ifdef USE_STATISTICS 192 | stats.total_skip_histogram[bi.total_skip] += 1; 193 | #endif 194 | 195 | return data + bi.total_skip; 196 | } 197 | 198 | } // namespace detail 199 | 200 | } // namespace sse 201 | --------------------------------------------------------------------------------