├── bin
    └── .gitignore
├── obj
    └── .placeholder
├── scripts
    ├── table.py
    ├── hybrid-shift-back.py
    ├── stats-bytes-converted.py
    ├── stats-utilization.py
    ├── hybrid-unsigned.py
    ├── writer.py
    ├── cost.py
    ├── hybrid.py
    ├── hybrid-signed.py
    └── generator.py
├── experiments
    ├── hwevents
    │   ├── testcases.py
    │   ├── experiment.py
    │   ├── loader.py
    │   └── runner.py
    ├── speedup-comparison
    │   ├── loader.py
    │   └── report.py
    ├── spanmaskhistogram
    │   ├── hwevents_loader.py
    │   ├── report_writer.py
    │   ├── experiment.py
    │   ├── testcases.py
    │   ├── loader.py
    │   ├── runner.py
    │   ├── microbenchmark_loader.py
    │   └── report.py
    ├── microbenchmarks
    │   ├── results
    │   │   ├── skylake-i7-6700-gcc7.3.0.metadata
    │   │   └── westmere-i5-m540-gcc7.3.0.metadata
    │   ├── update_reports.sh
    │   ├── writer.py
    │   ├── experiment.py
    │   ├── testcases.py
    │   ├── loader.py
    │   ├── runner.py
    │   └── report.py
    ├── overalltests
    │   ├── results
    │   │   ├── skylake-i7-6700-gcc7.3.0.metadata
    │   │   └── westmere-i5-m540-gcc7.3.0.metadata
    │   ├── average_writer.py
    │   ├── loader.py
    │   ├── report_writer.py
    │   ├── experiment.py
    │   ├── testcases.py
    │   ├── runner.py
    │   ├── average.py
    │   └── report.py
    ├── utils.py
    ├── distribution.py
    ├── prettyprint.py
    └── README.rst
├── include
    ├── scalar
    │   ├── scalar-parse-common.h
    │   ├── scalar-parse-unsigned.h
    │   ├── std-parser-signed.h
    │   └── scalar-parse-signed.h
    ├── block_info.h
    ├── safe-convert.h
    ├── test
    │   ├── input_generator.h
    │   ├── time_utils.h
    │   ├── application.h
    │   ├── command_line.h
    │   ├── linux-perf-events.h
    │   └── benchmark.h
    ├── sse
    │   ├── sse-matcher.h
    │   ├── sse-matcher-stni.h
    │   ├── sse-parser-unsigned.h
    │   ├── sse-utils.h
    │   ├── sse-parser-statistics.h
    │   ├── sse-simplified-parser-signed.h
    │   ├── sse-parser-signed.h
    │   ├── sse-block-parser-unsigned.h
    │   ├── sse-block-parser-signed.h
    │   ├── sse-convert.h
    │   └── sse-parser-common.h
    ├── hybrid-parser.h
    ├── hybrid-parser-signed.h
    └── avx512
    │   └── avx512-parser-signed.h
├── .gitignore
├── test
    ├── unittest
    │   ├── test-stni-matcher.cpp
    │   ├── verify_sse_unsigned_conversion.cpp
    │   ├── verify_sse_signed_overflow_detection.cpp
    │   ├── verify_sse_unsigned_parser.cpp
    │   └── verify_sse_signed_parser_validation.cpp
    ├── spanmaskhistogram.cpp
    ├── statistics.cpp
    ├── utils
    │   ├── command_line.cpp
    │   ├── input_generator.cpp
    │   └── application.cpp
    ├── compare-avx512.cpp
    ├── compare-unsigned.cpp
    ├── benchmark-hwevents.cpp
    ├── compare-signed.cpp
    ├── benchmark-cpuclocks.cpp
    └── benchmark.cpp
├── LICENSE
├── src
    ├── block_info.cpp
    └── sse-parser-statistics.cpp
└── README.rst


/bin/.gitignore:
--------------------------------------------------------------------------------
1 | [a-z]*
2 | 


--------------------------------------------------------------------------------
/obj/.placeholder:
--------------------------------------------------------------------------------
1 | # placeholder
2 | 


--------------------------------------------------------------------------------
/scripts/table.py:
--------------------------------------------------------------------------------
1 | ../experiments/table.py


--------------------------------------------------------------------------------
/experiments/hwevents/testcases.py:
--------------------------------------------------------------------------------
1 | ../spanmaskhistogram/testcases.py


--------------------------------------------------------------------------------
/experiments/speedup-comparison/loader.py:
--------------------------------------------------------------------------------
1 | ../microbenchmarks/loader.py


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/hwevents_loader.py:
--------------------------------------------------------------------------------
1 | ../hwevents/loader.py


--------------------------------------------------------------------------------
/experiments/microbenchmarks/results/skylake-i7-6700-gcc7.3.0.metadata:
--------------------------------------------------------------------------------
1 | OS: Ubuntu
2 | Compiler: gcc (GCC) 7.3.0
3 | CPU: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
4 | 


--------------------------------------------------------------------------------
/experiments/overalltests/results/skylake-i7-6700-gcc7.3.0.metadata:
--------------------------------------------------------------------------------
1 | OS: Ubuntu
2 | Compiler: gcc (GCC) 7.3.0
3 | CPU: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
4 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/results/westmere-i5-m540-gcc7.3.0.metadata:
--------------------------------------------------------------------------------
1 | OS: Linux
2 | Compiler: gcc (Debian 7.3.0-11) 7.3.0
3 | CPU: Intel(R) Core(TM) i5 CPU M 540  @ 2.53GHz
4 | 


--------------------------------------------------------------------------------
/experiments/overalltests/results/westmere-i5-m540-gcc7.3.0.metadata:
--------------------------------------------------------------------------------
1 | OS: Linux
2 | Compiler: gcc (Debian 7.3.0-11) 7.3.0
3 | CPU: Intel(R) Core(TM) i5 CPU M 540  @ 2.53GHz
4 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/update_reports.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | for f in results/*.txt
 4 | do
 5 |     SRC=$f
 6 |     TRG="${SRC%.txt}".rst
 7 |     echo "(re)creating $TRG"
 8 |     python report.py $SRC "#*" > $TRG
 9 | done
10 | 


--------------------------------------------------------------------------------
/include/scalar/scalar-parse-common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace scalar {
 4 | 
 5 |     bool contains(const char* set, char c) {
 6 |         char* s = const_cast<char*>(set);
 7 |         while (*s) {
 8 |             if (*s++ == c) {
 9 |                 return true;
10 |             }
11 |         }
12 | 
13 |         return false;
14 |     }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # a helper file not included in this repo
 2 | sse-dump.h
 3 | 
 4 | # auto-generated files
 5 | *.inl
 6 | 
 7 | # the rest
 8 | *.o
 9 | *.pyc
10 | 
11 | # overall reports
12 | report-*.rst
13 | overall.txt
14 | 
15 | # microbenchmarks
16 | microbenchmarks.txt
17 | microbenchmarks.rst
18 | 
19 | # span mask histogram
20 | spanmaskhistogram.txt
21 | spanmaskhistogram.rst
22 | 
23 | # hwevents
24 | hwevents.txt
25 | 
26 | perf.data
27 | perf.data.old
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/utils.py:
--------------------------------------------------------------------------------
 1 | def groupby(sequence, keyfun):
 2 |     d = {}
 3 |     for item in sequence:
 4 |         key = keyfun(item)
 5 |         if key not in d:
 6 |             d[key] = [item]
 7 |         else:
 8 |             d[key].append(item)
 9 | 
10 |     return d
11 | 
12 | def splitsorted(sequence, keyfun):
13 |     prev = None
14 |     result = []
15 |     for item in sequence:
16 |         val = keyfun(item)
17 |         if val != prev:
18 |             result.append([])
19 |             prev = val
20 | 
21 |         result[-1].append(item)
22 | 
23 |     return result
24 | 


--------------------------------------------------------------------------------
/scripts/hybrid-shift-back.py:
--------------------------------------------------------------------------------
 1 | from hybrid import Parser
 2 | 
 3 | def get_shifts():
 4 |     for number in range(2**8):
 5 |         parser = Parser(number)
 6 |         ranges = parser.get_ranges()
 7 |         if len(ranges) == 0:
 8 |             yield 0
 9 |             continue
10 |         
11 |         last = ranges[-1]
12 |         if last.last == 7:
13 |             yield last.digits()
14 |         else:
15 |             yield 0
16 | 
17 | 
18 | def main():
19 |     shifts = list(get_shifts())
20 |     shifts = map(str, shifts)
21 |     print("static int shift[256] = {%s};" % (','.join(shifts)))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/include/block_info.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <cstdio>
 5 | 
 6 | #include <immintrin.h>
 7 | 
 8 | enum class Conversion: uint8_t {
 9 |     Empty,
10 |     SSE1Digit,
11 |     SSE2Digits,
12 |     SSE3Digits,
13 |     SSE4Digits,
14 |     SSE8Digits,
15 |     Scalar
16 | };
17 | 
18 | struct BlockInfo {
19 |     uint8_t     first_skip;
20 |     uint8_t     total_skip;
21 |     uint8_t     element_count;
22 |     Conversion  conversion_routine;
23 |     uint16_t    invalid_sign_mask;
24 |     uint8_t     shuffle_digits[16];
25 |     uint8_t     shuffle_signs[16];
26 | 
27 |     void dump(FILE* file) const;
28 | };
29 | 
30 | extern BlockInfo blocks[65536];
31 | 
32 | 


--------------------------------------------------------------------------------
/experiments/distribution.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def single_digit_distribution(digit, count = 8):
 5 |     tmp = [0] * count
 6 |     tmp[digit - 1] = 1
 7 | 
 8 |     return tmp
 9 | 
10 | 
11 | def uniform_distribution(count):
12 |     return [1] * count
13 | 
14 | 
15 | def normal_distribution(mu, sigma, count = 8):
16 |     tmp = [0] * count
17 | 
18 |     def gauss(x):
19 |         c = 1.0/(sigma * math.sqrt(2 * math.pi))
20 |         return c * math.exp(-(x - mu)**2/(2 * sigma**2))
21 | 
22 |     for x in range(0, count):
23 |         tmp[x] = gauss(float(x + 1))
24 | 
25 |     m = max(tmp)
26 | 
27 |     for x in range(0, count):
28 |         tmp[x] = int(1000 * tmp[x]/m)
29 | 
30 |     return tmp
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/report_writer.py:
--------------------------------------------------------------------------------
 1 | class RestWriter(object):
 2 |     def __init__(self, file, report):
 3 |         self.file   = file
 4 |         self.report = report
 5 | 
 6 | 
 7 |     def write(self, restsection):
 8 | 
 9 |         assert len(restsection) >= 1
10 | 
11 |         for title, table in self.report:
12 |             self.write_header(title, restsection[0], 80)
13 |             self.file.write('\n')
14 |             self.file.write(str(table))
15 | 
16 | 
17 |     def write_header(self, title, char, width = 80):
18 |         f = self.file
19 |         f.write('\n')
20 |         f.write('\n')
21 |         f.write("%s\n" % title)
22 |         f.write(char * max(len(title), width))
23 |         f.write('\n')
24 | 
25 | 


--------------------------------------------------------------------------------
/include/safe-convert.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <stdexcept>
 5 | 
 6 | template <typename T>
 7 | void mul10_add_digit(T& number, char c) {
 8 |     // number * 10 + 9 <= MAX  <=>  number <= (MAX - 9)/10
 9 |     if (number < (std::numeric_limits<T>::max() - 9) / 10) {
10 |         // no overflow is possible, use the faster path
11 |         number = 10*number + c - '0';
12 |     } else {
13 |         // check for overflow
14 |         if (__builtin_umul_overflow(number, 10, &number)) {
15 |             throw std::range_error("unsigned overflow (1)");
16 |         }
17 | 
18 |         if (__builtin_uadd_overflow(number, c - '0', &number)) {
19 |             throw std::range_error("unsigned overflow (2)");
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/scripts/stats-bytes-converted.py:
--------------------------------------------------------------------------------
 1 | from generator import Generator
 2 | from table import Table
 3 | 
 4 | if __name__ == '__main__':
 5 |     gen = Generator()
 6 | 
 7 |     freq = {}
 8 |     for bi in gen.run():
 9 |         k = bi.total_skip
10 | 
11 |         freq[k] = freq.get(k, 0) + 1
12 | 
13 |     table = Table()
14 |     table.add_header(["bytes processed", "patterns", "%", "cumulative %"])
15 | 
16 |     cumulative = 0
17 |     for total_skip in sorted(freq.keys()):
18 |         count = freq[total_skip]
19 |         cumulative += count
20 | 
21 |         table.add_row([
22 |             '%d' % total_skip,
23 |             '%d' % count,
24 |             '%0.2f%%' % (100 * count/65536.0),
25 |             '%0.2f%%' % (100 * cumulative/65536.0),
26 |         ])
27 | 
28 |     print(table)
29 | 


--------------------------------------------------------------------------------
/include/test/input_generator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <string>
 5 | #include <random>
 6 | 
 7 | std::string generate_unsigned(size_t size,
 8 |                               const std::string& separators_set,
 9 |                               std::mt19937 random,
10 |                               std::discrete_distribution<> numbers,
11 |                               std::discrete_distribution<> separators);
12 | 
13 | std::string generate_signed(size_t size,
14 |                             const std::string& separators_set,
15 |                             std::mt19937 random,
16 |                             std::discrete_distribution<> numbers,
17 |                             std::discrete_distribution<> separators,
18 |                             std::discrete_distribution<> sign);
19 | 


--------------------------------------------------------------------------------
/experiments/overalltests/average_writer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class RestWriter(object):
 4 |     def __init__(self, file, report):
 5 |         self.file   = file
 6 |         self.report = report
 7 | 
 8 | 
 9 |     def write(self, restsection):
10 | 
11 |         assert len(restsection) >= 1
12 | 
13 |         for separator, distribution, table in self.report:
14 |             header = '%s (%s)' % (distribution, separator)
15 |             self.write_header(header, restsection[0], 80)
16 |             self.file.write('\n')
17 |             self.file.write(str(table))
18 | 
19 | 
20 |     def write_header(self, title, char, width = 80):
21 |         f = self.file
22 |         f.write('\n')
23 |         f.write('\n')
24 |         f.write("%s\n" % title)
25 |         f.write(char * max(len(title), width))
26 |         f.write('\n')
27 | 
28 | 


--------------------------------------------------------------------------------
/experiments/overalltests/loader.py:
--------------------------------------------------------------------------------
 1 | class Item(object):
 2 |     __slots__ = ("procedure",
 3 |                  "size",
 4 |                  "loops",
 5 |                  "distribution_name",
 6 |                  "num_distribution",
 7 |                  "sep_distribution",
 8 |                  "sign_distribution",
 9 |                  "time")
10 | 
11 | 
12 | def load(file):
13 |     L = []
14 |     for line in file:   
15 |         F = line.split(';')
16 | 
17 |         L.append(Item())
18 |         item = L[-1]
19 | 
20 |         item.procedure          = F[0]
21 |         item.size               = int(F[1])
22 |         item.loops              = int(F[2])
23 |         item.distribution_name  = F[3]
24 |         item.num_distribution   = F[4]
25 |         item.sep_distribution   = F[5]
26 |         item.sign_distribution  = F[6]
27 |         item.time               = int(F[7])
28 | 
29 |     return L
30 | 
31 | 


--------------------------------------------------------------------------------
/experiments/hwevents/experiment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | from testcases import testcases
 8 | from runner import Runner, format_distribution
 9 | 
10 | def main():
11 | 
12 |     for item in testcases():
13 |         size, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item
14 | 
15 |         r = Runner(size, numbers_distribution, separators_distribution, sign_distribution)
16 |         result = r.run()
17 |         print("%d;%s;%s;%s;%s;%s" % (
18 |             size,
19 |             distribution_name,
20 |             numbers_distribution,
21 |             separators_distribution,
22 |             sign_distribution,
23 |             result
24 |         ))
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/experiment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | from testcases import testcases
 8 | from runner import Runner, format_distribution
 9 | 
10 | def main():
11 | 
12 |     for item in testcases():
13 |         size, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item
14 | 
15 |         r = Runner(size, numbers_distribution, separators_distribution, sign_distribution)
16 |         result = r.run()
17 |         print("%d;%s;%s;%s;%s;%s" % (
18 |             size,
19 |             distribution_name,
20 |             numbers_distribution,
21 |             separators_distribution,
22 |             sign_distribution,
23 |             result
24 |         ))
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/test/unittest/test-stni-matcher.cpp:
--------------------------------------------------------------------------------
 1 | #include "sse/sse-matcher-stni.h"
 2 | 
 3 | #include <string>
 4 | #include <cassert>
 5 | 
 6 | bool all_bytes_equal(__m128i a, __m128i b) {
 7 |     
 8 |     uint8_t tmpa[16];
 9 |     uint8_t tmpb[16];
10 | 
11 |     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpa), a);
12 |     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpb), b);
13 | 
14 |     return memcmp(tmpa, tmpb, 16) == 0;
15 | }
16 | 
17 | int main() {
18 | 
19 |     sse::STNIMatcher matcher(",.;");
20 | 
21 |     const std::string s = ".123,45.;091;.;,";
22 |     assert(s.size() == 16);
23 | 
24 |     const __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s.c_str()));
25 |     const __m128i ret   = matcher.get_mask(input, _mm_setzero_si128());
26 | 
27 |     const __m128i expected = _mm_setr_epi8(-1, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1);
28 | 
29 |     assert(all_bytes_equal(ret, expected));
30 | }
31 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/writer.py:
--------------------------------------------------------------------------------
 1 | class RestWriter(object):
 2 |     def __init__(self, file, report):
 3 |         self.file   = file
 4 |         self.report = report
 5 | 
 6 | 
 7 |     def write(self, restsection):
 8 | 
 9 |         assert len(restsection) >= 2
10 | 
11 |         for section, collection in self.report:
12 |             self.write_header(section, restsection[0], 80)
13 | 
14 |             for subsection, table in collection:
15 |                 self.file.write('\n')
16 |                 self.file.write("**%s**\n" % subsection)
17 |                 self.file.write('\n')
18 |                 self.file.write(str(table))
19 |                 self.file.write('\n')
20 | 
21 | 
22 |     def write_header(self, title, char, width):
23 |         f = self.file
24 |         f.write('\n')
25 |         f.write('\n')
26 |         f.write("%s\n" % title)
27 |         f.write(char * max(len(title), width))
28 |         f.write('\n')
29 | 
30 | 


--------------------------------------------------------------------------------
/experiments/overalltests/report_writer.py:
--------------------------------------------------------------------------------
 1 | class RestWriter(object):
 2 |     def __init__(self, file, report):
 3 |         self.file   = file
 4 |         self.report = report
 5 | 
 6 | 
 7 |     def write(self, restsection):
 8 | 
 9 |         assert len(restsection) >= 3
10 | 
11 |         for separator, collection1 in self.report:
12 |             self.write_header(separator, restsection[0], 80)
13 |             for distribution, collection2 in collection1:
14 |                 self.write_header(distribution, restsection[1], 50)
15 |                 for parameters, table in collection2:
16 |                     self.write_header(parameters, restsection[2], 40)
17 |                     self.file.write('\n')
18 |                     self.file.write(str(table))
19 | 
20 | 
21 |     def write_header(self, title, char, width = 80):
22 |         f = self.file
23 |         f.write('\n')
24 |         f.write('\n')
25 |         f.write("%s\n" % title)
26 |         f.write(char * max(len(title), width))
27 |         f.write('\n')
28 | 
29 | 


--------------------------------------------------------------------------------
/include/test/time_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | #include <string>
 5 | #include <cstdio>
 6 | 
 7 | using Clock = std::chrono::high_resolution_clock;
 8 | 
 9 | template <typename UNIT = std::chrono::microseconds>
10 | Clock::time_point::rep elapsed(const Clock::time_point& t1, const Clock::time_point& t2) {
11 |     return std::chrono::duration_cast<UNIT>(t2 - t1).count();
12 | }
13 | 
14 | template <typename FUN>
15 | Clock::time_point::rep measure_time(FUN fun) {
16 | 
17 |     const auto t1 = Clock::now();
18 |     fun();
19 |     const auto t2 = Clock::now();
20 | 
21 |     return elapsed(t1, t2);
22 | }
23 | 
24 | template <typename FUN>
25 | Clock::time_point::rep measure_time(const std::string& info, FUN fun) {
26 | 
27 |     if (!info.empty()) {
28 |         printf("%s", info.c_str());
29 |         fflush(stdout);
30 |     }
31 | 
32 |     const auto dt = measure_time(fun);
33 | 
34 |     if (!info.empty()) {
35 |         printf("%lu us\n", dt);
36 |     }
37 | 
38 |     return dt;
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/experiment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | 
 8 | from testcases import testcases
 9 | from runner import Runner, format_distribution
10 | 
11 | 
12 | def test(file):
13 |     for item in testcases():
14 |         size, loops, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item
15 | 
16 |         r = Runner(size, loops, numbers_distribution, separators_distribution, sign_distribution)
17 |         clocks = r.run()
18 | 
19 |         file.write("%d;%d;%s;%s;%s;%s;%s\n" % (
20 |             size,
21 |             loops,
22 |             distribution_name,
23 |             format_distribution(numbers_distribution),
24 |             format_distribution(separators_distribution),
25 |             format_distribution(sign_distribution),
26 |             str(clocks)))
27 | 
28 | 
29 | def main():
30 |     test(sys.stdout)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/experiments/overalltests/experiment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | from testcases import testcases
 8 | from runner import Runner, format_distribution
 9 | 
10 | 
11 | def test(file):
12 |     for item in testcases():
13 |         procedure, size, loops, distribution_name, numbers_distribution, separators_distribution, sign_distribution = item
14 | 
15 |         r = Runner(procedure, size, loops, numbers_distribution, separators_distribution, sign_distribution)
16 |         time = r.run()
17 | 
18 |         file.write("%s;%d;%d;%s;%s;%s;%s;%d\n" % (
19 |             procedure,
20 |             size,
21 |             loops,
22 |             distribution_name,
23 |             format_distribution(numbers_distribution),
24 |             format_distribution(separators_distribution),
25 |             format_distribution(sign_distribution),
26 |             time))
27 | 
28 | 
29 | def main():
30 |     test(sys.stdout)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/include/sse/sse-matcher.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cassert>
 4 | #include <cstring>
 5 | 
 6 | #include "immintrin.h"
 7 | 
 8 | namespace sse {
 9 | 
10 |     template<int K>
11 |     class NaiveMatcher {
12 |         __m128i letters[K + 1];
13 |         size_t n;
14 | 
15 |     public:
16 |         NaiveMatcher(const char* s) {
17 |             assert(s != nullptr);
18 |             n = strlen(s);
19 |             assert(n < K);
20 | 
21 |             for (size_t i=0; i < n + 1; i++) {
22 |                 letters[i] = _mm_set1_epi8(s[i]);
23 |             }
24 |         }
25 | 
26 |         NaiveMatcher(char c) : n(0) {
27 |             letters[0] = _mm_set1_epi8(c);
28 |         }
29 | 
30 |     public:
31 |         __m128i get_mask(const __m128i& input, const __m128i& initial) const {
32 |             __m128i result = initial;
33 |             for (size_t i=0; i < n + 1; i++) {
34 | 
35 |                 const __m128i mask = _mm_cmpeq_epi8(letters[i], input);
36 |                 result = _mm_or_si128(result, mask);
37 |             }
38 | 
39 |             return result;
40 |         }
41 |     };
42 | 
43 | } // namespace sse
44 | 


--------------------------------------------------------------------------------
/include/sse/sse-matcher-stni.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <cassert>
 5 | #include <cstring>
 6 | 
 7 | #include "immintrin.h"
 8 | 
 9 | namespace sse {
10 | 
11 |     class STNIMatcher {
12 |         __m128i set;
13 |         size_t  set_size;
14 | 
15 |     public:
16 |         static bool can_handle(const char* s) {
17 |             return (s != nullptr)
18 |                 && (strlen(s) > 0)
19 |                 && (strlen(s) <= 16);
20 |         }
21 | 
22 |     public:
23 |         STNIMatcher(const char* s) {
24 |             assert(can_handle(s));
25 |             
26 |             set_size = strlen(s);
27 |             set      = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
28 |         }
29 | 
30 |     public:
31 |         __m128i get_mask(const __m128i& input, const __m128i& initial) {
32 | 
33 |             const uint8_t mode = _SIDD_UBYTE_OPS
34 |                                | _SIDD_CMP_EQUAL_ANY
35 |                                | _SIDD_UNIT_MASK;
36 | 
37 |             return _mm_or_si128(initial, _mm_cmpestrm(set, set_size, input, 16, mode));
38 |         }
39 |     };
40 | 
41 | } // namespace sse
42 | 


--------------------------------------------------------------------------------
/include/scalar/scalar-parse-unsigned.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <cstring>
 5 | #include <stdexcept>
 6 | 
 7 | #include "safe-convert.h"
 8 | #include "scalar-parse-common.h"
 9 | 
10 | namespace scalar {
11 | 
12 |     template <typename INSERTER>
13 |     void parse_unsigned(const char* data, size_t size, const char* separators, INSERTER output) {
14 | 
15 |         uint32_t result = 0;
16 |         size_t digits = 0;
17 | 
18 |         for (size_t i=0; i < size; i++) {
19 |             const char c = data[i];
20 |             if (c >= '0' && c <= '9') {
21 |                 mul10_add_digit(result, c);
22 |                 digits += 1;
23 |             } else if (contains(separators, c)) {
24 |                 if (digits > 0) {
25 |                     *output++ = result;
26 |                     result = 0;
27 |                     digits = 0;
28 |                 }
29 |             } else {
30 |                 throw std::runtime_error("Wrong character (scalar)");
31 |             }
32 |         }
33 | 
34 |         if (digits > 0) {
35 |             *output++ = result;
36 |         }
37 |     }
38 | 
39 | } // namespace
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/testcases.py:
--------------------------------------------------------------------------------
 1 | from distribution import *
 2 | 
 3 | def testcases():
 4 |     sign_distribution = [1,1,1]
 5 | 
 6 |     for size in sizes:
 7 |         for separator_distribution in separator_distributions:
 8 |             for k in range(1, 8 + 1):
 9 |                 numbers_distribution = single_digit_distribution(k)
10 |                 yield size, 'single', numbers_distribution, separator_distribution, sign_distribution
11 | 
12 |             for k in range(1, 8 + 1):
13 |                 numbers_distribution = uniform_distribution(k)
14 |                 yield size, 'uniform', numbers_distribution, separator_distribution, sign_distribution
15 | 
16 |             for k in range(1, 8 + 1):
17 |                 numbers_distribution = normal_distribution(k, 1.0)
18 |                 yield size, 'normal', numbers_distribution, separator_distribution, sign_distribution
19 | 
20 | 
21 | sizes = [
22 |     1024,
23 |     4096,
24 |     65536,
25 |     102400,
26 |     1024000,
27 |     10240000,
28 | ]
29 | 
30 | separator_distributions = [
31 |     [1],            # single character
32 |     [1,1,1,1,1,1]   # from 1 to 6 separators
33 | ]
34 | 
35 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/testcases.py:
--------------------------------------------------------------------------------
 1 | from distribution import *
 2 | 
 3 | def testcases():
 4 |     sign_distribution = [1,1,1]
 5 | 
 6 |     for size, loops in sizes:
 7 |         for separator_distribution in separator_distributions:
 8 |             for k in range(1, 8 + 1):
 9 |                 numbers_distribution = single_digit_distribution(k)
10 |                 yield size, loops, 'single', numbers_distribution, separator_distribution, sign_distribution
11 | 
12 |             for k in range(1, 8 + 1):
13 |                 numbers_distribution = uniform_distribution(k)
14 |                 yield size, loops, 'uniform', numbers_distribution, separator_distribution, sign_distribution
15 | 
16 |             for k in range(1, 8 + 1):
17 |                 numbers_distribution = normal_distribution(k, 1.0)
18 |                 yield size, loops, 'normal', numbers_distribution, separator_distribution, sign_distribution
19 | 
20 | 
21 | sizes = [
22 |     (1024,  10000),
23 |     (4096,  10000),
24 |     (65536,  1000),
25 |     (102400, 100),
26 | ]
27 | 
28 | separator_distributions = [
29 |     [1],            # single character
30 |     [1,1,1,1,1,1]   # from 1 to 6 separators
31 | ]
32 | 
33 | 


--------------------------------------------------------------------------------
/include/scalar/std-parser-signed.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdlib>
 4 | #include <cstring>
 5 | #include <climits>
 6 | #include <stdexcept>
 7 | 
 8 | namespace scalar {
 9 | 
10 |     namespace cstd {
11 | 
12 |         template <typename INSERTER>
13 |         void parse_signed(const char* data, size_t size, const char* separators, INSERTER output) {
14 | 
15 |             char* ptr = const_cast<char*>(data);
16 |             char* end = ptr + size;
17 | 
18 |             char* endptr;
19 |             while (true) {
20 |                 ptr += strspn(ptr, separators);
21 |                 if (ptr == end) {
22 |                     break;
23 |                 }
24 | 
25 |                 errno = 0;
26 |                 const long val = std::strtol(ptr, &endptr, 10);
27 | 
28 |                 // the following check comes from "man 3 strtol"
29 |                 if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) || (errno != 0 && val == 0)) {
30 |                     throw std::logic_error("invalid input");
31 |                 }
32 | 
33 |                 if (endptr == ptr) {
34 |                     throw std::logic_error("no digits");
35 |                 }
36 | 
37 |                 ptr = endptr;
38 |                 *output++ = val;
39 |             }
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/experiments/hwevents/loader.py:
--------------------------------------------------------------------------------
 1 | class HWEvents(object):
 2 |     __slots__ = ('branches', 'branch_misses', 'cache_references', 'cache_misses')
 3 | 
 4 |     def get_branch_miss_ratio(self):
 5 |         return float(self.branch_misses) / self.branches
 6 | 
 7 |     def get_cache_miss_ratio(self):
 8 |         return float(self.cache_misses) / self.cache_references
 9 | 
10 | 
11 | class Item(object):
12 |     __slots__ = ('size', 'distribution_name', 'numbers_distribution',
13 |                  'separators_distribution', 'sign_distribution', 'hwevents')
14 | 
15 | def load(path):
16 |     with open(path, 'rt') as f:
17 |         for line in f:
18 |             F = line.split(';')
19 | 
20 |             item = Item()
21 |             item.size                    = int(F[0])
22 |             item.distribution_name       = F[1]
23 |             item.numbers_distribution    = tuple(eval(F[2]))
24 |             item.separators_distribution = tuple(eval(F[3]))
25 |             item.sign_distribution       = tuple(eval(F[4]))
26 | 
27 |             tmp = eval(F[5])
28 |             hwevents = HWEvents()
29 |             hwevents.branches         = tmp[0]
30 |             hwevents.branch_misses    = tmp[1]
31 |             hwevents.cache_references = tmp[2]
32 |             hwevents.cache_misses     = tmp[3]
33 |             item.hwevents = hwevents
34 | 
35 |             yield item
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, Wojciech Muła
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
17 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
18 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/scripts/stats-utilization.py:
--------------------------------------------------------------------------------
 1 | from generator import Generator
 2 | from table import Table
 3 | 
 4 | if __name__ == '__main__':
 5 |     gen = Generator()
 6 | 
 7 |     conversion = {}
 8 | 
 9 |     for bi in gen.run():
10 |         size = bi.element_size
11 |         k    = len(bi.spans)
12 | 
13 |         if size not in conversion:
14 |             conversion[size] = []
15 | 
16 |         conversion[size].append(k)
17 | 
18 |     table = Table()
19 |     table.add_header(["element size", "occurances", "%", "avg", "max", "utilization"])
20 | 
21 |     def get_capacity(element_size):
22 |         if element_size == 1:
23 |             return 16
24 |         elif element_size == 2:
25 |             return 8
26 |         elif element_size == 4:
27 |             return 4
28 |         elif element_size == 8:
29 |             return 2
30 |         else:
31 |             return 1
32 | 
33 | 
34 |     for element_size in sorted(conversion.keys()):
35 |         list = conversion[element_size]
36 |         n = len(list)
37 |         avg = sum(list)/float(n)
38 |         utilization = 100 * avg/get_capacity(element_size)
39 | 
40 |         table.add_row([
41 |             '%d' % element_size,
42 |             '%d' % n,
43 |             '%0.1f%%' % (100 * n/65536.0),
44 |             '%0.2f' % avg,
45 |             '%d' % max(list),
46 |             '%0.1f%%' % utilization,
47 |         ])
48 | 
49 |     print(table)
50 | 


--------------------------------------------------------------------------------
/experiments/overalltests/testcases.py:
--------------------------------------------------------------------------------
 1 | from distribution import *
 2 | 
 3 | def testcases():
 4 |     sign_distribution = [1,1,1]
 5 | 
 6 |     for procedure in procedures:
 7 |         for size, loops in sizes:
 8 |             for separator_distribution in separator_distributions:
 9 |                 for k in range(1, 8 + 1):
10 |                     numbers_distribution = single_digit_distribution(k)
11 |                     yield procedure, size, loops, 'single', numbers_distribution, separator_distribution, sign_distribution
12 | 
13 |                 for k in range(1, 8 + 1):
14 |                     numbers_distribution = uniform_distribution(k)
15 |                     yield procedure, size, loops, 'uniform', numbers_distribution, separator_distribution, sign_distribution
16 | 
17 |                 for k in range(1, 8 + 1):
18 |                     numbers_distribution = normal_distribution(k, 1.0)
19 |                     yield procedure, size, loops, 'normal', numbers_distribution, separator_distribution, sign_distribution
20 | 
21 | 
22 | procedures = [
23 |     'scalar',
24 |     'sse',
25 |     'sse-block'
26 | ]
27 | 
28 | sizes = [
29 |     (1024,      10000),
30 |     (4096,      10000),
31 |     (102400,     1000),
32 |     (1024000,     100),
33 |     (10240000,     10),
34 | ]
35 | 
36 | separator_distributions = [
37 |     [1],            # single character
38 |     [1,1,1,1,1,1]   # from 1 to 6 separators
39 | ]
40 | 
41 | 


--------------------------------------------------------------------------------
/test/spanmaskhistogram.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iterator>
 3 | #include <cstdio>
 4 | #include <cstdlib>
 5 | 
 6 | #include "sse/sse-matcher.h"
 7 | #include "sse/sse-parser-signed.h"
 8 | 
 9 | #include "application.h"
10 | 
11 | class App: public Application {
12 | 
13 |     using Vector = std::vector<uint32_t>;
14 | 
15 | public:
16 |     App(int argc, char** argv) : Application(argc, argv) {}
17 |     
18 | private:
19 |     virtual bool custom_run() override;
20 |     virtual void custom_init() override;
21 | };
22 | 
23 | void App::custom_init() {
24 |     quiet = true;
25 | }
26 | 
27 | bool App::custom_run() {
28 |     const auto tmp = generate_signed();
29 | 
30 |     const char* separators = ",; ";
31 |     sse::NaiveMatcher<8> matcher(separators);
32 |     std::vector<int32_t> result;
33 |     sse::parser_signed(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result));
34 |     sse::stats.span_mask_histogram_to_csv(stdout);
35 | 
36 |     return true;
37 | }
38 | 
39 | int main(int argc, char* argv[]) {
40 | 
41 |     try {
42 |         App app(argc, argv);
43 | 
44 |         app.run();
45 | #ifndef USE_STATISTICS
46 |         puts("Program was not compiled with USE_STATISTICS");
47 | #endif
48 |         return EXIT_SUCCESS;
49 | 
50 |     } catch (std::exception& e) {
51 |         printf("%s\n", e.what());
52 |         return EXIT_FAILURE;
53 |     } catch (Application::Exit&) {
54 |         return EXIT_SUCCESS;
55 |     }
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/src/block_info.cpp:
--------------------------------------------------------------------------------
 1 | #include "block_info.h"
 2 | 
 3 | #include "block_info.inl"
 4 | 
 5 | namespace {
 6 |     
 7 |     void as_array(FILE* f, const uint8_t data[16]) {
 8 |         fprintf(f, "{");
 9 |         fprintf(f, "%02x", data[0]);
10 |         for (int i=1; i < 16; i++)
11 |             fprintf(f, ", %02x", data[i]);
12 |         fprintf(f, "}\n");
13 |     }
14 | 
15 |     const char* to_string(Conversion c) {
16 |         switch (c) {
17 |             case Conversion::Scalar:
18 |                 return "Scalar";
19 | 
20 |             case Conversion::SSE1Digit:
21 |                 return "SSE1Digit";
22 | 
23 |             case Conversion::SSE2Digits:
24 |                 return "SSE2Digits";
25 | 
26 |             case Conversion::SSE4Digits:
27 |                 return "SSE4Digits";
28 | 
29 |             case Conversion::SSE8Digits:
30 |                 return "SSE8Digits";
31 | 
32 |             default:
33 |                 return "<unknown>";
34 |         }
35 |     }
36 | 
37 | } // namespace
38 |  
39 | void BlockInfo::dump(FILE* f) const {
40 |     fprintf(f, "first_skip          : %d\n", first_skip);
41 |     fprintf(f, "total_skip          : %d\n", total_skip);
42 |     fprintf(f, "element_count       : %d\n", element_count);
43 |     fprintf(f, "conversion          : %s\n", to_string(conversion_routine));
44 |     fprintf(f, "invalid_sign_mask   : %04x\n", invalid_sign_mask);
45 |     fprintf(f, "shuffle_digits      : "); as_array(f, shuffle_digits);
46 |     fprintf(f, "shuffle_signs       : "); as_array(f, shuffle_signs);
47 | }
48 | 


--------------------------------------------------------------------------------
/include/sse/sse-parser-unsigned.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <cassert>
 5 | 
 6 | #include "scalar/scalar-parse-unsigned.h"
 7 | #include "sse-utils.h"
 8 | #include "sse-convert.h"
 9 | #include "sse-parser-common.h"
10 | #include "sse-parser-statistics.h"
11 | #include "block_info.h"
12 | 
13 | namespace sse {
14 | 
15 |     template <typename MATCHER, typename INSERTER>
16 |     void parser(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) {
17 | 
18 |         char* data = const_cast<char*>(string);
19 |         char* end  = data + size;
20 | 
21 |         while (data + 16 < end) {
22 |             const __m128i  input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
23 |             const __m128i  t0 = decimal_digits_mask(input);
24 |             const uint16_t digit_mask = _mm_movemask_epi8(t0);
25 |             const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, t0));
26 | 
27 |             STATS_INC(loops);
28 |             STATS_SPAN_MASK(digit_mask);
29 | 
30 |             if (valid_mask != 0xffff) {
31 |                 throw std::runtime_error("Wrong character");
32 |             }
33 | 
34 |             if (digit_mask == 0) {
35 |                 data += 16;
36 |                 continue;
37 |             }
38 | 
39 |             const BlockInfo& bi = blocks[digit_mask];
40 |             data = detail::parse_unsigned(bi, input, data, end, output);
41 | 
42 |         } // for
43 | 
44 |         // process the tail
45 |         scalar::parse_unsigned(data, string + size - data, separators, output);
46 |     }
47 | 
48 | } // namespace sse
49 | 


--------------------------------------------------------------------------------
/include/test/application.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <map>
 5 | #include <random>
 6 | #include <string>
 7 | 
 8 | #include "command_line.h"
 9 | 
10 | class Application {
11 | 
12 | public:
13 |     class Exit {};
14 |     class ArgumentError: public std::logic_error {
15 |     public:
16 |         ArgumentError(const std::string& msg) : std::logic_error(msg) {}
17 |     };
18 | 
19 | protected:
20 |     CommandLine cmdline;
21 | 
22 | protected:
23 |     bool quiet;
24 | 
25 | private:
26 |     size_t size;
27 |     size_t debug_size;
28 |     size_t loop_count;
29 |     struct {
30 |         std::discrete_distribution<> numbers;
31 |         std::discrete_distribution<> separators;
32 |         std::discrete_distribution<> sign;
33 |     } distribution;
34 |     bool sign_nonnull;
35 |     std::string separators_set;
36 | 
37 |     std::random_device rd;
38 |     std::mt19937 random;
39 | 
40 | public:
41 |     bool run();
42 | 
43 | protected:
44 |     Application(int argc, char* argv[]);
45 | 
46 |     virtual bool custom_run() = 0;
47 |     virtual void custom_init();
48 | 
49 |     bool has_signed_distribution() const {
50 |         return sign_nonnull;
51 |     }
52 | 
53 |     std::string get_separators_set() const {
54 |         return separators_set;
55 |     }
56 |     std::string generate_unsigned();
57 |     std::string generate_signed();
58 | 
59 | public:
60 |     size_t get_size() const {
61 |         return size;
62 |     }
63 | 
64 |     size_t get_loop_count() const {
65 |         return loop_count;
66 |     }
67 | 
68 | protected:
69 |     virtual void print_custom_help() const;
70 | 
71 | private:
72 |     void init();
73 |     void print_help() const;
74 | };
75 | 
76 | 


--------------------------------------------------------------------------------
/experiments/hwevents/runner.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from os.path import exists
 3 | 
 4 | EXECUTABLE = './bin/benchmark-hwevents'
 5 | LOOPS = 3
 6 | 
 7 | 
 8 | class Runner(object):
 9 |     def __init__(self, size, numbers_distribution, separators_distribution, sign_distribution):
10 | 
11 |         assert len(numbers_distribution) > 0
12 |         assert len(separators_distribution) > 0
13 |         assert len(sign_distribution) > 0
14 | 
15 |         self.size   = size
16 |         self.numbers_distribution    = numbers_distribution
17 |         self.separators_distribution = separators_distribution
18 |         self.sign_distribution       = sign_distribution
19 | 
20 |     def run(self):
21 |         args = self.__prepare_arguments()
22 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE)
23 |         res  = proc.communicate()[0]
24 |         ret  = proc.wait()
25 |         if ret != 0:
26 |             print(args)
27 |             print(res)
28 |             raise RuntimeError("program failed")
29 | 
30 |         return self.__parse_output(res)
31 | 
32 | 
33 |     def __prepare_arguments(self):
34 |         return (
35 |             EXECUTABLE,
36 |             '--size=%d'         % self.size,
37 |             '--num=%s'          % format_distribution(self.numbers_distribution),
38 |             '--sep=%s'          % format_distribution(self.separators_distribution),
39 |             '--sign=%s'         % format_distribution(self.sign_distribution),
40 |             '--loops=%d'        % LOOPS,
41 |             '--csv-output'
42 |         )
43 | 
44 | 
45 |     def __parse_output(self, output):
46 |         return list(map(int, (s.strip() for s in output.split(b','))))
47 | 
48 | 
49 | def format_distribution(dist):
50 |     return ','.join(map(str, dist))
51 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/loader.py:
--------------------------------------------------------------------------------
 1 | from hwevents_loader import load as load_hwevents
 2 | from microbenchmark_loader import load as load_cycles
 3 | 
 4 | def load(spanmaskhistogram, hwevents, microbenchmark):
 5 | 
 6 |     microbenchmarks_dict = {}
 7 |     for item in load_cycles(microbenchmark):
 8 |         key = (item.size, item.num_distribution, item.sep_distribution, item.sign_distribution)
 9 |         microbenchmarks_dict[key] = item.cycles['SSE']
10 | 
11 |     hwevents_dict = {}
12 |     for item in load_hwevents(hwevents):
13 |         key = (item.size, item.numbers_distribution, item.separators_distribution, item.sign_distribution)
14 |         hwevents_dict[key] = item.hwevents
15 |     
16 |     for item in load_histogram(spanmaskhistogram):
17 |         key = (item.size, item.numbers_distribution, item.separators_distribution, item.sign_distribution)
18 |         item.hwevents = hwevents_dict[key]
19 |         item.cycles   = microbenchmarks_dict.get(key, None)
20 | 
21 |         yield item
22 | 
23 | 
24 | class Item(object):
25 |     __slots__ = ('size', 'distribution_name', 'numbers_distribution',
26 |                  'separators_distribution', 'sign_distribution',
27 |                  'histogram', 'hwevents', 'cycles')
28 | 
29 | def load_histogram(path):
30 |     with open(path, 'rt') as f:
31 |         for line in f:
32 |             F = line.split(';')
33 | 
34 |             item = Item()
35 |             item.size                    = int(F[0])
36 |             item.distribution_name       = F[1]
37 |             item.numbers_distribution    = tuple(eval(F[2]))
38 |             item.separators_distribution = tuple(eval(F[3]))
39 |             item.sign_distribution       = tuple(eval(F[4]))
40 |             item.histogram               = eval(F[5])
41 | 
42 |             yield item
43 | 
44 | 


--------------------------------------------------------------------------------
/include/test/command_line.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <stdexcept>
 6 | 
 7 | class CommandLine {
 8 | 
 9 |     std::vector<std::string> args;
10 | 
11 |     class OptionNotFound: public std::logic_error {
12 |     public:
13 |         OptionNotFound(const std::string& s) : std::logic_error(s) {}
14 |     };
15 | 
16 | public:
17 |     CommandLine(int argc, char** argv);
18 | 
19 | public:
20 |     bool empty() const { return args.size() == 1; }
21 | 
22 |     // like "-h", "--version"
23 |     bool has_flag(const std::string& flag) const;
24 | 
25 |     // for "--name=value" returns "value"
26 |     std::string get_value(const std::string& option) const;
27 |     std::string get_value(const std::string& option, const std::string& defvalue) const;
28 |     bool has_value(const std::string& option) const;
29 | 
30 |     template <typename T, typename CONVERSION>
31 |     T parse_value(const std::string& option, CONVERSION conv) {
32 |         try {
33 |             return conv(get_value(option));
34 |         } catch (OptionNotFound&) {
35 |             throw;
36 |         } catch (std::exception& e) {
37 |             const auto msg = "Wrong value of '" + option + "': " + std::string(e.what());
38 |             throw std::logic_error(msg);
39 |         }
40 |     }
41 | 
42 |     template <typename T, typename CONVERSION>
43 |     T parse_value(const std::string& option, CONVERSION conv, const T& defvalue) {
44 |         try {
45 |             return conv(get_value(option));
46 |         } catch (OptionNotFound&) {
47 |             return defvalue;
48 |         } catch (std::exception& e) {
49 |             const auto msg = "Wrong value of '" + option + "': " + std::string(e.what());
50 |             throw std::logic_error(msg);
51 |         }
52 |     }
53 | 
54 |     const std::string& get_program_name() const;
55 | };
56 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/runner.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from os.path import exists
 3 | 
 4 | EXECUTABLE = './bin/spanmaskhistogram'
 5 | 
 6 | 
 7 | class Runner(object):
 8 |     def __init__(self, size, numbers_distribution, separators_distribution, sign_distribution):
 9 | 
10 |         assert len(numbers_distribution) > 0
11 |         assert len(separators_distribution) > 0
12 |         assert len(sign_distribution) > 0
13 | 
14 |         self.size   = size
15 |         self.numbers_distribution    = numbers_distribution
16 |         self.separators_distribution = separators_distribution
17 |         self.sign_distribution       = sign_distribution
18 | 
19 |     def run(self):
20 |         args = self.__prepare_arguments()
21 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE)
22 |         res  = proc.communicate()[0]
23 |         ret  = proc.wait()
24 |         if ret != 0:
25 |             print(args)
26 |             print(res)
27 |             raise RuntimeError("program failed")
28 | 
29 |         return self.__parse_output(res)
30 | 
31 | 
32 |     def __prepare_arguments(self):
33 |         return (
34 |             EXECUTABLE,
35 |             '--size=%d'         % self.size,
36 |             '--num=%s'          % format_distribution(self.numbers_distribution),
37 |             '--sep=%s'          % format_distribution(self.separators_distribution),
38 |             '--sign=%s'         % format_distribution(self.sign_distribution),
39 |         )
40 | 
41 | 
42 |     def __parse_output(self, output):
43 |         res = []
44 |         for line in output.splitlines():
45 |             F = line.split(b',')
46 |             mask  = int(F[0].strip(), 16)
47 |             count = int(F[1].strip())
48 | 
49 |             res.append((mask, count))
50 | 
51 |         return res
52 | 
53 | 
54 | def format_distribution(dist):
55 |     return ','.join(map(str, dist))
56 | 


--------------------------------------------------------------------------------
/include/hybrid-parser.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <cstdint>
 5 | #include <immintrin.h>
 6 | #include "sse/sse-utils.h"
 7 | 
 8 | namespace scalar {
 9 | 
10 |     template<int N>
11 |     uint32_t convert(const char* s, uint32_t prev) {
12 |         return convert<N - 1>(s + 1, prev * 10 + uint8_t(s[0]) - '0');
13 |     }
14 | 
15 |     template<>
16 |     uint32_t convert<0>(const char* /*s*/, uint32_t prev) {
17 |         return prev;
18 |     }
19 | 
20 |     template<int N>
21 |     uint32_t convert(const char* s) {
22 |         return convert<N>(s, 0);
23 |     }
24 | 
25 | }
26 | 
27 | template <typename MATCHER, typename INSERTER>
28 | void hybrid_parser(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) {
29 |     char* data = const_cast<char*>(string);
30 |     char* end  = data + size;
31 |     bool has_last = false;
32 |     uint32_t val = 0;
33 |     while (data + 16 < end) {
34 |         const __m128i  input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
35 |         const __m128i  t0 = sse::decimal_digits_mask(input);
36 |         const uint16_t digit_mask = _mm_movemask_epi8(t0);
37 |         const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, t0));
38 | 
39 |         if (valid_mask != 0xffff) {
40 |             throw std::runtime_error("Wrong character");
41 |         }
42 | 
43 |         if (digit_mask == 0) {
44 |             data += 16;
45 |             continue;
46 |         }
47 | 
48 |         switch (digit_mask & 0xff) {
49 |             #include "hybrid-parser-unsigned.inl"
50 |         }
51 | 
52 |         data += 8;
53 | 
54 |         switch (digit_mask >> 8) {
55 |             #include "hybrid-parser-unsigned.inl"
56 |         }
57 | 
58 |         data += 8;
59 |     } // for
60 | 
61 |     // process the tail
62 |     scalar::parse_unsigned(data, string + size - data, separators, output);
63 | }
64 | 


--------------------------------------------------------------------------------
/experiments/prettyprint.py:
--------------------------------------------------------------------------------
 1 | class Parameters(object):
 2 |     def __init__(self, weight, title):
 3 |         self.weight = weight
 4 |         self.title  = title
 5 | 
 6 | def get_num_distribution_parameters(distribution_name, num_distribution):
 7 | 
 8 |     if type(num_distribution) is str:
 9 |         distr = map(int, num_distribution.split(','))
10 |     else:
11 |         distr = num_distribution
12 | 
13 |     def format_count(count, noun):
14 |         if count == 1:
15 |             return '%d %s' % (count, noun)
16 |         else:
17 |             return '%d %ss' % (count, noun)
18 | 
19 |     if distribution_name == 'single':
20 |         def get_fixed():
21 |             return distr.index(1) + 1
22 | 
23 |         n = get_fixed()
24 |         return Parameters(n, format_count(n, 'digit'))
25 | 
26 |     elif distribution_name == 'normal':
27 |         def get_max():
28 |             return distr.index(max(distr)) + 1
29 | 
30 |         n = get_max()
31 |         return Parameters(n, "max at %d digit" % n)
32 | 
33 |     elif distribution_name == 'uniform':
34 |         n = len(distr)
35 |         return Parameters(n, "1 .. %s" % format_count(n, 'digit'))
36 |     
37 |     assert False
38 | 
39 | 
40 | def get_distribution_title(distribution_name):
41 |     if distribution_name == 'single':
42 |         return 'Fixed length'
43 |     elif distribution_name == 'normal':
44 |         return 'Gaussian distribution'
45 |     elif distribution_name == 'uniform':
46 |         return 'Uniform distribution'
47 |     
48 |     assert False
49 | 
50 | 
51 | def get_separator_title(sep_distribution):
52 |     if type(sep_distribution) is str:
53 |         sep = sep_distribution.split(',')
54 |     else:
55 |         sep = sep_distribution
56 | 
57 |     if sep == ['1']:
58 |         separator = 'single separator character'
59 |     else:
60 |         k = len(sep)
61 |         separator = '1 .. %d separator characters' % k
62 | 
63 |     return separator
64 | 
65 | 


--------------------------------------------------------------------------------
/include/sse/sse-utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstring>
 4 | #include <immintrin.h>
 5 | 
 6 | namespace sse {
 7 | 
 8 |     __m128i decimal_digits_mask(const __m128i input) {
 9 |         const __m128i ascii0 = _mm_set1_epi8('0');
10 |         const __m128i after_ascii9 = _mm_set1_epi8('9' + 1);
11 | 
12 |         const __m128i t0 = _mm_cmplt_epi8(input, ascii0); // t1 = (x < '0')
13 |         const __m128i t1 = _mm_cmplt_epi8(input, after_ascii9); // t0 = (x <= '9')
14 | 
15 |         return _mm_andnot_si128(t0, t1); // x <= '9' and x >= '0'
16 |     }
17 | 
18 |     __m128i sign_mask(const __m128i input) {
19 |         const __m128i t0 = _mm_cmpeq_epi8(input, _mm_set1_epi8('+'));
20 |         const __m128i t1 = _mm_cmpeq_epi8(input, _mm_set1_epi8('-'));
21 | 
22 |         return _mm_or_si128(t0, t1);
23 |     }
24 | 
25 |     uint64_t compose_bitmask(const __m128i bytemask0,
26 |                              const __m128i bytemask1,
27 |                              const __m128i bytemask2,
28 |                              const __m128i bytemask3) {
29 | 
30 |         const uint64_t mask0 = _mm_movemask_epi8(bytemask0);
31 |         const uint64_t mask1 = _mm_movemask_epi8(bytemask1);
32 |         const uint64_t mask2 = _mm_movemask_epi8(bytemask2);
33 |         const uint64_t mask3 = _mm_movemask_epi8(bytemask3);
34 | 
35 |         return mask0 
36 |              | (mask1 << (1*16))
37 |              | (mask2 << (2*16))
38 |              | (mask3 << (3*16));
39 |     }
40 | 
41 |     __m128i from_epu16(const uint16_t x, uint8_t one = 0xff) {
42 | 
43 |         uint8_t tmp[16];
44 |         memset(tmp, 0, sizeof(tmp));
45 | 
46 |         int idx = 0;
47 |         uint16_t val = x;
48 |         while (val) {
49 |             if (val & 0x0001) {
50 |                 tmp[idx] = one;
51 |             }
52 | 
53 |             val >>= 1;
54 |             idx += 1;
55 |         }
56 | 
57 |         return _mm_loadu_si128((const __m128i*)tmp);
58 |     }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/loader.py:
--------------------------------------------------------------------------------
 1 | import prettyprint
 2 | 
 3 | class Item(object):
 4 |     __slots__ = (
 5 |         "size",
 6 |         "iterations",
 7 |         "distribution_name",
 8 |         "num_distribution",
 9 |         "sep_distribution",
10 |         "sign_distribution",
11 |         "cycles",
12 |     )
13 | 
14 |     def get_num_distribution_title(self):
15 |         return '%s (%s)' % (prettyprint.get_distribution_title(self.distribution_name), \
16 |                prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).title)
17 | 
18 |     def get_num_distribution_weight(self):
19 |         return prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).weight
20 | 
21 |     def get_sep_distribution_title(self):
22 |         return prettyprint.get_separator_title(self.sep_distribution)
23 | 
24 | 
25 | procedures = (
26 |     'scalar',
27 |     'scalar (std)',
28 |     'scalar (hybrid)',
29 |     'SSE',
30 |     'SSE (block)',
31 |     'SSE (simplified)',
32 | )
33 | 
34 | reference_procedure = 'scalar'
35 | 
36 | assert reference_procedure in procedures
37 | 
38 | def load(path):
39 |     with open(path, 'rt') as f:
40 |         for item in load_file(f):
41 |             yield item
42 | 
43 | 
44 | def load_file(file):
45 | 
46 |     for line in file:
47 | 
48 |         item = Item()
49 |         F = line.split(';')
50 | 
51 |         item.size = int(F[0])
52 |         item.iterations = int(F[1])
53 |         item.distribution_name = F[2]
54 | 
55 |         def get_tuple(string):
56 |             tmp = eval(string)
57 |             if type(tmp) is int:
58 |                 return (1,)
59 |             else:
60 |                 assert type(tmp) is tuple
61 |                 return tmp
62 | 
63 |         item.num_distribution  = get_tuple(F[3])
64 |         item.sep_distribution  = get_tuple(F[4])
65 |         item.sign_distribution = get_tuple(F[5])
66 |         item.cycles = eval(F[6])
67 | 
68 |         yield item
69 | 
70 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/microbenchmark_loader.py:
--------------------------------------------------------------------------------
 1 | import prettyprint
 2 | 
 3 | class Item(object):
 4 |     __slots__ = (
 5 |         "size",
 6 |         "iterations",
 7 |         "distribution_name",
 8 |         "num_distribution",
 9 |         "sep_distribution",
10 |         "sign_distribution",
11 |         "cycles",
12 |     )
13 | 
14 |     def get_num_distribution_title(self):
15 |         return '%s (%s)' % (prettyprint.get_distribution_title(self.distribution_name), \
16 |                prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).title)
17 | 
18 |     def get_num_distribution_weight(self):
19 |         return prettyprint.get_num_distribution_parameters(self.distribution_name, self.num_distribution).weight
20 | 
21 |     def get_sep_distribution_title(self):
22 |         return prettyprint.get_separator_title(self.sep_distribution)
23 | 
24 | 
25 | procedures = (
26 |     'scalar',
27 |     'scalar (std)',
28 |     'scalar (hybrid)',
29 |     'SSE',
30 |     'SSE (block)',
31 |     'SSE (simplified)',
32 | )
33 | 
34 | reference_procedure = 'scalar'
35 | 
36 | assert reference_procedure in procedures
37 | 
38 | def load(path):
39 |     with open(path, 'rt') as f:
40 |         for item in load_file(f):
41 |             yield item
42 | 
43 | 
44 | def load_file(file):
45 | 
46 |     for line in file:
47 | 
48 |         item = Item()
49 |         F = line.split(';')
50 | 
51 |         item.size = int(F[0])
52 |         item.iterations = int(F[1])
53 |         item.distribution_name = F[2]
54 | 
55 |         def get_tuple(string):
56 |             tmp = eval(string)
57 |             if type(tmp) is int:
58 |                 return (1,)
59 |             else:
60 |                 assert type(tmp) is tuple
61 |                 return tmp
62 | 
63 |         item.num_distribution  = get_tuple(F[3])
64 |         item.sep_distribution  = get_tuple(F[4])
65 |         item.sign_distribution = get_tuple(F[5])
66 |         item.cycles = eval(F[6])
67 | 
68 |         yield item
69 | 
70 | 


--------------------------------------------------------------------------------
/scripts/hybrid-unsigned.py:
--------------------------------------------------------------------------------
 1 | from hybrid import GeneratorBase
 2 | 
 3 | class GenerateUnsingedParser(GeneratorBase):
 4 | 
 5 |     def before(self):
 6 |         self.lines.append('case 0x%02x:' % self.number)
 7 | 
 8 |     def after(self):
 9 |         self.lines.append('break;')
10 | 
11 |     def empty(self):
12 |         pass
13 | 
14 |     def full(self):
15 |         l = self.lines
16 | 
17 |         l.append("if (has_last) {")
18 |         l.append("   val = %s;" % self.expression(self.span, "val"))
19 |         l.append("} else {")
20 |         l.append("   val = %s;" % self.expression(self.span))
21 |         l.append("}")
22 |         l.append("has_last = true;")
23 | 
24 |     def finalize_previous(self):
25 |         l = self.lines
26 | 
27 |         l.append("if (has_last) {")
28 |         l.append("   has_last = false;")
29 |         l.append("    *output++ = val;")
30 |         l.append("}")
31 | 
32 |     def first_continuation(self):
33 |         l = self.lines
34 | 
35 |         l.append("if (has_last) {")
36 |         l.append("   val = %s;" % self.expression(self.span, "val"))
37 |         l.append("   has_last = false;")
38 |         l.append("} else {")
39 |         l.append("   val = %s;" % self.expression(self.span))
40 |         l.append("}")
41 |         l.append("*output++ = val;")
42 | 
43 |     def whole(self):
44 |         self.lines.append("*output++ = %s;" % self.expression(self.span))
45 | 
46 |     def last(self):
47 |         l = self.lines
48 | 
49 |         l.append("val = %s;" % self.expression(self.span))
50 |         l.append("has_last = true;")
51 | 
52 |     def expression(self, span, arg = None):
53 |         result = "scalar::convert<%d>(data" % span.digits()
54 |         if span.first != 0:
55 |             result += ' + %d' % span.first;
56 | 
57 |         if arg is not None:
58 |             result += ', %s' % arg
59 | 
60 |         result += ')'
61 |         return result
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     gen = GenerateUnsingedParser()
66 |     for line in gen.get():
67 |         print(line)
68 | 
69 | 


--------------------------------------------------------------------------------
/test/statistics.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iterator>
 3 | #include <cstdio>
 4 | #include <cstdlib>
 5 | 
 6 | #include "input_generator.h"
 7 | #include "sse/sse-matcher.h"
 8 | #include "sse/sse-parser-unsigned.h"
 9 | #include "sse/sse-parser-signed.h"
10 | 
11 | #include "application.h"
12 | 
13 | class StatisticsApp: public Application {
14 | 
15 |     using Vector = std::vector<uint32_t>;
16 | 
17 | public:
18 |     StatisticsApp(int argc, char** argv) : Application(argc, argv) {}
19 |     
20 | private:
21 |     virtual bool custom_run() override;
22 | 
23 | private:
24 |     void run_unsigned();
25 |     void run_signed();
26 | };
27 | 
28 | bool StatisticsApp::custom_run() {
29 |     if (has_signed_distribution()) {
30 |         run_signed();
31 |     } else {
32 |         run_unsigned();
33 |     }
34 | 
35 |     return true;
36 | }
37 | 
38 | void StatisticsApp::run_unsigned() {
39 | 
40 |     const auto tmp = generate_unsigned();
41 | 
42 |     const char* separators = ",; ";
43 |     sse::NaiveMatcher<8> matcher(separators);
44 |     std::vector<uint32_t> result;
45 |     sse::parser(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result));
46 |     sse::stats.print();
47 | }
48 | 
49 | void StatisticsApp::run_signed() {
50 | 
51 |     const auto tmp = generate_signed();
52 | 
53 |     const char* separators = ",; ";
54 |     sse::NaiveMatcher<8> matcher(separators);
55 |     std::vector<int32_t> result;
56 |     sse::parser_signed(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result));
57 |     sse::stats.print();
58 | }
59 | 
60 | int main(int argc, char* argv[]) {
61 | 
62 |     try {
63 |         StatisticsApp app(argc, argv);
64 | 
65 |         app.run();
66 | #ifndef USE_STATISTICS
67 |         puts("Program was not compiled with USE_STATISTICS");
68 | #endif
69 |         return EXIT_SUCCESS;
70 | 
71 |     } catch (std::exception& e) {
72 |         printf("%s\n", e.what());
73 |         return EXIT_FAILURE;
74 |     } catch (Application::Exit&) {
75 |         return EXIT_SUCCESS;
76 |     }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/test/utils/command_line.cpp:
--------------------------------------------------------------------------------
 1 | #include "command_line.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <stdexcept>
 5 | #include <cstring>
 6 | 
 7 | CommandLine::CommandLine(int argc, char** argv) {
 8 |     for (int i=0; i < argc; i++) {
 9 |         args.emplace_back(argv[i]);
10 |     }
11 | }
12 | 
13 | bool CommandLine::has_flag(const std::string& flag) const {
14 |     return std::find(args.begin(), args.end(), flag) != args.end();
15 | }
16 | 
17 | namespace {
18 |     
19 |     // is s2 prefix of s1
20 |     bool is_prefix(const std::string s1, const std::string s2) {
21 |         if (s2.size() > s1.size()) {
22 |             return false;
23 |         }
24 | 
25 |         if (s1.size() == s2.size()) {
26 |             return s1 == s2;
27 |         }
28 | 
29 |         return memcmp(s1.c_str(), s2.c_str(), s2.size()) == 0;
30 |     }
31 | }
32 | 
33 | std::string CommandLine::get_value(const std::string& option) const {
34 | 
35 |     for (size_t i=0; i < args.size(); i++) {
36 |         if (args[i] == option) {
37 |             try {
38 |                 return args.at(i + 1);
39 |             } catch (std::out_of_range&) {
40 |                 throw OptionNotFound("Argument " + args[i] + " should be followed by a value");
41 |             }
42 |         }
43 |     }
44 | 
45 |     const auto long_option = option + "=";
46 |     for (const auto& arg: args) {
47 |         if (is_prefix(arg, long_option)) {
48 |             return arg.substr(long_option.size());
49 |         }
50 |     }
51 | 
52 |     throw OptionNotFound("Argument " + option + " not found");
53 | }
54 | 
55 | std::string CommandLine::get_value(const std::string& option, const std::string& defvalue) const {
56 |     try {
57 |         return get_value(option);
58 |     } catch (std::logic_error&) {
59 |         return defvalue;
60 |     }
61 | }
62 | 
63 | bool CommandLine::has_value(const std::string& option) const {
64 |     try {
65 |         get_value(option);
66 |         return true;
67 |     } catch (std::logic_error&) {
68 |         return false;
69 |     }
70 | }
71 | 
72 | const std::string& CommandLine::get_program_name() const {
73 |     return args[0];
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/runner.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | EXECUTABLE = './bin/benchmark-cpuclocks'
 4 | 
 5 | class Runner(object):
 6 |     def __init__(self, size, loops, numbers_distribution, separators_distribution, sign_distribution):
 7 | 
 8 |         assert len(numbers_distribution) > 0
 9 |         assert len(separators_distribution) > 0
10 |         assert len(sign_distribution) > 0
11 | 
12 |         self.size      = size
13 |         self.loops     = loops
14 |         self.numbers_distribution    = numbers_distribution
15 |         self.separators_distribution = separators_distribution
16 |         self.sign_distribution       = sign_distribution
17 | 
18 |     def run(self):
19 |         args = self.__prepare_arguments()
20 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE)
21 |         res  = proc.communicate()[0]
22 |         ret  = proc.wait()
23 |         if ret != 0:
24 |             print(args)
25 |             print(res)
26 |             raise RuntimeError("program failed")
27 | 
28 |         return self.__parse_output(res)
29 | 
30 | 
31 |     def __prepare_arguments(self):
32 |         return (
33 |             EXECUTABLE,
34 |             '--size=%d'         % self.size,
35 |             '--loops=%d'        % self.loops,
36 |             '--num=%s'          % format_distribution(self.numbers_distribution),
37 |             '--sep=%s'          % format_distribution(self.separators_distribution),
38 |             '--sign=%s'         % format_distribution(self.sign_distribution),
39 |         )
40 | 
41 | 
42 |     def __parse_output(self, output):
43 |         d = {}
44 | 
45 |         for line in output.splitlines():
46 |             if 'cycle/op' not in line:
47 |                 continue
48 | 
49 |             # line = "scalar  :    14.503 cycle/op (best)   15.494 cycle/op (avg)"
50 |             name, tmp = line.split(':')
51 |             name = name.strip()
52 |             tmp  = tmp.split()
53 |             best = float(tmp[0])
54 |             avg  = float(tmp[3])
55 | 
56 |             d[name] = (best, avg)
57 | 
58 |         return d
59 | 
60 | 
61 | def format_distribution(dist):
62 |     return ','.join(map(str, dist))
63 | 


--------------------------------------------------------------------------------
/include/test/linux-perf-events.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unistd.h>             // for syscall
 4 | #include <sys/ioctl.h>          // for ioctl
 5 | #include <asm/unistd.h>         // for __NR_perf_event_open
 6 | #include <linux/perf_event.h>   // for perf event constants
 7 | 
 8 | #include <cerrno>               // for errno
 9 | #include <cstring>              // for memset
10 | #include <stdexcept>
11 | 
12 | 
13 | template <int TYPE = PERF_TYPE_HARDWARE>
14 | class LinuxEvents {
15 | 
16 |     int fd;
17 |     perf_event_attr attribs;
18 | 
19 | public:
20 |     LinuxEvents(int config) : fd(0) {
21 |         memset(&attribs, 0, sizeof(attribs));
22 |         attribs.type        = TYPE;
23 |         attribs.size        = sizeof(attribs);
24 |         attribs.config      = config;
25 |         attribs.disabled        = 1;
26 |         attribs.exclude_kernel  = 1;
27 |         attribs.exclude_hv      = 1;
28 | 
29 |         int pid = 0;    // the current process
30 |         int cpu = -1;   // all CPUs
31 |         int group = -1; // no group
32 |         unsigned long flags = 0;
33 |         fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
34 |         if (fd == -1) {
35 |             report_error("perf_event_open");
36 |         }
37 |     }
38 | 
39 |     ~LinuxEvents() {
40 |         close(fd);
41 |     }
42 | 
43 |     void start() {
44 |         if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) {
45 |             report_error("ioctl(PERF_EVENT_IOC_RESET)");
46 |         }
47 | 
48 |         if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) {
49 |             report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
50 |         }
51 |     }
52 | 
53 |     unsigned long end() {
54 |         if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) {
55 |             report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
56 |         }
57 | 
58 |         unsigned long result;
59 |         if (read(fd, &result, sizeof(result)) == -1) {
60 |             report_error("read");
61 |         }
62 | 
63 |         return result;
64 |     }
65 | 
66 | private:
67 |     void report_error(const std::string& context) {
68 |         throw std::runtime_error(context + ": " + std::string(strerror(errno)));
69 |     }
70 | 
71 | };
72 | 
73 | 


--------------------------------------------------------------------------------
/experiments/overalltests/runner.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | EXECUTABLE = './bin/benchmark'
 4 | 
 5 | 
 6 | class Runner(object):
 7 |     def __init__(self, procedure, size, loops, numbers_distribution, separators_distribution, sign_distribution):
 8 | 
 9 |         assert len(numbers_distribution) > 0
10 |         assert len(separators_distribution) > 0
11 |         assert len(sign_distribution) > 0
12 | 
13 |         self.procedure = procedure
14 |         self.size      = size
15 |         self.loops     = loops
16 |         self.numbers_distribution    = numbers_distribution
17 |         self.separators_distribution = separators_distribution
18 |         self.sign_distribution       = sign_distribution
19 | 
20 |     def run(self):
21 |         args = self.__prepare_arguments()
22 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE)
23 |         res  = proc.communicate()[0]
24 |         ret  = proc.wait()
25 |         if ret != 0:
26 |             print(args)
27 |             print res
28 |             raise RuntimeError("program failed")
29 | 
30 |         d = self.__parse_output(res)
31 |         return d['time']
32 | 
33 | 
34 |     def __prepare_arguments(self):
35 |         return (
36 |             EXECUTABLE,
37 |             '--procedure=%s'    % self.procedure,
38 |             '--size=%d'         % self.size,
39 |             '--loops=%d'        % self.loops,
40 |             '--num=%s'          % format_distribution(self.numbers_distribution),
41 |             '--sep=%s'          % format_distribution(self.separators_distribution),
42 |             '--sign=%s'         % format_distribution(self.sign_distribution),
43 |         )
44 | 
45 | 
46 |     def __parse_output(self, output):
47 |         d = {}
48 | 
49 |         for line in output.splitlines():
50 |             tmp = line.split(':')
51 |             try:
52 |                 key   = tmp[0].strip()
53 |                 value = tmp[1].strip()
54 | 
55 |                 d[key] = value
56 |             except IndexError:
57 |                 continue
58 | 
59 |             try:
60 |                 value = int(value.split()[0])
61 |                 d[key] = value
62 |             except ValueError:
63 |                 pass
64 | 
65 |         return d
66 | 
67 | 
68 | def format_distribution(dist):
69 |     return ','.join(map(str, dist))
70 | 


--------------------------------------------------------------------------------
/experiments/microbenchmarks/report.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | from table import Table
 8 | from loader import load_file as load, procedures, reference_procedure
 9 | from utils import groupby
10 | from writer import RestWriter
11 | from prettyprint import *
12 | 
13 | 
14 | class Report(object):
15 |     def __init__(self, path):
16 |         with open(path, 'rt') as f:
17 |             data = list(load(f))
18 | 
19 |         bysize = lambda item: item.size
20 |         data = groupby(data, bysize)
21 |         self.report = []
22 |         for size in sorted(data):
23 |             collection = data[size]
24 |             sortby = lambda item: (item.distribution_name, item.num_distribution, item.sep_distribution)
25 |             collection.sort(key=sortby)
26 | 
27 |             res = []
28 |             for item in collection:
29 |                 title = self.get_title(item)
30 |                 table = self.prepare_table(item)
31 | 
32 |                 res.append((title, table))
33 | 
34 |             self.report.append((
35 |                 'Input size %d bytes' % size,
36 |                 res
37 |             ))
38 | 
39 |     def get(self):
40 |         return self.report
41 | 
42 | 
43 |     def get_title(self, item):
44 |         return '%s --- %s' % (item.get_num_distribution_title(), item.get_sep_distribution_title())
45 | 
46 | 
47 |     def prepare_table(self, item):
48 |         table = Table()
49 |         table.add_header(["", ("cycles per one input byte", 2), "speed-up"])
50 |         table.add_header(["procedure", "min", "avg", "(min)"])
51 | 
52 |         refmin, refavg = item.cycles[reference_procedure]
53 | 
54 |         for key in procedures:
55 |             try:
56 |                 min, avg = item.cycles[key]
57 |             except KeyError:
58 |                 continue
59 | 
60 |             table.add_row([
61 |                 key,
62 |                 '%5.3f' % min,
63 |                 '%5.3f' % avg,
64 |                 '%0.2f' % (refmin/min)
65 |             ])
66 | 
67 |         return table
68 | 
69 | 
70 | def main():
71 |     report = Report(sys.argv[1])
72 |     writer = RestWriter(sys.stdout, report.get())
73 |     try:
74 |         restsection = sys.argv[2]
75 |     except IndexError:
76 |         restsection = '-~'
77 | 
78 |     writer.write(restsection)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/experiments/README.rst:
--------------------------------------------------------------------------------
 1 | Scripts purpose
 2 | --------------------------------------------------------------------------------
 3 | 
 4 | ``microbenchamarks/*.py`` --- run all implementations (scalar and vectorized)
 5 | on rather small inputs and count how many **CPU cycles** are needed to complete
 6 | conversion. Subdirectory ``results`` contains produced files from some computers.
 7 | 
 8 | It uses ``bin/benchmark-cpuclocks`` utility; makefile targets
 9 | ``microbenchmarks.txt`` and ``microbenchmarks.rst``.
10 | 
11 | --------------------------------------------------------------------------------
12 | 
13 | ``speedup-comparison/report.py`` --- from ``microbenchmarks.txt`` produces a
14 | summary speedup array (min/avg/max) for all methods and input
15 | size/distribution. Usage::
16 | 
17 |     $ python speedup-comparison/report.py microbenchmarks/results/file.txt > file.rst
18 | 
19 | The result of this script is shown in article__.
20 | 
21 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html#core-i7-results
22 | 
23 | --------------------------------------------------------------------------------
24 | 
25 | ``overalltests/*.py`` --- run scalar and SSE implementations on small and huge
26 | input, measure **wall clock** of algorithms.  Subdirectory ``results`` contains
27 | produced files from some computers.
28 | 
29 | It uses ``bin/benchmark`` utility; makefile targets ``overall.txt``,
30 | ``report-overall.rst`` (all numbers are shown), ``report-overall-short.rst``
31 | (just min/mav/max speedup is shown).
32 | 
33 | --------------------------------------------------------------------------------
34 | 
35 | ``hwevents/*.py`` --- runs SSE searches  and counts hardware events: branch
36 | taken & misses and cache references & misses.
37 | 
38 | It uses ``bin/benchmark-hwevents`` utility; makefile target: ``hwevents.txt``.
39 | 
40 | --------------------------------------------------------------------------------
41 | 
42 | ``spanmaskhistogram/*.py`` --- for different input sizes and input data
43 | distrubutions it gets following parameters of **SSE procedure**:
44 | 
45 | * ``span_pattern`` statistics usage  (runs ``bin/statistics``);
46 | * running time in CPU clocks (from ``measurements.txt``);
47 | * brach and cache events count (from ``hwevents.txt``).
48 | 
49 | All these data is collated in single a table; makefile target
50 | ``spanmaskhistogram.rst``.
51 | 
52 | The result of this script is shown in article__.
53 | 
54 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html#sse-conversion-runtime-analysis
55 | 
56 | 


--------------------------------------------------------------------------------
/include/sse/sse-parser-statistics.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdio>
 4 | #include <cstdint>
 5 | #include <map>
 6 | 
 7 | #ifdef USE_STATISTICS
 8 | #define STATS_ADD(__field__, val) sse::stats.__field__ += (val)
 9 | #define STATS_INC(__field__) STATS_ADD(__field__, 1)
10 | #define STATS_SPAN_MASK(mask) sse::stats.span_masks_histogram[mask] += 1
11 | #else
12 | #define STATS_ADD(__field__, val)
13 | #define STATS_INC(__field__)
14 | #define STATS_SPAN_MASK(mask)
15 | #endif
16 | 
17 | namespace sse {
18 | 
19 |     struct SSEStatistics {
20 |         size_t scalar_conversions = 0;
21 | 
22 |         size_t digit1_calls = 0;
23 |         size_t digit1_converted = 0;
24 |         size_t digit2_calls = 0;
25 |         size_t digit2_converted = 0;
26 |         size_t digit3_calls = 0;
27 |         size_t digit3_converted = 0;
28 |         size_t digit4_calls = 0;
29 |         size_t digit4_converted = 0;
30 |         size_t digit8_calls = 0;
31 |         size_t digit8_converted = 0;
32 | 
33 |         size_t get_all_converted() const {
34 |             return scalar_conversions
35 |                  + get_SSE_converted();
36 |         }
37 | 
38 |         size_t get_SSE_converted() const {
39 |             return digit1_converted
40 |                  + digit2_converted
41 |                  + digit3_converted
42 |                  + digit4_converted
43 |                  + digit8_converted;
44 |         }
45 |     };
46 | 
47 |     struct Statistics {
48 | 
49 |         std::map<int, size_t> total_skip_histogram;
50 |         std::map<uint16_t, size_t> span_masks_histogram;
51 | 
52 |         size_t loops = 0;
53 | 
54 |         SSEStatistics unsigned_path;
55 |         SSEStatistics signed_path;
56 | 
57 |         Statistics();
58 | 
59 |         size_t get_all_converted() const {
60 |             return unsigned_path.get_all_converted()
61 |                  + signed_path.get_all_converted();
62 |         }
63 | 
64 |         size_t get_scalar_conversions() const {
65 |             return unsigned_path.scalar_conversions
66 |                  + signed_path.scalar_conversions;
67 |         }
68 | 
69 |         size_t get_SSE_converted() const {
70 |             return unsigned_path.get_SSE_converted()
71 |                  + signed_path.get_SSE_converted();
72 |         }
73 | 
74 |         void print(FILE* file) const;
75 |         void span_mask_histogram_to_csv(FILE* file) const;
76 | 
77 |         void print() const {
78 |             print(stdout);
79 |         }
80 | 
81 |         void init();
82 |     };
83 | 
84 |     extern Statistics stats;
85 | 
86 | } // namespace sse
87 | 
88 | 


--------------------------------------------------------------------------------
/test/utils/input_generator.cpp:
--------------------------------------------------------------------------------
 1 | #include "input_generator.h"
 2 | 
 3 | #include <cassert>
 4 | 
 5 | static const std::string numbers = "0123456789";
 6 | 
 7 | static
 8 | std::string random_string(size_t n, const std::string& set) {
 9 |     
10 |     std::string result(n, ' ');
11 |     for (size_t i=0; i < n; i++) {
12 |         result[i] = set[rand() % set.size()];
13 |     }
14 | 
15 |     return result;
16 | }
17 | 
18 | std::string generate_unsigned(size_t size,
19 |                               const std::string& separators_set,
20 |                               std::mt19937 random,
21 |                               std::discrete_distribution<> num,
22 |                               std::discrete_distribution<> sep) {
23 | 
24 |     std::string result;
25 | 
26 |     while (true) {
27 |         const size_t n = num(random) + 1;
28 |         const size_t k = sep(random) + 1;
29 | 
30 |         const std::string number = random_string(n, numbers);
31 |         const std::string sep    = random_string(k, separators_set);
32 | 
33 |         if (result.size() + n + k < size) {
34 |             result += number;
35 |             result += sep;
36 |         } else {
37 |             result += random_string(size - result.size(), separators_set);
38 |             return result;
39 |         }
40 |     }
41 | }
42 | 
43 | std::string generate_signed(size_t size,
44 |                             const std::string& separators_set,
45 |                             std::mt19937 random,
46 |                             std::discrete_distribution<> num,
47 |                             std::discrete_distribution<> sep,
48 |                             std::discrete_distribution<> sign) {
49 | 
50 |     std::string result;
51 | 
52 |     while (true) {
53 |         const size_t n = num(random) + 1;
54 |         const size_t k = sep(random) + 1;
55 |         const size_t s = sign(random) % 3;
56 | 
57 |         const std::string number = random_string(n, numbers);
58 |         const std::string sep    = random_string(k, separators_set);
59 | 
60 |         if (result.size() + n + k + s < size) {
61 |             switch (s) {
62 |                 case 0:
63 |                     break;
64 | 
65 |                 case 1:
66 |                     result += '-';
67 |                     break;
68 | 
69 |                 case 2:
70 |                     result += '+';
71 |                     break;
72 |             }
73 |             result += number;
74 |             result += sep;
75 |         } else {
76 |             result += random_string(size - result.size(), separators_set);
77 |             return result;
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/scripts/writer.py:
--------------------------------------------------------------------------------
 1 | ITEM_PATTERN = \
 2 |     "/* %(ID)04x %(SCALAR_COST)5.2f %(SSE_COST)5.2f */ {" \
 3 |     "%(FIRST_SKIP)s," \
 4 |     "%(TOTAL_SKIP)s," \
 5 |     "%(ELEMENT_COUNT)s," \
 6 |     "%(CONVERSION)s," \
 7 |     "0x%(INVALID_SIGN_MASK)04x," \
 8 |     "{%(SHUFFLE_DIGITS)s}," \
 9 |     "{%(SHUFFLE_SIGNS)s}" \
10 |     "}"
11 | 
12 | FILE_PATTERN = """
13 | #pragma once
14 | 
15 | #include "block_info.h"
16 | 
17 | BlockInfo blocks[%(COUNT)d] = {
18 | %(ITEMS)s
19 | };
20 | """
21 | 
22 | from cost import scalar_cost, SSE_cost
23 | 
24 | class CPPWriter(object):
25 |     def __init__(self, data):
26 |         self.data = data
27 |         pass
28 | 
29 |     def save(self, path):
30 |         tmp = [self._render_item(item) for item in self.data]
31 |         params = {
32 |             'COUNT': len(tmp),
33 |             'ITEMS': ',\n'.join(tmp),
34 |         }
35 | 
36 |         text = FILE_PATTERN % params
37 | 
38 |         with open(path, 'wt') as f:
39 |             f.write(text)
40 | 
41 | 
42 |     def _render_item(self, block):
43 |         params = {
44 |             'ID'                : block.id,
45 |             'FIRST_SKIP'        : block.first_skip,
46 |             'TOTAL_SKIP'        : block.total_skip,
47 |             'ELEMENT_COUNT'     : len(block.spans),
48 |             'CONVERSION'        : self.get_conversion_enum(block),
49 |             'INVALID_SIGN_MASK' : block.get_invalid_sign_mask(),
50 |             'SHUFFLE_DIGITS'    : self._make_c_array(block.shuffle_digits),
51 |             'SHUFFLE_SIGNS'     : self._make_c_array(block.shuffle_signs),
52 |             'SCALAR_COST'       : scalar_cost(block).value(),
53 |             'SSE_COST'          : SSE_cost(block).value(),
54 |         }
55 | 
56 |         return ITEM_PATTERN % params
57 | 
58 | 
59 |     def _make_c_array(self, numbers):
60 |         return ','.join('0x%02x' % x for x in numbers)
61 | 
62 |     def get_conversion_enum(self, block):
63 |         if block.element_size == 0:
64 |             return 'Conversion::Empty'
65 | 
66 |         if block.element_size == 1:
67 |             return 'Conversion::SSE1Digit'
68 | 
69 |         if block.element_size == 2:
70 |             return 'Conversion::SSE2Digits'
71 | 
72 |         if block.element_size == 4:
73 |             if all(r.digits() == 3 for r in block.spans):
74 |                 print(block)
75 |                 return 'Conversion::SSE3Digits'
76 |             else:
77 |                 return 'Conversion::SSE4Digits'
78 | 
79 |         if block.element_size == 8:
80 |             return 'Conversion::SSE8Digits'
81 | 
82 |         if block.element_size == 16:
83 |             return 'Conversion::Scalar'
84 | 
85 |         assert False
86 | 


--------------------------------------------------------------------------------
/test/compare-avx512.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <cstdio>
  4 | #include <cstdlib>
  5 | 
  6 | #include "input_generator.h"
  7 | #include "avx512/avx512-parser-signed.h"
  8 | 
  9 | #include "application.h"
 10 | 
 11 | class CompareApp: public Application {
 12 | 
 13 |     using Vector = std::vector<int32_t>;
 14 | 
 15 | public:
 16 |     CompareApp(int argc, char** argv) : Application(argc, argv) {}
 17 | 
 18 |     virtual bool custom_run() override;
 19 | 
 20 | private:
 21 |     void dump(const Vector& vec) const;
 22 |     bool compare(const Vector& expected, const Vector& result) const;
 23 | 
 24 | };
 25 | 
 26 | 
 27 | bool CompareApp::custom_run() {
 28 | 
 29 |     const auto tmp = generate_signed();
 30 | 
 31 |     Vector reference;
 32 |     Vector result;
 33 |     const char* separators = ";, ";
 34 |     scalar::parse_signed(tmp.data(), tmp.size(), separators, std::back_inserter(reference));
 35 | 
 36 |     avx512::parser_signed(tmp.data(), tmp.size(), separators, std::back_inserter(result));
 37 | 
 38 |     if (!compare(reference, result)) {
 39 |         puts(tmp.c_str());
 40 |         puts("");
 41 |         dump(reference);
 42 |         puts("");
 43 |         dump(result);
 44 | 
 45 |         return false;
 46 |     } else {
 47 |         puts("All OK");
 48 |         return true;
 49 |     }
 50 | }
 51 | 
 52 | void CompareApp::dump(const Vector& vec) const {
 53 |     printf("size = %lu: [", vec.size());
 54 | 
 55 |     const size_t n = vec.size();
 56 |     if (n) {
 57 |         printf("%d", vec[0]);
 58 |     }
 59 | 
 60 |     for (size_t i=1; i < n; i++) {
 61 |         printf(", %d", vec[i]);
 62 |     }
 63 | 
 64 |     printf("]\n");
 65 | }
 66 | 
 67 | bool CompareApp::compare(const Vector& expected, const Vector& result) const {
 68 | 
 69 |     if (expected.size() != result.size()) {
 70 |         puts("different sizes");
 71 |         return false;
 72 |     }
 73 | 
 74 |     const size_t n = expected.size();
 75 |     for (size_t i=0; i < n; i++) {
 76 |         const auto e = expected[i];
 77 |         const auto r = result[i];
 78 | 
 79 |         if (e != r) {
 80 |             printf("error at #%lu: expected = %d, result = %d\n", i, e, r);
 81 |             return false;
 82 |         }
 83 |     }
 84 | 
 85 |     return true;
 86 | }
 87 | 
 88 | int main(int argc, char* argv[]) {
 89 | 
 90 |     try {
 91 |         CompareApp app(argc, argv);
 92 | 
 93 |         return app.run() ? EXIT_SUCCESS : EXIT_FAILURE;
 94 | 
 95 |     } catch (std::exception& e) {
 96 |         printf("%s\n", e.what());
 97 |         return EXIT_FAILURE;
 98 |     } catch (Application::Exit&) {
 99 |         return EXIT_SUCCESS;
100 |     }
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/include/sse/sse-simplified-parser-signed.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Nate's idea: simply consider all non-digit and non-sign characters as separators.
 3 | */
 4 | #pragma once
 5 | 
 6 | #include <vector>
 7 | #include <cassert>
 8 | 
 9 | #include "scalar/scalar-parse-signed.h"
10 | #include "sse-utils.h"
11 | #include "sse-convert.h"
12 | #include "sse-parser-common.h"
13 | #include "sse-parser-statistics.h"
14 | #include "block_info.h"
15 | 
16 | namespace sse_simplified {
17 | 
18 |     namespace detail {
19 | 
20 |         template <typename INSERTER>
21 |         char* process_chunk(char* data, char* end, const __m128i& input, INSERTER output) {
22 | 
23 |             const __m128i ascii_minus = _mm_set1_epi8('-');
24 |             const __m128i ascii_plus  = _mm_set1_epi8('+');
25 | 
26 |             const __m128i bytemask_digit = sse::decimal_digits_mask(input);
27 | 
28 |             const __m128i bytemask_plus  = _mm_cmpeq_epi8(input, ascii_plus);
29 |             const __m128i bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus);
30 |             const __m128i bytemask_sign  = _mm_or_si128(bytemask_plus, bytemask_minus);
31 |             const __m128i bytemask_span  = _mm_or_si128(bytemask_digit, bytemask_sign);
32 | 
33 |             const uint16_t span_mask = _mm_movemask_epi8(bytemask_span);
34 |             const uint16_t sign_mask = _mm_movemask_epi8(bytemask_sign);
35 |             const BlockInfo& bi      = blocks[span_mask];
36 |             if (sign_mask & bi.invalid_sign_mask) {
37 |                 throw std::runtime_error("'+' or '-' at invalid position");
38 |             }
39 | 
40 |             if (span_mask == 0) {
41 |                 return data + 16;
42 |             }
43 | 
44 |             STATS_INC(loops);
45 | 
46 |             if (sign_mask == 0 || bi.conversion_routine == Conversion::SSE1Digit) {
47 |                 // unsigned path
48 |                 return sse::detail::parse_unsigned(bi, input, data, end, output);
49 |             } else {
50 |                 return sse::detail::parse_signed(bi, input, data, end, output);
51 |             }
52 |         }
53 | 
54 |     }
55 | 
56 |     template <typename INSERTER>
57 |     void parse_signed(const char* string, size_t size, const char* separators, INSERTER output) {
58 | 
59 |         char* data = const_cast<char*>(string);
60 |         char* end  = data + size;
61 | 
62 |         while (data + 16 < end) {
63 |             const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
64 |             data = detail::process_chunk(data, end, input, output);
65 | 
66 |         } // for
67 | 
68 |         // process the tail
69 |         scalar::parse_signed(data, string + size - data, separators, output);
70 |     }
71 | 
72 | } // namespace sse_simplified
73 | 


--------------------------------------------------------------------------------
/experiments/speedup-comparison/report.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | if __name__ == '__main__' and __package__ is None:
 5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | from loader import load_file, procedures as all_procedures, reference_procedure
 8 | from utils import groupby
 9 | from prettyprint import *
10 | from table import Table
11 | 
12 | procedures = [proc for proc in all_procedures if proc != reference_procedure]
13 | 
14 | def calculate_speeups(item):
15 |     speedup = {}
16 |     ref = float(item.cycles[reference_procedure][0])
17 |     for proc in procedures:
18 |         val = item.cycles[proc][0]
19 |         speedup[proc] = ref/val
20 | 
21 |     return speedup
22 | 
23 | 
24 | def statistics(array):
25 |     n = len(array)
26 |     assert n > 0
27 |     return (min(array), sum(array)/n, max(array))
28 | 
29 | 
30 | def calculate_speedup_statistics(collection):
31 |     speedups = {}
32 |     for proc in procedures:
33 |         speedups[proc] = []
34 | 
35 |     for item in collection:
36 |         speedup = calculate_speeups(item)
37 |         for proc in procedures:
38 |             speedups[proc].append(speedup[proc])
39 | 
40 |     size = collection[0].size
41 |     name = collection[0].distribution_name
42 |     result = {}
43 |     for proc in procedures:
44 |         array = speedups[proc]
45 |         if len(array) == 0:
46 |             continue
47 | 
48 |         result[proc] = statistics(array)
49 | 
50 |     return (size, name, result)
51 | 
52 | 
53 | def main(path):
54 |     with open(path, 'rt') as f:
55 |         keyfun = lambda item: (item.size, item.distribution_name)
56 |         data = groupby(load_file(f), keyfun)
57 | 
58 |     header1 = [("", 3), ("speedup over %s procedure" % reference_procedure, len(procedures) * 3)]
59 |     header2 = [("", 3)]
60 |     header3 = ["size [B]", "distribution", "samples"]
61 |     for proc in procedures:
62 |         header2.append((proc, 3))
63 |         header3.extend(["min", "avg", "max"])
64 |  
65 |     table = Table()
66 |     table.add_header(header1)
67 |     table.add_header(header2)
68 |     table.add_header(header3)
69 | 
70 |     for key in sorted(data):
71 |         collection = data[key]
72 |         size, name, stats = calculate_speedup_statistics(collection)
73 | 
74 |         row = []
75 |         row.append('%d' % size)
76 |         row.append(get_distribution_title(name))
77 |         row.append('%d' % len(collection))
78 | 
79 |         for proc in procedures:
80 |             row.append('%0.2f' % stats[proc][0])
81 |             row.append('%0.2f' % stats[proc][1])
82 |             row.append('%0.2f' % stats[proc][2])
83 | 
84 |         table.add_row(row)
85 | 
86 |     
87 |     print(table)
88 | 
89 | if __name__ == '__main__':
90 |     main(sys.argv[1])
91 | 


--------------------------------------------------------------------------------
/test/compare-unsigned.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <cstdio>
  4 | #include <cstdlib>
  5 | 
  6 | #include "input_generator.h"
  7 | #include "scalar/scalar-parse-unsigned.h"
  8 | #include "sse/sse-matcher.h"
  9 | #include "sse/sse-parser-unsigned.h"
 10 | 
 11 | #include "application.h"
 12 | 
 13 | class CompareApp: public Application {
 14 | 
 15 |     using Vector = std::vector<uint32_t>;
 16 | 
 17 | public:
 18 |     CompareApp(int argc, char** argv) : Application(argc, argv) {}
 19 | 
 20 | private:
 21 |     virtual bool custom_run() override;
 22 | 
 23 | private:
 24 |     void dump(const Vector& vec) const;
 25 |     bool compare(const Vector& expected, const Vector& result) const;
 26 | 
 27 | };
 28 | 
 29 | bool CompareApp::custom_run() {
 30 | 
 31 |     const auto tmp = generate_unsigned();
 32 | 
 33 |     Vector reference;
 34 |     Vector result;
 35 |     const char* separators = ";, ";
 36 |     scalar::parse_unsigned(tmp.data(), tmp.size(), separators, std::back_inserter(reference));
 37 | 
 38 |     sse::NaiveMatcher<8> matcher(separators);
 39 |     sse::parser(tmp.data(), tmp.size(), separators, std::move(matcher), std::back_inserter(result));
 40 | 
 41 |     if (!compare(reference, result)) {
 42 |         puts(tmp.c_str());
 43 |         puts("");
 44 |         dump(reference);
 45 |         puts("");
 46 |         dump(result);
 47 | 
 48 |         return false;
 49 |     } else {
 50 |         puts("All OK");
 51 |         return true;
 52 |     }
 53 | }
 54 | 
 55 | void CompareApp::dump(const Vector& vec) const {
 56 |     printf("size = %lu: [", vec.size());
 57 | 
 58 |     const size_t n = vec.size();
 59 |     if (n) {
 60 |         printf("%u", vec[0]);
 61 |     }
 62 | 
 63 |     for (size_t i=1; i < n; i++) {
 64 |         printf(", %u", vec[i]);
 65 |     }
 66 | 
 67 |     printf("]\n");
 68 | }
 69 | 
 70 | bool CompareApp::compare(const Vector& expected, const Vector& result) const {
 71 | 
 72 |     if (expected.size() != result.size()) {
 73 |         puts("different sizes");
 74 |         return false;
 75 |     }
 76 | 
 77 |     const size_t n = expected.size();
 78 |     for (size_t i=0; i < n; i++) {
 79 |         const auto e = expected[i];
 80 |         const auto r = result[i];
 81 | 
 82 |         if (e != r) {
 83 |             printf("error at #%lu: expected = %u, result = %d\n", i, e, r);
 84 |             return false;
 85 |         }
 86 |     }
 87 | 
 88 |     return true;
 89 | }
 90 | 
 91 | int main(int argc, char* argv[]) {
 92 | 
 93 |     try {
 94 |         CompareApp app(argc, argv);
 95 | 
 96 |         return app.run() ? EXIT_SUCCESS : EXIT_FAILURE;
 97 | 
 98 |     } catch (std::exception& e) {
 99 |         printf("%s\n", e.what());
100 |         return EXIT_FAILURE;
101 |     } catch (Application::Exit&) {
102 |         return EXIT_SUCCESS;
103 |     }
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/scripts/cost.py:
--------------------------------------------------------------------------------
  1 | SPACE = '_'
  2 | DIGIT = 'd'
  3 | 
  4 | class Cost(object):
  5 |     def __init__(self):
  6 |         self.compare = 0
  7 |         self.multiplication = 0
  8 |         self.add_sub = 0
  9 |         self.bit_and = 0
 10 |         self.pack = 0
 11 |         self.movemask = 0
 12 |         self.load = 0
 13 | 
 14 |         self.store = 0
 15 | 
 16 |     def value(self):
 17 |         c = 1.00 * self.compare + \
 18 |             1.00 * self.multiplication + \
 19 |             1.00 * self.add_sub + \
 20 |             1.00 * self.bit_and + \
 21 |             1.00 * self.pack + \
 22 |             1.00 * self.movemask + \
 23 |             1.00 * self.load
 24 |         
 25 |         if self.store > 0:
 26 |             return c/float(self.store)
 27 |         else:
 28 |             return c
 29 |             
 30 |     def __str__(self):
 31 |         return '%0.2f' % self.value()
 32 | 
 33 | def scalar_cost(bi):
 34 |     cost = Cost()
 35 | 
 36 |     prev = SPACE
 37 |     for c in bi.image:
 38 |         if c == SPACE:
 39 |             cost.compare += 1 # char in separators
 40 |             if prev == DIGIT:
 41 |                 # end of digits span
 42 |                 cost.store += 1
 43 |         else:
 44 |             # tmp = x - '0'
 45 |             cost.add_sub += 1
 46 |             # if x > 9 then invalid char
 47 |             pass
 48 |             # else
 49 |             # result = 10 * result + tmp
 50 |             cost.multiplication += 1
 51 |             cost.add_sub += 1
 52 |             
 53 |         prev = c
 54 | 
 55 |     # unlike SIMD algorithm, the last range is considered
 56 |     if prev == DIGIT:
 57 |         cost.store += 1
 58 | 
 59 |     return cost
 60 | 
 61 | 
 62 | def SSE_cost(bi):
 63 |     cost = Cost()
 64 | 
 65 |     # simd code always validate whole input
 66 |     cost.compare  += 3
 67 |     cost.bit_and  += 3
 68 |     cost.movemask += 2
 69 | 
 70 |     if bi.element_size == 1:
 71 |         cost.add_sub += 1
 72 |         cost.load += len(bi.spans)
 73 | 
 74 |     elif bi.element_size == 2:
 75 |         cost.add_sub += 1
 76 |         cost.multiplication += 1
 77 |         cost.load += len(bi.spans)
 78 | 
 79 |     elif bi.element_size == 4:
 80 |         cost.add_sub += 1
 81 |         cost.multiplication += 2
 82 |         cost.pack += 1
 83 |         cost.load += len(bi.spans)
 84 | 
 85 |     elif bi.element_size == 8:
 86 |         cost.add_sub += 1
 87 |         cost.multiplication += 3
 88 |         cost.pack += 1
 89 |         cost.load += len(bi.spans)
 90 | 
 91 |     else:
 92 |         cost.compare = 100000
 93 | 
 94 |     cost.store = len(bi.spans)
 95 | 
 96 |     return cost
 97 | 
 98 | 
 99 | def is_profitable(bi):
100 |     if len(bi.spans) == 0:
101 |         return False
102 | 
103 |     scalar = scalar_cost(bi)
104 |     sse    = SSE_cost(bi)
105 | 
106 |     return sse.cost() < scalar.cost()
107 | 
108 | 


--------------------------------------------------------------------------------
/include/sse/sse-parser-signed.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <cassert>
 5 | 
 6 | #include "scalar/scalar-parse-signed.h"
 7 | #include "sse-utils.h"
 8 | #include "sse-convert.h"
 9 | #include "sse-parser-common.h"
10 | #include "sse-parser-statistics.h"
11 | #include "block_info.h"
12 | 
13 | namespace sse {
14 | 
15 |     namespace detail {
16 | 
17 |         template <typename MATCHER, typename INSERTER>
18 |         char* process_chunk(char* data, char* end, const __m128i& input, MATCHER matcher, INSERTER output) {
19 | 
20 |             const __m128i ascii_minus = _mm_set1_epi8('-');
21 |             const __m128i ascii_plus  = _mm_set1_epi8('+');
22 | 
23 |             const __m128i bytemask_digit = decimal_digits_mask(input);
24 | 
25 |             const __m128i bytemask_plus  = _mm_cmpeq_epi8(input, ascii_plus);
26 |             const __m128i bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus);
27 |             const __m128i bytemask_sign  = _mm_or_si128(bytemask_plus, bytemask_minus);
28 |             const __m128i bytemask_span  = _mm_or_si128(bytemask_digit, bytemask_sign);
29 | 
30 |             const uint16_t valid_mask    = _mm_movemask_epi8(matcher.get_mask(input, bytemask_span));
31 | 
32 |             if (valid_mask != 0xffff) {
33 |                 throw std::runtime_error("Wrong character");
34 |             }
35 | 
36 |             const uint16_t sign_mask = _mm_movemask_epi8(bytemask_sign);
37 |             const uint16_t span_mask = _mm_movemask_epi8(bytemask_span);
38 |             STATS_SPAN_MASK(span_mask);
39 |             const BlockInfo& bi      = blocks[span_mask];
40 |             if (sign_mask & bi.invalid_sign_mask) {
41 |                 throw std::runtime_error("'+' or '-' at invalid position");
42 |             }
43 | 
44 |             if (span_mask == 0) {
45 |                 return data + 16;
46 |             }
47 | 
48 |             STATS_INC(loops);
49 | 
50 |             if (sign_mask == 0 || bi.conversion_routine == Conversion::SSE1Digit) {
51 |                 // unsigned path
52 |                 return detail::parse_unsigned(bi, input, data, end, output);
53 |             } else {
54 |                 return detail::parse_signed(bi, input, data, end, output);
55 |             }
56 |         }
57 | 
58 |     }
59 | 
60 |     template <typename MATCHER, typename INSERTER>
61 |     void parser_signed(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) {
62 | 
63 |         char* data = const_cast<char*>(string);
64 |         char* end  = data + size;
65 | 
66 |         while (data + 16 < end) {
67 |             const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
68 |             data = detail::process_chunk(data, end, input, matcher, output);
69 | 
70 |         } // for
71 | 
72 |         // process the tail
73 |         scalar::parse_signed(data, string + size - data, separators, output);
74 |     }
75 | 
76 | } // namespace sse
77 | 


--------------------------------------------------------------------------------
/test/benchmark-hwevents.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iterator>
 3 | #include <numeric>
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | 
 7 | #include "linux-perf-events.h"
 8 | #include "sse/sse-matcher.h"
 9 | #include "sse/sse-parser-signed.h"
10 | #include "sse/sse-simplified-parser-signed.h"
11 | 
12 | #include "application.h"
13 | 
14 | class BenchmarkApp: public Application {
15 | 
16 |     using SignedVector = std::vector<int32_t>;
17 | 
18 | public:
19 |     BenchmarkApp(int argc, char** argv) : Application(argc, argv) {}
20 | 
21 | private:
22 |     virtual bool custom_run() override;
23 |     virtual void custom_init() override;
24 | 
25 | private:
26 |     std::string tmp;
27 |     bool csv;
28 | 
29 |     SignedVector result;   
30 | };
31 | 
32 | void BenchmarkApp::custom_init() {
33 |     csv   = cmdline.has_flag("--csv-output");
34 |     quiet = csv;
35 | }
36 | 
37 | bool BenchmarkApp::custom_run() {
38 | 
39 |     if (!csv) {
40 |         printf("Input size: %lu, loops: %lu\n", get_size(), get_loop_count());
41 |     }
42 | 
43 |     tmp = generate_signed();
44 | 
45 |     const char* separators = ";, ";
46 |     auto k = get_loop_count();
47 |     
48 |     LinuxEvents<PERF_TYPE_HARDWARE> ev_branches(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
49 |     LinuxEvents<PERF_TYPE_HARDWARE> ev_branch_misses(PERF_COUNT_HW_BRANCH_MISSES);
50 |     LinuxEvents<PERF_TYPE_HARDWARE> ev_cache_references(PERF_COUNT_HW_CACHE_REFERENCES);
51 |     LinuxEvents<PERF_TYPE_HARDWARE> ev_cache_misses(PERF_COUNT_HW_CACHE_MISSES);
52 | 
53 |     ev_branches.start();
54 |     ev_branch_misses.start();
55 |     ev_cache_references.start();
56 |     ev_cache_misses.start();
57 |     while (k--) {
58 |         result.clear();
59 |         sse::NaiveMatcher<8> matcher(separators);
60 |         sse::parser_signed(tmp.data(), tmp.size(), separators,
61 |                            std::move(matcher), std::back_inserter(result));
62 |     }
63 | 
64 |     const auto branches         = ev_branches.end();
65 |     const auto branch_misses    = ev_branch_misses.end();
66 |     const auto cache_references = ev_cache_references.end();
67 |     const auto cache_misses     = ev_cache_misses.end();
68 | 
69 |     if (csv) {
70 |         printf("%lu, %lu, %lu, %lu\n", branches, branch_misses, cache_references, cache_misses);
71 |     } else {
72 |         printf("branches:           %lu\n", branches);
73 |         printf("branch misses:      %lu\n", branch_misses);
74 |         printf("branch miss ratio:  %0.2f%%\n", (100.0 * branch_misses) / branches);
75 |         printf("cache references:   %lu\n", cache_references);
76 |         printf("cache misses:       %lu\n", cache_misses);
77 |         printf("cache miss ratio:   %0.2f%%\n", (100.0 * cache_misses) / cache_references);
78 |     }
79 | 
80 |     return true;
81 | }
82 | 
83 | 
84 | int main(int argc, char* argv[]) {
85 | 
86 |     try {
87 |         BenchmarkApp app(argc, argv);
88 | 
89 |         return app.run() ? EXIT_SUCCESS : EXIT_FAILURE;
90 | 
91 |     } catch (std::exception& e) {
92 |         printf("%s\n", e.what());
93 |         return EXIT_FAILURE;
94 |     } catch (Application::Exit&) {
95 |         return EXIT_SUCCESS;
96 |     }
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/include/sse/sse-block-parser-unsigned.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <cassert>
 5 | 
 6 | #include "scalar/scalar-parse-unsigned.h"
 7 | #include "sse-utils.h"
 8 | #include "sse-convert.h"
 9 | #include "sse-parser-common.h"
10 | #include "sse-parser-statistics.h"
11 | #include "block_info.h"
12 | 
13 | namespace sse {
14 | 
15 |     template <typename MATCHER, typename INSERTER>
16 |     void parser_block_unsigned(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) {
17 | 
18 |         char* data = const_cast<char*>(string);
19 |         char* end  = data + size;
20 |         while (data + 16*4 < end) {
21 |             const __m128i  input0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 0*16));
22 |             const __m128i  input1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 1*16));
23 |             const __m128i  input2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 2*16));
24 |             const __m128i  input3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 3*16));
25 |             const __m128i  t0 = decimal_digits_mask(input0);
26 |             const uint64_t digit_mask0 = _mm_movemask_epi8(t0);
27 |             const uint64_t valid_mask0 = _mm_movemask_epi8(matcher.get_mask(input0, t0));
28 |             const __m128i  t1 = decimal_digits_mask(input1);
29 |             const uint64_t digit_mask1 = _mm_movemask_epi8(t1);
30 |             const uint64_t valid_mask1 = _mm_movemask_epi8(matcher.get_mask(input1, t1));
31 |             const __m128i  t2 = decimal_digits_mask(input2);
32 |             const uint64_t digit_mask2 = _mm_movemask_epi8(t2);
33 |             const uint64_t valid_mask2 = _mm_movemask_epi8(matcher.get_mask(input2, t2));
34 |             const __m128i  t3 = decimal_digits_mask(input3);
35 |             const uint64_t digit_mask3 = _mm_movemask_epi8(t3);
36 |             const uint64_t valid_mask3 = _mm_movemask_epi8(matcher.get_mask(input3, t3));
37 | 
38 |             STATS_INC(loops);
39 | 
40 |             if ((valid_mask0 & valid_mask1 & valid_mask2 & valid_mask3) != 0xffff) {
41 |                 throw std::runtime_error("Wrong character");
42 |             }
43 | 
44 |             const uint64_t digit_mask = digit_mask0
45 |                                      | (digit_mask1 << (1*16))
46 |                                      | (digit_mask2 << (2*16))
47 |                                      | (digit_mask3 << (3*16));
48 | 
49 |             if (digit_mask == 0) {
50 |                 data += 16*4;
51 |                 continue;
52 |             }
53 | 
54 |             __m128i input = input0;
55 |             uint64_t mask = digit_mask;
56 |             char* loopend = data + 3*16;
57 |             while (data < loopend) {
58 |                 char* prevdata = data;
59 |                 const BlockInfo& bi = blocks[mask & 0xffff];
60 |                 data = detail::parse_unsigned(bi, input, data, end, output);
61 |                 if (data == end) {
62 |                     break;
63 |                 }
64 | 
65 |                 const int shift = data - prevdata;
66 |                 mask >>= shift;
67 |                 input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
68 |             }
69 | 
70 |         } // for
71 | 
72 |         // process the tail
73 |         scalar::parse_unsigned(data, string + size - data, separators, output);
74 |     }
75 | 
76 | } // namespace sse
77 | 


--------------------------------------------------------------------------------
/include/hybrid-parser-signed.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstddef>
  4 | #include <cstdint>
  5 | #include <immintrin.h>
  6 | #include "sse/sse-utils.h"
  7 | #include "scalar/scalar-parse-signed.h"
  8 | 
  9 | namespace hybrid_signed {
 10 | 
 11 |     template<int N>
 12 |     uint32_t convert(const char* s, uint32_t prev) {
 13 |         int8_t digit = int8_t(s[0]) - '0';
 14 |         if (digit < 0) {
 15 |             throw std::runtime_error("'+' or '-' on a wrong position");
 16 |         }
 17 |         return convert<N - 1>(s + 1, prev * 10 + digit);
 18 |     }
 19 | 
 20 |     template<>
 21 |     uint32_t convert<0>(const char* /*s*/, uint32_t prev) {
 22 |         return prev;
 23 |     }
 24 | 
 25 |     template<int N>
 26 |     int32_t convert(const char* s) {
 27 |         if (s[0] == '+')
 28 |             return convert<N - 1>(s + 1, 0);
 29 |         else if (s[0] == '-') // TODO: check range
 30 |             return -static_cast<int32_t>(convert<N - 1>(s + 1, 0));
 31 |         else
 32 |             return convert<N>(s, 0);
 33 |     }
 34 | 
 35 | }
 36 | 
 37 | template <typename MATCHER, typename INSERTER>
 38 | void parser_hybrid_signed(const char* string, size_t size, const char* separators, MATCHER matcher, INSERTER output) {
 39 | 
 40 |     #include "hybrid-shift-back.inl"
 41 | 
 42 |     const __m128i ascii_plus  = _mm_set1_epi8('+');
 43 |     const __m128i ascii_minus = _mm_set1_epi8('-');
 44 | 
 45 |     char* data = const_cast<char*>(string);
 46 |     char* end  = data + size;
 47 | 
 48 |     enum Previous {
 49 |         none,
 50 |         has_sign,
 51 |         has_value
 52 |     };
 53 | 
 54 |     Previous prev = none;
 55 |     bool    negative = false;
 56 |     int32_t val = 0;
 57 | 
 58 |     uint16_t span_mask = 0;
 59 |     while (data + 16 < end) {
 60 |         const __m128i  input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
 61 |         const __m128i  bytemask_digit = sse::decimal_digits_mask(input);
 62 |         const __m128i  bytemask_plus  = _mm_cmpeq_epi8(input, ascii_plus);
 63 |         const __m128i  bytemask_minus = _mm_cmpeq_epi8(input, ascii_minus);
 64 |         const __m128i  bytemask_sign  = _mm_or_si128(bytemask_minus, bytemask_plus);
 65 |         const __m128i  bytemask_span  = _mm_or_si128(bytemask_digit, bytemask_sign);
 66 |         const uint16_t valid_mask = _mm_movemask_epi8(matcher.get_mask(input, bytemask_span));
 67 | 
 68 |         if (valid_mask != 0xffff) {
 69 |             throw std::runtime_error("Wrong character");
 70 |         }
 71 | 
 72 |         span_mask = _mm_movemask_epi8(bytemask_span);
 73 |         if (span_mask == 0) {
 74 |             data += 16;
 75 |             continue;
 76 |         }
 77 | 
 78 |         switch (span_mask & 0xff) {
 79 |             #include "hybrid-parser-signed.inl"
 80 |         }
 81 | 
 82 |         data += 8;
 83 | 
 84 |         switch (span_mask >> 8) {
 85 |             #include "hybrid-parser-signed.inl"
 86 |         }
 87 | 
 88 |         data += 8;
 89 |     } // for
 90 | 
 91 |     // Shift back if the last span in the last wasn't saved yet
 92 |     // XXX: If there is a really long sequence of digits (more
 93 |     //      than 16) then this fixup will not help.
 94 |     if (prev != none) {
 95 |         data -= shift[span_mask >> 8];
 96 |     }
 97 | 
 98 |     // process the tail
 99 |     scalar::parse_signed(data, string + size - data, separators, output);
100 | }
101 | 


--------------------------------------------------------------------------------
/experiments/overalltests/average.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os.path
  3 | 
  4 | if __name__ == '__main__' and __package__ is None:
  5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  6 | 
  7 | from loader import load
  8 | from utils import groupby
  9 | from average_writer import RestWriter
 10 | from table import Table
 11 | from prettyprint import *
 12 | 
 13 | class Report(object):
 14 |     def __init__(self, path):
 15 |         with open(path, 'rt') as f:
 16 |             self.raw_data = load(f)
 17 | 
 18 |         bydistribution = lambda item: item.distribution_name
 19 | 
 20 |         bysep = lambda item: (item.sep_distribution, item.distribution_name)
 21 | 
 22 |         self.report = []
 23 |         for (sep, distribution_name), collection in groupby(self.raw_data, bysep).iteritems():
 24 |             ret = self.prepare_table(collection)
 25 |             self.report.append((
 26 |                 get_separator_title(sep),
 27 |                 get_distribution_title(distribution_name),
 28 |                 ret
 29 |             ))
 30 | 
 31 |     def get(self):
 32 |         return self.report
 33 | 
 34 | 
 35 |     def prepare_table(self, procedures):
 36 | 
 37 |         keyfun = lambda item: (item.size, item.loops, item.num_distribution)
 38 |         tmp = groupby(procedures, keyfun)
 39 | 
 40 |         data = {}
 41 |         for (size, loops, _), items in tmp.iteritems():
 42 |             def get_time(procedure):
 43 |                 for item in items:
 44 |                     if item.procedure == procedure:
 45 |                         return item.time
 46 | 
 47 |                 raise KeyError("Procedure '%s' not found" % procedure)
 48 | 
 49 |             t0 = get_time("scalar")
 50 |             t1 = get_time("sse")
 51 |             t2 = get_time("sse-block")
 52 | 
 53 |             if t0 < 10 and t1 < 10 and t2 < 10:
 54 |                 # don't fool people when all measurements are single-digit numbers
 55 |                 continue
 56 | 
 57 |             speedup_sse = float(t0)/t1
 58 |             speedup_sse_block = float(t0)/t2
 59 | 
 60 |             key = (size, loops)
 61 |             if key not in data:
 62 |                 data[key] = [[], []]
 63 |             
 64 |             data[key][0].append(speedup_sse)
 65 |             data[key][1].append(speedup_sse_block)
 66 | 
 67 |         t = Table()
 68 |         t.add_header([("input", 2), ("SSE speed-up", 3), ("SSE block speed-up", 3)])
 69 |         t.add_header(["size [B]", "loops", "min", "avg", "max", "min", "avg", "max"])
 70 | 
 71 |         def stats(numbers):
 72 |             s = sum(numbers)
 73 |             n = len(numbers)
 74 |             return min(numbers), s/n, max(numbers)
 75 | 
 76 |         for size, loops in sorted(data, key=lambda t: t[0]):
 77 |             
 78 |             key = size, loops
 79 | 
 80 |             sse = stats(data[key][0])
 81 |             sse_block = stats(data[key][1])
 82 | 
 83 |             t.add_row([
 84 |                 '{:,}'.format(size),
 85 |                 '%d' % loops,
 86 | 
 87 |                 '%0.2f' % sse[0],
 88 |                 '%0.2f' % sse[1],
 89 |                 '%0.2f' % sse[2],
 90 | 
 91 |                 '%0.2f' % sse_block[0],
 92 |                 '%0.2f' % sse_block[1],
 93 |                 '%0.2f' % sse_block[2],
 94 |             ])
 95 |         
 96 |         return t
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     rep = Report(sys.argv[1])
101 |     wrt = RestWriter(sys.stdout, rep.get())
102 |     try:
103 |         sep = sys.argv[2]
104 |     except IndexError:
105 |         sep = '~'
106 |         
107 |     wrt.write(sep)
108 | 


--------------------------------------------------------------------------------
/test/unittest/verify_sse_unsigned_conversion.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <algorithm>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cctype>
  7 | 
  8 | #include "block_info.h"
  9 | #include "scalar/scalar-parse-unsigned.h"
 10 | #include "scalar/scalar-parse-signed.h"
 11 | #include "sse/sse-convert.h"
 12 | #include "sse/sse-matcher.h"
 13 | 
 14 | class Verify {
 15 | 
 16 |     char buffer[17];
 17 |     __m128i input;
 18 | 
 19 |     std::vector<uint32_t> result;
 20 |     std::vector<uint32_t> reference;
 21 | 
 22 | public:
 23 |     Verify() {
 24 |         memset(buffer, 0, sizeof(buffer));
 25 |         result.resize(16);
 26 |     }
 27 | 
 28 |     bool run() {
 29 | 
 30 |         unsigned unsupported = 0;
 31 | 
 32 |         for (int x=0; x < 65536; x++) {
 33 |             generate_input(x);
 34 | 
 35 |             const BlockInfo& b = blocks[x];
 36 |             const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)b.shuffle_digits);
 37 |             const __m128i shuffled = _mm_shuffle_epi8(input, shuffle_digits);
 38 | 
 39 |             using namespace sse;
 40 | 
 41 |             if (b.conversion_routine == Conversion::SSE1Digit) {
 42 |                 convert_1digit(shuffled, b.element_count, &result[0]);
 43 |             } else if (b.conversion_routine == Conversion::SSE2Digits) {
 44 |                 convert_2digits(shuffled, b.element_count, &result[0]);
 45 |             } else if (b.conversion_routine == Conversion::SSE3Digits) {
 46 |                 convert_3digits(shuffled, b.element_count, &result[0]);
 47 |             } else if (b.conversion_routine == Conversion::SSE4Digits) {
 48 |                 convert_4digits(shuffled, b.element_count, &result[0]);
 49 |             } else if (b.conversion_routine == Conversion::SSE8Digits) {
 50 |                 convert_8digits(shuffled, b.element_count, &result[0]);
 51 |             } else {
 52 |                 unsupported += 1;
 53 |                 continue;
 54 |             }
 55 | 
 56 |             if (!compare(b.element_count)) {
 57 |                 return false;
 58 |             }
 59 |         } // for
 60 | 
 61 |         printf("All OK (%d cases will never be supported by SIMD code)\n", unsupported);
 62 |         return true;
 63 |     }
 64 | 
 65 | private:
 66 |     void generate_input(uint16_t x) {
 67 | 
 68 |         int k = 0;
 69 |         for (int i=0; i < 16; i++) {
 70 |             if (x & (1 << i)) {
 71 |                 buffer[i] = (k % 10) + '0';
 72 |                 k += 1;
 73 |             } else {
 74 |                 buffer[i] = '_';
 75 |             }
 76 |         }
 77 | 
 78 |         input = _mm_loadu_si128((const __m128i*)buffer);
 79 | 
 80 |         std::fill(result.begin(), result.end(), -1);
 81 | 
 82 |         reference.clear();
 83 |         scalar::parse_unsigned(buffer, 16, "_", std::back_inserter(reference));
 84 |     }
 85 | 
 86 |     bool compare(size_t n) const {
 87 |         for (size_t i=0; i < n; i++) {
 88 |             if (result[i] != reference[i]) {
 89 |                 printf("mismatch at %lu: expected=%u, result=%u\n", i, reference[i], result[i]);
 90 |                 printf("reference = "); dump(reference, n);
 91 |                 printf("result =    "); dump(result, n);
 92 |                 return false;
 93 |             }
 94 |         }
 95 | 
 96 |         return true;
 97 |     }
 98 | 
 99 |     void dump(const std::vector<uint32_t>& vec, size_t n) const {
100 |         for (size_t i=0; i < n; i++) {
101 |             if (i > 0) printf(", ");
102 |             printf("%u", vec[i]);
103 |         }
104 |         putchar('\n');
105 |     }
106 | 
107 | };
108 | 
109 | 
110 | int main() {
111 | 
112 |     puts("Verify SSE unsigned converters for valid inputs");
113 |     Verify verify;
114 |     if (!verify.run()) {
115 |         return EXIT_FAILURE;
116 |     }
117 | 
118 |     return EXIT_SUCCESS;
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/include/scalar/scalar-parse-signed.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <cstring>
  5 | #include <stdexcept>
  6 | #include <limits>
  7 | 
  8 | #include "safe-convert.h"
  9 | #include "scalar-parse-common.h"
 10 | 
 11 | namespace scalar {
 12 | 
 13 |     template <typename INSERTER>
 14 |     void parse_signed(const char* data, size_t size, const char* separators, INSERTER output) {
 15 | 
 16 |         enum State {
 17 |             Separator,
 18 |             Plus,
 19 |             Minus,
 20 |             Digit
 21 |         };
 22 | 
 23 |         State state = Separator;
 24 |         State prev = Separator;
 25 |         bool negative = false;
 26 |         uint32_t number = 0;
 27 | 
 28 |         for (size_t i=0; i < size; i++) {
 29 |             const char c = data[i];
 30 |             if (c == '+') {
 31 |                 state = Plus;
 32 |             } else if (c == '-') {
 33 |                 state = Minus;
 34 |             } else if (c >= '0' && c <= '9') {
 35 |                 state = Digit;
 36 |             } else if (contains(separators, c)) {
 37 |                 state = Separator;
 38 |             } else {
 39 |                 throw std::runtime_error("Wrong character (scalar)");
 40 |             }
 41 | 
 42 |             switch (state) {
 43 |                 case Plus:
 44 |                     if (prev != Separator) {
 45 |                         throw std::runtime_error("Invalid syntax ('+' follows a non-separator character)");
 46 |                     }
 47 |                     number = 0;
 48 |                     negative = false;
 49 |                     break;
 50 | 
 51 |                 case Minus:
 52 |                     if (prev != Separator) {
 53 |                         throw std::runtime_error("Invalid syntax ('-' follows a non-separator character)");
 54 |                     }
 55 |                     number = 0;
 56 |                     negative = true;
 57 |                     break;
 58 | 
 59 |                 case Digit:
 60 |                     if (prev == Separator) {
 61 |                         number = c - '0';
 62 |                         negative = false;
 63 |                     } else {
 64 |                         mul10_add_digit(number, c);
 65 |                     }
 66 |                     break;
 67 | 
 68 |                 case Separator:
 69 |                     if (prev == Digit) {
 70 |                         if (negative) {
 71 |                             const int64_t tmp = std::numeric_limits<int32_t>::max();
 72 |                             const uint32_t absmin = -tmp;
 73 |                             if (number > absmin) {
 74 |                                 throw std::range_error("signed overflow");
 75 |                             }
 76 |                             *output = -number;
 77 |                         } else {
 78 |                             if (number > std::numeric_limits<int32_t>::max()) {
 79 |                                 throw std::range_error("signed overflow");
 80 |                             }
 81 | 
 82 |                             *output = number;
 83 |                         }
 84 |                     } else if (prev != Separator) {
 85 |                         throw std::runtime_error("Invalid syntax ('-' or '+' not followed by any digit)");
 86 |                     }
 87 |                     break;
 88 |             } // switch
 89 | 
 90 |             prev = state;
 91 |         } // for
 92 | 
 93 |         if (state == Separator) {
 94 |             if (prev == Digit) {
 95 |                 if (negative) {
 96 |                     *output = -number;
 97 |                 } else {
 98 |                     *output = number;
 99 |                 }
100 |             } else if (prev != Separator) {
101 |                 throw std::runtime_error("Invalid syntax ('-' or '+' not followed by any digit)");
102 |             }
103 |         }
104 |     }
105 | 
106 | } // namespace
107 | 
108 | 


--------------------------------------------------------------------------------
/scripts/hybrid.py:
--------------------------------------------------------------------------------
  1 | class DigitsSpan(object):
  2 |     # range: [first, last] - include the both ends
  3 |     def __init__(self, first, last):
  4 |         assert first <= last
  5 | 
  6 |         self.first = first
  7 |         self.last  = last
  8 | 
  9 |         assert self.digits() <= 8
 10 | 
 11 |     def digits(self):
 12 |         return self.last - self.first + 1
 13 | 
 14 |     def __str__(self):
 15 |         return "<%d, %d>" % (self.first, self.last)
 16 | 
 17 |     __repr__ = __str__
 18 | 
 19 | 
 20 | DIGIT = 'd'
 21 | SPACE = '_'
 22 | 
 23 | class Parser(object):
 24 |     def __init__(self, number):
 25 |         assert number >= 0
 26 |         assert number < 256
 27 | 
 28 |         self.number = number
 29 |         self.image = self.__convert_to_string(number)
 30 | 
 31 | 
 32 |     def get_ranges(self):
 33 |         prev  = SPACE
 34 |         start = None
 35 |         ranges = []
 36 |         for i, c in enumerate(self.image):
 37 |             if c == prev:
 38 |                 continue
 39 | 
 40 |             if c == DIGIT: # transition
 41 |                 start = i
 42 |             else:
 43 |                 ranges.append(DigitsSpan(start, i - 1))
 44 |                 start = None
 45 | 
 46 |             prev = c
 47 | 
 48 |         if start is not None:
 49 |             ranges.append(DigitsSpan(start, 7))
 50 | 
 51 |         return ranges
 52 | 
 53 | 
 54 |     def __convert_to_string(self, x):
 55 |         s = ''
 56 |         for i in range(8):
 57 |             if x & (1 << i):
 58 |                 s += DIGIT
 59 |             else:
 60 |                 s += SPACE
 61 | 
 62 |         return s
 63 | 
 64 | 
 65 | EMPTY                   = 1
 66 | FULL                    = 2
 67 | FINALIZE_PREVIOUS       = 3
 68 | FIRST_CONTINUATION      = 4
 69 | WHOLE                   = 5
 70 | LAST                    = 6
 71 | 
 72 | def tokenize(number):
 73 |     parser = Parser(number)
 74 |     ranges = parser.get_ranges()
 75 |     if number == 0x00:
 76 |         assert len(ranges) == 0
 77 |         yield (EMPTY, [])
 78 | 
 79 |     elif number == 0xff:
 80 |         assert len(ranges) == 1
 81 |         yield (FULL, ranges[0])
 82 | 
 83 |     else:
 84 |         if ranges[0].first > 0:
 85 |             yield (FINALIZE_PREVIOUS, None)
 86 |         for r in ranges:
 87 |             if r.first == 0:
 88 |                 yield (FIRST_CONTINUATION, r)
 89 |             elif r.last == 7:
 90 |                 yield (LAST, r)
 91 |             else:
 92 |                 yield (WHOLE, r)
 93 | 
 94 | 
 95 | class GeneratorBase(object):
 96 |     def __init__(self):
 97 |         self.span   = None
 98 |         self.number = None
 99 | 
100 |     def get(self):
101 | 
102 |         self.begin()
103 | 
104 |         self.lines = []
105 |         for number in range(2**8):
106 |             self.number = number
107 |             self.before()
108 |             for (kind, span) in tokenize(number):
109 |                 self.span = span
110 | 
111 |                 if kind == EMPTY:
112 |                     self.empty()
113 |                 elif kind == FULL:
114 |                     self.full()
115 |                 elif kind == FINALIZE_PREVIOUS:
116 |                     self.finalize_previous()
117 |                 elif kind == FIRST_CONTINUATION:
118 |                     self.first_continuation()
119 |                 elif kind == WHOLE:
120 |                     self.whole()
121 |                 elif kind == LAST:
122 |                     self.last()
123 |                 else:
124 |                     assert False
125 | 
126 |             #for
127 |             self.after()
128 |         #for
129 | 
130 |         self.end()
131 | 
132 |         return self.lines
133 | 
134 | 
135 |     def begin(self):
136 |         pass
137 | 
138 | 
139 |     def end(self):
140 |         pass
141 | 
142 | 
143 |     def before(self):
144 |         pass
145 | 
146 | 
147 |     def after(self):
148 |         pass
149 | 
150 | 


--------------------------------------------------------------------------------
/scripts/hybrid-signed.py:
--------------------------------------------------------------------------------
  1 | from hybrid import GeneratorBase
  2 | 
  3 | class GenerateSingedParser(GeneratorBase):
  4 | 
  5 |     def before(self):
  6 |         self.lines.append('case 0x%02x:' % self.number)
  7 | 
  8 |     def after(self):
  9 |         self.lines.append('break;')
 10 | 
 11 |     def empty(self):
 12 |         pass
 13 | 
 14 |     def full(self):
 15 |         l = self.lines
 16 | 
 17 |         l.append("if (prev != none) {")
 18 |         l.append("   val = %s;" % self.expression(self.span, "val"))
 19 |         l.append("} else {")
 20 |         l.append("   val = %s;" % self.expression(self.span))
 21 |         l.append("}")
 22 |         l.append("prev = has_value;")
 23 | 
 24 |     def finalize_previous(self):
 25 |         l = self.lines
 26 | 
 27 |         l.append("if (prev == has_value) {")
 28 |         l.append("    *output++ = (negative) ? -val : val;")
 29 |         l.append("    prev = none;")
 30 |         l.append("} else if (prev == has_sign) {")
 31 |                       # there was a sole sign at the end of the previous block
 32 |         l.append('    throw std::runtime_error("wrong syntax");')
 33 |         l.append("}")
 34 | 
 35 |     def first_continuation(self):
 36 |         l = self.lines
 37 | 
 38 |         l.append("if (prev == has_value) {")
 39 |         l.append("   val = %s;" % self.expression(self.span, "val"))
 40 |         l.append("   *output++ = (negative) ? -val : val;")
 41 |         l.append("   prev = none;")
 42 |         l.append("} else if (prev == has_sign) {")
 43 |         l.append("   val = %s;" % self.expression(self.span, "0"))
 44 |         l.append("   *output++ = (negative) ? -val : val;")
 45 |         l.append("   prev = none;")
 46 |         l.append("} else {")
 47 |         l.append("   *output++ = %s;" % self.expression(self.span))
 48 |         l.append("}")
 49 | 
 50 |     def whole(self):
 51 |         self.lines.append("*output++ = %s;" % self.expression(self.span))
 52 | 
 53 |     def last(self):
 54 |         l = self.lines
 55 |         span = self.span
 56 | 
 57 |         if span.digits() == 1:
 58 |             # just one character
 59 |             l.append("if (data[%d] == '+') {" % span.first)
 60 |             l.append("  prev = has_sign;")
 61 |             l.append("  negative = false;")
 62 |             l.append("} else if (data[%d] == '-') {" % span.first)
 63 |             l.append("  prev = has_sign;")
 64 |             l.append("  negative = true;")
 65 |             l.append("} else {")
 66 |             l.append("  val = %s;" % self.expression(span, "0"))
 67 |             l.append("  prev = has_value;")
 68 |             l.append("  negative = false;")
 69 |             l.append("}")
 70 |             
 71 |         else:
 72 |             l.append("if (data[%d] == '+') {" % span.first)
 73 |             l.append("  val = %s;" % self.invocation(span.digits() - 1, span.first + 1))
 74 |             l.append("  negative = false;")
 75 |             l.append("} else if (data[%d] == '-') {" % span.first)
 76 |             l.append("  val = %s;" % self.invocation(span.digits() - 1, span.first + 1))
 77 |             l.append("  negative = true;")
 78 |             l.append("} else {")
 79 |             l.append("  val = %s;" % self.expression(span, "0"))
 80 |             l.append("  negative = false;")
 81 |             l.append("}")
 82 |             l.append("prev = has_value;")
 83 | 
 84 |     
 85 |     def invocation(self, digits, offset, arg = None):
 86 |         result = "hybrid_signed::convert<%d>(data" % digits
 87 |         if offset != 0:
 88 |             result += ' + %d' % offset
 89 |         
 90 |         if arg is not None:
 91 |             result += ', %s' % arg
 92 | 
 93 |         result += ')'
 94 |         return result
 95 | 
 96 | 
 97 |     def expression(self, span, arg = None):
 98 |         return self.invocation(span.digits(), span.first, arg)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     gen = GenerateSingedParser()
103 |     for line in gen.get():
104 |         print(line)
105 | 
106 | 


--------------------------------------------------------------------------------
/test/compare-signed.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <cstdio>
  4 | #include <cstdlib>
  5 | 
  6 | #include "scalar/scalar-parse-unsigned.h"
  7 | #include "sse/sse-matcher.h"
  8 | #include "sse/sse-parser-signed.h"
  9 | #include "sse/sse-block-parser-signed.h"
 10 | 
 11 | #include "application.h"
 12 | 
 13 | class CompareApp: public Application {
 14 | 
 15 |     using Vector = std::vector<int32_t>;
 16 | 
 17 | private:
 18 |     const std::string separators;
 19 |     std::string input_string;
 20 |     Vector reference;
 21 |     Vector result;
 22 | 
 23 | public:
 24 |     CompareApp(int argc, char** argv)
 25 |         : Application(argc, argv)
 26 |         , separators(";, ") {}
 27 | 
 28 | private:
 29 |     virtual bool custom_run() override;
 30 | 
 31 | private:
 32 |     void run_sse_parser() {
 33 | 
 34 |         sse::NaiveMatcher<8> matcher(separators.c_str());
 35 |         result.clear();
 36 |         sse::parser_signed(
 37 |             input_string.data(),
 38 |             input_string.size(),
 39 |             separators.c_str(),
 40 |             std::move(matcher),
 41 |             std::back_inserter(reference));
 42 |     }
 43 | 
 44 |     void run_sse_block_parser() {
 45 | 
 46 |         sse::NaiveMatcher<8> matcher(separators.c_str());
 47 |         result.clear();
 48 |         sse::parser_block_signed(
 49 |             input_string.data(),
 50 |             input_string.size(),
 51 |             separators.c_str(),
 52 |             std::move(matcher),
 53 |             std::back_inserter(result));
 54 |     }
 55 | 
 56 | private:
 57 |     void dump(const Vector& vec) const;
 58 |     bool compare(const Vector& expected, const Vector& result) const;
 59 | 
 60 | };
 61 | 
 62 | bool CompareApp::custom_run() {
 63 | 
 64 |     input_string = generate_signed();
 65 |     scalar::parse_signed(input_string.data(),
 66 |                          input_string.size(),
 67 |                          separators.c_str(),
 68 |                          std::back_inserter(reference));
 69 | 
 70 |     puts("Checking SSE parser");
 71 |     run_sse_parser();
 72 |     if (!compare(reference, result)) {
 73 |         puts(input_string.c_str());
 74 |         puts("");
 75 |         dump(reference);
 76 |         puts("");
 77 |         dump(result);
 78 | 
 79 |         return false;
 80 |     }
 81 | 
 82 |     puts("Checking SSE block parser");
 83 |     run_sse_block_parser();
 84 |     if (!compare(reference, result)) {
 85 |         puts(input_string.c_str());
 86 |         puts("");
 87 |         dump(reference);
 88 |         puts("");
 89 |         dump(result);
 90 | 
 91 |         return false;
 92 |     }
 93 | 
 94 | 
 95 |     puts("All OK");
 96 |     return true;
 97 | }
 98 | 
 99 | void CompareApp::dump(const Vector& vec) const {
100 |     printf("size = %lu: [", vec.size());
101 | 
102 |     const size_t n = vec.size();
103 |     if (n) {
104 |         printf("%d", vec[0]);
105 |     }
106 | 
107 |     for (size_t i=1; i < n; i++) {
108 |         printf(", %d", vec[i]);
109 |     }
110 | 
111 |     printf("]\n");
112 | }
113 | 
114 | bool CompareApp::compare(const Vector& expected, const Vector& result) const {
115 | 
116 |     if (expected.size() != result.size()) {
117 |         puts("different sizes");
118 |         return false;
119 |     }
120 | 
121 |     const size_t n = expected.size();
122 |     for (size_t i=0; i < n; i++) {
123 |         const auto e = expected[i];
124 |         const auto r = result[i];
125 | 
126 |         if (e != r) {
127 |             printf("error at #%lu: expected = %d, result = %d\n", i, e, r);
128 |             return false;
129 |         }
130 |     }
131 | 
132 |     return true;
133 | }
134 | 
135 | int main(int argc, char* argv[]) {
136 | 
137 |     try {
138 |         CompareApp app(argc, argv);
139 | 
140 |         return app.run() ? EXIT_SUCCESS : EXIT_FAILURE;
141 | 
142 |     } catch (std::exception& e) {
143 |         printf("%s\n", e.what());
144 |         return EXIT_FAILURE;
145 |     } catch (Application::Exit&) {
146 |         return EXIT_SUCCESS;
147 |     }
148 | }
149 | 
150 | 


--------------------------------------------------------------------------------
/test/benchmark-cpuclocks.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <numeric>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | 
  7 | #include "benchmark.h"
  8 | #include "scalar/std-parser-signed.h"
  9 | #include "hybrid-parser-signed.h"
 10 | #include "sse/sse-matcher.h"
 11 | #include "sse/sse-parser-signed.h"
 12 | #include "sse/sse-block-parser-signed.h"
 13 | #include "sse/sse-simplified-parser-signed.h"
 14 | 
 15 | #include "application.h"
 16 | 
 17 | class BenchmarkApp: public Application {
 18 | 
 19 |     using SignedVector = std::vector<int32_t>;
 20 | 
 21 | public:
 22 |     BenchmarkApp(int argc, char** argv) : Application(argc, argv) {}
 23 | 
 24 | private:
 25 |     virtual bool custom_run() override;
 26 | 
 27 | private:
 28 |     std::string tmp;
 29 | 
 30 |     struct ResultSigned {
 31 |         SignedVector reference;
 32 |         SignedVector SSE;
 33 |         SignedVector SSEblock;
 34 |         SignedVector std_scalar;
 35 |         SignedVector SSEsimplified;
 36 |         SignedVector hybrid;
 37 |     } result_signed;
 38 | };
 39 | 
 40 | bool BenchmarkApp::custom_run() {
 41 | 
 42 |     printf("Input size: %lu, loops: %lu\n", get_size(), get_loop_count());
 43 | 
 44 |     tmp = generate_signed();
 45 | 
 46 |     const char* separators = ";, ";
 47 | 
 48 |     const auto repeat = get_loop_count();
 49 |     const auto size   = tmp.size();
 50 | 
 51 |     BEST_TIME(
 52 |         // pre:
 53 |         result_signed.reference.clear(),
 54 | 
 55 |         // test:
 56 |         scalar::parse_signed(tmp.data(), tmp.size(), separators,
 57 |                              std::back_inserter(result_signed.reference)),
 58 |         "scalar",
 59 |         repeat,
 60 |         size
 61 |     );
 62 | 
 63 |     BEST_TIME(
 64 |         // pre:
 65 |         result_signed.SSE.clear();
 66 |         sse::NaiveMatcher<8> matcher(separators);,
 67 | 
 68 |         // test:
 69 |         sse::parser_signed(tmp.data(), tmp.size(), separators,
 70 |                            std::move(matcher), std::back_inserter(result_signed.SSE)),
 71 |         "SSE",
 72 |         repeat,
 73 |         size
 74 |     );
 75 | 
 76 |     BEST_TIME(
 77 |         // pre:
 78 |         result_signed.SSEblock.clear();
 79 |         sse::NaiveMatcher<8> matcher(separators);,
 80 | 
 81 |         // test:
 82 |         sse::parser_block_signed(
 83 |             tmp.data(), tmp.size(),
 84 |             separators,
 85 |             std::move(matcher), std::back_inserter(result_signed.SSEblock));,
 86 | 
 87 |         "SSE (block)",
 88 |         repeat,
 89 |         size
 90 |     );
 91 | 
 92 |     BEST_TIME(
 93 |         // pre:
 94 |         result_signed.std_scalar.clear();,
 95 | 
 96 |         // test:
 97 |         scalar::cstd::parse_signed(
 98 |             tmp.data(), tmp.size(),
 99 |             separators,
100 |             std::back_inserter(result_signed.std_scalar));,
101 | 
102 |         "scalar (std)",
103 |         repeat,
104 |         size
105 |     );
106 | 
107 |     BEST_TIME(
108 |         // pre:
109 |         result_signed.SSEsimplified.clear();,
110 | 
111 |         // result:
112 |         sse_simplified::parse_signed(
113 |             tmp.data(), tmp.size(),
114 |             separators,
115 |             std::back_inserter(result_signed.SSEsimplified));,
116 | 
117 |         "SSE (simplified)",
118 |         repeat,
119 |         size
120 |     );
121 | 
122 |     BEST_TIME(
123 |         // pre:
124 |         result_signed.hybrid.clear();
125 |         sse::NaiveMatcher<8> matcher(separators);,
126 | 
127 |         // test:
128 |         parser_hybrid_signed(tmp.data(), tmp.size(), separators,
129 |                              std::move(matcher), std::back_inserter(result_signed.hybrid)),
130 |         "scalar (hybrid)",
131 |         repeat,
132 |         size
133 |     );
134 | 
135 |     return true;
136 | }
137 | 
138 | 
139 | int main(int argc, char* argv[]) {
140 | 
141 |     try {
142 |         BenchmarkApp app(argc, argv);
143 | 
144 |         return app.run() ? EXIT_SUCCESS : EXIT_FAILURE;
145 | 
146 |     } catch (std::exception& e) {
147 |         printf("%s\n", e.what());
148 |         return EXIT_FAILURE;
149 |     } catch (Application::Exit&) {
150 |         return EXIT_SUCCESS;
151 |     }
152 | }
153 | 
154 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ================================================================================
  2 |                 Parsing series of integers with SIMD
  3 | ================================================================================
  4 | 
  5 | Sample programs for article `Parsing series of integers with SIMD`__
  6 | 
  7 | __ http://0x80.pl/articles/simd-parsing-int-sequences.html
  8 | 
  9 | Parsers extract integer numbers from strings. A number can be prepended by a
 10 | sign character. The numbers are separated by arbitrary sequences of separator
 11 | chars. All other characters are invalid and the parsers detects them and raise
 12 | exception.
 13 | 
 14 | This repository contains:
 15 | 
 16 | * scalar reference implementation;
 17 | * two variants of SSE parsers; there are also separate variants designed
 18 |   solely of parsing unsigned numbers;
 19 | * scalar hybrid that combines ideas from SIMD parsing with scalar
 20 |   conversion procedures.
 21 | 
 22 | Requires: C++11 compiler (tested with GCC 7.3) and Python 2.7.
 23 | 
 24 | 
 25 | Usage
 26 | --------------------------------------------------------------------------------
 27 | 
 28 | Type ``make`` to build all programs.
 29 | 
 30 | Type ``make run-unittests`` to build all unit tests and then run them.
 31 | Some tests are time consuming, be patient.
 32 | 
 33 | Type ``make microbenchmarks.rst`` to run microbenchmarks.
 34 | 
 35 | Type ``make report-overall.rst`` to run performance benchmarks.
 36 | 
 37 | Type ``make spanmaskhistogram.rst`` to produce runtime analysis report
 38 | for SSE implementation.
 39 | 
 40 | 
 41 | Programs
 42 | --------------------------------------------------------------------------------
 43 | 
 44 | There are several programs available in ``bin`` subdirectory.
 45 | 
 46 | * ``benchmark`` --- test performance of given procedure
 47 | * ``benchmark-cpuclocks`` --- measure performance of all procedures; display
 48 |   CPU clocks
 49 | * ``benchmark-all`` --- compare performance of different procedures
 50 | * ``compare-singed`` and ``comapre-unsigned`` --- are used to
 51 |   validate if parsers produces the same results as the reference
 52 |   parser
 53 | * ``compare-avx512`` --- the same as above, but tests only
 54 |   AVX512BW implementation
 55 | * ``statistics`` --- gather execution statistics from SSE parsers
 56 | 
 57 | Apart from these programs, there are several ``verify_*`` executables
 58 | that runs various unit tests; they are invoked by ``make run-tests``.
 59 | 
 60 | Common arguments
 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 62 | 
 63 | All the programs generates random data which is then parsed,
 64 | following command line arguments can be used to control data
 65 | characteristics:
 66 | 
 67 |     --size=NUMBER         input size (in bytes)
 68 |     --loops=NUMBER        how many times a test must be repeated [default: 1]
 69 |     --seed=NUMBER         seed for random number generator [default: 0]
 70 |     --num=DISTRIBUTION    distribution of lengths of numbers
 71 |     --sep=DISTRIBUTION    distribution of lengths of gaps between numbers [default: '1']
 72 |     --separators=string   list of separator characters [default: ",; "]
 73 |     --sign=DISTRIBUTION   distribution of sign in front of number [default: '1']
 74 | 
 75 | ``DISTRIBUTION`` is a list of weights separated with commas, which defines
 76 | distribution of items.
 77 | 
 78 | In case of ``--num`` it's the count of decimal digits in a random number.
 79 | For instance ``--num=1,1,1,1`` will produce one-, two-, three- or four-digit
 80 | numbers with the same probability; ``--num=0,0,0,1,5,1`` will produce four-,
 81 | five- or six-digit numbers, but five-digits numbers with probability 5/7.
 82 | 
 83 | In case of ``--sep`` it is the distribution of numbers of separator characters
 84 | between the generated numbers. The default ``--sep=1`` means there's always
 85 | exactly one character; ``--sep=0,0,1,1,1`` would put form 2 to 4 separator chars.
 86 | 
 87 | The ``--sign`` defines distribution of set: no-character-sign, '+' and '-'.
 88 | Thus the default ``--sign=1`` forces just unsigned numbers; ``--sign=0,0,1``
 89 | will force all numbers negative.
 90 | 
 91 | 
 92 | TODO
 93 | --------------------------------------------------------------------------------
 94 | 
 95 | * Complete AVX512 implementation to handle scalar fallback.
 96 | 
 97 | 
 98 | License
 99 | --------------------------------------------------------------------------------
100 | 
101 | BSD
102 | 


--------------------------------------------------------------------------------
/experiments/overalltests/report.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os.path
  3 | 
  4 | if __name__ == '__main__' and __package__ is None:
  5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  6 | 
  7 | from table import Table
  8 | from loader import load
  9 | from utils import groupby
 10 | from report_writer import RestWriter
 11 | from prettyprint import *
 12 | 
 13 | 
 14 | class Report(object):
 15 | 
 16 |     def __init__(self, path):
 17 |         with open(path, 'rt') as f:
 18 |             self.raw_data = load(f)
 19 | 
 20 |         # group by separators distribution
 21 |         bysep = lambda item: item.sep_distribution
 22 | 
 23 |         self.report = []
 24 |         for sep, collection in groupby(self.raw_data, bysep).items():
 25 |             ret = self.split_by_distribution(collection)
 26 |             self.report.append((
 27 |                 get_separator_title(sep),
 28 |                 ret
 29 |             ))
 30 | 
 31 | 
 32 |     def get(self):
 33 |         return self.report
 34 | 
 35 | 
 36 |     def split_by_distribution(self, collection):
 37 |         result = []
 38 | 
 39 |         bynum = lambda item: (item.distribution_name)
 40 |         tmp = groupby(collection, bynum)
 41 |         for distribution_name, collection in tmp.items():
 42 |             res = self.split_by_parameters(distribution_name, collection)
 43 |             result.append((
 44 |                 get_distribution_title(distribution_name),
 45 |                 res
 46 |             ))
 47 | 
 48 |         return result
 49 |  
 50 | 
 51 |     def split_by_parameters(self, distribution_name, collection):
 52 |         byparam = lambda item: item.num_distribution
 53 | 
 54 |         result = []
 55 |         for key, collection in groupby(collection, byparam).items():
 56 |             table = self.prepare_table(collection)
 57 |             ret   = get_num_distribution_parameters(distribution_name, key)
 58 |             result.append((
 59 |                 ret.title,
 60 |                 table,
 61 |                 ret.weight
 62 |             ))
 63 | 
 64 |         result.sort(key=lambda row: row[-1])
 65 | 
 66 |         return [item[:2] for item in result]
 67 | 
 68 | 
 69 |     def prepare_table(self, procedures):
 70 | 
 71 |         keyfun = lambda item: (item.size, item.loops)
 72 |         tmp = groupby(procedures, keyfun)
 73 | 
 74 |         data = []
 75 |         for (size, loops), items in tmp.items():
 76 |             def get_time(procedure):
 77 |                 for item in items:
 78 |                     if item.procedure == procedure:
 79 |                         return item.time
 80 | 
 81 |                 raise KeyError("Procedure '%s' not found" % procedure)
 82 | 
 83 |             data.append((
 84 |                 size,
 85 |                 loops,
 86 |                 get_time("scalar"),
 87 |                 get_time("sse"),
 88 |                 get_time("sse-block"),
 89 |             ))
 90 | 
 91 |         data.sort(key=lambda t: t[0]) # sort by size
 92 | 
 93 |         t = Table()
 94 |         t.add_header([("input", 2), "scalar", ("SSE", 2), ("SSE block", 2)])
 95 |         t.add_header(["size [B]", "loops", "time [us]", "time [us]", "speed-up", "time [us]", "speed-up"])
 96 | 
 97 |         for item in data:
 98 |             t0 = item[2]
 99 |             t1 = item[3]
100 |             t2 = item[4]
101 |             if t0 < 10 and t1 < 10 and t2 < 10:
102 |                 # don't fool people when all measurements are single-digit numbers
103 |                 speedup_sse = '---'
104 |                 speedup_sse_block = '---'
105 |             else:
106 |                 speedup_sse = '%0.2f' % (float(t0)/t1)
107 |                 speedup_sse_block = '%0.2f' % (float(t0)/t2)
108 | 
109 |             t.add_row([
110 |                 '{:,}'.format(item[0]),
111 |                 '%d' % item[1],
112 |                 '%d' % item[2],
113 |                 '%d' % item[3],
114 |                 speedup_sse,
115 |                 '%d' % item[4],
116 |                 speedup_sse_block,
117 |             ])
118 |         
119 |         return t
120 | 
121 | 
122 | 
123 | 
124 | def main():
125 |     report = Report(sys.argv[1])
126 |     writer = RestWriter(sys.stdout, report.get())
127 |     try:
128 |         restsection = sys.argv[2]
129 |     except IndexError:
130 |         restsection = "-~#"
131 | 
132 |     writer.write(restsection)
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 


--------------------------------------------------------------------------------
/test/unittest/verify_sse_signed_overflow_detection.cpp:
--------------------------------------------------------------------------------
  1 | #include <iterator>
  2 | #include <cstdio>
  3 | #include <limits>
  4 | 
  5 | #include "sse/sse-parser-signed.h"
  6 | #include "sse/sse-matcher.h"
  7 | 
  8 | 
  9 | const char SEPARATOR = '_';
 10 | 
 11 | class Verify {
 12 | 
 13 |     using Vector = std::vector<int32_t>;
 14 |     Vector result;
 15 | 
 16 |     std::string input;
 17 |     std::string image;
 18 |     size_t      size;
 19 |     size_t      position;
 20 |     int64_t     value;
 21 | public:
 22 | 
 23 |     Verify() : size(64) {}
 24 | 
 25 |     bool run() {
 26 | 
 27 |         //check_not_overflow();
 28 |         check_overflow();
 29 | 
 30 |         printf("All OK\n");
 31 |         return true;
 32 |     }
 33 | 
 34 | private:
 35 |     void check_not_overflow() {
 36 | 
 37 |         value = std::numeric_limits<int32_t>::max();
 38 |         image = std::to_string(value);
 39 | 
 40 |         printf("'%s' should not overflow\n", image.c_str());
 41 |         assume_not_overflow();
 42 | 
 43 |         image = '+' + image;
 44 |         printf("'%s' should not overflow\n", image.c_str());
 45 |         assume_not_overflow();
 46 | 
 47 |         value = std::numeric_limits<int32_t>::min();
 48 |         image = std::to_string(value);
 49 | 
 50 |         printf("'%s' should not overflow\n", image.c_str());
 51 |         assume_not_overflow();
 52 |     }
 53 | 
 54 |     void assume_not_overflow() {
 55 | 
 56 |         for (position=0; position < 32; position++) {
 57 | 
 58 |             prepare_input();
 59 |             convert();
 60 | 
 61 |             assert(result.size() == 1);
 62 |             assert(result[0] == value);
 63 |         }
 64 |     }
 65 | 
 66 |     void check_overflow() {
 67 | 
 68 |         value = std::numeric_limits<int32_t>::max();
 69 |         value += 1;
 70 |         image = std::to_string(value);
 71 | 
 72 |         printf("'%s' should overflow\n", image.c_str());
 73 |         assume_overflow();
 74 | 
 75 |         image = '+' + image;
 76 |         printf("'%s' should overflow\n", image.c_str());
 77 |         assume_overflow();
 78 | 
 79 |         value = 9999999999l;
 80 |         image = std::to_string(value);
 81 | 
 82 |         printf("'%s' should overflow\n", image.c_str());
 83 |         assume_overflow();
 84 | 
 85 |         value = 100000000000000l;
 86 |         image = std::to_string(value);
 87 | 
 88 |         printf("'%s' should overflow\n", image.c_str());
 89 |         assume_overflow();
 90 | 
 91 |         value = std::numeric_limits<int32_t>::min();
 92 |         value -= 1;
 93 |         image = std::to_string(value);
 94 | 
 95 |         printf("'%s' should overflow\n", image.c_str());
 96 |         assume_overflow();
 97 | 
 98 |         value = -9999999999l;
 99 |         image = std::to_string(value);
100 | 
101 |         printf("'%s' should overflow\n", image.c_str());
102 |         assume_overflow();
103 | 
104 |         value = -100000000000000l;
105 |         image = std::to_string(value);
106 | 
107 |         printf("'%s' should overflow\n", image.c_str());
108 |         assume_overflow();
109 |     }
110 | 
111 |     void assume_overflow() {
112 | 
113 |         for (position=0; position < 32; position++) {
114 | 
115 |             prepare_input();
116 |             try {
117 |                 convert();
118 |                 assert(false && "must fail");
119 |             } catch (std::range_error& e) {
120 |                 assert(result.size() == 0);
121 |             } catch (...) {
122 |                 assert(false && "unexpected exception");
123 |             }
124 |         }
125 |     }
126 | 
127 |     void prepare_input() {
128 |         input.clear();
129 |         input += std::string(position, SEPARATOR);
130 |         input += image;
131 |         input += std::string(size - input.size(), SEPARATOR);
132 |     }
133 | 
134 |     void convert() {
135 |         const char separators[] = {SEPARATOR, 0};
136 | 
137 |         result.clear();
138 |         sse::NaiveMatcher<8> matcher(separators);
139 | 
140 |         sse::parser_signed(input.data(),
141 |                            input.size(), separators,
142 |                            std::move(matcher),
143 |                            std::back_inserter(result));
144 |     }
145 | };
146 | 
147 | 
148 | int main() {
149 |     puts("Verify if sse::signed_parser detects overflows");
150 |     Verify verify;
151 |     if (verify.run()) {
152 |         return EXIT_SUCCESS;
153 |     } else {
154 |         return EXIT_FAILURE;
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/experiments/spanmaskhistogram/report.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os.path
  3 | 
  4 | if __name__ == '__main__' and __package__ is None:
  5 |     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  6 | 
  7 | from table import Table
  8 | from utils import splitsorted
  9 | from prettyprint import *
 10 | from loader import load
 11 | from report_writer import RestWriter
 12 | 
 13 | 
 14 | class Report(object):
 15 | 
 16 |     def __init__(self):
 17 |         self.report = None
 18 |         self.tmp = []
 19 |         self.prev_size = None
 20 | 
 21 |     def add(self, item):
 22 |         if item.size != self.prev_size:
 23 |             self.prev_size = item.size
 24 |             self.tmp.append((item.size, []))
 25 | 
 26 |         title = '%s, %s' % (
 27 |             get_num_distribution_parameters(item.distribution_name, item.numbers_distribution).title,
 28 |             get_separator_title(item.separators_distribution))
 29 | 
 30 |         self.tmp[-1][1].append((item.distribution_name, title, item.histogram, item.hwevents, item.cycles))
 31 | 
 32 | 
 33 |     def get(self):
 34 |         if self.report is None:
 35 |             self.report = []
 36 |             for size, statistics in self.tmp:
 37 |                 title = 'Input size {:,d} bytes'.format(size)
 38 |                 self.report.append((title, self.prepare_table(statistics)))
 39 | 
 40 |         return self.report
 41 | 
 42 | 
 43 |     def prepare_table(self, stats):
 44 | 
 45 |         t = Table()
 46 |         t.add_header(["parameters", ("distinct span masks count", 5), ("cycles per byte", 2), ("branches", 3), ("cache references", 3)])
 47 |         t.add_header(["", "< 25%", "< 50%", "< 75%", "< 95%", "100%", "min", "avg", "taken", "mispredicted", "ratio", "count", "missed", "ratio"])
 48 | 
 49 |         splitted = splitsorted(stats, lambda item: item[0])
 50 | 
 51 |         for subarray in splitted:
 52 |             distribution_name = subarray[0][0]
 53 |             title = get_distribution_title(distribution_name)
 54 |             t.add_row([(title, 14)])
 55 | 
 56 |             for distribution_name, parameters, histogram, hwevents, cycles in subarray:
 57 | 
 58 |                 row = [parameters]
 59 | 
 60 |                 # histogram
 61 |                 weights = [0.25, 0.50, 0.75, 0.95, 1.00]
 62 |                 tmp = self.process_histogram(histogram, weights)
 63 |                 for w in weights:
 64 |                     row.append('%d' % tmp[w])
 65 | 
 66 |                 # cycles
 67 |                 if cycles is None:
 68 |                     row.append('')
 69 |                     row.append('')
 70 |                 else:
 71 |                     row.append('%0.3f' % cycles[0])
 72 |                     row.append('%0.3f' % cycles[1])
 73 | 
 74 |                 # hwevents
 75 |                 row.append('%d' % hwevents.branches)
 76 |                 row.append('%d' % hwevents.branch_misses)
 77 |                 row.append('%0.2f%%' % (100.0 * hwevents.get_branch_miss_ratio()))
 78 |                 row.append('%d' % hwevents.cache_references)
 79 |                 row.append('%d' % hwevents.cache_misses)
 80 |                 row.append('%0.2f%%' % (100.0 * hwevents.get_cache_miss_ratio()))
 81 | 
 82 |                 t.add_row(row)
 83 | 
 84 |         return t
 85 | 
 86 | 
 87 |     def process_histogram(self, list, weights):
 88 | 
 89 |         assert(len(weights) > 0)
 90 | 
 91 |         get_count = lambda item: item[1]
 92 |         list.sort(key=get_count)
 93 |         total = sum(get_count(item) for item in list)
 94 | 
 95 |         result = {}
 96 |         for w in weights:
 97 |             result[w] = 0
 98 | 
 99 |         cumulative = 0
100 |         for k, (mask, count) in enumerate(list):
101 |             cumulative += count
102 |             proc = cumulative/float(total)
103 |             for w in result:
104 |                 if proc <= w:
105 |                     result[w] = k + 1
106 | 
107 |         return result
108 | 
109 | 
110 | def main():
111 |     report = Report()
112 | 
113 |     spanmaskhistogram   = sys.argv[1]
114 |     hwevents            = sys.argv[2]
115 |     microbenchmarks     = sys.argv[3]
116 |     output              = sys.argv[4]
117 |     restseparator       = sys.argv[5]
118 | 
119 |     for item in load(spanmaskhistogram, hwevents, microbenchmarks):
120 |         report.add(item)
121 | 
122 |     data = report.get()
123 | 
124 |     with open(output, 'wt') as f:
125 |         writer = RestWriter(f, data)
126 |         writer.write(restseparator)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 
132 | 


--------------------------------------------------------------------------------
/include/avx512/avx512-parser-signed.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cassert>
  4 | #include <cstring>
  5 | 
  6 | #include <immintrin.h>
  7 | 
  8 | #include "scalar/scalar-parse-signed.h"
  9 | #include "sse/sse-parser-signed.h"
 10 | 
 11 | #include "block_info.h"
 12 | 
 13 | namespace avx512 {
 14 | 
 15 |     enum Lookup: uint8_t {
 16 | 
 17 |         DIGIT   = 0x80,
 18 |         SIGN    = 0xc0,
 19 |         VALID   = 1,
 20 |         INVALID = 0
 21 |     };
 22 | 
 23 |     void prepare_lookup(const char* separators, uint8_t result[128]) {
 24 |         uint8_t* c = (uint8_t*)(separators);
 25 | 
 26 |         memset(result, INVALID, 128);
 27 | 
 28 |         for (int i='0'; i <= '9'; i++) {
 29 |             result[i] = DIGIT;
 30 |         }
 31 | 
 32 |         result['-'] = SIGN;
 33 |         result['+'] = SIGN;
 34 | 
 35 |         while (*c) {
 36 |             uint8_t x = *c++;
 37 |             if (x & 0x80) {
 38 |                 throw std::logic_error("extended ASCII is not supported");
 39 |             }
 40 | 
 41 |             switch (x) {
 42 |                 case '0': case '1': case '2':
 43 |                 case '3': case '4': case '5':
 44 |                 case '6': case '7': case '8':
 45 |                 case '9': case '+': case '-':
 46 |                     throw std::logic_error("digits and sign chars are reserved");
 47 |             }
 48 | 
 49 |             result[x] = VALID;
 50 |         }
 51 |     }
 52 | 
 53 |     template <typename INSERTER>
 54 |     void parser_signed(const char* string, size_t size, const char* separators, INSERTER output) {
 55 | 
 56 |         char* data = const_cast<char*>(string);
 57 |         char* end  = data + size;
 58 | 
 59 |         uint8_t classes_lookup[128];
 60 |         prepare_lookup(separators, classes_lookup);
 61 | 
 62 |         const __m512i class_lo = _mm512_loadu_si512(reinterpret_cast<__m512i*>(&classes_lookup[0]));
 63 |         const __m512i class_hi = _mm512_loadu_si512(reinterpret_cast<__m512i*>(&classes_lookup[64]));
 64 |         while (data + 64 < end) {
 65 |             const __m512i input = _mm512_loadu_si512(reinterpret_cast<__m512i*>(data));
 66 | 
 67 |             const __m512i classes = _mm512_permutex2var_epi8(class_lo, input, class_hi);
 68 | 
 69 |             if (_mm512_test_epi8_mask(classes, classes) != uint64_t(-1)) {
 70 |                 throw std::logic_error("invalid character");
 71 |             }
 72 | 
 73 |             uint64_t span_mask64 = _mm512_movepi8_mask(classes);
 74 |             uint64_t sign_mask64 = _mm512_test_epi8_mask(classes, _mm512_set1_epi8(int8_t(0x40)));
 75 | 
 76 |             char* bufend = data + 64;
 77 |             while (data + 16 <= bufend) {
 78 |                 const uint16_t span_mask = span_mask64 & 0xffff;
 79 |                 const uint16_t sign_mask = sign_mask64 & 0xffff;
 80 | 
 81 |                 const BlockInfo& bi = blocks[span_mask];
 82 |                 if (sign_mask & bi.invalid_sign_mask) {
 83 |                     throw std::runtime_error("'+' or '-' at invalid position");
 84 |                 }
 85 | 
 86 |                 const __m128i chunk = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
 87 | 
 88 |                 const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits);
 89 |                 const __m128i shuffle_signs  = _mm_loadu_si128((const __m128i*)bi.shuffle_signs);
 90 | 
 91 |                 const __m128i shuffled    = _mm_shuffle_epi8(chunk, shuffle_digits);
 92 |                 const __m128i negate_mask = _mm_cmpeq_epi8(_mm_shuffle_epi8(chunk, shuffle_signs), _mm_set1_epi8('-'));
 93 |                 if (bi.conversion_routine == Conversion::SSE1Digit) {
 94 | 
 95 |                     sse::convert_1digit(shuffled, bi.element_count, output);
 96 | 
 97 |                 } else if (bi.conversion_routine == Conversion::SSE2Digits) {
 98 | 
 99 |                     sse::convert_2digits_signed(shuffled, negate_mask, bi.element_count, output);
100 | 
101 |                 } else if (bi.conversion_routine == Conversion::SSE4Digits) {
102 | 
103 |                     sse::convert_4digits_signed(shuffled, negate_mask, bi.element_count, output);
104 | 
105 |                 } else if (bi.conversion_routine == Conversion::SSE8Digits) {
106 | 
107 |                     sse::convert_8digits_signed(shuffled, negate_mask, bi.element_count, output);
108 | 
109 |                 } else {
110 | 
111 |                     printf("case %04x not handled yet\n", span_mask);
112 |                     assert(false);
113 |                 }
114 | 
115 |                 data += bi.total_skip;
116 | 
117 |                 span_mask64 >>= bi.total_skip;
118 |                 sign_mask64 >>= bi.total_skip;
119 |             }
120 | 
121 |         } // for
122 | 
123 |         // process the tail
124 |         scalar::parse_signed(data, string + size - data, separators, output);
125 |     }
126 | 
127 | } // namespace sse
128 | 


--------------------------------------------------------------------------------
/test/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <numeric>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | 
  7 | #include "time_utils.h"
  8 | #include "scalar/scalar-parse-unsigned.h"
  9 | #include "sse/sse-matcher.h"
 10 | #include "sse/sse-parser-unsigned.h"
 11 | #include "sse/sse-block-parser-unsigned.h"
 12 | #include "sse/sse-parser-signed.h"
 13 | #include "sse/sse-block-parser-signed.h"
 14 | 
 15 | #include "application.h"
 16 | 
 17 | class BenchmarkApp: public Application {
 18 | 
 19 |     using Vector = std::vector<int32_t>;
 20 | 
 21 |     enum class Procedure {
 22 |         Scalar,
 23 |         SSE,
 24 |         SSEBlock
 25 |     };
 26 | 
 27 |     std::string procedure_name;
 28 |     Procedure procedure;
 29 | 
 30 | public:
 31 |     BenchmarkApp(int argc, char** argv);
 32 | 
 33 | protected:
 34 |     virtual bool custom_run() override;
 35 |     virtual void custom_init() override;
 36 |     virtual void print_custom_help() const override;
 37 | 
 38 | private:
 39 |     Vector result;
 40 |     std::string tmp;
 41 | 
 42 | private:
 43 |     template <typename T>
 44 |     uint64_t sum(const T& vec) const {
 45 |         return std::accumulate(vec.begin(), vec.end(), 0);
 46 |     }
 47 | 
 48 |     template <typename FUN>
 49 |     Clock::time_point::rep measure_time(FUN fun) {
 50 | 
 51 |         Clock::time_point::rep min = 0;
 52 |         for (size_t i=0; i < get_loop_count(); i++) {
 53 |             result.clear();
 54 |             const auto t1 = Clock::now();
 55 |             fun();
 56 |             const auto t2 = Clock::now();
 57 | 
 58 |             const auto dt = elapsed(t1, t2);
 59 |             if (i == 0) {
 60 |                 min = dt;
 61 |             } else {
 62 |                 min = std::min(dt, min);
 63 |             }
 64 |         }
 65 | 
 66 |         return min;
 67 |     }
 68 | 
 69 | };
 70 | 
 71 | BenchmarkApp::BenchmarkApp(int argc, char** argv) : Application(argc, argv) {}
 72 | 
 73 | 
 74 | void BenchmarkApp::custom_init() {
 75 |     procedure_name = cmdline.get_value("--procedure", "");
 76 |     if (procedure_name.empty()) {
 77 |         throw ArgumentError("Procedure name must not be empty");
 78 |     }
 79 | 
 80 |     if (procedure_name == "scalar") {
 81 |         procedure = Procedure::Scalar;
 82 |     } else if (procedure_name == "sse") {
 83 |         procedure = Procedure::SSE;
 84 |     } else if (procedure_name == "sse-block") {
 85 |         procedure = Procedure::SSEBlock;
 86 |     } else {
 87 |         throw ArgumentError("Unknown procedure name. It must be: 'scalar', 'sse', 'sse-block'");
 88 |     }
 89 | }
 90 | 
 91 | 
 92 | void BenchmarkApp::print_custom_help() const {
 93 |     puts("--procedure=NAME where name is 'scalar', sse' or 'sse-block'");
 94 | }
 95 | 
 96 | 
 97 | bool BenchmarkApp::custom_run() {
 98 | 
 99 |     tmp = generate_signed();
100 | 
101 |     Clock::time_point::rep time;
102 | 
103 |     switch (procedure) {
104 |         case Procedure::Scalar:
105 |             time = measure_time([this] {
106 |                     scalar::parse_signed(tmp.data(), tmp.size(), get_separators_set().c_str(),
107 |                                          std::back_inserter(result));
108 |                 });
109 |             break;
110 | 
111 |         case Procedure::SSE:
112 |             time = measure_time([this] {
113 |                     sse::NaiveMatcher<8> matcher(get_separators_set().c_str());
114 |                     sse::parser_signed(
115 |                         tmp.data(),
116 |                         tmp.size(),
117 |                         get_separators_set().c_str(),
118 |                         std::move(matcher),
119 |                         std::back_inserter(result));
120 |                 });
121 |             break;
122 | 
123 |         case Procedure::SSEBlock:
124 |             time = measure_time([this] {
125 |                     sse::NaiveMatcher<8> matcher(get_separators_set().c_str());
126 |                     sse::parser_block_signed(
127 |                         tmp.data(),
128 |                         tmp.size(),
129 |                         get_separators_set().c_str(),
130 |                         std::move(matcher),
131 |                         std::back_inserter(result));
132 |                 });
133 |             break;
134 | 
135 |         default:
136 |             __builtin_unreachable();
137 |             time = 0;
138 |             assert(false);
139 |             break;
140 |     }
141 | 
142 |     printf("input size : %lu\n", get_size());
143 |     printf("loops      : %lu\n", get_loop_count());
144 |     printf("procedure  : %s\n", procedure_name.c_str());
145 |     printf("time       : %ld us\n", time);
146 |     // this prevents compiler from optimizing out the benchmark loop
147 |     printf("reference results: %lu\n", sum(result));
148 | 
149 |     return true;
150 | }
151 | 
152 | 
153 | int main(int argc, char* argv[]) {
154 | 
155 |     try {
156 |         BenchmarkApp app(argc, argv);
157 |         app.run();
158 | 
159 |         return EXIT_SUCCESS;
160 | 
161 |     } catch (std::exception& e) {
162 |         printf("%s\n", e.what());
163 |         return EXIT_FAILURE;
164 |     } catch (Application::Exit&) {
165 |         return EXIT_SUCCESS;
166 |     }
167 | }
168 | 
169 | 


--------------------------------------------------------------------------------
/include/sse/sse-block-parser-signed.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <cassert>
  5 | 
  6 | #include "scalar/scalar-parse-signed.h"
  7 | #include "sse-utils.h"
  8 | #include "sse-convert.h"
  9 | #include "sse-parser-common.h"
 10 | #include "sse-parser-statistics.h"
 11 | #include "block_info.h"
 12 | 
 13 | namespace sse {
 14 |     
 15 |     namespace detail {
 16 | 
 17 |         struct result_type {
 18 |             uint64_t span_mask;
 19 |             uint64_t sign_mask;
 20 |         };
 21 | 
 22 |         template <typename MATCHER>
 23 |         result_type prepare_masks(char* data, MATCHER matcher) {
 24 |             const __m128i input0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 0*16));
 25 |             const __m128i input1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 1*16));
 26 |             const __m128i input2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 2*16));
 27 |             const __m128i input3 = _mm_loadu_si128(reinterpret_cast<__m128i*>(data + 3*16));
 28 | 
 29 |             const __m128i bytemask_digit0 = decimal_digits_mask(input0);
 30 |             const __m128i bytemask_digit1 = decimal_digits_mask(input1);
 31 |             const __m128i bytemask_digit2 = decimal_digits_mask(input2);
 32 |             const __m128i bytemask_digit3 = decimal_digits_mask(input3);
 33 | 
 34 |             const __m128i bytemask_sign0  = sign_mask(input0);
 35 |             const __m128i bytemask_sign1  = sign_mask(input1);
 36 |             const __m128i bytemask_sign2  = sign_mask(input2);
 37 |             const __m128i bytemask_sign3  = sign_mask(input3);
 38 | 
 39 |             const __m128i bytemask_span0  = _mm_or_si128(bytemask_digit0, bytemask_sign0);
 40 |             const __m128i bytemask_span1  = _mm_or_si128(bytemask_digit1, bytemask_sign1);
 41 |             const __m128i bytemask_span2  = _mm_or_si128(bytemask_digit2, bytemask_sign2);
 42 |             const __m128i bytemask_span3  = _mm_or_si128(bytemask_digit3, bytemask_sign3);
 43 | 
 44 |             const __m128i bytemask_valid = _mm_and_si128(matcher.get_mask(input0, bytemask_span0),
 45 |                                            _mm_and_si128(matcher.get_mask(input1, bytemask_span1),
 46 |                                            _mm_and_si128(matcher.get_mask(input2, bytemask_span2),
 47 |                                                          matcher.get_mask(input3, bytemask_span3))));
 48 | 
 49 |             if (_mm_movemask_epi8(bytemask_valid) != 0xffff) {
 50 |                 throw std::runtime_error("Wrong character");
 51 |             }
 52 | 
 53 |             result_type res;
 54 |             res.sign_mask = compose_bitmask(bytemask_sign0,
 55 |                                             bytemask_sign1,
 56 |                                             bytemask_sign2,
 57 |                                             bytemask_sign3);
 58 | 
 59 |             res.span_mask = compose_bitmask(bytemask_span0,
 60 |                                             bytemask_span1,
 61 |                                             bytemask_span2,
 62 |                                             bytemask_span3);
 63 |             return res;
 64 |         }
 65 | 
 66 |     } // namespace detail
 67 | 
 68 |     template <typename MATCHER, typename INSERTER>
 69 |     void parser_block_signed(
 70 |         const char* string,
 71 |         size_t size,
 72 |         const char* separators,
 73 |         MATCHER matcher,
 74 |         INSERTER output) {
 75 | 
 76 | 
 77 |         char* data = const_cast<char*>(string);
 78 |         char* end  = data + size;
 79 | 
 80 |         while (data + 4*16 < end) {
 81 |             detail::result_type res = detail::prepare_masks(data, matcher);
 82 | 
 83 |             char* loopend = data + 3*16;
 84 |             while (data < loopend) {
 85 |                 const uint16_t span_mask = res.span_mask & 0xffff;
 86 |                 if (span_mask == 0) {
 87 |                     res.span_mask >>= 16;
 88 |                     res.sign_mask >>= 16;
 89 |                     data += 16;
 90 | 
 91 |                     continue;
 92 | 
 93 |                 }
 94 | 
 95 |                 const BlockInfo& bi = blocks[span_mask];
 96 |                 const uint16_t sign_mask = res.sign_mask & 0xffff;
 97 |                 if (sign_mask & bi.invalid_sign_mask) {
 98 |                     throw std::runtime_error("'+' or '-' at invalid position");
 99 |                 }
100 | 
101 |                 const __m128i input = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
102 |                 char* prevdata = data;
103 |                 if (sign_mask == 0) {
104 |                     data = detail::parse_unsigned(bi, input, data, end, output);
105 |                 } else {
106 |                     data = detail::parse_signed(bi, input, data, end, output);
107 |                 }
108 | 
109 |                 if (data == end) {
110 |                     break;
111 |                 }
112 | 
113 |                 const int shift = data - prevdata;
114 |                 res.span_mask >>= shift;
115 |                 res.sign_mask >>= shift;
116 |             } // inner while
117 |             
118 |         } // while
119 | 
120 |         // process the tail
121 |         scalar::parse_signed(data, string + size - data, separators, output);
122 |     }
123 | 
124 | } // namespace sse
125 | 


--------------------------------------------------------------------------------
/src/sse-parser-statistics.cpp:
--------------------------------------------------------------------------------
  1 | #include "sse/sse-parser-statistics.h"
  2 | #include <vector>
  3 | #include <algorithm>
  4 | #include <cassert>
  5 | 
  6 | sse::Statistics sse::stats; // a global object
  7 | 
  8 | namespace {
  9 | 
 10 |     void print_skip_histogram(FILE* file, const sse::Statistics& stats) {
 11 |         size_t sum = 0;
 12 |         for (const auto& item: stats.total_skip_histogram) {
 13 |             sum += item.second;
 14 |         }
 15 | 
 16 |         for (const auto& item: stats.total_skip_histogram) {
 17 |             const int    skip  = item.first;
 18 |             const size_t count = item.second;
 19 | 
 20 |             fprintf(file, "* process %2d byte(s): %5lu (%5.2f%%)\n", skip, count, 100.0*count/sum);
 21 |         }
 22 |     }
 23 | 
 24 |     struct span_histogram_entry {
 25 |         uint16_t mask;
 26 |         size_t   count;
 27 | 
 28 |         span_histogram_entry(uint16_t mask_, size_t count_)
 29 |             : mask(mask_)
 30 |             , count(count_) {}
 31 |     };
 32 | 
 33 |     using span_histogram = std::vector<span_histogram_entry>;
 34 | 
 35 |     void print_span_mask_histogram(FILE* file, const sse::Statistics& stats) {
 36 |         size_t sum = 0;
 37 |         span_histogram histogram;
 38 |         for (const auto& item: stats.span_masks_histogram) {
 39 |             sum += item.second;
 40 |             histogram.emplace_back(item.first, item.second);
 41 |         }
 42 | 
 43 |         std::sort(histogram.begin(), histogram.end(),
 44 |                   [](const span_histogram_entry& a, const span_histogram_entry& b){return a.count < b.count;});
 45 | 
 46 |         printf("Span mask histogram (%lu entries)\n", stats.span_masks_histogram.size());
 47 |         size_t cumulative = 0;
 48 |         size_t id = 0;
 49 |         for (const auto& item: histogram) {
 50 |             cumulative += item.count;
 51 |             fprintf(file, "%5lu 0x%02x: %5lu (%5.2f%%; cumulative %5.2f%%)\n",
 52 |                     id++,
 53 |                     item.mask, item.count,
 54 |                     100.0*item.count/sum,
 55 |                     100.0*cumulative/sum);
 56 |         }
 57 |     }
 58 | 
 59 |     void print_sse_statistics(FILE* file, const char* title, size_t calls, size_t converted) {
 60 | 
 61 |         fprintf(file, "* %s:", title);
 62 |         if (calls == 0 && converted == 0) {
 63 |             printf(" none\n");
 64 |             return;
 65 |         } else {
 66 |             printf("\n");
 67 |         }
 68 | 
 69 |         fprintf(file, "  - calls:           %8lu\n", calls);
 70 |         fprintf(file, "  - converted nums:  %8lu\n", converted);
 71 |         fprintf(file, "  - conversion/call: ");
 72 |         if (calls > 0) {
 73 |             fprintf(file, "%11.2f", converted/double(calls));
 74 |         } else {
 75 |             fprintf(file, "-");
 76 |         }
 77 |         fprintf(file, "\n");
 78 |     }
 79 | 
 80 | }
 81 | 
 82 | void sse::Statistics::print(FILE* file) const {
 83 |     fprintf(file, "SSE parser statistics\n");
 84 |     fprintf(file, "loops                   : %8lu\n", loops);
 85 |     fprintf(file, "total numbers converted : %8lu\n", get_all_converted());
 86 |     fprintf(file, "scalar conversions      : %8lu\n", get_scalar_conversions());
 87 | 
 88 |     const double perc_total = 100.0*get_SSE_converted()/get_all_converted();
 89 |     fprintf(file, "all converted by SSE    : %8lu (%0.2f%%)\n", get_SSE_converted(), perc_total);
 90 |     fprintf(file, " - by unsinged routines : %8lu\n", unsigned_path.get_SSE_converted());
 91 |     fprintf(file, " - by singed routines   : %8lu\n", signed_path.get_SSE_converted());
 92 |     print_skip_histogram(file, *this);
 93 | 
 94 |     print_sse_statistics(file, "1-digit vector conversions (unsigned)", unsigned_path.digit1_calls, unsigned_path.digit1_converted);
 95 |     print_sse_statistics(file, "2-digit vector conversions (unsigned)", unsigned_path.digit2_calls, unsigned_path.digit2_converted);
 96 |     print_sse_statistics(file, "2-digit vector conversions (signed)",   signed_path.digit2_calls,   signed_path.digit2_converted);
 97 |     print_sse_statistics(file, "3-digit vector conversions (unsigned)", unsigned_path.digit3_calls, unsigned_path.digit3_converted);
 98 |     print_sse_statistics(file, "3-digit vector conversions (signed)",   signed_path.digit3_calls,   signed_path.digit3_converted);
 99 |     print_sse_statistics(file, "4-digit vector conversions (unsigned)", unsigned_path.digit4_calls, unsigned_path.digit4_converted);
100 |     print_sse_statistics(file, "4-digit vector conversions (signed)",   signed_path.digit4_calls,   signed_path.digit4_converted);
101 |     print_sse_statistics(file, "8-digit vector conversions (unsigned)", unsigned_path.digit8_calls, unsigned_path.digit8_converted);
102 |     print_sse_statistics(file, "8-digit vector conversions (signed)",   signed_path.digit8_calls,   signed_path.digit8_converted);
103 | 
104 |     print_span_mask_histogram(file, *this);
105 | }
106 | 
107 | sse::Statistics::Statistics() {
108 |     for (int i=0; i <= 16; i++)
109 |         total_skip_histogram[i] = 0;
110 | }
111 | 
112 | void sse::Statistics::span_mask_histogram_to_csv(FILE* file) const {
113 |     assert(file != nullptr);
114 | 
115 |     for (const auto& item: stats.span_masks_histogram) {
116 |         const uint16_t mask  = item.first;
117 |         const size_t   count = item.second;
118 |         fprintf(file, "%02x, %lu\n", mask, count);
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/include/test/benchmark.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BENCHMARK_H_
 2 | #define _BENCHMARK_H_
 3 | 
 4 | #include <stdint.h>
 5 | #define RDTSC_START(cycles)                                             \
 6 |     do {                                                                \
 7 |         uint32_t cyc_high, cyc_low;                                     \
 8 |         __asm volatile("cpuid\n"                                        \
 9 |                        "rdtsc\n"                                        \
10 |                        "mov %%edx, %0\n"                                \
11 |                        "mov %%eax, %1" :                                \
12 |                        "=r" (cyc_high),                                 \
13 |                        "=r"(cyc_low) :                                  \
14 |                        : /* no read only */                             \
15 |                        "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
16 |                        );                                               \
17 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
18 |     } while (0)
19 | 
20 | #define RDTSC_STOP(cycles)                                              \
21 |     do {                                                                \
22 |         uint32_t cyc_high, cyc_low;                                     \
23 |         __asm volatile("rdtscp\n"                                       \
24 |                        "mov %%edx, %0\n"                                \
25 |                        "mov %%eax, %1\n"                                \
26 |                        "cpuid" :                                        \
27 |                        "=r"(cyc_high),                                  \
28 |                        "=r"(cyc_low) :                                  \
29 |                        /* no read only registers */ :                   \
30 |                        "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
31 |                        );                                               \
32 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
33 |     } while (0)
34 | 
35 | static __attribute__ ((noinline))
36 | uint64_t rdtsc_overhead_func(uint64_t dummy) {
37 |     return dummy;
38 | }
39 | 
40 | uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;
41 | 
42 | #define RDTSC_SET_OVERHEAD(test, repeat)                            \
43 |   do {                                                              \
44 |     uint64_t cycles_start, cycles_final, cycles_diff;               \
45 |     uint64_t min_diff = UINT64_MAX;                                 \
46 |     for (unsigned i = 0; i < repeat; i++) {                         \
47 |       __asm volatile("" ::: /* pretend to clobber */ "memory");     \
48 |       RDTSC_START(cycles_start);                                    \
49 |       test;                                                         \
50 |       RDTSC_STOP(cycles_final);                                     \
51 |       cycles_diff = (cycles_final - cycles_start);                  \
52 |       if (cycles_diff < min_diff) min_diff = cycles_diff;           \
53 |     }                                                               \
54 |     global_rdtsc_overhead = min_diff;                               \
55 |     printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead);     \
56 |   } while (0)                                                       \
57 | 
58 | 
59 | /*
60 |  * Prints the best number of operations per cycle where
61 |  * test is the function call, answer is the expected answer generated by
62 |  * test, repeat is the number of times we should repeat and size is the
63 |  * number of operations represented by test.
64 |  */
65 | #define BEST_TIME(pre, test, test_name, repeat, size)                   \
66 |     do {                                                                \
67 |         if (global_rdtsc_overhead == UINT64_MAX) {                      \
68 |            RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat);          \
69 |         }                                                               \
70 |         printf("%-30s\t: ", test_name); fflush(stdout);                 \
71 |         uint64_t cycles_start, cycles_final, cycles_diff;               \
72 |         uint64_t min_diff = (uint64_t)-1;                               \
73 |         uint64_t sum_diff = 0;                                          \
74 |         for (size_t i = 0; i < repeat; i++) {                           \
75 |             pre;                                                        \
76 |             __asm volatile("" ::: /* pretend to clobber */ "memory");   \
77 |             RDTSC_START(cycles_start);                                  \
78 |             test;                                                       \
79 |             RDTSC_STOP(cycles_final);                                   \
80 |             cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
81 |             if (cycles_diff < min_diff) min_diff = cycles_diff;         \
82 |             sum_diff += cycles_diff;                                    \
83 |         }                                                               \
84 |         uint64_t S = size;                                              \
85 |         float cycle_per_op = (min_diff) / (double)S;                    \
86 |         float avg_cycle_per_op = (sum_diff) / ((double)S * repeat);     \
87 |         printf(" %8.3f cycle/op (best) %8.3f cycle/op (avg)\n", cycle_per_op, avg_cycle_per_op); \
88 |  } while (0)
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/test/unittest/verify_sse_unsigned_parser.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iterator>
  3 | #include <algorithm>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <cctype>
  7 | 
  8 | #include "block_info.h"
  9 | #include "scalar/scalar-parse-unsigned.h"
 10 | #include "scalar/scalar-parse-signed.h"
 11 | #include "sse/sse-convert.h"
 12 | #include "sse/sse-matcher.h"
 13 | #include "sse/sse-parser-unsigned.h"
 14 | 
 15 | 
 16 | class VerifyUnsignedParser {
 17 | 
 18 |     static const char* separators;
 19 | 
 20 |     static const int SIZE = 16 * 4;
 21 |     char buffer[SIZE + 1];
 22 |     std::vector<uint32_t> result;
 23 | 
 24 | public:
 25 |     VerifyUnsignedParser() {}
 26 | 
 27 |     bool run() {
 28 |         try {
 29 |             do_run();
 30 |             puts("All OK");
 31 |             return true;
 32 |         } catch (std::exception& e) {
 33 |             printf("failed: %s\n", e.what());
 34 |             dump();
 35 |             return false;
 36 |         }
 37 |     }
 38 | 
 39 | private:
 40 |     void do_run() {
 41 |         {
 42 |             printf("test 1... "); fflush(stdout);
 43 |             size_t cases = 0;
 44 |             for (int i=8; i <= 8; i++) {
 45 |                 cases += verify1number(i);
 46 |             }
 47 |             printf(" %lu cases chcecked\n", cases);
 48 |         }
 49 | 
 50 |         {
 51 |             printf("test 2... "); fflush(stdout);
 52 |             size_t cases = 0;
 53 |             for (int i=1; i <= 8; i++) {
 54 |                 for (int j=1; j <= 8; j++) {
 55 |                     cases += verify2numbers(i, j);
 56 |                 }
 57 |             }
 58 |             printf(" %lu cases chcecked\n", cases);
 59 |         }
 60 |     }
 61 |     
 62 | private:
 63 |     size_t verify1number(const size_t digits) {
 64 |         assert(digits > 0);
 65 |         assert(digits <= 8);
 66 | 
 67 |         size_t cases = 0;
 68 |         uint32_t reference = test_number(digits);
 69 |         for (size_t i=0; i < SIZE; i++) {
 70 |             clear();
 71 |             if (!put_number(i, digits)) break;
 72 |             cases += 1;
 73 | 
 74 |             sse::NaiveMatcher<8> matcher('_');
 75 |             result.clear();
 76 |             sse::parser(buffer, SIZE, separators, matcher, std::back_inserter(result));
 77 | 
 78 |             if (result.size() != 1) {
 79 |                 throw std::logic_error("size must be 1");
 80 |             }
 81 | 
 82 |             if (result[0] != reference) {
 83 |                 printf("result = %u, expected = %u\n", result[0], reference);
 84 |                 throw std::logic_error("wrong value");
 85 |             }
 86 |         }
 87 | 
 88 |         return cases;
 89 |     }
 90 | 
 91 |     size_t verify2numbers(int digits1, int digits2) {
 92 |         assert(digits1 > 0);
 93 |         assert(digits1 <= 8);
 94 |         assert(digits2 > 0);
 95 |         assert(digits2 <= 8);
 96 | 
 97 |         size_t cases = 0;
 98 | 
 99 |         const uint32_t reference1 = test_number(digits1);
100 |         const uint32_t reference2 = test_number(digits2);
101 | 
102 |         for (int i = 0; i < SIZE; i++) {
103 |             for (int j = i; j < SIZE; j++) {
104 |                 clear();
105 |                 if (!put_number(i, digits1)) continue;
106 |                 if (!put_number(j, digits2)) continue;
107 |                 cases += 1;
108 | 
109 |                 sse::NaiveMatcher<8> matcher('_');
110 |                 result.clear();
111 |                 sse::parser(buffer, SIZE, separators, matcher, std::back_inserter(result));
112 | 
113 |                 if (result.size() != 2) {
114 |                     throw std::logic_error("size must be 2");
115 |                 }
116 | 
117 |                 const bool e1 = (result[0] == reference1 && result[1] == reference2);
118 |                 const bool e2 = (result[0] == reference2 && result[1] == reference1);
119 |                 if (!(e1 || e2)) {
120 |                     printf("result = %u, %u, expected = %u, %u\n", result[0], result[1], reference1, reference2);
121 |                     throw std::logic_error("invalid value");
122 |                 }
123 |             }
124 |         }
125 | 
126 |         return cases;
127 |     }
128 | 
129 |     bool put_number(int offset, int digits) {
130 |         for (int i=0; i < digits; i++) {
131 |             if (offset + i >= SIZE) {
132 |                 return false;
133 |             }
134 | 
135 |             if (!is_free(offset + i)) return false;
136 |         }
137 | 
138 |         if (!is_free(offset - 1)) return false;
139 |         if (!is_free(offset + digits)) return false;
140 | 
141 |         for (int i=0; i < digits; i++) {
142 |             buffer[offset + i] = ((i + 1) % 10) + '0';
143 |         }
144 |         return true;
145 |     }
146 | 
147 |     bool is_free(int index) const {
148 |         if (index < 0) return true;
149 |         if (index >= SIZE) return true;
150 | 
151 |         return buffer[index] == '_';
152 |     }
153 | 
154 |     uint64_t test_number(int digits) {
155 |         uint64_t x = 0;
156 |         for (int i=0; i < digits; i++) {
157 |             x = 10 * x + (i + 1) % 10;
158 |         }
159 | 
160 |         return x;
161 |     }
162 | 
163 |     void clear() {
164 |         memset(buffer, '_', SIZE);
165 |         buffer[SIZE] = 0;
166 |     }
167 | 
168 |     void dump() {
169 |         puts(buffer);
170 |     }
171 | 
172 |     void dump(const std::vector<uint32_t>& vec) {
173 |         printf("size = %lu: [", vec.size());
174 | 
175 |         const size_t n = vec.size();
176 |         if (n) {
177 |             printf("%u", vec[0]);
178 |         }
179 | 
180 |         for (size_t i=1; i < n; i++) {
181 |             printf(", %u", vec[i]);
182 |         }
183 | 
184 |         printf("]\n");
185 |     }
186 | 
187 | };
188 | 
189 | const char* VerifyUnsignedParser::separators = "_";
190 | 
191 | 
192 | int main() {
193 | 
194 |     puts("Verify SSE unsigned parser");
195 |     VerifyUnsignedParser verify;
196 |     if (!verify.run()) {
197 |        return EXIT_FAILURE;
198 |     }
199 | 
200 |     return EXIT_SUCCESS;
201 | }
202 | 
203 | 


--------------------------------------------------------------------------------
/test/utils/application.cpp:
--------------------------------------------------------------------------------
  1 | #include "application.h"
  2 | 
  3 | #include "input_generator.h"
  4 | #include "time_utils.h"
  5 | 
  6 | #include <set>
  7 | #include <cassert>
  8 | 
  9 | namespace {
 10 | 
 11 |     std::vector<long> parse_array(const std::string& str) {
 12 |         char* c;
 13 |         const char* s = str.c_str();
 14 | 
 15 |         std::vector<long> result;
 16 |         while (true) {
 17 |             const long tmp = strtol(s, &c, 10);
 18 |             if (*c == ',') {
 19 |                 result.push_back(tmp);
 20 |                 s = c + 1;
 21 |             } else if (*c == '\0') {
 22 |                 if (c != s) {
 23 |                     result.push_back(tmp);
 24 |                 }
 25 |                 break;
 26 |             } else {
 27 |                 throw std::logic_error("Invalid character '" + std::string(1, *c) + "' in string \"" + str + "\"");
 28 |             }
 29 |         }
 30 | 
 31 |         if (result.empty()) {
 32 |             throw std::logic_error("Expected at least one number");
 33 |         }
 34 | 
 35 |         return result;
 36 |     }
 37 | 
 38 |     std::string parse_separators(const std::string& s) {
 39 |         std::set<char> set;
 40 |         static const std::string reserved_chars{"0123456789+-"};
 41 | 
 42 |         for (char c: s) {
 43 |             set.insert(c);
 44 |         }
 45 | 
 46 |         const bool empty     = set.empty();
 47 |         const bool too_large = set.size() > 16;
 48 |         bool invalid_chars = false;
 49 |         for (char c: reserved_chars) {
 50 |             if (set.count(c)) {
 51 |                 invalid_chars = true;
 52 |                 break;
 53 |             }
 54 |         }
 55 | 
 56 |         if (empty || too_large || invalid_chars) {
 57 |             throw Application::ArgumentError
 58 |                     ("Separators must be a non empty, up to 16 chars set; "
 59 |                      "forbidden chars are: '0'..'9', '+' and '-'.");
 60 |         }
 61 | 
 62 |         return std::string{set.begin(), set.end()};
 63 |     }
 64 | 
 65 | } // namespace unnamed
 66 | 
 67 | 
 68 | Application::Application(int argc, char* argv[])
 69 |     : cmdline(argc, argv)
 70 |     , quiet(false)
 71 |     , rd()
 72 |     , random(rd()) {}
 73 | 
 74 | 
 75 | bool Application::run() {
 76 |     init();
 77 |     custom_init();
 78 |     return custom_run();
 79 | }
 80 | 
 81 | 
 82 | void Application::init() {
 83 | 
 84 |     if (cmdline.empty() || cmdline.has_flag("-h") || cmdline.has_flag("--help")) {
 85 |         print_help();
 86 |         throw Application::Exit();
 87 |     }
 88 | 
 89 |     auto to_int = [](const std::string& val) {
 90 |         return std::stol(val);
 91 |     };
 92 | 
 93 |     size            = cmdline.parse_value<size_t>("--size", to_int);
 94 |     debug_size      = cmdline.parse_value<size_t>("--debug", to_int, 0);
 95 |     loop_count      = cmdline.parse_value<size_t>("--loops", to_int, 1);
 96 |     separators_set  = cmdline.parse_value<std::string>("--separators", parse_separators, ",; ");
 97 | 
 98 |     const auto seed = cmdline.parse_value("--seed", to_int, 0);
 99 |     random.seed(seed);
100 | 
101 |     {
102 |         const auto arr = cmdline.parse_value<std::vector<long>>("--num", parse_array);
103 |         distribution.numbers = std::discrete_distribution<>(arr.begin(), arr.end());
104 |     }
105 |     {
106 |         const auto arr = cmdline.parse_value<std::vector<long>>("--sep", parse_array, {1});
107 |         distribution.separators = std::discrete_distribution<>(arr.begin(), arr.end());
108 |     }
109 | 
110 |     if (cmdline.has_value("--sign")) {
111 |         const auto arr = cmdline.parse_value<std::vector<long>>("--sign", parse_array, {});
112 |         if (arr.size() != 3) {
113 |             throw std::logic_error("--sign expects exactly three-item distribution, like --sign=5,2,1");
114 |         }
115 |         distribution.sign = std::discrete_distribution<>(arr.begin(), arr.end());
116 |         sign_nonnull = true;
117 |     } else {
118 |         sign_nonnull = false;
119 |     }
120 | }
121 | 
122 | std::string Application::generate_unsigned() {
123 | 
124 |     std::string tmp;
125 | 
126 |     const std::string msg = (quiet) ? "" : "generating random unsigned numbers ";
127 |     measure_time(msg, [&tmp, this]{
128 |         tmp = ::generate_unsigned(
129 |                     size,
130 |                     get_separators_set(),
131 |                     random,
132 |                     distribution.numbers,
133 |                     distribution.separators);
134 |     });
135 |     assert(tmp.size() == size);
136 | 
137 |     if (!quiet && debug_size > 0) {
138 |         printf("first %lu bytes of the data:\n", debug_size);
139 |         fwrite(tmp.data(), debug_size, 1, stdout);
140 |         putchar('\n');
141 |     }
142 | 
143 |     return tmp;
144 | }
145 | 
146 | std::string Application::generate_signed() {
147 | 
148 |     std::string tmp;
149 | 
150 |     const std::string msg = (quiet) ? "" : "generating random signed numbers ";
151 |     measure_time(msg, [&tmp, this]{
152 |         tmp = ::generate_signed(
153 |                     size,
154 |                     get_separators_set(),
155 |                     random,
156 |                     distribution.numbers,
157 |                     distribution.separators,
158 |                     distribution.sign);
159 |     });
160 |     assert(tmp.size() == size);
161 | 
162 |     if (!quiet && debug_size > 0) {
163 |         printf("first %lu bytes of the data:\n", debug_size);
164 |         fwrite(tmp.data(), debug_size, 1, stdout);
165 |         putchar('\n');
166 |     }
167 | 
168 |     return tmp;
169 | }
170 | 
171 | void Application::print_help() const {
172 |     printf("Usage: %s [OPTIONS]\n", cmdline.get_program_name().c_str());
173 |     puts("");
174 |     puts("options are");
175 |     puts("");
176 |     puts("--size=NUMBER         input size (in bytes)");
177 |     puts("--loops=NUMBER        how many times a test must be repeated [default: 1]");
178 |     puts("--seed=NUMBER         seed for random number generator [default: 0]");
179 |     puts("--num=DISTRIBUTION    distribution of lengths of numbers");
180 |     puts("--sep=DISTRIBUTION    distribution of lengths of gaps between numbers [default: '1']");
181 |     puts("--separators=string   list of separator characters [default: \",; \"]");
182 |     puts("--sign=DISTRIBUTION   distribution of sign in front of number [default: '1']");
183 |     puts("--debug=K             prints K first bytes of generated input [default: 0]");
184 |     puts("");
185 |     puts("Distribution is given as a list of comma-separated values.");
186 |     puts("For --num and --sep the list length is unbound, for --sign it");
187 |     puts("must have exactly three items.");
188 | 
189 |     puts("");
190 |     print_custom_help();
191 | }
192 | 
193 | 
194 | void Application::custom_init() {
195 |     // do nothing
196 | }
197 | 
198 | 
199 | void Application::print_custom_help() const {
200 |     // do nothing
201 | }
202 | 


--------------------------------------------------------------------------------
/scripts/generator.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from writer import CPPWriter as Writer
  3 | 
  4 | SIMD_ELEMENT_SIZES = [1, 2, 4, 8, 16]
  5 | 
  6 | class DigitsSpan(object):
  7 |     # range: [first, last] - include the both ends
  8 |     def __init__(self, first, last):
  9 |         assert first <= last
 10 | 
 11 |         self.first = first
 12 |         self.last  = last
 13 | 
 14 |         assert self.digits() <= 16
 15 | 
 16 | 
 17 |     def digits(self):
 18 |         return self.last - self.first + 1
 19 | 
 20 | 
 21 |     def simd_size(self):
 22 |         d = self.digits()
 23 |         if d == 1:
 24 |             return 1
 25 |         if d == 2:
 26 |             return 2
 27 |         if d <= 4:
 28 |             return 4
 29 |         if d <= 8:
 30 |             return 8
 31 |         return 16;
 32 | 
 33 |     def __str__(self):
 34 |         return "<%d,%d>" % (self.first, self.last)
 35 | 
 36 |     __repr__ = __str__
 37 | 
 38 | DIGIT = 'd'
 39 | SPACE = '_'
 40 | 
 41 | class Parser(object):
 42 |     def __init__(self, number):
 43 |         assert number >= 0
 44 |         assert number < 65536
 45 | 
 46 |         self.number = number
 47 |         self.image = self.__convert_to_string(number)
 48 | 
 49 | 
 50 |     def get_spans(self):
 51 |         prev  = SPACE
 52 |         start = None
 53 |         spans = []
 54 |         for i, c in enumerate(self.image):
 55 |             if c == prev:
 56 |                 continue
 57 | 
 58 |             if c == DIGIT: # transition
 59 |                 start = i
 60 |             else:
 61 |                 # Note: a digits span which not ends within the chunk
 62 |                 #       doesn't appear in the result (we don't know how to parse it)
 63 |                 spans.append(DigitsSpan(start, i - 1))
 64 |                 start = None
 65 | 
 66 |             prev = c
 67 | 
 68 |         if start is not None:
 69 |             incomplete = [DigitsSpan(start, 15)]
 70 |         else:
 71 |             incomplete = []
 72 | 
 73 |         return (spans, incomplete)
 74 | 
 75 | 
 76 |     def __convert_to_string(self, x):
 77 |         s = ''
 78 |         for i in range(16):
 79 |             if x & (1 << i):
 80 |                 s += DIGIT
 81 |             else:
 82 |                 s += SPACE
 83 | 
 84 |         return s
 85 | 
 86 | 
 87 | class Optimizer(object):
 88 |     def __init__(self, spans):
 89 |         self.spans = spans
 90 | 
 91 | 
 92 |     def get_best(self):
 93 |         best = None
 94 |         best_size = None
 95 |         for element_size in SIMD_ELEMENT_SIZES:
 96 | 
 97 |             res = self.__pack(element_size)
 98 |             if res is None:
 99 |                 continue
100 | 
101 |             if best is None or len(res) > len(best):
102 |                 best = res;
103 |                 best_size = element_size
104 | 
105 |         if best is not None:
106 |             return (best_size, best)
107 |         else:
108 |             return None
109 | 
110 | 
111 |     def __pack(self, element_size, vector_size = 16):
112 |         max_size = vector_size / element_size
113 |         result = []
114 |         for r in self.spans:
115 |             if r.digits() <= element_size:
116 |                 result.append(r)
117 |                 if len(result) == max_size:
118 |                     break
119 |             else:
120 |                 break
121 | 
122 |         if len(result) > 0:
123 |             return result
124 |         else:
125 |             return None
126 | 
127 | 
128 | class BlockInfo(object):
129 | 
130 |     __slots__ = ("id", "image", "first_skip", "total_skip",
131 |                  "spans", "all_spans", "element_size", "shuffle_digits",
132 |                  "shuffle_signs")
133 | 
134 |     def __init__(self, number):
135 |         self.id              = number
136 |         self.first_skip      = 0
137 |         self.total_skip      = 0
138 |         self.spans           = []
139 |         self.all_spans       = []
140 |         self.element_size    = 0
141 |         self.shuffle_digits  = []
142 |         self.shuffle_signs   = []
143 | 
144 |     def build_pshubf_masks(self):
145 |         self.build_shuffle_digit()
146 |         self.build_shuffle_signs()
147 | 
148 |     def build_shuffle_digit(self):
149 |         self.shuffle_digits = [0x80] * 16
150 |         for element, r in enumerate(self.spans):
151 |             index  = element * self.element_size
152 |             index += self.element_size - r.digits() # align to "right" within the vector's element
153 | 
154 |             for i in range(r.first, r.last + 1):
155 |                 self.shuffle_digits[index] = i
156 |                 index += 1
157 |     
158 |     def build_shuffle_signs(self):
159 |         self.shuffle_signs = [0x80] * 16
160 |         for element, r in enumerate(self.spans):
161 |             index = element * self.element_size
162 |             for i in range(self.element_size):
163 |                 self.shuffle_signs[index + i] = r.first
164 | 
165 |     def get_invalid_sign_mask(self):
166 |         result = 0
167 |         for r in self.all_spans:
168 |             if r.digits() <= 1:
169 |                 continue
170 | 
171 |             # only first character of span might be '+' or '-'
172 |             bit = 1 << r.first
173 |             result |= bit
174 | 
175 |         if self.all_spans:
176 |             # if last span has just one char it might also be a sign
177 |             last = self.all_spans[-1]
178 |             if last.digits() == 1:
179 |                 bit = 1 << 15
180 |                 result |= bit
181 | 
182 |         # negate result, to avoid negation in runtime
183 |         return ~result & 0xffff
184 | 
185 |     def __str__(self):
186 |         param = (
187 |             self.id,
188 |             self.first_skip,
189 |             self.total_skip,
190 |             self.element_size,
191 |             self.spans
192 |         )
193 | 
194 |         return "<BlockInfo#%04x {first_skip=%d, total_skip=%d, " \
195 |                "element_size=%d, spans=%s}>" % param
196 | 
197 | 
198 | class Generator(object):
199 |     def run(self):
200 |         for i in range(2**16):
201 |             yield self.__get_structure(i)
202 | 
203 | 
204 |     def __get_structure(self, number):
205 |         parser = Parser(number)
206 |         spans, incomplete_span = parser.get_spans()
207 | 
208 |         opt = Optimizer(spans)
209 |         ret = opt.get_best()
210 | 
211 |         block = BlockInfo(number)
212 |         block.image = parser.image
213 |         block.all_spans = spans + incomplete_span
214 |         if ret is not None:
215 |             element_size, items = ret
216 | 
217 |             block.first_skip   = items[0].first
218 |             block.spans        = items
219 |             block.element_size = element_size
220 | 
221 |             block.total_skip = items[-1].last + 1
222 |             try:
223 |                 image = parser.image
224 |                 while image[block.total_skip] == '_':
225 |                     block.total_skip += 1
226 |             except IndexError:
227 |                 pass
228 |         else:
229 |             if number != 0: # there are digits at the end of chunk
230 |                 image = parser.image
231 |                 block.first_skip = image.index(DIGIT)
232 | 
233 |         block.build_pshubf_masks()
234 | 
235 |         return block
236 | 
237 | 
238 | def main(path):
239 |     gen  = Generator()
240 |     data = list(gen.run())
241 | 
242 |     writer = Writer(data)
243 |     writer.save(path)
244 | 
245 | if __name__ == '__main__':
246 |     if len(sys.argv) < 1:
247 |         print("Usage: script output-path")
248 |         sys.exit(1)
249 | 
250 |     main(sys.argv[1])
251 | 


--------------------------------------------------------------------------------
/include/sse/sse-convert.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <immintrin.h>
  5 | 
  6 | namespace sse {
  7 | 
  8 | #define SSE_ALIGN __attribute__ ((aligned (16)))
  9 | 
 10 |     template <typename INSERTER>
 11 |     void convert_1digit(const __m128i& input, int count, INSERTER output) {
 12 |         const __m128i ascii0 = _mm_set1_epi8('0');
 13 | 
 14 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
 15 | 
 16 |         uint8_t tmp[16] SSE_ALIGN;
 17 | 
 18 |         _mm_store_si128((__m128i*)tmp, t0);
 19 |         for (int i=0; i < count; i++)
 20 |             *output++ = tmp[i];
 21 |     }
 22 | 
 23 |     template <typename INSERTER>
 24 |     void convert_2digits(const __m128i& input, int count, INSERTER output) {
 25 |         const __m128i ascii0   = _mm_set1_epi8('0');
 26 |         const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
 27 | 
 28 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
 29 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
 30 | 
 31 |         uint16_t tmp[8] SSE_ALIGN;
 32 | 
 33 |         _mm_store_si128((__m128i*)tmp, t1);
 34 |         for (int i=0; i < count; i++)
 35 |             *output++ = tmp[i];
 36 |     }
 37 | 
 38 |     template <typename INSERTER>
 39 |     void convert_3digits(const __m128i& input, int count, INSERTER output) {
 40 | 
 41 |         const __m128i ascii0    = _mm_set1_epi8('0');
 42 |         const __m128i mul_all   = _mm_setr_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1);
 43 | 
 44 |         // =--------------
 45 | 
 46 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
 47 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_all);
 48 |         const __m128i t2 = _mm_hadd_epi16(t1, t1);
 49 | 
 50 |         uint16_t tmp[8] SSE_ALIGN;
 51 | 
 52 |         _mm_store_si128((__m128i*)tmp, t2);
 53 |         for (int i=0; i < count; i++)
 54 |             *output++ = tmp[i];
 55 |     }
 56 | 
 57 |     template <typename INSERTER>
 58 |     void convert_4digits(const __m128i& input, int count, INSERTER output) {
 59 | 
 60 |         const __m128i ascii0      = _mm_set1_epi8('0');
 61 |         const __m128i mul_1_10    = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
 62 |         const __m128i mul_1_100   = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 
 63 | 
 64 |         // =--------------
 65 | 
 66 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
 67 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
 68 |         const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
 69 | 
 70 |         uint32_t tmp[4] SSE_ALIGN;
 71 | 
 72 |         _mm_store_si128((__m128i*)tmp, t2);
 73 |         for (int i=0; i < count; i++)
 74 |             *output++ = tmp[i];
 75 |     }
 76 | 
 77 |     template <typename INSERTER>
 78 |     void convert_8digits(const __m128i& input, int count, INSERTER output) {
 79 | 
 80 |         const __m128i ascii0      = _mm_set1_epi8('0');
 81 |         const __m128i mul_1_10    = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
 82 |         const __m128i mul_1_100   = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 
 83 |         const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
 84 | 
 85 |         // =--------------
 86 | 
 87 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
 88 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
 89 |         const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
 90 |         const __m128i t3 = _mm_packus_epi32(t2, t2);
 91 |         const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
 92 | 
 93 |         uint32_t tmp[4] SSE_ALIGN;
 94 | 
 95 |         _mm_store_si128((__m128i*)tmp, t4);
 96 |         for (int i=0; i < count; i++)
 97 |             *output++ = tmp[i];
 98 |     }
 99 | 
100 |     template <typename INSERTER>
101 |     void convert_2digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) {
102 |         const __m128i ascii0   = _mm_set1_epi8('0');
103 |         const __m128i mul_1_10 = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
104 | 
105 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
106 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
107 | 
108 |         const __m128i s0 = _mm_xor_si128(t1, negate_mask);
109 |         const __m128i s1 = _mm_sub_epi16(s0, negate_mask);
110 | 
111 |         int16_t tmp[8] SSE_ALIGN;
112 | 
113 |         _mm_store_si128((__m128i*)tmp, s1);
114 |         for (int i=0; i < count; i++)
115 |             *output++ = tmp[i];
116 |     }
117 | 
118 |     template <typename INSERTER>
119 |     void convert_3digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) {
120 | 
121 |         const __m128i ascii0    = _mm_set1_epi8('0');
122 |         const __m128i mul_all   = _mm_setr_epi8(0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1, 0, 100, 10, 1);
123 | 
124 |         const __m128i s0 = _mm_xor_si128(mul_all, negate_mask);
125 |         const __m128i s1 = _mm_sub_epi8(s0, negate_mask);
126 | 
127 |         // =--------------
128 | 
129 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
130 |         const __m128i t1 = _mm_maddubs_epi16(t0, s1);
131 |         const __m128i t2 = _mm_hadd_epi16(t1, t1);
132 | 
133 |         int16_t tmp[8] SSE_ALIGN;
134 | 
135 |         _mm_store_si128((__m128i*)tmp, t2);
136 |         for (int i=0; i < count; i++)
137 |             *output++ = tmp[i];
138 |     }
139 | 
140 |     template <typename INSERTER>
141 |     void convert_4digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) {
142 | 
143 |         const __m128i ascii0      = _mm_set1_epi8('0');
144 |         const __m128i mul_1_10    = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
145 |         const __m128i mul_1_100   = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 
146 | 
147 |         // =--------------
148 | 
149 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
150 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
151 |         const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
152 | 
153 |         const __m128i s0 = _mm_xor_si128(t2, negate_mask);
154 |         const __m128i s1 = _mm_sub_epi32(s0, negate_mask);
155 | 
156 |         int32_t tmp[4] SSE_ALIGN;
157 | 
158 |         _mm_store_si128((__m128i*)tmp, s1);
159 |         for (int i=0; i < count; i++)
160 |             *output++ = tmp[i];
161 |     }
162 | 
163 |     template <typename INSERTER>
164 |     void convert_8digits_signed(const __m128i& input, const __m128i& negate_mask, int count, INSERTER output) {
165 | 
166 |         const __m128i ascii0      = _mm_set1_epi8('0');
167 |         const __m128i mul_1_10    = _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
168 |         const __m128i mul_1_100   = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); 
169 |         const __m128i mul_1_10000 = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
170 | 
171 |         // =--------------
172 | 
173 |         const __m128i t0 = _mm_subs_epu8(input, ascii0);
174 |         const __m128i t1 = _mm_maddubs_epi16(t0, mul_1_10);
175 |         const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
176 | 
177 |         const __m128i s0 = _mm_xor_si128(t2, negate_mask);
178 |         const __m128i s1 = _mm_sub_epi32(s0, negate_mask);
179 |         
180 |         const __m128i t3 = _mm_packs_epi32(s1, s1);
181 |         const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
182 | 
183 |         int32_t tmp[4] SSE_ALIGN;
184 | 
185 |         _mm_store_si128((__m128i*)tmp, t4);
186 |         for (int i=0; i < count; i++)
187 |             *output++ = tmp[i];
188 |     }
189 | 
190 | #undef SSE_ALIGN
191 | 
192 | }
193 | 
194 | 


--------------------------------------------------------------------------------
/test/unittest/verify_sse_signed_parser_validation.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <cassert>
  4 | #include <stdexcept>
  5 | 
  6 | #include "block_info.h"
  7 | #include "sse/sse-matcher.h"
  8 | #include "sse/sse-parser-signed.h"
  9 | 
 10 | const char PLUS     = '+';
 11 | const char MINUS    = '-';
 12 | const char SEP      = '_';
 13 | const char DIGIT    = '1';
 14 | const char INVALID  = 'x';
 15 | 
 16 | class Test {
 17 | 
 18 |     enum Class {
 19 |         Separator,
 20 |         Digit,
 21 |         Sign,
 22 |         Invalid
 23 |     };
 24 | 
 25 |     Class input_pattern[16];
 26 |     char input_string[17];
 27 |     __m128i input;
 28 | 
 29 |     enum class Result {
 30 |         NoException,
 31 |         OverflowException,
 32 |         OtherException
 33 |     };
 34 | 
 35 | public:
 36 |     Test() {
 37 |         for (int i=0; i < 16; i++) {
 38 |             input_pattern[i] = Separator;
 39 |         }
 40 |         
 41 |         input_string[16] = 0;
 42 |         render();
 43 |     }
 44 | 
 45 | public:
 46 |     bool run() {
 47 |         puts("Full validation of invalid input detection in SSE approach");
 48 |         puts("Warning: this might take a few minutes on a decent computer");
 49 |         const auto ret = validate();
 50 |         if (ret) {
 51 |             puts("OK");
 52 |         } else {
 53 |             puts("!!!FAILED!!!");
 54 |         }
 55 | 
 56 |         return ret;
 57 |     }
 58 | 
 59 | private:
 60 |     bool validate() {
 61 |         long id = 0;
 62 |         
 63 |         do {
 64 |             prepare();
 65 | 
 66 |             if (id % 1000000 == 0) {
 67 |                 printf("%ld %s\n", id, input_string);
 68 |             }
 69 | 
 70 |             Result expected;
 71 |             if (is_valid()) {
 72 |                 if (will_overflow())
 73 |                     expected = Result::OverflowException;
 74 |                 else
 75 |                     expected = Result::NoException;
 76 |             } else {
 77 |                 expected = Result::OtherException;
 78 |             }
 79 | 
 80 |             const Result result = SSE_validate_algorithm();
 81 | 
 82 |             if (result != expected) {
 83 |                 printf("failed for %ld: %s\n", id, input_string);
 84 |                 return false;
 85 |             }
 86 |             id += 1;
 87 |         } while (!increment());
 88 | 
 89 |         return true;
 90 |     }
 91 | 
 92 |     Result SSE_validate_algorithm() {
 93 |         std::vector<int32_t> sink;
 94 |         try {
 95 |             sse::NaiveMatcher<1> matcher(SEP);
 96 |             sse::detail::process_chunk(
 97 |                 input_string,
 98 |                 input_string + 16,
 99 |                 input,
100 |                 matcher,
101 |                 std::back_inserter(sink)
102 |             );
103 |             return Result::NoException;
104 |         } catch (std::range_error&) {
105 |             return Result::OverflowException;
106 |         } catch (std::exception&) {
107 |             return Result::OtherException;
108 |         }
109 |     }
110 | 
111 |     void prepare() {
112 |         render();
113 |         input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_string));
114 |     }
115 | 
116 |     void render() {
117 |         int j = 0;
118 |         for (int i=0; i < 16; i++) {
119 |             switch (input_pattern[i]) {
120 |                 case Separator:
121 |                     input_string[i] = SEP;
122 |                     break;
123 | 
124 |                 case Digit:
125 |                     input_string[i] = DIGIT;
126 |                     break;
127 | 
128 |                 case Sign:
129 |                     input_string[i] = (j++ % 2) ? PLUS : MINUS;
130 |                     break;
131 | 
132 |                 case Invalid:
133 |                     input_string[i] = INVALID;
134 |                     break;
135 |             }
136 |         }
137 |     }
138 | 
139 |     Class next(Class x) {
140 |         switch (x) {
141 |             case Separator:
142 |                 return Digit;
143 | 
144 |             case Digit:
145 |                 return Sign;
146 | 
147 |             case Sign:
148 |                 return Invalid;
149 | 
150 |             case Invalid:
151 |                 return Separator;
152 | 
153 |             default:
154 |                 assert(false);
155 |                 return Separator;
156 |         }
157 |     }
158 | 
159 |     bool increment() {
160 |         int i = 0;
161 |         do {
162 |             const auto n = next(input_pattern[i]);
163 |             input_pattern[i] = n;
164 |             if (n != Separator) {
165 |                 return false;
166 |             }
167 | 
168 |             i += 1;
169 | 
170 |         } while (i < 16);
171 | 
172 |         return true;
173 |     }
174 | 
175 | 
176 |     bool is_valid() {
177 |         Class prev = Separator;
178 |         for (int i=0; i < 16; i++) {
179 |             switch (input_pattern[i]) {
180 |                 case Separator:
181 |                     if (prev == Sign) { // a solitary sign, like "__+_"
182 |                         return false;
183 |                     }
184 |                     break;
185 | 
186 |                 case Digit:
187 |                     // a digit can follow anything
188 |                     break;
189 | 
190 |                 case Sign:
191 |                     if (prev != Separator) {
192 |                         return false;
193 |                     }
194 |                     break;
195 | 
196 |                 case Invalid:
197 |                     return false;
198 |             } // switch
199 |             prev = input_pattern[i];
200 |         }
201 | 
202 |         return true;
203 |     }
204 | 
205 |     uint16_t get_span_pattern() const {
206 |         // assume is_valid() == true
207 |         uint16_t result = 0;
208 |         uint16_t bit = 1;
209 |         for (int i=0; i < 16; i++, bit <<= 1) {
210 |             switch (input_pattern[i]) {
211 |                 case Separator:
212 |                     break;
213 | 
214 |                 case Digit:
215 |                 case Sign:
216 |                     result |= bit;
217 |                     break;
218 | 
219 |                 default:
220 |                     assert(false);
221 |                     return 0;
222 | 
223 |             } // switch
224 |         }
225 | 
226 |         return result;
227 |     }
228 | 
229 |     bool will_overflow() const {
230 |         // assume is_valid() == true
231 |         const BlockInfo& b = blocks[get_span_pattern()];
232 |         if (b.conversion_routine != Conversion::Scalar) {
233 |             // only scalar code might cause overflow error
234 |             return false;
235 |         }
236 | 
237 |         uint32_t result = 0;
238 |         bool negative = false;
239 |         for (int i=b.first_skip; i < 16; i++) {
240 |             switch (input_pattern[i]) {
241 |                 case Separator:
242 |                     result = 0;
243 |                     negative = false;
244 |                     // scalar code converts just the first span
245 |                     return false;
246 | 
247 |                 case Digit:
248 |                     try {
249 |                         mul10_add_digit(result, DIGIT);
250 |                     } catch (std::range_error& e) {
251 |                         return true;
252 |                     }
253 |                     break;
254 | 
255 |                 case Sign:
256 |                     negative = true; // (MINUS == ((j++ % 2) ? PLUS : MINUS));
257 |                     result = 0;
258 |                     break;
259 | 
260 |                 default:
261 |                     assert(false);
262 |             } // switch
263 |         }
264 | 
265 |         return false;
266 |     }
267 | 
268 | };
269 | 
270 | 
271 | int main() {
272 |     Test test;
273 | 
274 |     return test.run() ? EXIT_SUCCESS : EXIT_FAILURE;
275 | }
276 | 


--------------------------------------------------------------------------------
/include/sse/sse-parser-common.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <cassert>
  5 | #include <limits>
  6 | #include <stdexcept>
  7 | 
  8 | #include "safe-convert.h"
  9 | #include "sse-utils.h"
 10 | #include "sse-convert.h"
 11 | #include "sse-parser-statistics.h"
 12 | #include "block_info.h"
 13 | 
 14 | namespace sse {
 15 | 
 16 |     namespace detail {
 17 | 
 18 |         template <typename INSERTER>
 19 |         char* parse_unsigned(const BlockInfo& bi, const __m128i input, char* data, char* end, INSERTER output) {
 20 | 
 21 |             const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits);
 22 |             const __m128i shuffled = _mm_shuffle_epi8(input, shuffle_digits);
 23 | 
 24 |             if (bi.conversion_routine == Conversion::SSE1Digit) {
 25 | 
 26 |                 convert_1digit(shuffled, bi.element_count, output);
 27 | 
 28 |                 STATS_INC(unsigned_path.digit1_calls);
 29 |                 STATS_ADD(unsigned_path.digit1_converted, bi.element_count);
 30 | 
 31 |             } else if (bi.conversion_routine == Conversion::SSE2Digits) {
 32 | 
 33 |                 convert_2digits(shuffled, bi.element_count, output);
 34 | 
 35 |                 STATS_INC(unsigned_path.digit2_calls);
 36 |                 STATS_ADD(unsigned_path.digit2_converted, bi.element_count);
 37 | 
 38 |             } else if (bi.conversion_routine == Conversion::SSE3Digits) {
 39 | 
 40 |                 convert_3digits(shuffled, bi.element_count, output);
 41 | 
 42 |                 STATS_INC(unsigned_path.digit3_calls);
 43 |                 STATS_ADD(unsigned_path.digit3_converted, bi.element_count);
 44 | 
 45 |             } else if (bi.conversion_routine == Conversion::SSE4Digits) {
 46 | 
 47 |                 convert_4digits(shuffled, bi.element_count, output);
 48 | 
 49 |                 STATS_INC(unsigned_path.digit4_calls);
 50 |                 STATS_ADD(unsigned_path.digit4_converted, bi.element_count);
 51 | 
 52 |             } else if (bi.conversion_routine == Conversion::SSE8Digits) {
 53 | 
 54 |                 convert_8digits(shuffled, bi.element_count, output);
 55 | 
 56 |                 STATS_INC(unsigned_path.digit8_calls);
 57 |                 STATS_ADD(unsigned_path.digit8_converted, bi.element_count);
 58 | 
 59 |             } else {
 60 |                 uint32_t result = 0;
 61 |                 bool converted = false;
 62 | 
 63 |                 data += bi.first_skip;
 64 |                 while (data < end && *data >= '0' && *data <= '9') {
 65 |                     mul10_add_digit(result, *data);
 66 |                     data += 1;
 67 |                     converted = true;
 68 |                 }
 69 | 
 70 |                 if (converted) {
 71 |                     if (result > std::numeric_limits<int32_t>::max()) {
 72 |                         throw std::range_error("unsigned 32-bit overflow");
 73 |                     }
 74 |                     *output++ = result;
 75 |                 }
 76 | 
 77 |                 STATS_INC(unsigned_path.scalar_conversions);
 78 | 
 79 |                 return data;
 80 |             }
 81 | 
 82 | #ifdef USE_STATISTICS
 83 |             stats.total_skip_histogram[bi.total_skip] += 1;
 84 | #endif
 85 | 
 86 |             return data + bi.total_skip;
 87 |         }
 88 | 
 89 |         template <typename INSERTER>
 90 |         char* parse_signed(
 91 |             const BlockInfo& bi,
 92 |             const __m128i input,
 93 |             char* data,
 94 |             char* end,
 95 |             INSERTER output
 96 |         ) {
 97 |             const __m128i ascii_minus = _mm_set1_epi8('-');
 98 | 
 99 |             const __m128i shuffle_digits = _mm_loadu_si128((const __m128i*)bi.shuffle_digits);
100 |             const __m128i shuffle_signs  = _mm_loadu_si128((const __m128i*)bi.shuffle_signs);
101 | 
102 |             // Note: there is not need to mask '+' or '-' in the input prior
103 |             // shuffling. This is possible because ASCII codes of '+' and '-'
104 |             // are smaller than '0' (43 < 48 && 45 < 48). These character will
105 |             // be zeroed during subtraction of '0'.
106 |             const __m128i shuffled       = _mm_shuffle_epi8(input, shuffle_digits);
107 |             const __m128i shuffled_signs = _mm_shuffle_epi8(input, shuffle_signs);
108 |             const __m128i negate_mask    = _mm_cmpeq_epi8(shuffled_signs, ascii_minus);
109 | 
110 |             if (bi.conversion_routine == Conversion::SSE1Digit) {
111 | 
112 |                 convert_1digit(shuffled, bi.element_count, output);
113 | 
114 |             } else if (bi.conversion_routine == Conversion::SSE2Digits) {
115 | 
116 |                 convert_2digits_signed(shuffled, negate_mask, bi.element_count, output);
117 | 
118 |                 STATS_INC(signed_path.digit2_calls);
119 |                 STATS_ADD(signed_path.digit2_converted, bi.element_count);
120 | 
121 |             } else if (bi.conversion_routine == Conversion::SSE3Digits) {
122 | 
123 |                 convert_3digits_signed(shuffled, negate_mask, bi.element_count, output);
124 | 
125 |                 STATS_INC(signed_path.digit3_calls);
126 |                 STATS_ADD(signed_path.digit3_converted, bi.element_count);
127 | 
128 |             } else if (bi.conversion_routine == Conversion::SSE4Digits) {
129 | 
130 |                 convert_4digits_signed(shuffled, negate_mask, bi.element_count, output);
131 | 
132 |                 STATS_INC(signed_path.digit4_calls);
133 |                 STATS_ADD(signed_path.digit4_converted, bi.element_count);
134 | 
135 |             } else if (bi.conversion_routine == Conversion::SSE8Digits) {
136 | 
137 |                 convert_8digits_signed(shuffled, negate_mask, bi.element_count, output);
138 | 
139 |                 STATS_INC(signed_path.digit8_calls);
140 |                 STATS_ADD(signed_path.digit8_converted, bi.element_count);
141 | 
142 |             } else {
143 |                 bool converted = false;
144 |                 uint32_t result;
145 |                 bool negative;
146 | 
147 |                 data += bi.first_skip;
148 | 
149 |                 if (*data == '+') {
150 |                     data++;
151 |                     negative = false;
152 |                     result = 0;
153 |                 } else if (*data == '-') {
154 |                     data++;
155 |                     negative = true;
156 |                     result = 0;
157 |                 } else {
158 |                     result = *data++ - '0';
159 |                     negative = false;
160 |                     converted = true;
161 |                 }
162 | 
163 |                 while (data < end && *data >= '0' && *data <= '9') {
164 |                     mul10_add_digit(result, *data);
165 |                     data += 1;
166 |                     converted = true;
167 |                 }
168 | 
169 |                 if (converted) {
170 |                     if (negative) {
171 |                         const int64_t tmp = std::numeric_limits<int32_t>::min();
172 |                         const uint32_t absmin = -tmp;
173 |                         if (result > absmin) {
174 |                             throw std::range_error("signed 32-bit overflow");
175 |                         }
176 |                         *output++ = -result;
177 |                     } else {
178 |                         const uint32_t max = std::numeric_limits<int32_t>::max();
179 |                         if (result > max) {
180 |                             throw std::range_error("signed 32-bit overflow");
181 |                         }
182 |                         *output++ = result;
183 |                     }
184 |                 }
185 | 
186 |                 STATS_INC(signed_path.scalar_conversions);
187 | 
188 |                 return data;
189 |             }
190 | 
191 | #ifdef USE_STATISTICS
192 |             stats.total_skip_histogram[bi.total_skip] += 1;
193 | #endif
194 | 
195 |             return data + bi.total_skip;
196 |         }
197 | 
198 |     } // namespace detail
199 | 
200 | } // namespace sse
201 | 


--------------------------------------------------------------------------------