├── .gitignore ├── src ├── itoa-benchmark │ ├── result │ │ ├── makefile │ │ ├── corei7920@2.67_win64_vc2013_u32toa_sequential_time.png │ │ └── corei7920@2.67_win64_vc2013_u32toa_sequential_timedigit.png │ ├── null.cpp │ ├── amartin.cpp │ ├── vc.cpp │ ├── fmt.cpp │ ├── sprintf.cpp │ ├── to_string.cpp │ ├── ostrstream.cpp │ ├── ostringstream.cpp │ ├── digitslut.h │ ├── itoa_ljust.h │ ├── mwilson.cpp │ ├── license.txt │ ├── count.cpp │ ├── naive.cpp │ ├── CMakeLists.txt │ ├── folly.cpp │ ├── timer.h │ ├── itoa_jeaiii_bind.cpp │ ├── test.h │ ├── resultfilename.h │ ├── lut.cpp │ ├── branchlut2.cpp │ ├── countlut.cpp │ ├── countdecimaldigit.h │ ├── itoa_jeaiii.cpp │ ├── tmueller.cpp │ ├── readme.md │ ├── unnamed.cpp │ ├── itoa_ljust_impl.h │ ├── branchlut.cpp │ ├── msinttypes │ │ ├── inttypes.h │ │ └── stdint.h │ ├── main.cpp │ ├── unrolledlut.cpp │ └── sse2.cpp ├── digits10 │ ├── digits10.cc │ ├── digits10-benchmark.cc │ ├── digits10-test.cc │ └── digits10.h ├── find-pow10-benchmark.cc ├── vararg-benchmark.cc ├── file-benchmark.cc ├── concat-benchmark.cc ├── locale-benchmark.cc ├── u2985907.h ├── tinyformat-test.cc ├── itostr.cc └── int-benchmark.cc ├── .gitmodules ├── .clang-format ├── README.rst ├── CMakeLists.txt ├── bloat-test.py └── variadic-test.py /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeFiles 2 | -------------------------------------------------------------------------------- /src/itoa-benchmark/result/makefile: -------------------------------------------------------------------------------- 1 | %.html: %.csv template.php 2 | php template.php $< > $@ 3 | 4 | CSVFILES = $(basename $(wildcard *.csv)) 5 | all: $(addsuffix .html, $(CSVFILES)) 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "fmt"] 2 | path = fmt 3 | url = https://github.com/fmtlib/fmt.git 4 | [submodule "benchmark"] 5 | path = benchmark 6 | url = https://github.com/google/benchmark.git 7 | -------------------------------------------------------------------------------- /src/itoa-benchmark/result/corei7920@2.67_win64_vc2013_u32toa_sequential_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmtlib/format-benchmark/HEAD/src/itoa-benchmark/result/corei7920@2.67_win64_vc2013_u32toa_sequential_time.png -------------------------------------------------------------------------------- /src/itoa-benchmark/result/corei7920@2.67_win64_vc2013_u32toa_sequential_timedigit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmtlib/format-benchmark/HEAD/src/itoa-benchmark/result/corei7920@2.67_win64_vc2013_u32toa_sequential_timedigit.png -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | # Run manually to reformat a file: 2 | # clang-format -i --style=file 3 | Language: Cpp 4 | BasedOnStyle: Google 5 | IndentPPDirectives: AfterHash 6 | IndentCaseLabels: false 7 | AlwaysBreakTemplateDeclarations: false 8 | DerivePointerAlignment: false 9 | -------------------------------------------------------------------------------- /src/itoa-benchmark/null.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | 4 | void u32toa_null(uint32_t, char*) { 5 | } 6 | 7 | void i32toa_null(int32_t, char*) { 8 | } 9 | 10 | void u64toa_null(uint64_t, char*) { 11 | } 12 | 13 | void i64toa_null(int64_t, char*) { 14 | } 15 | 16 | REGISTER_TEST(null); 17 | -------------------------------------------------------------------------------- /src/itoa-benchmark/amartin.cpp: -------------------------------------------------------------------------------- 1 | #include "itoa_ljust_impl.h" 2 | #include "test.h" 3 | 4 | void u32toa_amartin(uint32_t v, char* out) { (void)to_dec(out,v); } 5 | void u64toa_amartin(uint64_t v, char* out) { (void)to_dec(out,v); } 6 | void i32toa_amartin( int32_t v, char* out) { (void)to_dec(out,v); } 7 | void i64toa_amartin( int64_t v, char* out) { (void)to_dec(out,v); } 8 | 9 | REGISTER_TEST(amartin); 10 | -------------------------------------------------------------------------------- /src/itoa-benchmark/vc.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | 3 | #include 4 | #include 5 | #include "test.h" 6 | 7 | void u32toa_vc(uint32_t value, char* buffer) { 8 | _ui64toa(value, buffer, 10); // No 32-bit unsigned version. 9 | } 10 | 11 | void i32toa_vc(int32_t value, char* buffer) { 12 | _itoa(value, buffer, 10); 13 | } 14 | 15 | void u64toa_vc(uint64_t value, char* buffer) { 16 | _ui64toa(value, buffer, 10); 17 | } 18 | 19 | void i64toa_vc(int64_t value, char* buffer) { 20 | _i64toa(value, buffer, 10); 21 | } 22 | 23 | REGISTER_TEST(vc); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/itoa-benchmark/fmt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "test.h" 4 | 5 | void u32toa_fmt(uint32_t value, char* buffer) { 6 | *fmt::format_to(buffer, FMT_COMPILE("{}"), value) = '\0'; 7 | } 8 | 9 | void i32toa_fmt(int32_t value, char* buffer) { 10 | *fmt::format_to(buffer, FMT_COMPILE("{}"), value) = '\0'; 11 | } 12 | 13 | void u64toa_fmt(uint64_t value, char* buffer) { 14 | *fmt::format_to(buffer, FMT_COMPILE("{}"), value) = '\0'; 15 | } 16 | 17 | void i64toa_fmt(int64_t value, char* buffer) { 18 | *fmt::format_to(buffer, FMT_COMPILE("{}"), value) = '\0'; 19 | } 20 | 21 | REGISTER_TEST(fmt); 22 | -------------------------------------------------------------------------------- /src/itoa-benchmark/sprintf.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include "msinttypes/inttypes.h" 3 | #else 4 | #include 5 | #endif 6 | #include 7 | #include "test.h" 8 | 9 | void u32toa_sprintf(uint32_t value, char* buffer) { 10 | sprintf(buffer, "%u", value); 11 | } 12 | 13 | void i32toa_sprintf(int32_t value, char* buffer) { 14 | sprintf(buffer, "%d", value); 15 | } 16 | 17 | void u64toa_sprintf(uint64_t value, char* buffer) { 18 | sprintf(buffer, "%" PRIu64, value); 19 | } 20 | 21 | void i64toa_sprintf(int64_t value, char* buffer) { 22 | sprintf(buffer, "%" PRIi64, value); 23 | } 24 | 25 | REGISTER_TEST(sprintf); 26 | -------------------------------------------------------------------------------- /src/digits10/digits10.cc: -------------------------------------------------------------------------------- 1 | #include "digits10.h" 2 | #include 3 | 4 | using std::uint32_t; 5 | 6 | const uint32_t powers_of_10_u32[] = { 7 | 0, 8 | 10, 9 | 100, 10 | 1000, 11 | 10000, 12 | 100000, 13 | 1000000, 14 | 10000000, 15 | 100000000, 16 | 1000000000 17 | }; 18 | 19 | std::vector generate_numbers(int num_digits) { 20 | // Use fixed seed to generate identical sequences. 21 | std::mt19937 gen(0); 22 | std::uniform_int_distribution 23 | dis(min_number(num_digits), max_number(num_digits)); 24 | std::vector result; 25 | int count = 100; 26 | result.reserve(count); 27 | for (int i = 0; i < count; ++i) 28 | result.push_back(dis(gen)); 29 | return result; 30 | } 31 | -------------------------------------------------------------------------------- /src/itoa-benchmark/to_string.cpp: -------------------------------------------------------------------------------- 1 | #if __cplusplus >= 201103L || _MSC_VER >= 1700 2 | 3 | #ifdef _MSC_VER 4 | #include "msinttypes/inttypes.h" 5 | #else 6 | #include 7 | #endif 8 | #include 9 | #include "test.h" 10 | 11 | void u32toa_to_string(uint32_t value, char* buffer) { 12 | strcpy(buffer, std::to_string(value).c_str()); 13 | } 14 | 15 | void i32toa_to_string(int32_t value, char* buffer) { 16 | strcpy(buffer, std::to_string(value).c_str()); 17 | } 18 | 19 | void u64toa_to_string(uint64_t value, char* buffer) { 20 | strcpy(buffer, std::to_string(value).c_str()); 21 | } 22 | 23 | void i64toa_to_string(int64_t value, char* buffer) { 24 | strcpy(buffer, std::to_string(value).c_str()); 25 | } 26 | 27 | #if RUN_CPPITOA 28 | REGISTER_TEST(to_string); 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/itoa-benchmark/ostrstream.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include "msinttypes/inttypes.h" 3 | #else 4 | #include 5 | #endif 6 | #include 7 | #include "test.h" 8 | 9 | void u32toa_ostrstream(uint32_t value, char* buffer) { 10 | std::ostrstream oss(buffer, 11); 11 | oss << value << std::ends; 12 | } 13 | 14 | void i32toa_ostrstream(int32_t value, char* buffer) { 15 | std::ostrstream oss(buffer, 12); 16 | oss << value << std::ends; 17 | } 18 | 19 | void u64toa_ostrstream(uint64_t value, char* buffer) { 20 | std::ostrstream oss(buffer, 21); 21 | oss << value << std::ends; 22 | } 23 | 24 | void i64toa_ostrstream(int64_t value, char* buffer) { 25 | std::ostrstream oss(buffer, 22); 26 | oss << value << std::ends; 27 | } 28 | 29 | #if RUN_CPPITOA 30 | REGISTER_TEST(ostrstream); 31 | #endif 32 | -------------------------------------------------------------------------------- /src/itoa-benchmark/ostringstream.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include "msinttypes/inttypes.h" 3 | #else 4 | #include 5 | #endif 6 | #include 7 | #include "test.h" 8 | 9 | void u32toa_ostringstream(uint32_t value, char* buffer) { 10 | std::ostringstream oss; 11 | oss << value; 12 | strcpy(buffer, oss.str().c_str()); 13 | } 14 | 15 | void i32toa_ostringstream(int32_t value, char* buffer) { 16 | std::ostringstream oss; 17 | oss << value; 18 | strcpy(buffer, oss.str().c_str()); 19 | } 20 | 21 | void u64toa_ostringstream(uint64_t value, char* buffer) { 22 | std::ostringstream oss; 23 | oss << value; 24 | strcpy(buffer, oss.str().c_str()); 25 | } 26 | 27 | void i64toa_ostringstream(int64_t value, char* buffer) { 28 | std::ostringstream oss; 29 | oss << value; 30 | strcpy(buffer, oss.str().c_str()); 31 | } 32 | 33 | #if RUN_CPPITOA 34 | REGISTER_TEST(ostringstream); 35 | #endif 36 | -------------------------------------------------------------------------------- /src/itoa-benchmark/digitslut.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | const char gDigitsLut[200] = { 4 | '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9', 5 | '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9', 6 | '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9', 7 | '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9', 8 | '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9', 9 | '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9', 10 | '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9', 11 | '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9', 12 | '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9', 13 | '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9' 14 | }; 15 | -------------------------------------------------------------------------------- /src/itoa-benchmark/itoa_ljust.h: -------------------------------------------------------------------------------- 1 | #ifndef ITOA_LJUST_H 2 | #define ITOA_LJUST_H 3 | 4 | //=== itoa_ljust.h - Fast integer to ascii conversion --*- C++ -*-// 5 | // 6 | // Fast and simple integer to ASCII conversion: 7 | // 8 | // - 32 and 64-bit integers 9 | // - signed and unsigned 10 | // - user supplied buffer must be large enough for all decimal digits 11 | // in value plus minus sign if negative 12 | // - left-justified 13 | // - NUL terminated 14 | // - return value is pointer to NUL terminator 15 | // 16 | // Copyright (c) 2016 Arturo Martin-de-Nicolas 17 | // arturomdn@gmail.com 18 | // https://github.com/amdn/itoa_ljust/ 19 | //===----------------------------------------------------------------------===// 20 | 21 | #include 22 | 23 | namespace itoa_ljust { 24 | 25 | char* itoa(uint32_t u, char* buffer); 26 | char* itoa( int32_t i, char* buffer); 27 | char* itoa(uint64_t u, char* buffer); 28 | char* itoa( int64_t i, char* buffer); 29 | 30 | } 31 | 32 | #endif // ITOA_LJUST_H 33 | -------------------------------------------------------------------------------- /src/itoa-benchmark/mwilson.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "test.h" 4 | 5 | const char digits[] = "9876543210123456789"; 6 | const char* zero = digits + 9; 7 | 8 | // Efficient Integer to String Conversions, by Matthew Wilson. 9 | template 10 | size_t convert(char buf[], T value) 11 | { 12 | T i = value; 13 | char* p = buf; 14 | 15 | do { 16 | int lsd = static_cast(i % 10); 17 | i /= 10; 18 | *p++ = zero[lsd]; 19 | } while (i != 0); 20 | 21 | if (value < 0) { 22 | *p++ = '-'; 23 | } 24 | *p = '\0'; 25 | std::reverse(buf, p); 26 | 27 | return p - buf; 28 | } 29 | 30 | void u32toa_mwilson(uint32_t value, char* buffer) { 31 | convert(buffer, value); 32 | } 33 | 34 | void i32toa_mwilson(int32_t value, char* buffer) { 35 | convert(buffer, value); 36 | } 37 | 38 | void u64toa_mwilson(uint64_t value, char* buffer) { 39 | convert(buffer, value); 40 | } 41 | 42 | void i64toa_mwilson(int64_t value, char* buffer) { 43 | convert(buffer, value); 44 | } 45 | 46 | REGISTER_TEST(mwilson); 47 | -------------------------------------------------------------------------------- /src/itoa-benchmark/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Milo Yip 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /src/itoa-benchmark/count.cpp: -------------------------------------------------------------------------------- 1 | #include "countdecimaldigit.h" 2 | #include "test.h" 3 | 4 | // Additional count number of digit pass 5 | 6 | void u32toa_count(uint32_t value, char* buffer) { 7 | unsigned digit = CountDecimalDigit32(value); 8 | buffer += digit; 9 | *buffer = '\0'; 10 | 11 | do { 12 | *--buffer = char(value % 10) + '0'; 13 | value /= 10; 14 | } while (value > 0); 15 | } 16 | 17 | void i32toa_count(int32_t value, char* buffer) { 18 | uint32_t u = static_cast(value); 19 | if (value < 0) { 20 | *buffer++ = '-'; 21 | u = ~u + 1; 22 | } 23 | u32toa_count(u, buffer); 24 | } 25 | 26 | void u64toa_count(uint64_t value, char* buffer) { 27 | unsigned digit = CountDecimalDigit64(value); 28 | buffer += digit; 29 | *buffer = '\0'; 30 | 31 | do { 32 | *--buffer = char(value % 10) + '0'; 33 | value /= 10; 34 | } while (value > 0); 35 | } 36 | 37 | void i64toa_count(int64_t value, char* buffer) { 38 | uint64_t u = static_cast(value); 39 | if (value < 0) { 40 | *buffer++ = '-'; 41 | u = ~u + 1; 42 | } 43 | u64toa_count(u, buffer); 44 | } 45 | 46 | REGISTER_TEST(count); 47 | -------------------------------------------------------------------------------- /src/itoa-benchmark/naive.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | 4 | void u32toa_naive(uint32_t value, char* buffer) { 5 | char temp[10]; 6 | char *p = temp; 7 | do { 8 | *p++ = char(value % 10) + '0'; 9 | value /= 10; 10 | } while (value > 0); 11 | 12 | do { 13 | *buffer++ = *--p; 14 | } while (p != temp); 15 | 16 | *buffer = '\0'; 17 | } 18 | 19 | void i32toa_naive(int32_t value, char* buffer) { 20 | uint32_t u = static_cast(value); 21 | if (value < 0) { 22 | *buffer++ = '-'; 23 | u = ~u + 1; 24 | } 25 | u32toa_naive(u, buffer); 26 | } 27 | 28 | void u64toa_naive(uint64_t value, char* buffer) { 29 | char temp[20]; 30 | char *p = temp; 31 | do { 32 | *p++ = char(value % 10) + '0'; 33 | value /= 10; 34 | } while (value > 0); 35 | 36 | do { 37 | *buffer++ = *--p; 38 | } while (p != temp); 39 | 40 | *buffer = '\0'; 41 | } 42 | 43 | void i64toa_naive(int64_t value, char* buffer) { 44 | uint64_t u = static_cast(value); 45 | if (value < 0) { 46 | *buffer++ = '-'; 47 | u = ~u + 1; 48 | } 49 | u64toa_naive(u, buffer); 50 | } 51 | 52 | REGISTER_TEST(naive); 53 | -------------------------------------------------------------------------------- /src/itoa-benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable( 3 | itoa-benchmark 4 | amartin.cpp 5 | branchlut.cpp 6 | branchlut2.cpp 7 | count.cpp 8 | countdecimaldigit.h 9 | countlut.cpp 10 | digitslut.h 11 | folly.cpp 12 | fmt.cpp 13 | itoa_jeaiii.cpp 14 | itoa_jeaiii_bind.cpp 15 | itoa_ljust.h 16 | itoa_ljust_impl.h 17 | lut.cpp 18 | main.cpp 19 | msinttypes 20 | mwilson.cpp 21 | naive.cpp 22 | null.cpp 23 | ostringstream.cpp 24 | ostrstream.cpp 25 | resultfilename.h 26 | sprintf.cpp 27 | sse2.cpp 28 | test.h 29 | timer.h 30 | tmueller.cpp 31 | to_string.cpp 32 | unnamed.cpp 33 | unrolledlut.cpp 34 | vc.cpp) 35 | target_link_libraries(itoa-benchmark fmt) 36 | 37 | if (APPLE) 38 | execute_process(COMMAND sysctl -n machdep.cpu.brand_string 39 | OUTPUT_VARIABLE out) 40 | elseif (UNIX) 41 | file(READ /proc/cpuinfo out) 42 | string(REGEX MATCH "(model name[^\n]*)" out "${out}") 43 | endif () 44 | 45 | string(REGEX REPLACE 46 | "(model name.*: )|[ |\n]+|(Intel\$R\$)|\$TM\$|\$R\$|CPU" "" 47 | out "${out}") 48 | if (out) 49 | target_compile_definitions(itoa-benchmark PRIVATE MACHINE="${out}") 50 | endif () 51 | -------------------------------------------------------------------------------- /src/itoa-benchmark/folly.cpp: -------------------------------------------------------------------------------- 1 | #ifdef HAS_FOLLY 2 | 3 | #include "test.h" 4 | #include 5 | 6 | using namespace folly; 7 | 8 | // Refer to the code from Folly/Conv.h 9 | // template 10 | // typename std::enable_if< 11 | // std::is_integral::value && std::is_signed::value && 12 | // IsSomeString::value && sizeof(Src) >= 4>::type 13 | // toAppend(Src value, Tgt * result) 14 | 15 | void u32toa_folly(uint32_t value, char* buffer) { 16 | buffer[uint64ToBufferUnsafe(value, buffer)] = '\0'; 17 | } 18 | 19 | void i32toa_folly(int32_t value, char* buffer) { 20 | if (value < 0) { 21 | *buffer++ = '-'; 22 | buffer[uint64ToBufferUnsafe(-uint64_t(value), buffer)] = '\0'; 23 | } 24 | else 25 | buffer[uint64ToBufferUnsafe(value, buffer)] = '\0'; 26 | } 27 | 28 | void u64toa_folly(uint64_t value, char* buffer) { 29 | buffer[uint64ToBufferUnsafe(value, buffer)] = '\0'; 30 | } 31 | 32 | void i64toa_folly(int64_t value, char* buffer) { 33 | if (value < 0) { 34 | *buffer++ = '-'; 35 | buffer[uint64ToBufferUnsafe(-uint64_t(value), buffer)] = '\0'; 36 | } 37 | else 38 | buffer[uint64ToBufferUnsafe(value, buffer)] = '\0'; 39 | } 40 | 41 | REGISTER_TEST(folly); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/itoa-benchmark/timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _WIN32 4 | 5 | #define WIN32_LEAN_AND_MEAN 6 | #include 7 | 8 | class Timer { 9 | public: 10 | Timer() : start_(), end_() { 11 | } 12 | 13 | void Start() { 14 | QueryPerformanceCounter(&start_); 15 | } 16 | 17 | void Stop() { 18 | QueryPerformanceCounter(&end_); 19 | } 20 | 21 | double GetElapsedMilliseconds() { 22 | LARGE_INTEGER freq; 23 | QueryPerformanceFrequency(&freq); 24 | return (end_.QuadPart - start_.QuadPart) * 1000.0 / freq.QuadPart; 25 | } 26 | 27 | private: 28 | LARGE_INTEGER start_; 29 | LARGE_INTEGER end_; 30 | }; 31 | 32 | // Undefine Windows bad macros 33 | #undef min 34 | #undef max 35 | 36 | #else 37 | 38 | #include 39 | 40 | class Timer { 41 | public: 42 | Timer() : start_(), end_() { 43 | } 44 | 45 | void Start() { 46 | gettimeofday(&start_, NULL); 47 | } 48 | 49 | void Stop() { 50 | gettimeofday(&end_, NULL); 51 | } 52 | 53 | double GetElapsedMilliseconds() { 54 | return (end_.tv_sec - start_.tv_sec) * 1000.0 55 | + (end_.tv_usec - start_.tv_usec) / 1000.0; 56 | } 57 | 58 | private: 59 | struct timeval start_; 60 | struct timeval end_; 61 | }; 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/digits10/digits10-benchmark.cc: -------------------------------------------------------------------------------- 1 | #include "digits10.h" 2 | 3 | static void num_digits(benchmark::internal::Benchmark* b) { 4 | for (int i = 1; i <= 10; ++i) b->Arg(i); 5 | } 6 | 7 | static void fmt64(benchmark::State& state) { 8 | run_benchmark(state, digits10_fmt64); 9 | } 10 | BENCHMARK(fmt64)->Apply(num_digits); 11 | 12 | static void jk_jeon(benchmark::State& state) { 13 | run_benchmark(state, digits10_jk_jeon); 14 | } 15 | BENCHMARK(jk_jeon)->Apply(num_digits); 16 | 17 | static void willets(benchmark::State& state) { 18 | run_benchmark(state, digits10_willets); 19 | } 20 | BENCHMARK(willets)->Apply(num_digits); 21 | 22 | static void clz_zverovich(benchmark::State& state) { 23 | run_benchmark(state, digits10_clz_zverovich); 24 | } 25 | BENCHMARK(clz_zverovich)->Apply(num_digits); 26 | 27 | static void grisu(benchmark::State& state) { 28 | run_benchmark(state, digits10_grisu); 29 | } 30 | BENCHMARK(grisu)->Apply(num_digits); 31 | 32 | static void naive(benchmark::State& state) { 33 | run_benchmark(state, digits10_naive); 34 | } 35 | BENCHMARK(naive)->Apply(num_digits); 36 | 37 | static void unroll4(benchmark::State& state) { 38 | run_benchmark(state, digits10_unroll4); 39 | } 40 | BENCHMARK(unroll4)->Apply(num_digits); 41 | 42 | static void clz(benchmark::State& state) { run_benchmark(state, digits10_clz); } 43 | BENCHMARK(clz)->Apply(num_digits); 44 | 45 | BENCHMARK_MAIN(); 46 | -------------------------------------------------------------------------------- /src/itoa-benchmark/itoa_jeaiii_bind.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright(c) 2017 James Edward Anhalt III (jeaiii) 5 | https://github.com/jeaiii/itoa 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files(the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions : 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #include "test.h" 27 | 28 | void u32toa_jeaiii(uint32_t i, char* b); 29 | void i32toa_jeaiii(int32_t i, char* b); 30 | void u64toa_jeaiii(uint64_t i, char* b); 31 | void i64toa_jeaiii(int64_t i, char* b); 32 | 33 | REGISTER_TEST(jeaiii); 34 | -------------------------------------------------------------------------------- /src/find-pow10-benchmark.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | auto generate_random_data() { 7 | std::random_device rd; 8 | std::mt19937 rng(rd()); 9 | std::uniform_int_distribution uni(-300, 300); 10 | std::vector v; 11 | std::generate_n(std::back_inserter(v), 100'000, [&] { return uni(rng); }); 12 | return v; 13 | } 14 | 15 | auto data = generate_random_data(); 16 | 17 | void find_pow10_ceil(benchmark::State &s) { 18 | size_t result = 0; 19 | while (s.KeepRunning()) { 20 | for (auto i: data) { 21 | const double one_over_log2_10 = 0.30102999566398114; // 1 / log2(10) 22 | int index = static_cast( 23 | std::ceil((i + 64 - 1) * one_over_log2_10)); 24 | result += index; 25 | } 26 | } 27 | benchmark::DoNotOptimize(result); 28 | } 29 | BENCHMARK(find_pow10_ceil); 30 | 31 | void find_pow10_int(benchmark::State &s) { 32 | size_t result = 0; 33 | while (s.KeepRunning()) { 34 | for (auto i: data) { 35 | constexpr std::uint64_t log10_2_up_to_32 = 0x4d104d42; 36 | auto index = int( 37 | // For arithmetic-shift 38 | std::int64_t( 39 | // Calculate 0x0.4d104d42 * exp * 2^32 40 | std::uint64_t(i + 64 - 1) * log10_2_up_to_32 41 | // To perform ceiling 42 | + ((std::uint64_t(1) << 32) - 1) 43 | ) >> 32 44 | ); 45 | result += index; 46 | } 47 | } 48 | benchmark::DoNotOptimize(result); 49 | } 50 | BENCHMARK(find_pow10_int); 51 | 52 | BENCHMARK_MAIN(); 53 | -------------------------------------------------------------------------------- /src/itoa-benchmark/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | struct Test; 8 | typedef std::vector TestList; 9 | class TestManager { 10 | public: 11 | static TestManager& Instance() { 12 | static TestManager singleton; 13 | return singleton; 14 | } 15 | 16 | void AddTest(const Test* test) { 17 | mTests.push_back(test); 18 | } 19 | 20 | const TestList& GetTests() const { 21 | return mTests; 22 | } 23 | 24 | TestList& GetTests() { 25 | return mTests; 26 | } 27 | 28 | private: 29 | TestList mTests; 30 | }; 31 | 32 | struct Test { 33 | Test( 34 | const char* fname, 35 | void (*u32toa)(uint32_t, char*), 36 | void (*i32toa)(int32_t, char*), 37 | void (*u64toa)(uint64_t, char*), 38 | void (*i64toa)(int64_t, char*)) 39 | : 40 | fname(fname), 41 | u32toa(u32toa), 42 | i32toa(i32toa), 43 | u64toa(u64toa), 44 | i64toa(i64toa) 45 | { 46 | TestManager::Instance().AddTest(this); 47 | } 48 | 49 | bool operator<(const Test& rhs) const { 50 | return strcmp(fname, rhs.fname) < 0; 51 | } 52 | 53 | const char* fname; 54 | void (*u32toa)(uint32_t, char*); 55 | void (*i32toa)(int32_t, char*); 56 | void (*u64toa)(uint64_t, char*); 57 | void (*i64toa)(int64_t, char*); 58 | }; 59 | 60 | 61 | #define STRINGIFY(x) #x 62 | #define REGISTER_TEST(f) static Test gRegister##f(STRINGIFY(f), u32toa##_##f, i32toa##_##f, u64toa##_##f, i64toa##_##f) 63 | 64 | -------------------------------------------------------------------------------- /src/itoa-benchmark/resultfilename.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef MACHINE 4 | #define MACHINE "unknown" 5 | #endif 6 | 7 | #if defined(_WIN64) 8 | # define OS "win64" 9 | #elif defined(_WIN32) 10 | # define OS "win32" 11 | #elif defined(__CYGWIN__) && defined(__x86_64) 12 | # define OS "cygwin64" 13 | #elif defined(__CYGWIN__) 14 | # define OS "cygwin32" 15 | #elif defined(__APPLE__) 16 | # include "TargetConditionals.h" 17 | # if TARGET_OS_IPHONE 18 | # ifdef __LP64__ 19 | # define OS "ios64" 20 | # else 21 | # define OS "ios32" 22 | # endif 23 | # elif TARGET_OS_MAC 24 | # ifdef __amd64__ 25 | # define OS "mac64" 26 | # else 27 | # define OS "mac32" 28 | # endif 29 | # endif 30 | #elif defined(__linux) 31 | # ifdef __LP64__ 32 | # define OS "linux64" 33 | # else 34 | # define OS "linux32" 35 | # endif 36 | #endif 37 | 38 | #ifndef OS 39 | #define OS "unknown" 40 | #endif 41 | 42 | #define STR_HELPER(x) #x 43 | #define STR(x) STR_HELPER(x) 44 | 45 | #if defined(_MSC_VER) 46 | # if _MSC_VER >= 1800 47 | # define COMPILER "vc2013" 48 | # elif _MSC_VER >= 1700 49 | # define COMPILER "vc2012" 50 | # elif _MSC_VER >= 1600 51 | # define COMPILER "vc2010" 52 | # elif _MSC_VER >= 1500 53 | # define COMPILER "vc2008" 54 | # elif _MSC_VER >= 1400 55 | # define COMPILER "vc2005" 56 | # else 57 | # define COMPILER "vc" 58 | # endif 59 | #elif defined(__clang__) 60 | # define COMPILER "clang" STR(__clang_major__) "." STR(__clang_minor__) 61 | #elif defined(__GNUC__) 62 | # define COMPILER "gcc" STR(__GNUC__) "." STR(__GNUC_MINOR__) 63 | #else 64 | # define COMPILER "Unknown" 65 | #endif 66 | 67 | #define RESULT_FILENAME MACHINE "_" OS "_" COMPILER ".csv" 68 | -------------------------------------------------------------------------------- /src/digits10/digits10-test.cc: -------------------------------------------------------------------------------- 1 | #include "digits10.h" 2 | 3 | #include 4 | 5 | using std::uint32_t; 6 | 7 | void test_digits10(uint32_t (*digits10)(uint32_t n)) { 8 | for (uint32_t i = 0; i < 10; ++i) EXPECT_EQ(1u, digits10(i)); 9 | for (uint32_t i = 1, n = 1, end = std::numeric_limits::max() / 10; 10 | n <= end; ++i) { 11 | n *= 10; 12 | EXPECT_EQ(i, digits10(n - 1)); 13 | EXPECT_EQ(i + 1, digits10(n)); 14 | } 15 | } 16 | 17 | TEST(Digits10Test, Digits10) { 18 | test_digits10(digits10_naive); 19 | test_digits10(digits10_unroll4); 20 | test_digits10(digits10_clz); 21 | test_digits10(digits10_clz_zverovich); 22 | } 23 | 24 | TEST(Digits10Test, MinNumber) { 25 | EXPECT_EQ(0, min_number(1)); 26 | EXPECT_EQ(10, min_number(2)); 27 | EXPECT_EQ(100, min_number(3)); 28 | EXPECT_EQ(1000000000, min_number(10)); 29 | EXPECT_THROW(min_number(0), std::out_of_range); 30 | EXPECT_THROW(min_number(11), std::out_of_range); 31 | } 32 | 33 | TEST(Digits10Test, MaxNumber) { 34 | EXPECT_EQ(9, max_number(1)); 35 | EXPECT_EQ(99, max_number(2)); 36 | EXPECT_EQ(999, max_number(3)); 37 | EXPECT_EQ(999999999, max_number(9)); 38 | EXPECT_EQ(std::numeric_limits::max(), max_number(10)); 39 | EXPECT_THROW(max_number(0), std::out_of_range); 40 | EXPECT_THROW(max_number(11), std::out_of_range); 41 | } 42 | 43 | TEST(Digits10Test, GenerateNumbers) { 44 | const std::size_t size = 100; 45 | auto n1 = generate_numbers(3); 46 | auto n2 = generate_numbers(3); 47 | EXPECT_EQ(size, n1.size()); 48 | for (std::size_t i = 0; i < size; ++i) { 49 | EXPECT_EQ(3, digits10_naive(n1[i])); 50 | EXPECT_EQ(n1[i], n2[i]); 51 | } 52 | } 53 | 54 | int main(int argc, char** argv) { 55 | ::testing::InitGoogleTest(&argc, argv); 56 | return RUN_ALL_TESTS(); 57 | } 58 | -------------------------------------------------------------------------------- /src/vararg-benchmark.cc: -------------------------------------------------------------------------------- 1 | // Benchmark varargs overhead. 2 | 3 | #include 4 | #include "benchmark/benchmark.h" 5 | #include "fmt/format.h" 6 | 7 | int __attribute__((noinline)) test_vprintf(const char *f, std::va_list) { 8 | benchmark::DoNotOptimize(f); 9 | return 0; 10 | } 11 | 12 | int test_printf(const char *format, ...) { 13 | std::va_list args; 14 | va_start(args, format); 15 | int result = test_vprintf(format, args); 16 | va_end(args); 17 | return result; 18 | } 19 | 20 | void varargs(benchmark::State& state) { 21 | while (state.KeepRunning()) 22 | test_printf("%d", 42); 23 | } 24 | 25 | BENCHMARK(varargs); 26 | 27 | void __attribute__((noinline)) test_vprint(const char *f, fmt::format_args) { 28 | benchmark::DoNotOptimize(f); 29 | } 30 | 31 | template 32 | inline void test_print(const char *format, const Args & ... args) { 33 | test_vprint(format, fmt::make_format_args(args...)); 34 | } 35 | 36 | void fmt_variadic(benchmark::State &state) { 37 | while (state.KeepRunning()) 38 | test_print("{}", 42); 39 | } 40 | 41 | BENCHMARK(fmt_variadic); 42 | 43 | void test_sprintf(benchmark::State &state) { 44 | char buffer[64]; 45 | while (state.KeepRunning()) 46 | std::sprintf(buffer, "%d", 42); 47 | } 48 | 49 | BENCHMARK(test_sprintf); 50 | 51 | void test_format(benchmark::State &state) { 52 | while (state.KeepRunning()) 53 | fmt::format("{}", 42); 54 | } 55 | 56 | BENCHMARK(test_format); 57 | 58 | void test_sprintf_pos(benchmark::State &state) { 59 | char buffer[64]; 60 | while (state.KeepRunning()) 61 | std::sprintf(buffer, "%1$d", 42); 62 | } 63 | 64 | BENCHMARK(test_sprintf_pos); 65 | 66 | void test_format_pos(benchmark::State &state) { 67 | while (state.KeepRunning()) 68 | fmt::format("{0}", 42); 69 | } 70 | 71 | BENCHMARK(test_format_pos); 72 | 73 | BENCHMARK_MAIN(); 74 | -------------------------------------------------------------------------------- /src/itoa-benchmark/lut.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "digitslut.h" 3 | #include "test.h" 4 | 5 | // Use lookup table of two digits 6 | 7 | void u32toa_lut(uint32_t value, char* buffer) { 8 | char temp[10]; 9 | char* p = temp; 10 | 11 | while (value >= 100) { 12 | const unsigned i = (value % 100) << 1; 13 | value /= 100; 14 | *p++ = gDigitsLut[i + 1]; 15 | *p++ = gDigitsLut[i]; 16 | } 17 | 18 | if (value < 10) 19 | *p++ = char(value) + '0'; 20 | else { 21 | const unsigned i = value << 1; 22 | *p++ = gDigitsLut[i + 1]; 23 | *p++ = gDigitsLut[i]; 24 | } 25 | 26 | do { 27 | *buffer++ = *--p; 28 | } while (p != temp); 29 | 30 | *buffer = '\0'; 31 | } 32 | 33 | void i32toa_lut(int32_t value, char* buffer) { 34 | uint32_t u = static_cast(value); 35 | if (value < 0) { 36 | *buffer++ = '-'; 37 | u = ~u + 1; 38 | } 39 | u32toa_lut(u, buffer); 40 | } 41 | 42 | void u64toa_lut(uint64_t value, char* buffer) { 43 | char temp[20]; 44 | char* p = temp; 45 | 46 | while (value >= 100) { 47 | const unsigned i = static_cast(value % 100) << 1; 48 | value /= 100; 49 | *p++ = gDigitsLut[i + 1]; 50 | *p++ = gDigitsLut[i]; 51 | } 52 | 53 | if (value < 10) 54 | *p++ = char(value) + '0'; 55 | else { 56 | const unsigned i = static_cast(value) << 1; 57 | *p++ = gDigitsLut[i + 1]; 58 | *p++ = gDigitsLut[i]; 59 | } 60 | 61 | do { 62 | *buffer++ = *--p; 63 | } while (p != temp); 64 | 65 | *buffer = '\0'; 66 | } 67 | 68 | void i64toa_lut(int64_t value, char* buffer) { 69 | uint64_t u = static_cast(value); 70 | if (value < 0) { 71 | *buffer++ = '-'; 72 | u = ~u + 1; 73 | } 74 | u64toa_lut(u, buffer); 75 | } 76 | 77 | REGISTER_TEST(lut); 78 | -------------------------------------------------------------------------------- /src/file-benchmark.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | auto test_data = "test data"; 9 | auto num_iters = 1'000'000; 10 | 11 | const char* removed(benchmark::State& state, const char* path) { 12 | state.PauseTiming(); 13 | std::remove(path); 14 | state.ResumeTiming(); 15 | return path; 16 | } 17 | 18 | void fprintf(benchmark::State& state) { 19 | for (auto s : state) { 20 | auto f = fopen(removed(state, "/tmp/fprintf-test"), "wb"); 21 | for (int i = 0; i < num_iters; ++i) fprintf(f, "%s\n", test_data); 22 | fclose(f); 23 | } 24 | } 25 | BENCHMARK(fprintf); 26 | 27 | void std_ofstream(benchmark::State& state) { 28 | for (auto s : state) { 29 | auto os = 30 | std::ofstream(removed(state, "/tmp/ofstream-test"), std::ios::binary); 31 | for (int i = 0; i < num_iters; ++i) os << test_data << '\n'; 32 | } 33 | } 34 | BENCHMARK(std_ofstream); 35 | 36 | /*void fmt_print_compile(benchmark::State& state) { 37 | for (auto s : state) { 38 | auto f = fmt::output_file(removed(state, "/tmp/fmt-compile-test"), 39 | fmt::buffer_size=state.range(0)); 40 | for (int i = 0; i < num_iters; ++i) 41 | f.print(FMT_COMPILE("{}\n"), test_data); 42 | } 43 | } 44 | BENCHMARK(fmt_print_compile)->RangeMultiplier(2)->Range(BUFSIZ, 1 << 20);*/ 45 | 46 | void fmt_print_runtime(benchmark::State& state) { 47 | for (auto s : state) { 48 | auto f = fmt::output_file(removed(state, "/tmp/fmt-runtime-test"), 49 | fmt::buffer_size = state.range(0)); 50 | for (int i = 0; i < num_iters; ++i) f.print("{}\n", test_data); 51 | } 52 | } 53 | BENCHMARK(fmt_print_runtime)->RangeMultiplier(2)->Range(BUFSIZ, 1 << 20); 54 | 55 | /*void fmt_print_compile_default(benchmark::State& state) { 56 | for (auto s : state) { 57 | auto f = fmt::output_file(removed(state, "/tmp/fmt-compile-default-test")); 58 | for (int i = 0; i < num_iters; ++i) f.print(FMT_COMPILE("{}\n"), test_data); 59 | } 60 | } 61 | BENCHMARK(fmt_print_compile_default);*/ 62 | 63 | BENCHMARK_MAIN(); 64 | -------------------------------------------------------------------------------- /src/itoa-benchmark/branchlut2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "digitslut.h" 3 | #include "test.h" 4 | 5 | 6 | #define BEGIN2(n) \ 7 | do { \ 8 | int t = (n); \ 9 | if(t < 10) *p++ = '0' + t; \ 10 | else { \ 11 | t *= 2; \ 12 | *p++ = gDigitsLut[t]; \ 13 | *p++ = gDigitsLut[t + 1]; \ 14 | } \ 15 | } while(0) 16 | #define MIDDLE2(n) \ 17 | do { \ 18 | int t = (n) * 2; \ 19 | *p++ = gDigitsLut[t]; \ 20 | *p++ = gDigitsLut[t + 1]; \ 21 | } while(0) 22 | #define BEGIN4(n) \ 23 | do { \ 24 | int t4 = (n); \ 25 | if(t4 < 100) BEGIN2(t4); \ 26 | else { BEGIN2(t4 / 100); MIDDLE2(t4 % 100); } \ 27 | } while(0) 28 | #define MIDDLE4(n) \ 29 | do { \ 30 | int t4 = (n); \ 31 | MIDDLE2(t4 / 100); MIDDLE2(t4 % 100); \ 32 | } while(0) 33 | #define BEGIN8(n) \ 34 | do { \ 35 | uint32_t t8 = (n); \ 36 | if(t8 < 10000) BEGIN4(t8); \ 37 | else { BEGIN4(t8 / 10000); MIDDLE4(t8 % 10000); } \ 38 | } while(0) 39 | #define MIDDLE8(n) \ 40 | do { \ 41 | uint32_t t8 = (n); \ 42 | MIDDLE4(t8 / 10000); MIDDLE4(t8 % 10000); \ 43 | } while(0) 44 | #define MIDDLE16(n) \ 45 | do { \ 46 | uint64_t t16 = (n); \ 47 | MIDDLE8(t16 / 100000000); MIDDLE8(t16 % 100000000); \ 48 | } while(0) 49 | 50 | void u32toa_branchlut2(uint32_t x, char* p) { 51 | 52 | if(x < 100000000) BEGIN8(x); 53 | else { BEGIN2(x / 100000000); MIDDLE8(x % 100000000); } 54 | *p = 0; 55 | 56 | } 57 | void i32toa_branchlut2(int32_t x, char* p) { 58 | 59 | uint64_t t; 60 | if(x >= 0) t = x; 61 | else *p++ = '-', t = -uint32_t(x); 62 | u32toa_branchlut2(t, p); 63 | 64 | } 65 | void u64toa_branchlut2(uint64_t x, char* p) { 66 | 67 | if(x < 100000000) BEGIN8(x); 68 | else if(x < 10000000000000000) { BEGIN8(x / 100000000); MIDDLE8(x % 100000000); } 69 | else { BEGIN4(x / 10000000000000000); MIDDLE16(x % 10000000000000000); } 70 | *p = 0; 71 | 72 | } 73 | void i64toa_branchlut2(int64_t x, char* p) { 74 | 75 | uint64_t t; 76 | if(x >= 0) t = x; 77 | else *p++ = '-', t = -uint64_t(x); 78 | u64toa_branchlut2(t, p); 79 | 80 | } 81 | 82 | 83 | REGISTER_TEST(branchlut2); 84 | -------------------------------------------------------------------------------- /src/itoa-benchmark/countlut.cpp: -------------------------------------------------------------------------------- 1 | #include "countdecimaldigit.h" 2 | #include "digitslut.h" 3 | #include "test.h" 4 | 5 | // Additional count number of digit pass 6 | // Use lookup table of two gDigitsLut 7 | 8 | void u32toa_countlut(uint32_t value, char* buffer) { 9 | unsigned digit = CountDecimalDigit32(value); 10 | buffer += digit; 11 | *buffer = '\0'; 12 | 13 | while (value >= 100) { 14 | const unsigned i = (value % 100) << 1; 15 | value /= 100; 16 | *--buffer = gDigitsLut[i + 1]; 17 | *--buffer = gDigitsLut[i]; 18 | } 19 | 20 | if (value < 10) { 21 | *--buffer = char(value) + '0'; 22 | } 23 | else { 24 | const unsigned i = value << 1; 25 | *--buffer = gDigitsLut[i + 1]; 26 | *--buffer = gDigitsLut[i]; 27 | } 28 | } 29 | 30 | void i32toa_countlut(int32_t value, char* buffer) { 31 | uint32_t u = static_cast(value); 32 | if (value < 0) { 33 | *buffer++ = '-'; 34 | u = ~u + 1; 35 | } 36 | u32toa_countlut(u, buffer); 37 | } 38 | 39 | void u64toa_countlut(uint64_t value, char* buffer) { 40 | unsigned digit = CountDecimalDigit64(value); 41 | buffer += digit; 42 | *buffer = '\0'; 43 | 44 | while (value >= 100000000) { 45 | const uint32_t a = static_cast(value % 100000000); 46 | value /= 100000000; 47 | 48 | const uint32_t b = a / 10000; 49 | const uint32_t c = a % 10000; 50 | 51 | const uint32_t b1 = (b / 100) << 1; 52 | const uint32_t b2 = (b % 100) << 1; 53 | const uint32_t c1 = (c / 100) << 1; 54 | const uint32_t c2 = (c % 100) << 1; 55 | 56 | buffer -= 8; 57 | 58 | buffer[0] = gDigitsLut[b1]; 59 | buffer[1] = gDigitsLut[b1 + 1]; 60 | buffer[2] = gDigitsLut[b2]; 61 | buffer[3] = gDigitsLut[b2 + 1]; 62 | buffer[4] = gDigitsLut[c1]; 63 | buffer[5] = gDigitsLut[c1 + 1]; 64 | buffer[6] = gDigitsLut[c2]; 65 | buffer[7] = gDigitsLut[c2 + 1]; 66 | } 67 | 68 | uint32_t value32 = static_cast(value); 69 | while (value32 >= 100) { 70 | const unsigned i = static_cast(value32 % 100) << 1; 71 | value32 /= 100; 72 | *--buffer = gDigitsLut[i + 1]; 73 | *--buffer = gDigitsLut[i]; 74 | } 75 | 76 | if (value32 < 10) { 77 | *--buffer = char(value32) + '0'; 78 | } 79 | else { 80 | const unsigned i = static_cast(value32) << 1; 81 | *--buffer = gDigitsLut[i + 1]; 82 | *--buffer = gDigitsLut[i]; 83 | } 84 | } 85 | 86 | void i64toa_countlut(int64_t value, char* buffer) { 87 | uint64_t u = static_cast(value); 88 | if (value < 0) { 89 | *buffer++ = '-'; 90 | u = ~u + 1; 91 | } 92 | u64toa_countlut(u, buffer); 93 | } 94 | 95 | REGISTER_TEST(countlut); 96 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | A collection of formatting benchmarks 2 | ===================================== 3 | 4 | * Speed, compile time and code bloat tests from 5 | `tinyformat `__. 6 | * ``int-benchmark``: decimal integer to string conversion benchmark from Boost Karma 7 | * ``itoa-benchmark``: decimal integer to string conversion benchmark by Milo Yip. See ``__. 8 | 9 | Building and running ``int-benchmark``: 10 | 11 | .. code:: 12 | 13 | cmake . 14 | make 15 | ./int-benchmark 16 | 17 | Alternatively (requires ``ninja``):: 18 | 19 | cmake -G Ninja . 20 | ninja int-benchmark 21 | ./int-benchmark 22 | 23 | Sample results on macOS with clang and libc++: 24 | 25 | .. code:: 26 | 27 | 2020-06-25 18:35:12 28 | Running ./int-benchmark 29 | Run on (8 X 2800 MHz CPU s) 30 | CPU Caches: 31 | L1 Data 32K (x4) 32 | L1 Instruction 32K (x4) 33 | L2 Unified 262K (x4) 34 | L3 Unified 8388K (x1) 35 | Load Average: 3.05, 2.27, 2.05 36 | -------------------------------------------------------------------------------- 37 | Benchmark Time CPU Iterations UserCounters... 38 | -------------------------------------------------------------------------------- 39 | sprintf 62015560 ns 61939300 ns 10 items_per_second=16.1448M/s 40 | std_ostringstream 153347073 ns 153167000 ns 4 items_per_second=6.52882M/s 41 | std_to_string 15058924 ns 15052045 ns 44 items_per_second=66.4362M/s 42 | std_to_chars 10995660 ns 10991238 ns 63 items_per_second=90.9816M/s 43 | fmt_to_string 11909200 ns 11905632 ns 57 items_per_second=83.9939M/s 44 | fmt_format_runtime 17071781 ns 17062878 ns 41 items_per_second=58.6068M/s 45 | fmt_format_compile 11897635 ns 11893517 ns 58 items_per_second=84.0794M/s 46 | fmt_format_to_runtime 13539784 ns 13534137 ns 51 items_per_second=73.8872M/s 47 | fmt_format_to_compile 8941199 ns 8937675 ns 77 items_per_second=111.886M/s 48 | fmt_format_int 8721323 ns 8718253 ns 79 items_per_second=114.702M/s 49 | boost_lexical_cast 29685237 ns 29668455 ns 22 items_per_second=33.7058M/s 50 | boost_format 315875676 ns 315739000 ns 2 items_per_second=3.16717M/s 51 | boost_karma_generate 12138668 ns 12134518 ns 56 items_per_second=82.4095M/s 52 | voigt_itostr 20177849 ns 19994243 ns 37 items_per_second=50.0144M/s 53 | u2985907 9445288 ns 9427174 ns 69 items_per_second=106.076M/s 54 | decimal_from 13009338 ns 12952232 ns 56 items_per_second=77.2068M/s 55 | stout_ltoa 39336210 ns 39325000 ns 18 items_per_second=25.4291M/s 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/concat-benchmark.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | std::string str1 = "label"; 7 | std::string str2 = "data1"; 8 | std::string str3 = "data2"; 9 | std::string str4 = "data3"; 10 | std::string str5 = "delim"; 11 | 12 | void naive(benchmark::State& state) { 13 | benchmark::ClobberMemory(); 14 | for (auto _ : state) { 15 | std::string output = "Result: " + str1 + ": (" + str2 + ',' + str3 + ',' + 16 | str4 + ',' + str5 + ')'; 17 | benchmark::DoNotOptimize(output.data()); 18 | } 19 | } 20 | BENCHMARK(naive); 21 | 22 | void append(benchmark::State& state) { 23 | benchmark::ClobberMemory(); 24 | for (auto _ : state) { 25 | std::string output = "Result: "; 26 | output += str1; 27 | output += ": ("; 28 | output += str2; 29 | output += ','; 30 | output += str3; 31 | output += ','; 32 | output += str4; 33 | output += ','; 34 | output += str5; 35 | output += ')'; 36 | benchmark::DoNotOptimize(output.data()); 37 | } 38 | } 39 | BENCHMARK(append); 40 | 41 | void appendWithReserve(benchmark::State& state) { 42 | benchmark::ClobberMemory(); 43 | for (auto _ : state) { 44 | std::string output = "Result: "; 45 | output.reserve(str1.length() + str2.length() + str3.length() + 46 | str4.length() + str5.length() + 16); 47 | output += str1; 48 | output += ": ("; 49 | output += str2; 50 | output += ','; 51 | output += str3; 52 | output += ','; 53 | output += str4; 54 | output += ','; 55 | output += str5; 56 | output += ')'; 57 | benchmark::DoNotOptimize(output.data()); 58 | } 59 | } 60 | BENCHMARK(appendWithReserve); 61 | 62 | void format_compile(benchmark::State& state) { 63 | benchmark::ClobberMemory(); 64 | for (auto _ : state) { 65 | auto output = fmt::format(FMT_COMPILE("Result: {}: ({},{},{},{})"), str1, 66 | str2, str3, str4, str5); 67 | benchmark::DoNotOptimize(output.data()); 68 | } 69 | } 70 | BENCHMARK(format_compile); 71 | 72 | void format_runtime(benchmark::State& state) { 73 | benchmark::ClobberMemory(); 74 | for (auto _ : state) { 75 | auto output = 76 | fmt::format("Result: {}: ({},{},{},{})", str1, str2, str3, str4, str5); 77 | benchmark::DoNotOptimize(output.data()); 78 | } 79 | } 80 | BENCHMARK(format_runtime); 81 | 82 | void format_to(benchmark::State& state) { 83 | benchmark::ClobberMemory(); 84 | for (auto _ : state) { 85 | fmt::memory_buffer output; 86 | fmt::format_to(std::back_inserter(output), "Result: {}: ({},{},{},{})", str1, str2, str3, str4, 87 | str5); 88 | benchmark::DoNotOptimize(output.data()); 89 | } 90 | } 91 | BENCHMARK(format_to); 92 | 93 | void nullop(benchmark::State& state) { 94 | for (auto _ : state) { 95 | benchmark::ClobberMemory(); 96 | } 97 | } 98 | BENCHMARK(nullop); 99 | 100 | BENCHMARK_MAIN(); 101 | -------------------------------------------------------------------------------- /src/itoa-benchmark/countdecimaldigit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #ifdef _MSC_VER 5 | #include "intrin.h" 6 | #endif 7 | 8 | inline unsigned CountDecimalDigit32(uint32_t n) { 9 | #if defined(_MSC_VER) || defined(__GNUC__) 10 | static const uint32_t powers_of_10[] = { 11 | 0, 12 | 10, 13 | 100, 14 | 1000, 15 | 10000, 16 | 100000, 17 | 1000000, 18 | 10000000, 19 | 100000000, 20 | 1000000000 21 | }; 22 | 23 | #ifdef _MSC_VER 24 | unsigned long i = 0; 25 | _BitScanReverse(&i, n | 1); 26 | uint32_t t = (i + 1) * 1233 >> 12; 27 | #elif __GNUC__ 28 | uint32_t t = (32 - __builtin_clz(n | 1)) * 1233 >> 12; 29 | #endif 30 | return t - (n < powers_of_10[t]) + 1; 31 | #else 32 | // Simple pure C++ implementation 33 | if (n < 10) return 1; 34 | if (n < 100) return 2; 35 | if (n < 1000) return 3; 36 | if (n < 10000) return 4; 37 | if (n < 100000) return 5; 38 | if (n < 1000000) return 6; 39 | if (n < 10000000) return 7; 40 | if (n < 100000000) return 8; 41 | if (n < 1000000000) return 9; 42 | return 10; 43 | #endif 44 | } 45 | 46 | inline unsigned CountDecimalDigit64(uint64_t n) { 47 | #if defined(_MSC_VER) || defined(__GNUC__) 48 | static const uint64_t powers_of_10[] = { 49 | 0, 50 | 10, 51 | 100, 52 | 1000, 53 | 10000, 54 | 100000, 55 | 1000000, 56 | 10000000, 57 | 100000000, 58 | 1000000000, 59 | 10000000000, 60 | 100000000000, 61 | 1000000000000, 62 | 10000000000000, 63 | 100000000000000, 64 | 1000000000000000, 65 | 10000000000000000, 66 | 100000000000000000, 67 | 1000000000000000000, 68 | 10000000000000000000U 69 | }; 70 | 71 | #if __GNUC__ 72 | uint32_t t = (64 - __builtin_clzll(n | 1)) * 1233 >> 12; 73 | #elif _M_IX86 74 | unsigned long i = 0; 75 | uint64_t m = n | 1; 76 | if (_BitScanReverse(&i, m >> 32)) 77 | i += 32; 78 | else 79 | _BitScanReverse(&i, m & 0xFFFFFFFF); 80 | uint32_t t = (i + 1) * 1233 >> 12; 81 | #elif _M_X64 82 | unsigned long i = 0; 83 | _BitScanReverse64(&i, n | 1); 84 | uint32_t t = (i + 1) * 1233 >> 12; 85 | #endif 86 | return t - (n < powers_of_10[t]) + 1; 87 | #else 88 | // Simple pure C++ implementation 89 | if (n < 10) return 1; 90 | if (n < 100) return 2; 91 | if (n < 1000) return 3; 92 | if (n < 10000) return 4; 93 | if (n < 100000) return 5; 94 | if (n < 1000000) return 6; 95 | if (n < 10000000) return 7; 96 | if (n < 100000000) return 8; 97 | if (n < 1000000000) return 9; 98 | if (n < 10000000000) return 10; 99 | if (n < 100000000000) return 11; 100 | if (n < 1000000000000) return 12; 101 | if (n < 10000000000000) return 13; 102 | if (n < 100000000000000) return 14; 103 | if (n < 1000000000000000) return 15; 104 | if (n < 10000000000000000) return 16; 105 | if (n < 100000000000000000) return 17; 106 | if (n < 1000000000000000000) return 18; 107 | if (n < 10000000000000000000) return 19; 108 | return 20; 109 | #endif 110 | } 111 | -------------------------------------------------------------------------------- /src/locale-benchmark.cc: -------------------------------------------------------------------------------- 1 | // A locale-aware integer formatting benchmark. 2 | // 3 | // Copyright (c) 2019 - present, Victor Zverovich 4 | // All rights reserved. 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | struct separate_thousands : std::numpunct { 20 | char do_thousands_sep() const { return ','; } 21 | std::string do_grouping() const { return "\3"; } 22 | }; 23 | 24 | struct Data { 25 | std::vector values; 26 | size_t total_length; 27 | 28 | auto begin() const { return values.begin(); } 29 | auto end() const { return values.end(); } 30 | 31 | // Prints the number of values by digit count, e.g. 32 | // 1 27263 33 | // 2 247132 34 | // 3 450601 35 | // 4 246986 36 | // 5 25188 37 | // 6 2537 38 | // 7 251 39 | // 8 39 40 | // 9 2 41 | // 10 1 42 | void print_digit_counts() const { 43 | int counts[11] = {}; 44 | for (auto value : values) 45 | ++counts[fmt::format_int(value).size()]; 46 | fmt::print("The number of values by digit count:\n"); 47 | for (int i = 1; i < 11; ++i) 48 | fmt::print("{:2} {:6}\n", i, counts[i]); 49 | } 50 | 51 | Data() : values(1'000'000) { 52 | // Same data as in Boost Karma int generator test: 53 | // https://www.boost.org/doc/libs/1_63_0/libs/spirit/workbench/karma/int_generator.cpp 54 | std::srand(0); 55 | std::generate(values.begin(), values.end(), []() { 56 | int scale = std::rand() / 100 + 1; 57 | return (std::rand() * std::rand()) / scale; 58 | }); 59 | std::ostringstream os; 60 | os.imbue(std::locale(std::locale(), new separate_thousands())); 61 | total_length = 62 | std::accumulate(begin(), end(), size_t(), [&](size_t lhs, int rhs) { 63 | os.str(std::string()); 64 | os << rhs; 65 | return lhs + os.str().size(); 66 | }); 67 | print_digit_counts(); 68 | } 69 | } data; 70 | 71 | void finalize(benchmark::State& state, size_t result) { 72 | auto expected = state.iterations() * data.total_length; 73 | if (result != expected) { 74 | throw std::logic_error( 75 | fmt::format("invalid length: {} != {}", result, expected)); 76 | } 77 | state.SetItemsProcessed(state.iterations() * data.values.size()); 78 | benchmark::DoNotOptimize(result); 79 | } 80 | 81 | void ostringstream(benchmark::State& state) { 82 | size_t result = 0; 83 | std::ostringstream os; 84 | os.imbue(std::locale(std::locale(), new separate_thousands())); 85 | while (state.KeepRunning()) { 86 | for (auto value : data) { 87 | os.str(std::string()); 88 | os << value; 89 | result += os.str().size(); 90 | } 91 | } 92 | finalize(state, result); 93 | } 94 | BENCHMARK(ostringstream); 95 | 96 | void format_locale(benchmark::State& state) { 97 | size_t result = 0; 98 | auto loc = std::locale(std::locale(), new separate_thousands()); 99 | while (state.KeepRunning()) { 100 | for (auto value : data) result += fmt::format(loc, "{:L}", value).size(); 101 | } 102 | finalize(state, result); 103 | } 104 | BENCHMARK(format_locale); 105 | 106 | BENCHMARK_MAIN(); 107 | -------------------------------------------------------------------------------- /src/itoa-benchmark/itoa_jeaiii.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2017 James Edward Anhalt III (jeaiii) 5 | https://github.com/jeaiii/itoa 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #include 27 | 28 | #define A(N) t = (1ULL << (32 + N / 5 * N * 53 / 16)) / uint32_t(1e##N) + 1 - N / 9, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 5 * 4 29 | 30 | #if 0 31 | // 1 char at a time 32 | 33 | #define D(N) b[N] = char(t >> 32) + '0' 34 | #define E t = 10ULL * uint32_t(t) 35 | 36 | #define L0 b[0] = char(u) + '0' 37 | #define L1 A(1), D(0), E, D(1) 38 | #define L2 A(2), D(0), E, D(1), E, D(2) 39 | #define L3 A(3), D(0), E, D(1), E, D(2), E, D(3) 40 | #define L4 A(4), D(0), E, D(1), E, D(2), E, D(3), E, D(4) 41 | #define L5 A(5), D(0), E, D(1), E, D(2), E, D(3), E, D(4), E, D(5) 42 | #define L6 A(6), D(0), E, D(1), E, D(2), E, D(3), E, D(4), E, D(5), E, D(6) 43 | #define L7 A(7), D(0), E, D(1), E, D(2), E, D(3), E, D(4), E, D(5), E, D(6), E, D(7) 44 | #define L8 A(8), D(0), E, D(1), E, D(2), E, D(3), E, D(4), E, D(5), E, D(6), E, D(7), E, D(8) 45 | #define L9 A(9), D(0), E, D(1), E, D(2), E, D(3), E, D(4), E, D(5), E, D(6), E, D(7), E, D(8), E, D(9) 46 | 47 | #else 48 | // 2 chars at a time, little endian only, unaligned 2 byte writes 49 | 50 | static const uint16_t s_100s[] = { 51 | '00', '10', '20', '30', '40', '50', '60', '70', '80', '90', 52 | '01', '11', '21', '31', '41', '51', '61', '71', '81', '91', 53 | '02', '12', '22', '32', '42', '52', '62', '72', '82', '92', 54 | '03', '13', '23', '33', '43', '53', '63', '73', '83', '93', 55 | '04', '14', '24', '34', '44', '54', '64', '74', '84', '94', 56 | '05', '15', '25', '35', '45', '55', '65', '75', '85', '95', 57 | '06', '16', '26', '36', '46', '56', '66', '76', '86', '96', 58 | '07', '17', '27', '37', '47', '57', '67', '77', '87', '97', 59 | '08', '18', '28', '38', '48', '58', '68', '78', '88', '98', 60 | '09', '19', '29', '39', '49', '59', '69', '79', '89', '99', 61 | }; 62 | 63 | #define W(N, I) *(uint16_t*)&b[N] = s_100s[I] 64 | #define Q(N) b[N] = char((10ULL * uint32_t(t)) >> 32) + '0' 65 | #define D(N) W(N, t >> 32) 66 | #define E t = 100ULL * uint32_t(t) 67 | 68 | #define L0 b[0] = char(u) + '0' 69 | #define L1 W(0, u) 70 | #define L2 A(1), D(0), Q(2) 71 | #define L3 A(2), D(0), E, D(2) 72 | #define L4 A(3), D(0), E, D(2), Q(4) 73 | #define L5 A(4), D(0), E, D(2), E, D(4) 74 | #define L6 A(5), D(0), E, D(2), E, D(4), Q(6) 75 | #define L7 A(6), D(0), E, D(2), E, D(4), E, D(6) 76 | #define L8 A(7), D(0), E, D(2), E, D(4), E, D(6), Q(8) 77 | #define L9 A(8), D(0), E, D(2), E, D(4), E, D(6), E, D(8) 78 | 79 | #endif 80 | 81 | #define LN(N) (L##N, b += N + 1) 82 | #define LZ(N) (L##N, b[N + 1] = '\0') 83 | #define LG(F) (u<100 ? u<10 ? F(0) : F(1) : u<1000000 ? u<10000 ? u<1000 ? F(2) : F(3) : u<100000 ? F(4) : F(5) : u<100000000 ? u<10000000 ? F(6) : F(7) : u<1000000000 ? F(8) : F(9)) 84 | 85 | void u32toa_jeaiii(uint32_t u, char* b) 86 | { 87 | uint64_t t; 88 | LG(LZ); 89 | } 90 | 91 | void i32toa_jeaiii(int32_t i, char* b) 92 | { 93 | uint32_t u = i < 0 ? *b++ = '-', 0 - uint32_t(i) : i; 94 | uint64_t t; 95 | LG(LZ); 96 | } 97 | 98 | void u64toa_jeaiii(uint64_t n, char* b) 99 | { 100 | uint32_t u; 101 | uint64_t t; 102 | 103 | if (uint32_t(n >> 32) == 0) 104 | return u = uint32_t(n), (void)LG(LZ); 105 | 106 | uint64_t a = n / 100000000; 107 | 108 | if (uint32_t(a >> 32) == 0) 109 | { 110 | u = uint32_t(a); 111 | LG(LN); 112 | } 113 | else 114 | { 115 | u = uint32_t(a / 100000000); 116 | LG(LN); 117 | u = a % 100000000; 118 | LN(7); 119 | } 120 | 121 | u = n % 100000000; 122 | LZ(7); 123 | } 124 | 125 | void i64toa_jeaiii(int64_t i, char* b) 126 | { 127 | uint64_t n = i < 0 ? *b++ = '-', 0 - uint64_t(i) : i; 128 | u64toa_jeaiii(n, b); 129 | } 130 | -------------------------------------------------------------------------------- /src/itoa-benchmark/tmueller.cpp: -------------------------------------------------------------------------------- 1 | // #include "itoa_ljust_impl.h" 2 | #include "test.h" 3 | 4 | #if defined(__GNUC__) || defined(__clang__) 5 | #define t_likely(expr) __builtin_expect(expr, 1) 6 | #define t_unlikely(expr) __builtin_expect(expr, 0) 7 | #else 8 | #define t_likely(expr) (expr) 9 | #define t_unlikely(expr) (expr) 10 | #endif 11 | 12 | #define t_inline __attribute__((always_inline)) 13 | 14 | static const char DIGITS[] = 15 | "0001020304050607080910111213141516171819" 16 | "2021222324252627282930313233343536373839" 17 | "4041424344454647484950515253545556575859" 18 | "6061626364656667686970717273747576777879" 19 | "8081828384858687888990919293949596979899"; 20 | 21 | t_inline void uint32toa_tmueller(uint64_t x, char* out) { 22 | char* str = out; 23 | if (t_likely(x < 100000)) { 24 | if (t_likely(x) < 1000) { 25 | if (t_likely(x) < 10) { 26 | *str++ = (char) ('0' + x); 27 | *str = 0; 28 | return; 29 | } 30 | uint32_t inc = 0; 31 | x = (x * ((0xffffUL / 100UL) + 1)); 32 | uint32_t d; 33 | d = (x >> 16); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 34 | x = (x & 0xffffUL) * 10; 35 | d = (x >> 16); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 36 | x = (x & 0xffffUL) * 10; 37 | *str++ = (char) ('0' + (x >> 16)); 38 | *str = 0; 39 | } else { 40 | uint32_t inc = 0; 41 | x = (x * ((0xffffffffUL / 10000UL) + 1)); 42 | uint32_t d; 43 | d = (x >> 32); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 44 | x = (x & 0xffffffffUL) * 100; 45 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 46 | x = (x & 0xffffffffUL) * 100; 47 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 48 | *str = 0; 49 | } 50 | } else { 51 | if (t_likely(x < 10000000)) { 52 | uint32_t inc = 0; 53 | x = (x * ((0xfffffffffffUL / 1000000) + 1)); 54 | uint32_t d; 55 | d = (x >> 44); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 56 | x = (x & 0xfffffffffffUL) * 100; 57 | memcpy(str, DIGITS + (x >> 44) * 2, 2); str += 2; 58 | x = (x & 0xfffffffffffUL) * 100; 59 | memcpy(str, DIGITS + (x >> 44) * 2, 2); str += 2; 60 | x = (x & 0xfffffffffffUL) * 100; 61 | memcpy(str, DIGITS + (x >> 44) * 2, 2); str += 2; 62 | *str = 0; 63 | } else { 64 | uint32_t inc = 0; 65 | x = (((x * 2305843009L) >> 29) + 4); 66 | uint32_t d; 67 | d = (x >> 32); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 68 | x = (x & 0xffffffffUL) * 10; 69 | d = (x >> 32); *str = (char) ('0' | d); inc |= -d; str += inc >> 31; 70 | x = (x & 0xffffffffUL) * 100; 71 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 72 | x = (x & 0xffffffffUL) * 100; 73 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 74 | x = (x & 0xffffffffUL) * 100; 75 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 76 | x = (x & 0xffffffffUL) * 100; 77 | memcpy(str, DIGITS + (x >> 32) * 2, 2); str += 2; 78 | *str = 0; 79 | } 80 | } 81 | } 82 | 83 | void u32toa_tmueller(uint32_t v, char* out) { 84 | uint32toa_tmueller(v, out); 85 | } 86 | 87 | void i32toa_tmueller( int32_t v, char* out) { 88 | // branchless (from amartin) 89 | *out = '-'; 90 | uint32_t mask = v < 0 ? ~(int32_t) 0 : 0; 91 | uint32_t u = ((2 * (uint32_t)(v)) & ~mask) - v; 92 | out += mask & 1; 93 | uint64_t x = u; 94 | uint32toa_tmueller(x, out); 95 | } 96 | 97 | static const uint64_t POW_10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 98 | 100000000, 1000000000, 10000000000ULL, 100000000000ULL, 1000000000000ULL, 10000000000000ULL, 99 | 100000000000000ULL, 1000000000000000ULL, 10000000000000000ULL, 100000000000000000ULL, 100 | 1000000000000000000ULL, 10000000000000000000ULL }; 101 | 102 | void u64toa_tmueller(uint64_t v, char* out) { 103 | if(v < 10) { 104 | *out++ = '0' + v; 105 | *out = 0; 106 | return; 107 | } 108 | int zeros = 64 - __builtin_clzl(v); 109 | int len = (1233 * zeros) >> 12; 110 | uint64_t p10 = POW_10[len]; 111 | if (v >= p10) { 112 | len++; 113 | } 114 | out += len; 115 | *out = 0; 116 | while (v >= 100) { 117 | uint64_t d100 = v / 100; 118 | uint64_t index = v - d100 * 100; 119 | v = d100; 120 | out -= 2; 121 | memcpy(out, DIGITS + index * 2, 2); 122 | } 123 | if (v < 10) { 124 | *--out = '0' + v; 125 | return; 126 | } 127 | out -= 2; 128 | memcpy(out, DIGITS + v * 2, 2); 129 | } 130 | 131 | void i64toa_tmueller( int64_t v, char* out) { 132 | // branchless (from amartin) 133 | *out = '-'; 134 | uint64_t mask = v < 0 ? ~(int64_t) 0 : 0; 135 | uint64_t u = ((2 * (uint64_t)(v)) & ~mask) - v; 136 | out += mask & 1; 137 | u64toa_tmueller(u, out); 138 | } 139 | 140 | REGISTER_TEST(tmueller); 141 | -------------------------------------------------------------------------------- /src/u2985907.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // The integer to string conversion method by StackOverflow user 4 | // https://stackoverflow.com/users/2985907/user2985907 sometimes incorrectly 5 | // attributed to jiaendu: https://stackoverflow.com/a/19944488/471164 6 | // Includes fixes from 7 | // https://gist.github.com/cpei-avalara/8aedf14f5618852be2cff4de267d497c. 8 | namespace so_u2985907 { 9 | 10 | inline int ufast_utoa10(unsigned int value, char* str) 11 | { 12 | #define JOIN(N) \ 13 | N "0", N "1", N "2", N "3", N "4", N "5", N "6", N "7", N "8", N "9" \ 14 | 15 | #define JOIN2(N) \ 16 | JOIN(N "0"), JOIN(N "1"), JOIN(N "2"), JOIN(N "3"), JOIN(N "4"), \ 17 | JOIN(N "5"), JOIN(N "6"), JOIN(N "7"), JOIN(N "8"), JOIN(N "9") \ 18 | 19 | #define JOIN3(N) \ 20 | JOIN2(N "0"), JOIN2(N "1"), JOIN2(N "2"), JOIN2(N "3"), JOIN2(N "4"), \ 21 | JOIN2(N "5"), JOIN2(N "6"), JOIN2(N "7"), JOIN2(N "8"), JOIN2(N "9") \ 22 | 23 | #define JOIN4 \ 24 | JOIN3("0"), JOIN3("1"), JOIN3("2"), JOIN3("3"), JOIN3("4"), \ 25 | JOIN3("5"), JOIN3("6"), JOIN3("7"), JOIN3("8"), JOIN3("9") \ 26 | 27 | #define JOIN5(N) \ 28 | JOIN(N), JOIN(N "1"), JOIN(N "2"), JOIN(N "3"), JOIN(N "4"), \ 29 | JOIN(N "5"), JOIN(N "6"), JOIN(N "7"), JOIN(N "8"), JOIN(N "9") \ 30 | 31 | #define JOIN6 \ 32 | JOIN5(""), JOIN2("1"), JOIN2("2"), JOIN2("3"), JOIN2("4"), \ 33 | JOIN2("5"), JOIN2("6"), JOIN2("7"), JOIN2("8"), JOIN2("9") \ 34 | 35 | #define F(N) ((N) >= 100 ? 3 : (N) >= 10 ? 2 : 1) 36 | 37 | #define F10(N) \ 38 | F(N), F(N + 1), F(N + 2), F(N + 3), F(N + 4), \ 39 | F(N + 5), F(N + 6), F(N + 7), F(N + 8), F(N + 9) \ 40 | 41 | #define F100(N) \ 42 | F10(N), F10(N + 10), F10(N + 20), F10(N + 30), \ 43 | F10(N + 40), F10(N + 50), F10(N + 60), F10(N + 70),\ 44 | F10(N + 80), F10(N + 90) \ 45 | 46 | static const short offsets[] = { 47 | F100( 0), F100(100), F100(200), F100(300), F100(400), 48 | F100(500), F100(600), F100(700), F100(800), F100(900) 49 | }; 50 | 51 | static const char table1[][4] = { JOIN ("") }; 52 | static const char table2[][4] = { JOIN2("") }; 53 | static const char table3[][4] = { JOIN3("") }; 54 | static const char table4[][8] = { JOIN4 }; 55 | static const char table5[][4] = { JOIN6 }; 56 | 57 | #undef JOIN 58 | #undef JOIN2 59 | #undef JOIN3 60 | #undef JOIN4 61 | #undef F 62 | #undef F10 63 | #undef F100 64 | 65 | char *wstr; 66 | #if (_WIN64 || __x86_64__ || __ppc64__) 67 | uint64_t remains[2]; 68 | #else 69 | uint32_t remains[2]; 70 | #endif 71 | unsigned int v2; 72 | 73 | if (value >= 100000000) 74 | { 75 | #if (_WIN64 || __x86_64__ || __ppc64__) 76 | remains[0] = (((uint64_t)value * (uint64_t)3518437209) >> 45); 77 | remains[1] = (((uint64_t)value * (uint64_t)2882303762) >> 58); 78 | #else 79 | remains[0] = value / 10000; 80 | remains[1] = value / 100000000; 81 | #endif 82 | v2 = remains[1]; 83 | remains[1] = remains[0] - remains[1] * 10000; 84 | remains[0] = value - remains[0] * 10000; 85 | if (v2 >= 10) 86 | { 87 | memcpy(str,table5[v2],2); 88 | str += 2; 89 | memcpy(str,table4[remains[1]],4); 90 | str += 4; 91 | memcpy(str,table4[remains[0]],4); 92 | return 10; 93 | } 94 | else 95 | { 96 | *(char *) str = v2 + '0'; 97 | str += 1; 98 | memcpy(str,table4[remains[1]],4); 99 | str += 4; 100 | memcpy(str,table4[remains[0]],4); 101 | return 9; 102 | } 103 | } 104 | else if (value >= 10000) 105 | { 106 | #if (_WIN64 || __x86_64__ || __ppc64__) 107 | v2 = (((uint64_t)value * (uint64_t)3518437209 ) >> 45); 108 | #else 109 | v2 = value / 10000; 110 | #endif 111 | remains[0] = value - v2 * 10000; 112 | if (v2 >= 1000) 113 | { 114 | memcpy(str,table4[v2],4); 115 | str += 4; 116 | memcpy(str,table4[remains[0]],4); 117 | return 8; 118 | } 119 | else 120 | { 121 | wstr = str; 122 | memcpy(wstr,table5[v2],4); 123 | wstr += offsets[v2]; 124 | memcpy(wstr,table4[remains[0]],4); 125 | wstr += 4; 126 | return (wstr - str); 127 | } 128 | } 129 | else 130 | { 131 | if (value >= 1000) 132 | { 133 | memcpy(str,table4[value],4); 134 | return 4; 135 | } 136 | else if (value >= 100) 137 | { 138 | memcpy(str,table3[value],3); 139 | return 3; 140 | } 141 | else if (value >= 10) 142 | { 143 | memcpy(str,table2[value],2); 144 | return 2; 145 | } 146 | else 147 | { 148 | *(char *) str = *(char *) table1[value]; 149 | return 1; 150 | } 151 | } 152 | } 153 | 154 | int ufast_itoa10(int value, char* str) { 155 | if (value < 0) { *(str++) = '-'; 156 | return ufast_utoa10(-value, str) + 1; 157 | } 158 | else return ufast_utoa10(value, str); 159 | } 160 | } // namespace so_u2985907 -------------------------------------------------------------------------------- /src/tinyformat-test.cc: -------------------------------------------------------------------------------- 1 | #if defined(__linux__) && defined(__clang__) 2 | // Workaround for bug in gcc 4.4 standard library headers when compling with 3 | // clang in C++11 mode. 4 | namespace std { class type_info; } 5 | #endif 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #ifdef SPEED_TEST 13 | #ifdef HAVE_FORMAT 14 | # include "fmt/format.h" 15 | # include "fmt/compile.h" 16 | #endif 17 | #if __has_include() 18 | # include 19 | # define HAVE_BOOST 20 | #endif 21 | #if __has_include() 22 | # include 23 | # define HAVE_FOLLY 24 | #endif 25 | #define STB_SPRINTF_IMPLEMENTATION 26 | #include "stb_sprintf.h" 27 | #include 28 | #include 29 | #endif 30 | 31 | // Throw instead of abort() so we can test error conditions. 32 | #define TINYFORMAT_ERROR(reason) \ 33 | throw std::runtime_error(reason); 34 | 35 | #include "tinyformat.h" 36 | #include 37 | 38 | #if 0 39 | // Compare result of tfm::format() to C's sprintf(). 40 | template 41 | void compareSprintf(const Args&... args) 42 | { 43 | std::string tfmResult = tfm::format(args...); 44 | char sprintfResult[200]; 45 | sprintf(sprintfResult, args...); 46 | if(tfmResult != sprintfResult) 47 | { 48 | std::cout << tfmResult << std::endl; 49 | std::cout << sprintfResult << std::endl; 50 | assert(0 && "results didn't match, see above."); 51 | } 52 | } 53 | #endif 54 | 55 | #define EXPECT_ERROR(expression) \ 56 | { \ 57 | try { expression; assert(0 && "expected exception in " \ 58 | #expression); } \ 59 | catch(std::runtime_error&) {} \ 60 | } 61 | 62 | #define CHECK_EQUAL(a, b) \ 63 | if(!((a) == (b))) \ 64 | { \ 65 | std::cout << "test failed, line " << __LINE__ << "\n"; \ 66 | std::cout << (a) << " != " << (b) << "\n"; \ 67 | std::cout << "[" #a ", " #b "]\n"; \ 68 | ++nfailed; \ 69 | } 70 | 71 | #ifdef FMT_PROFILE 72 | # include 73 | // Make sure the profiler library is linked in. 74 | static int profiling_enabled = ProfilingIsEnabledForAllThreads(); 75 | #endif 76 | 77 | #ifdef SPEED_TEST 78 | void speedTest(const std::string& which) 79 | { 80 | // Following is required so that we're not limited by per-character 81 | // buffering. 82 | std::ios_base::sync_with_stdio(false); 83 | const long maxIter = 2000000L; 84 | if(which == "printf") 85 | { 86 | // libc version 87 | for(long i = 0; i < maxIter; ++i) 88 | printf("%0.10f:%04d:%+g:%s:%p:%c:%%\n", 89 | 1.234, 42, 3.13, "str", (void*)1000, (int)'X'); 90 | } 91 | else if(which == "iostreams") 92 | { 93 | // Std iostreams version. What a mess!! 94 | for(long i = 0; i < maxIter; ++i) 95 | std::cout << std::setprecision(10) << std::fixed << 1.234 << ':' 96 | << std::resetiosflags(std::ios::floatfield) 97 | << std::setw(4) << std::setfill('0') << 42 << std::setfill(' ') << ':' 98 | << std::setiosflags(std::ios::showpos) << 3.13 << std::resetiosflags(std::ios::showpos) << ':' 99 | << "str" << ':' 100 | << (void*)1000 << ':' 101 | << 'X' << ":%\n"; 102 | } 103 | else if(which == "tinyformat") 104 | { 105 | // tinyformat version. 106 | for(long i = 0; i < maxIter; ++i) 107 | tfm::printf("%0.10f:%04d:%+g:%s:%p:%c:%%\n", 108 | 1.234, 42, 3.13, "str", (void*)1000, (int)'X'); 109 | } 110 | #ifdef HAVE_FORMAT 111 | else if(which == "format") 112 | { 113 | // fmt version. 114 | for(long i = 0; i < maxIter; ++i) 115 | fmt::print("{:.10f}:{:04}:{:+}:{}:{}:{}:%\n", 116 | 1.234, 42, 3.13, "str", (void*)1000, 'X'); 117 | } 118 | else if(which == "fmt::compile") 119 | { 120 | // fmt version (compiled). 121 | for(long i = 0; i < maxIter; ++i) 122 | { 123 | char buf[100]; 124 | //fmt::memory_buffer buf; 125 | auto finished_at = fmt::format_to( 126 | buf, FMT_COMPILE("{:.10f}:{:04}:{:+}:{}:{}:{}:%\n"), 127 | 1.234, 42, 3.13, "str", (void*)1000, 'X'); 128 | *finished_at = '\0'; 129 | std::puts(buf); 130 | } 131 | } 132 | #endif 133 | else if(which == "folly") 134 | { 135 | #ifdef HAVE_FOLLY 136 | // folly::format version 137 | for(long i = 0; i < maxIter; ++i) 138 | std::cout << folly::format("{:.10f}:{:04}:{:+}:{}:{}:{}:%\n", 139 | 1.234, 42, 3.13, "str", (void*)1000, 'X'); 140 | #else 141 | fprintf(stderr, "folly is not available\n"); 142 | #endif 143 | } 144 | else if(which == "boost") 145 | { 146 | #ifdef HAVE_BOOST 147 | // boost::format version 148 | for(long i = 0; i < maxIter; ++i) 149 | std::cout << boost::format("%0.10f:%04d:%+g:%s:%p:%c:%%\n") 150 | % 1.234 % 42 % 3.13 % "str" % (void*)1000 % (int)'X'; 151 | #else 152 | fprintf(stderr, "boost is not available\n"); 153 | #endif 154 | } 155 | else if(which == "stb_sprintf") 156 | { 157 | char buf[100]; 158 | // stb_sprintf version 159 | for(long i = 0; i < maxIter; ++i) { 160 | stbsp_sprintf(buf, "%0.10f:%04d:%+g:%s:%p:%c:%%\n", 161 | 1.234, 42, 3.13, "str", (void*)1000, (int)'X'); 162 | fputs(buf, stdout); 163 | } 164 | } 165 | else 166 | { 167 | assert(0 && "speed test for which version?"); 168 | } 169 | } 170 | #endif 171 | 172 | 173 | int main(int argc, char* argv[]) 174 | { 175 | #ifdef SPEED_TEST 176 | if(argc >= 2) 177 | speedTest(argv[1]); 178 | return 0; 179 | #else 180 | return unitTests(); 181 | #endif 182 | } 183 | -------------------------------------------------------------------------------- /src/itoa-benchmark/readme.md: -------------------------------------------------------------------------------- 1 | # itoa Benchmark 2 | 3 | Copyright(c) 2014-2016 Milo Yip (miloyip@gmail.com) 4 | 5 | ## Introduction 6 | 7 | This benchmark evaluates the performance of conversion from 32-bit/64-bit integer to ASCII string in decimal. The function prototypes are: 8 | 9 | ~~~~~~~~cpp 10 | void u32toa(uint32_t value, char* buffer); 11 | void i32toa(int32_t value, char* buffer); 12 | void u64toa(uint64_t value, char* buffer); 13 | void i64toa(int64_t value, char* buffer); 14 | ~~~~~~~~ 15 | 16 | Note that `itoa()` is *not* a standard function in C and C++, but provided by some compilers. 17 | 18 | ## Procedure 19 | 20 | Firstly the program verifies the correctness of implementations. 21 | 22 | Then, two cases for benchmark are carried out: 23 | 24 | 1. **Sequential**: Converts consecutive values in same number of decimal digits. 25 | 26 | For `u32toa()`, the tested values are { 1, 2, ..., 9 }, {10, 11, ..., 99 }, ... { 4000000000, 4000000001, ..., 4294967296}, i.e., groups of 1 to 10 decimal digits. 27 | 28 | For signed versions, use alternate signs, e.g. { 1, -2, 3, -4, ... 9 }. 29 | 30 | For 64-bit integer, there are groups of 1 to 20 decimal digits. 31 | 32 | 2. **Random**: Converts the shuffled sequence of the first case. 33 | 34 | Each digit group is run for 100000 times. The minimum time duration is measured for 10 trials. 35 | 36 | ## Build and Run 37 | 38 | 1. Obtain [premake5](http://industriousone.com/premake/download). 39 | 2. make 40 | 41 | ## Results 42 | 43 | The following are `sequential` results measured on a PC (Core i7 920 @2.67Ghz), where `u32toa()` is compiled by Visual C++ 2013 and run on Windows 64-bit. The speedup is based on `sprintf()`. 44 | 45 | |Function |Time (ns)|Speedup| 46 | |---------|--------:|------:| 47 | |sprintf | 194.225| 1.00x| 48 | |vc | 61.522| 3.16x| 49 | |naive | 26.743| 7.26x| 50 | |count | 20.552| 9.45x| 51 | |lut | 17.810| 10.91x| 52 | |countlut | 9.926| 19.57x| 53 | |branchlut| 8.430| 23.04x| 54 | |sse2 | 7.614| 25.51x| 55 | |null | 2.230| 87.09x| 56 | 57 | ![corei7920@2.67_win64_vc2013_u32toa_sequential_time](result/corei7920@2.67_win64_vc2013_u32toa_sequential_time.png) 58 | 59 | ![corei7920@2.67_win64_vc2013_u32toa_sequential_timedigit](result/corei7920@2.67_win64_vc2013_u32toa_sequential_timedigit.png) 60 | 61 | Note that the `null` implementation does nothing. It measures the overheads of looping and function call. 62 | 63 | Since the C++ standard library implementations (`ostringstream`, `ostrstream`, `to_string`) are slow, they are turned off by default. User can re-enable them by defining `RUN_CPPITOA` macro. 64 | 65 | Some results of various configurations are located at `itoa-benchmark/result`. They can be accessed online, with interactivity provided by [Google Charts](https://developers.google.com/chart/): 66 | 67 | * [corei7920@2.67_win32_vc2013](http://rawgit.com/miloyip/itoa-benchmark/master/result/corei7920@2.67_win32_vc2013.html) 68 | * [corei7920@2.67_win64_vc2013](http://rawgit.com/miloyip/itoa-benchmark/master/result/corei7920@2.67_win64_vc2013.html) 69 | * [corei7920@2.67_cygwin32_gcc4.8](http://rawgit.com/miloyip/itoa-benchmark/master/result/corei7920@2.67_cygwin32_gcc4.8.html) 70 | * [corei7920@2.67_cygwin64_gcc4.8](http://rawgit.com/miloyip/itoa-benchmark/master/result/corei7920@2.67_cygwin64_gcc4.8.html) 71 | 72 | ## Implementations 73 | 74 | Function | Description 75 | --------------|----------- 76 | ostringstream | `std::ostringstream` in C++ standard library. 77 | ostrstream | `std::ostrstream` in C++ standard library. 78 | to_string | `std::to_string()` in C++11 standard library. 79 | sprintf | `sprintf()` in C standard library 80 | vc | Visual C++'s `_itoa()`, `_i64toa()`, `_ui64toa()` 81 | naive | Compute division/modulo of 10 for each digit, store digits in temp array and copy to buffer in reverse order. 82 | unnamed | Compute division/modulo of 10 for each digit, store directly in buffer 83 | count | Count number of decimal digits first, using technique from [1]. 84 | lut | Uses lookup table (LUT) of digit pairs for division/modulo of 100. Mentioned in [2] 85 | countlut | Combines count and lut. 86 | branchlut | Use branching to divide-and-conquer the range of value, make computation more parallel. 87 | sse2 | Based on branchlut scheme, use SSE2 SIMD instructions to convert 8 digits in parallel. The algorithm is designed by Wojciech Muła [3]. (Experiment shows it is useful for values equal to or more than 9 digits) 88 | null | Do nothing. 89 | 90 | ## FAQ 91 | 92 | 1. How to add an implementation? 93 | 94 | You may clone an existing implementation file (e.g. `naive.cpp`). And then modify it. Re-run `premake` to add it to project or makefile. Note that it will automatically register to the benchmark by macro `REGISTER_TEST(name)`. 95 | 96 | Making pull request of new implementations is welcome. 97 | 98 | 2. Why not converting integers to `std::string`? 99 | 100 | It may introduce heap allocation, which is a big overhead. User can easily wrap these low-level functions to return `std::string`, if needed. 101 | 102 | 3. Why fast `itoa()` functions is needed? 103 | 104 | They are a very common operations in writing data in text format. The standard way of `sprintf()`, `std::stringstream`, `std::to_string(int)` (C++11) often provides poor performance. The author of this benchmark would optimize the "naive" implementation in [RapidJSON](https://github.com/miloyip/rapidjson/issues/31), thus he creates this project. 105 | 106 | ## References 107 | 108 | [1] Anderson, [Bit Twiddling Hacks](https://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10), 1997. 109 | 110 | [2] Alexandrescu, [Three Optimization Tips for C++](http://www.slideshare.net/andreialexandrescu1/three-optimization-tips-for-c-15708507), 2012. 111 | 112 | [3] Muła, [SSE: conversion integers to decimal representation](http://wm.ite.pl/articles/sse-itoa.html), 2011. 113 | 114 | ## Related Benchmarks and Discussions 115 | 116 | * [The String Formatters of Manor Farm] (http://www.gotw.ca/publications/mill19.htm) by Herb Sutter, 2001. 117 | * [C++ itoa benchmark](https://github.com/localvoid/cxx-benchmark-itoa) by [localvoid](https://github.com/localvoid) 118 | * [Stackoverflow: C++ performance challenge: integer to std::string conversion](http://stackoverflow.com/questions/4351371/c-performance-challenge-integer-to-stdstring-conversion) 119 | -------------------------------------------------------------------------------- /src/itostr.cc: -------------------------------------------------------------------------------- 1 | /* Copyright Richard Benjamin Voigt */ 2 | /* All rights reserved. */ 3 | /* contact richardvoigt@gmail.com for a cleaned-up version under a BSD license 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | template struct assert_integral { 10 | enum { value = (T)0.5 }; 11 | char test[1 - 4 * value]; 12 | }; 13 | 14 | template struct itostr_impl {}; 15 | 16 | template struct itostr_impl { 17 | static std::string cvt(T val) { 18 | std::string retval(5, '\0'); 19 | int i = 0; 20 | char ch = 0; 21 | 22 | if (is_signed) { 23 | if (val < 0) { 24 | retval[i] = '-'; 25 | ++i; 26 | if (val <= -100) { 27 | ch = '1'; 28 | val += 100; 29 | } 30 | val = -val; 31 | } else if (val >= 100) { 32 | ch |= '1'; 33 | val -= 100; 34 | } 35 | } else { 36 | if (val >= 200) { 37 | ch |= '2'; 38 | val -= 200; 39 | } else if (val >= 100) { 40 | ch |= '1'; 41 | val -= 100; 42 | } 43 | } 44 | if (ch) { 45 | retval[i] = ch; 46 | ++i; 47 | ch = '0'; 48 | } 49 | 50 | if (val >= 80) { 51 | ch |= '8'; 52 | val -= 80; 53 | } else if (val >= 40) { 54 | ch |= '4'; 55 | val -= 40; 56 | } 57 | if (val >= 20) { 58 | ch |= '2'; 59 | val -= 20; 60 | } 61 | if (val >= 10) { 62 | ch |= '1'; 63 | val -= 10; 64 | } 65 | if (ch) { 66 | retval[i] = ch; 67 | ++i; 68 | } 69 | 70 | retval[i] = '0' + val; 71 | retval.resize(i + 1); 72 | 73 | return retval; 74 | } 75 | }; 76 | 77 | template struct itostr_impl { 78 | static std::string cvt(T val) { 79 | std::string retval(7, '\0'); 80 | int i = 0; 81 | char ch = 0; 82 | 83 | if (is_signed) { 84 | if (val < 0) { 85 | retval[i] = '-'; 86 | ++i; 87 | if (val <= -20000) { 88 | ch = '2'; 89 | val += 20000; 90 | } 91 | val = -val; 92 | } else if (val >= 20000) { 93 | ch |= '2'; 94 | val -= 20000; 95 | } 96 | } else { 97 | if (val >= 40000) { 98 | ch |= '4'; 99 | val -= 40000; 100 | } else if (val >= 20000) { 101 | ch |= '2'; 102 | val -= 20000; 103 | } 104 | } 105 | if (val >= 10000) { 106 | ch |= '1'; 107 | val -= 10000; 108 | } 109 | 110 | if (ch) { 111 | retval[i] = ch; 112 | ++i; 113 | ch = '0'; 114 | } 115 | 116 | if (val >= 8000) { 117 | ch |= '8'; 118 | val -= 8000; 119 | } else if (val >= 4000) { 120 | ch |= '4'; 121 | val -= 4000; 122 | } 123 | if (val >= 2000) { 124 | ch |= '2'; 125 | val -= 2000; 126 | } 127 | if (val >= 1000) { 128 | ch |= '1'; 129 | val -= 1000; 130 | } 131 | if (ch) { 132 | retval[i] = ch; 133 | ++i; 134 | ch = '0'; 135 | } 136 | 137 | if (val >= 800) { 138 | ch |= '8'; 139 | val -= 800; 140 | } else if (val >= 400) { 141 | ch |= '4'; 142 | val -= 400; 143 | } 144 | if (val >= 200) { 145 | ch |= '2'; 146 | val -= 200; 147 | } 148 | if (val >= 100) { 149 | ch |= '1'; 150 | val -= 100; 151 | } 152 | if (ch) { 153 | retval[i] = ch; 154 | ++i; 155 | ch = '0'; 156 | } 157 | 158 | if (val >= 80) { 159 | ch |= '8'; 160 | val -= 80; 161 | } else if (val >= 40) { 162 | ch |= '4'; 163 | val -= 40; 164 | } 165 | if (val >= 20) { 166 | ch |= '2'; 167 | val -= 20; 168 | } 169 | if (val >= 10) { 170 | ch |= '1'; 171 | val -= 10; 172 | } 173 | if (ch) { 174 | retval[i] = ch; 175 | ++i; 176 | } 177 | 178 | retval[i] = '0' + val; 179 | retval.resize(i + 1); 180 | 181 | return retval; 182 | } 183 | }; 184 | 185 | const char digit_pair_table[201] = { 186 | "00010203040506070809" 187 | "10111213141516171819" 188 | "20212223242526272829" 189 | "30313233343536373839" 190 | "40414243444546474849" 191 | "50515253545556575859" 192 | "60616263646566676869" 193 | "70717273747576777879" 194 | "80818283848586878889" 195 | "90919293949596979899"}; 196 | 197 | template struct itostr_impl { 198 | static std::string cvt(T val) { 199 | char buf[11], ch = 0; 200 | char* start = buf + 1; 201 | char* p = start; 202 | bool neg = val < 0; 203 | int digit; 204 | 205 | if (is_signed) { 206 | if (neg) { 207 | if (val <= -2000000000) { 208 | ch = '2'; 209 | val += 2000000000; 210 | } 211 | val = -val; 212 | } else if (val >= 2000000000) { 213 | ch = '2'; 214 | val -= 2000000000; 215 | } 216 | } else { 217 | if (val >= 4000000000U) { 218 | ch |= '4'; 219 | val -= 4000000000U; 220 | } else if (val >= 2000000000) { 221 | ch |= '2'; 222 | val -= 2000000000; 223 | } 224 | } 225 | if (val >= 1000000000) { 226 | ch |= '1'; 227 | val -= 1000000000; 228 | } 229 | 230 | if (ch) { 231 | *p = ch; 232 | ++p; 233 | ch = '0'; 234 | } else if (val < 1000) { 235 | if (val < 10) goto d1; 236 | if (val < 1000) goto d10; 237 | } else { 238 | if (val < 100000) goto d1000; 239 | if (val < 10000000) goto d100000; 240 | } 241 | 242 | #define DO_PAIR(n) \ 243 | d##n : digit = val / n; \ 244 | *(p++) = digit_pair_table[digit * 2]; \ 245 | *(p++) = digit_pair_table[digit * 2 + 1]; \ 246 | val -= n * digit; 247 | 248 | DO_PAIR(10000000); 249 | DO_PAIR(100000); 250 | DO_PAIR(1000); 251 | DO_PAIR(10); 252 | 253 | d1: 254 | *p = '0' | val; 255 | 256 | if (p > start && *start == '0') ++start; 257 | 258 | if (is_signed && neg) *--start = '-'; 259 | 260 | return std::string(start, p + 1 - start); 261 | } 262 | }; 263 | 264 | template std::string itostr(T val) { 265 | (void)sizeof(assert_integral); 266 | return itostr_impl::cvt(val); 267 | } 268 | -------------------------------------------------------------------------------- /src/itoa-benchmark/unnamed.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "test.h" 4 | 5 | void u32toa_unnamed(uint32_t value, char* buffer) { 6 | if (1000000000UL <= value) { *buffer++ = (char)((value / 1000000000UL) % 10 + '0'); } 7 | if ( 100000000UL <= value) { *buffer++ = (char)((value / 100000000UL) % 10 + '0'); } 8 | if ( 10000000UL <= value) { *buffer++ = (char)((value / 10000000UL) % 10 + '0'); } 9 | if ( 1000000UL <= value) { *buffer++ = (char)((value / 1000000UL) % 10 + '0'); } 10 | if ( 100000UL <= value) { *buffer++ = (char)((value / 100000UL) % 10 + '0'); } 11 | if ( 10000UL <= value) { *buffer++ = (char)((value / 10000UL) % 10 + '0'); } 12 | if ( 1000UL <= value) { *buffer++ = (char)((value / 1000UL) % 10 + '0'); } 13 | if ( 100UL <= value) { *buffer++ = (char)((value / 100UL) % 10 + '0'); } 14 | if ( 10UL <= value) { *buffer++ = (char)((value / 10UL) % 10 + '0'); } 15 | 16 | *buffer++ = (char)(value % 10 + '0'); 17 | *buffer = '\0'; 18 | } 19 | 20 | void i32toa_unnamed(int32_t value, char* buffer) { 21 | if (value == std::numeric_limits::min()) { 22 | memcpy(buffer, "-2147483648\0", 12); 23 | return; 24 | } 25 | 26 | if (value < 0) { 27 | *buffer++ = '-'; 28 | value = -value; 29 | } 30 | 31 | if (1000000000L <= value) { *buffer++ = (char)((value / 1000000000L) % 10 + '0'); } 32 | if ( 100000000L <= value) { *buffer++ = (char)((value / 100000000L) % 10 + '0'); } 33 | if ( 10000000L <= value) { *buffer++ = (char)((value / 10000000L) % 10 + '0'); } 34 | if ( 1000000L <= value) { *buffer++ = (char)((value / 1000000L) % 10 + '0'); } 35 | if ( 100000L <= value) { *buffer++ = (char)((value / 100000L) % 10 + '0'); } 36 | if ( 10000L <= value) { *buffer++ = (char)((value / 10000L) % 10 + '0'); } 37 | if ( 1000L <= value) { *buffer++ = (char)((value / 1000L) % 10 + '0'); } 38 | if ( 100L <= value) { *buffer++ = (char)((value / 100L) % 10 + '0'); } 39 | if ( 10L <= value) { *buffer++ = (char)((value / 10L) % 10 + '0'); } 40 | 41 | *buffer++ = (char)(value % 10 + '0'); 42 | *buffer = '\0'; 43 | } 44 | 45 | void u64toa_unnamed(uint64_t value, char* buffer) { 46 | if ((value >> 32) == 0) { 47 | u32toa_unnamed(static_cast(value), buffer); 48 | return; 49 | } 50 | 51 | if (10000000000000000000ULL <= value) { *buffer++ = (char)((value / 10000000000000000000ULL) % 10 + '0'); } 52 | if ( 1000000000000000000ULL <= value) { *buffer++ = (char)((value / 1000000000000000000ULL) % 10 + '0'); } 53 | if ( 100000000000000000ULL <= value) { *buffer++ = (char)((value / 100000000000000000ULL) % 10 + '0'); } 54 | if ( 10000000000000000ULL <= value) { *buffer++ = (char)((value / 10000000000000000ULL) % 10 + '0'); } 55 | if ( 1000000000000000ULL <= value) { *buffer++ = (char)((value / 1000000000000000ULL) % 10 + '0'); } 56 | if ( 100000000000000ULL <= value) { *buffer++ = (char)((value / 100000000000000ULL) % 10 + '0'); } 57 | if ( 10000000000000ULL <= value) { *buffer++ = (char)((value / 10000000000000ULL) % 10 + '0'); } 58 | if ( 1000000000000ULL <= value) { *buffer++ = (char)((value / 1000000000000ULL) % 10 + '0'); } 59 | if ( 100000000000ULL <= value) { *buffer++ = (char)((value / 100000000000ULL) % 10 + '0'); } 60 | if ( 10000000000ULL <= value) { *buffer++ = (char)((value / 10000000000ULL) % 10 + '0'); } 61 | if ( 1000000000ULL <= value) { *buffer++ = (char)((value / 1000000000ULL) % 10 + '0'); } 62 | if ( 100000000ULL <= value) { *buffer++ = (char)((value / 100000000ULL) % 10 + '0'); } 63 | if ( 10000000ULL <= value) { *buffer++ = (char)((value / 10000000ULL) % 10 + '0'); } 64 | if ( 1000000ULL <= value) { *buffer++ = (char)((value / 1000000ULL) % 10 + '0'); } 65 | if ( 100000ULL <= value) { *buffer++ = (char)((value / 100000ULL) % 10 + '0'); } 66 | if ( 10000ULL <= value) { *buffer++ = (char)((value / 10000ULL) % 10 + '0'); } 67 | if ( 1000ULL <= value) { *buffer++ = (char)((value / 1000ULL) % 10 + '0'); } 68 | if ( 100ULL <= value) { *buffer++ = (char)((value / 100ULL) % 10 + '0'); } 69 | if ( 10ULL <= value) { *buffer++ = (char)((value / 10ULL) % 10 + '0'); } 70 | 71 | *buffer++ = (char)(value % 10 + '0'); 72 | *buffer = '\0'; 73 | } 74 | 75 | void i64toa_unnamed(int64_t value, char* buffer) { 76 | if (value == std::numeric_limits::min()) { 77 | memcpy(buffer, "-9223372036854775808\0", 21); 78 | return; 79 | } 80 | 81 | if (value >= 0 && (value >> 32) == 0) { 82 | u32toa_unnamed(static_cast(value), buffer); 83 | return; 84 | } 85 | 86 | if (value < 0) { 87 | *buffer++ = '-'; 88 | value = -value; 89 | 90 | if ((value >> 32) == 0) { 91 | u32toa_unnamed(static_cast(value), buffer); 92 | return; 93 | } 94 | } 95 | 96 | if (1000000000000000000LL <= value) { *buffer++ = (char)((value / 1000000000000000000LL) % 10 + '0'); } 97 | if ( 100000000000000000LL <= value) { *buffer++ = (char)((value / 100000000000000000LL) % 10 + '0'); } 98 | if ( 10000000000000000LL <= value) { *buffer++ = (char)((value / 10000000000000000LL) % 10 + '0'); } 99 | if ( 1000000000000000LL <= value) { *buffer++ = (char)((value / 1000000000000000LL) % 10 + '0'); } 100 | if ( 100000000000000LL <= value) { *buffer++ = (char)((value / 100000000000000LL) % 10 + '0'); } 101 | if ( 10000000000000LL <= value) { *buffer++ = (char)((value / 10000000000000LL) % 10 + '0'); } 102 | if ( 1000000000000LL <= value) { *buffer++ = (char)((value / 1000000000000LL) % 10 + '0'); } 103 | if ( 100000000000LL <= value) { *buffer++ = (char)((value / 100000000000LL) % 10 + '0'); } 104 | if ( 10000000000LL <= value) { *buffer++ = (char)((value / 10000000000LL) % 10 + '0'); } 105 | if ( 1000000000LL <= value) { *buffer++ = (char)((value / 1000000000LL) % 10 + '0'); } 106 | if ( 100000000LL <= value) { *buffer++ = (char)((value / 100000000LL) % 10 + '0'); } 107 | if ( 10000000LL <= value) { *buffer++ = (char)((value / 10000000LL) % 10 + '0'); } 108 | if ( 1000000LL <= value) { *buffer++ = (char)((value / 1000000LL) % 10 + '0'); } 109 | if ( 100000LL <= value) { *buffer++ = (char)((value / 100000LL) % 10 + '0'); } 110 | if ( 10000LL <= value) { *buffer++ = (char)((value / 10000LL) % 10 + '0'); } 111 | if ( 1000LL <= value) { *buffer++ = (char)((value / 1000LL) % 10 + '0'); } 112 | if ( 100LL <= value) { *buffer++ = (char)((value / 100LL) % 10 + '0'); } 113 | if ( 10LL <= value) { *buffer++ = (char)((value / 10LL) % 10 + '0'); } 114 | 115 | *buffer++ = (char)(value % 10 + '0'); 116 | *buffer = '\0'; 117 | } 118 | 119 | REGISTER_TEST(unnamed); 120 | -------------------------------------------------------------------------------- /src/itoa-benchmark/itoa_ljust_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef ITOA_LJUST_IMPL_H 2 | #define ITOA_LJUST_IMPL_H 3 | 4 | /*===----------------------------------------------------------------------===* 5 | * itoa_ljust_impl.h - Fast integer to ascii decimal conversion 6 | * 7 | * This file is meant to be included in only one .cpp file/compilation unit. 8 | * Uses recursive function templates, compile with -O3 for best performance. 9 | * 10 | * The implementation strives to perform well with random input values 11 | * where CPU branch prediction becomes ineffective: 12 | * 13 | * 1 ) reduce the number of conditional branches used to determine 14 | * the number of digits and use uninterrupted sequence of 15 | * instructions to generate multiple digits, this was inspired by 16 | * the implementation of FastUInt32ToBufferLeft in 17 | * https://github.com/google/protobuf/blob/master/ 18 | * src/google/protobuf/stubs/strutil.cc 19 | * 20 | * 2 ) avoid branches altogether by allowing overwriting of characters 21 | * in the output buffer when the difference is only one character 22 | * a) minus sign 23 | * b) alignment to even # digits 24 | * 25 | * 3 ) use hints to the compiler to indicate which conditional branches 26 | * are likely to be taken so the compiler arranges the likely 27 | * case to be the fallthrough, branch not taken 28 | * 29 | * Other Performance considerations 30 | * 31 | * 4 ) use a lookup table to convert binary numbers 0..99 into 2 characters 32 | * This technique is used by all fast implementations. 33 | * See Andrei Alexandrescu's engineering notes 34 | * https://www.facebook.com/notes/facebook-engineering/ 35 | * three-optimization-tips-for-c/10151361643253920/ 36 | * 37 | * 5 ) use memcpy to store 2 digits at a time - most compilers treat 38 | * memcpy as a builtin/intrinsic, this lets the compiler 39 | * generate a 2-byte store instruction in platforms that support 40 | * unaligned access 41 | * 42 | * 6 ) use explicit multiplicative inverse to perform integer division 43 | * See Terje Mathisen's algoritm in Agner Fog's 44 | * http://www.agner.org/optimize/optimizing_assembly.pdf 45 | * By exploiting knowledge of the restricted domain of the dividend 46 | * the multiplicative inverse factor is smaller and can fit in the 47 | * immediate operand of x86 multiply instructions, resulting in 48 | * fewer instructions 49 | * 50 | * 7 ) inline the recursive call to FastUInt64ToBufferLeft in the original 51 | * Google Protocol Buffers, as in itoa-benchmark/src/unrolledlut.cpp 52 | * by nyronium@genthree.io 53 | * 54 | * Correctness considerations 55 | * 56 | * 8 ) Avoids unary minus of signed types - undefined behavior if value 57 | * is INT_MIN in platforms using two's complement representation 58 | * 59 | *===----------------------------------------------------------------------===* 60 | * 61 | * The MIT License (MIT) 62 | * 63 | * Copyright (c) 2016-2017 Arturo Martin-de-Nicolas 64 | * arturomdn@gmail.com 65 | * https://github.com/amdn/itoa_ljust/ 66 | * 67 | * Permission is hereby granted, free of charge, to any person obtaining a copy 68 | * of this software and associated documentation files (the "Software"), to deal 69 | * in the Software without restriction, including without limitation the rights 70 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 71 | * copies of the Software, and to permit persons to whom the Software is 72 | * furnished to do so, subject to the following conditions: 73 | * 74 | * The above copyright notice and this permission notice shall be included 75 | * in all copies or substantial portions of the Software. 76 | * 77 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 78 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 79 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 80 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 81 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 82 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 83 | * SOFTWARE. 84 | *===----------------------------------------------------------------------===*/ 85 | 86 | #include 87 | #include 88 | #include 89 | #include // memcpy 90 | #include 91 | 92 | #include "itoa_ljust.h" 93 | 94 | #if defined(__GNUC__) || defined(__clang__) 95 | #define likely(expr) __builtin_expect(static_cast(expr), 1) /* Note 3 */ 96 | #define unlikely(expr) __builtin_expect(static_cast(expr), 0) 97 | #else 98 | #define likely(expr) (expr) 99 | #define unlikely(expr) (expr) 100 | #endif 101 | 102 | namespace { 103 | 104 | using u32 = uint32_t; 105 | using u64 = uint64_t; 106 | 107 | struct Digits { /* Note 4 */ 108 | std::array A; 109 | Digits() { 110 | int i,d,n=0; 111 | std::generate(A.begin(), A.end(), [&] () { 112 | return i=n++,d=i/2, '0' + (i%2 ? d%10 : d/10); 113 | }); 114 | } 115 | char const* operator()(u32 d) const { return &A[2*d]; } 116 | } const digits; 117 | 118 | constexpr u32 p10(u32 e) { return e ? 10*p10(e-1) : 1; } 119 | 120 | template u32 q10(u32 u); /* Note 6 */ 121 | template<> u32 q10<0>(u32 u) { return u; } 122 | template<> u32 q10<2>(u32 u) { return ((u64)u * 5243U) >> 19; } // u < 10^4 123 | template<> u32 q10<4>(u32 u) { return ((1+(u64)u) * 858993U) >> 33; } // u < 10^6 124 | template<> u32 q10<6>(u32 u) { return ((1+(u64)u)* 8796093U) >> 43; } // u < 10^8 125 | template<> u32 q10<8>(u32 u) { return ((u64)u * 1441151881U) >> 57; } // u < 2^32 126 | template u64 q10(u64 u) { return u / p10(E); } 127 | 128 | template struct QR { 129 | T const q; 130 | T const r; 131 | explicit QR( U u ) : q(q10(u)), r(u - q * p10(E)) {} 132 | }; 133 | 134 | template 135 | typename std::enable_if::type cvt(char* out, u32 u) { 136 | QR d{u}; 137 | memcpy(out, digits(d.q), 2); /* Note 5 */ 138 | return cvt(out+2, d.r); 139 | } 140 | template<> char* cvt<0>(char* out, u32) { *out = '\0'; return out; } 141 | template 142 | typename std::enable_if::type cvt(char* out, u32 u) { 143 | QR d{u}; 144 | char const* src = digits(d.q); 145 | *out = *src++; 146 | out += d.q > 9; /* Note 2b */ 147 | *out++ = *src; 148 | return cvt(out, d.r); 149 | } 150 | 151 | char* to_dec(char* out, u32 u) { /* Note 1 */ 152 | if (unlikely(u >= p10(8))) return cvt<9>(out, u); 153 | else if (likely(u < p10(2))) return cvt<1>(out, u); 154 | else if (likely(u < p10(4))) return cvt<3>(out, u); 155 | else if (likely(u < p10(6))) return cvt<5>(out, u); 156 | else return cvt<7>(out, u); 157 | } 158 | 159 | char* to_dec(char* out, u64 u) { /* Note 7 */ 160 | u32 low = static_cast(u); 161 | if (likely(low == u)) return to_dec(out, low); 162 | QR mid{u}; 163 | u32 mid32 = static_cast(mid.q); 164 | if (likely(mid32 == mid.q)) { 165 | out = to_dec(out, mid32); 166 | return cvt<8>(out, static_cast(mid.r)); 167 | } else { 168 | QR d{mid.q}; 169 | out = d.q < p10(2) ? cvt<1>(out, d.q) : cvt<3>(out, d.q); 170 | out = cvt<8>(out, d.r); 171 | return cvt<8>(out, static_cast(mid.r)); 172 | } 173 | } 174 | 175 | template::value, 177 | typename std::make_unsigned::type>::type> 178 | char* to_dec(char* out, T v) { 179 | U mask = v < 0 ? ~(U)(0) : 0; /* Note 2a */ 180 | U u = ((2 * (U)(v)) & ~mask) - (U)(v); /* Note 8 */ 181 | *out = '-'; 182 | return to_dec(out + (mask&1), u); 183 | } 184 | } // anonymous namespace 185 | 186 | #endif // ITOA_LJUST_IMPL.H 187 | -------------------------------------------------------------------------------- /src/digits10/digits10.h: -------------------------------------------------------------------------------- 1 | #ifndef DIGITS10_H_ 2 | #define DIGITS10_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "benchmark/benchmark.h" 11 | 12 | #define FMT_POWERS_OF_10(factor) \ 13 | factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \ 14 | (factor)*1000000, (factor)*10000000, (factor)*100000000, \ 15 | (factor)*1000000000 16 | 17 | // It is a separate function rather than a part of count_digits to workaround 18 | // the lack of static constexpr in constexpr functions. 19 | inline auto digits10_fmt64(uint64_t n) -> int { 20 | // https://github.com/fmtlib/format-benchmark/blob/master/digits10 21 | // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)). 22 | static constexpr uint16_t bsr2log10[] = { 23 | 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 24 | 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 25 | 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 26 | 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20}; 27 | auto t = bsr2log10[__builtin_clzll(n | 1) ^ 63]; 28 | static constexpr const uint64_t zero_or_powers_of_10[] = { 29 | 0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL), 30 | 10000000000000000000ULL}; 31 | return t - (n < zero_or_powers_of_10[t]); 32 | } 33 | 34 | constexpr int floor_log10_pow2(int e) noexcept { return (e * 1262611) >> 22; } 35 | 36 | constexpr int ceil_log10_pow2(int e) noexcept { 37 | return e == 0 ? 0 : floor_log10_pow2(e) + 1; 38 | } 39 | 40 | struct digit_count_table_holder_t { 41 | std::uint64_t entry[64]; 42 | }; 43 | 44 | constexpr digit_count_table_holder_t generate_digit_count_table() { 45 | digit_count_table_holder_t table{{}}; 46 | constexpr std::uint64_t pow10[] = {1ull, 47 | 10ull, 48 | 100ull, 49 | 1000ull, 50 | 1'0000ull, 51 | 10'0000ull, 52 | 100'0000ull, 53 | 1000'0000ull, 54 | 1'0000'0000ull, 55 | 10'0000'0000ull, 56 | 100'0000'0000ull, 57 | 1000'0000'0000ull, 58 | 1'0000'0000'0000ull, 59 | 10'0000'0000'0000ull, 60 | 100'0000'0000'0000ull, 61 | 1000'0000'0000'0000ull, 62 | 1'0000'0000'0000'0000ull, 63 | 10'0000'0000'0000'0000ull, 64 | 100'0000'0000'0000'0000ull, 65 | 1000'0000'0000'0000'0000ull}; 66 | 67 | for (int i = 0; i < 64; ++i) { 68 | auto const ub = std::uint64_t(ceil_log10_pow2(i)); 69 | assert(ub <= 19); 70 | table.entry[i] = ((ub + 1) << 52) - (pow10[ub] >> (i / 4)); 71 | } 72 | 73 | return table; 74 | } 75 | 76 | constexpr inline auto digit_count_table = generate_digit_count_table(); 77 | 78 | inline int floor_log2(std::uint64_t n) { return 63 ^ __builtin_clzll(n); } 79 | 80 | inline int digits10_jk_jeon(std::uint64_t n) { 81 | auto clz = floor_log2(n); 82 | return int((digit_count_table.entry[clz] + (n >> (clz / 4))) >> 52); 83 | } 84 | 85 | // It is a separate function rather than a part of count_digits to workaround 86 | // the lack of static constexpr in constexpr functions. 87 | inline uint64_t count_digits_inc(int n) { 88 | // An optimization by Kendall Willets from https://bit.ly/3uOIQrB. 89 | // This increments the upper 32 bits (log10(T) - 1) when >= T is added. 90 | #define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T) 91 | static constexpr uint64_t table[] = { 92 | FMT_INC(0), FMT_INC(0), FMT_INC(0), // 8 93 | FMT_INC(10), FMT_INC(10), FMT_INC(10), // 64 94 | FMT_INC(100), FMT_INC(100), FMT_INC(100), // 512 95 | FMT_INC(1000), FMT_INC(1000), FMT_INC(1000), // 4096 96 | FMT_INC(10000), FMT_INC(10000), FMT_INC(10000), // 32k 97 | FMT_INC(100000), FMT_INC(100000), FMT_INC(100000), // 256k 98 | FMT_INC(1000000), FMT_INC(1000000), FMT_INC(1000000), // 2048k 99 | FMT_INC(10000000), FMT_INC(10000000), FMT_INC(10000000), // 16M 100 | FMT_INC(100000000), FMT_INC(100000000), FMT_INC(100000000), // 128M 101 | FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000), // 1024M 102 | FMT_INC(1000000000), FMT_INC(1000000000) // 4B 103 | }; 104 | return table[n]; 105 | } 106 | 107 | inline auto digits10_willets(std::uint32_t n) -> int { 108 | auto inc = count_digits_inc(__builtin_clz(n | 1) ^ 31); 109 | return static_cast((n + inc) >> 32); 110 | } 111 | 112 | inline std::uint32_t digits10_naive(std::uint32_t n) { 113 | std::uint32_t result = 0; 114 | do { 115 | ++result; 116 | n /= 10; 117 | } while (n); 118 | return result; 119 | } 120 | 121 | inline std::uint32_t digits10_unroll4(std::uint32_t n) { 122 | std::uint32_t result = 1; 123 | for (;;) { 124 | if (n < 10) return result; 125 | if (n < 100) return result + 1; 126 | if (n < 1000) return result + 2; 127 | if (n < 10000) return result + 3; 128 | n /= 10000u; 129 | result += 4; 130 | } 131 | } 132 | 133 | extern const uint32_t powers_of_10_u32[]; 134 | 135 | inline std::uint32_t digits10_clz(std::uint32_t n) { 136 | std::uint32_t t = (32 - __builtin_clz(n | 1)) * 1233 >> 12; 137 | return t - (n < powers_of_10_u32[t]) + 1; 138 | } 139 | 140 | // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)). 141 | // This is a function instead of an array to workaround a bug in GCC10 (#1810). 142 | inline uint16_t bsr2log10(int bsr) { 143 | constexpr uint16_t data[] = { 144 | 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 145 | 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 146 | 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 147 | 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20}; 148 | return data[bsr]; 149 | } 150 | 151 | static const uint32_t powers_of_10_u32_z[] = { 152 | 0, 0, 10, 100, 1000, 10000, 153 | 100000, 1000000, 10000000, 100000000, 1000000000}; 154 | 155 | // My version of digits10_clz that converts clz to bsr and uses two lookup 156 | // tables. 157 | inline std::uint32_t digits10_clz_zverovich(std::uint32_t n) { 158 | auto t = bsr2log10(__builtin_clz(n | 1) ^ 31); 159 | return t - (n < powers_of_10_u32_z[t]); 160 | } 161 | 162 | inline int digits10_grisu(uint32_t n) { 163 | if (n < 10) return 1; 164 | if (n < 100) return 2; 165 | if (n < 1000) return 3; 166 | if (n < 10000) return 4; 167 | if (n < 100000) return 5; 168 | if (n < 1000000) return 6; 169 | if (n < 10000000) return 7; 170 | if (n < 100000000) return 8; 171 | if (n < 1000000000) return 9; 172 | return 10; 173 | } 174 | 175 | // Return minimum number with the specified number of digits. 176 | inline std::uint32_t min_number(unsigned num_digits) { 177 | if (num_digits == 0 || num_digits > 10) 178 | throw std::out_of_range("num_digits is out of range"); 179 | return num_digits == 1 ? 0 : std::pow(10, num_digits - 1); 180 | } 181 | 182 | inline std::uint32_t max_number(unsigned num_digits) { 183 | if (num_digits == 0 || num_digits > 10) 184 | throw std::out_of_range("num_digits is out of range"); 185 | return num_digits == 10 ? std::numeric_limits::max() 186 | : std::pow(10, num_digits) - 1; 187 | } 188 | 189 | // Generate 100 numbers with specified number of digits. 190 | std::vector generate_numbers(int num_digits); 191 | 192 | template void run_benchmark(benchmark::State& state, F digits10) { 193 | int num_digits = state.range(); 194 | auto numbers = generate_numbers(num_digits); 195 | bool valid = true; 196 | while (state.KeepRunning()) { 197 | for (auto n : numbers) valid &= (digits10(n) == num_digits); 198 | } 199 | if (!valid) throw std::logic_error("invalid result"); 200 | } 201 | 202 | #endif // DIGITS10_H_ 203 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | # Joins arguments and places the results in ${result_var}. 4 | function(join result_var) 5 | set(result ) 6 | foreach (arg ${ARGN}) 7 | set(result "${result}${arg}") 8 | endforeach () 9 | set(${result_var} "${result}" PARENT_SCOPE) 10 | endfunction() 11 | 12 | # Sets a cache variable with a docstring joined from multiple arguments: 13 | # set( ... CACHE ...) 14 | # This allows splitting a long docstring for readability. 15 | function(set_verbose) 16 | # cmake_parse_arguments is broken in CMake 3.4 (cannot parse CACHE) so use 17 | # list instead. 18 | list(GET ARGN 0 var) 19 | list(REMOVE_AT ARGN 0) 20 | list(GET ARGN 0 val) 21 | list(REMOVE_AT ARGN 0) 22 | list(REMOVE_AT ARGN 0) 23 | list(GET ARGN 0 type) 24 | list(REMOVE_AT ARGN 0) 25 | join(doc ${ARGN}) 26 | set(${var} ${val} CACHE ${type} ${doc}) 27 | endfunction() 28 | 29 | # Set the default CMAKE_BUILD_TYPE to Release. 30 | # This should be done before the project command since the latter can set 31 | # CMAKE_BUILD_TYPE itself (it does so for nmake). 32 | if (NOT CMAKE_BUILD_TYPE) 33 | set_verbose(CMAKE_BUILD_TYPE Release CACHE STRING 34 | "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or " 35 | "CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.") 36 | endif () 37 | 38 | project(FORMAT_BENCHMARKS) 39 | 40 | set(CMAKE_MACOSX_RPATH ON) 41 | set(CMAKE_CXX_STANDARD 17) 42 | 43 | include(CheckCXXCompilerFlag) 44 | 45 | if (NOT DEFINED INTEL) 46 | set(_IS_INTEL_HOST OFF) 47 | if (APPLE) 48 | execute_process(COMMAND sysctl -n machdep.cpu.brand_string 49 | OUTPUT_VARIABLE out) 50 | if (out MATCHES "Intel.*") 51 | set(_IS_INTEL_HOST ON) 52 | endif () 53 | else () 54 | file(READ /proc/cpuinfo out) 55 | if (out MATCHES "(.|\n)*GenuineIntel(.|\n)*") 56 | set(_IS_INTEL_HOST ON) 57 | endif () 58 | endif () 59 | option(INTEL "Enable Intel JCC bug mitigation." ${_IS_INTEL_HOST}) 60 | endif () 61 | 62 | function(check_flags_and_append_on_success test_flags_var_name append_to_var) 63 | check_cxx_compiler_flag("${${test_flags_var_name}_FLAGS}" 64 | ${test_flags_var_name}_WORKS) 65 | if (${test_flags_var_name}_WORKS) 66 | list(APPEND ${append_to_var} ${${test_flags_var_name}_FLAGS}) 67 | set(${append_to_var} ${${append_to_var}} PARENT_SCOPE) 68 | endif () 69 | endfunction() 70 | 71 | # Workaround a JCC bug in Intel CPUs: 72 | # https://www.intel.com/content/dam/support/us/en/documents/processors/ 73 | # mitigations-jump-conditional-code-erratum.pdf 74 | # to get more reliable benchmark results. 75 | # 76 | # Ideally we should use -mbranches-within-32B-boundaries but it's not widely 77 | # available so at least align loops/functions as fallback to prevent unrelated 78 | # code changes from affecting benchmark results. 79 | set(ALIGN_ALL_BLOCKS_FLAGS -mllvm -align-all-blocks=5) 80 | set(ALIGN_32B_BOUNDARIES_FLAGS -Wa,-mbranches-within-32B-boundaries) 81 | set(ALIGN_FUNCTION_FLAGS -falign-functions=32) 82 | check_flags_and_append_on_success(ALIGN_32B_BOUNDARIES ALIGN_OPTIONS) 83 | if (NOT ALIGN_32B_BOUNDARIES_WORKS) 84 | check_flags_and_append_on_success(ALIGN_ALL_BLOCKS ALIGN_OPTIONS) 85 | check_flags_and_append_on_success(ALIGN_FUNCTION ALIGN_OPTIONS) 86 | endif () 87 | 88 | message(STATUS "Align options: ${ALIGN_OPTIONS}") 89 | add_definitions(${ALIGN_OPTIONS}) 90 | 91 | # Use shared libraries to make comparison with IOStreams and printf 92 | # fair as these use shared libraries too (libstdc++ and libc). 93 | set(BUILD_SHARED_LIBS ON CACHE BOOL 94 | "Build shared library instead of static one") 95 | set(FMT_TEST TRUE CACHE BOOL "Enable fmt tests") 96 | add_subdirectory(fmt) 97 | 98 | find_package(Boost CONFIG) 99 | 100 | set(FOLLY_DIR HINTS /Users/viz/homebrew/Cellar/folly/2022.11.14.00) 101 | find_path(FOLLY_INCLUDE_DIR folly/Format.h HINTS ${FOLLY_DIR}/include) 102 | find_library(FOLLY_LIB folly HINTS ${FOLLY_DIR}/lib) 103 | if (FOLLY_INCLUDE_DIR AND FOLLY_LIB) 104 | set(EXTRA_LIBS ${FOLLY_LIB}) 105 | endif () 106 | 107 | find_library(PROFILER_LIB profiler) 108 | find_path(PROFILER_INCLUDE_DIR gperftools/profiler.h) 109 | if (PROFILER_LIB AND PROFILER_INCLUDE_DIR) 110 | include_directories(${PROFILER_INCLUDE_DIR}) 111 | set(HAVE_PROFILER TRUE) 112 | endif () 113 | 114 | add_executable(tinyformat_speed_test src/tinyformat-test.cc) 115 | target_link_libraries(tinyformat_speed_test fmt ${EXTRA_LIBS}) 116 | if (HAVE_PROFILER) 117 | target_link_libraries(tinyformat_speed_test ${PROFILER_LIB}) 118 | set(PROFILE_DEFS ";FMT_PROFILE") 119 | endif () 120 | if (TARGET Boost::boost) 121 | target_link_libraries(tinyformat_speed_test Boost::boost) 122 | endif () 123 | set_target_properties(tinyformat_speed_test PROPERTIES COMPILE_DEFINITIONS 124 | "SPEED_TEST;HAVE_FORMAT;_SCL_SECURE_NO_WARNINGS;${PROFILE_DEFS}") 125 | if (CPP11_FLAG) 126 | set_target_properties(tinyformat_speed_test 127 | PROPERTIES COMPILE_FLAGS ${CPP11_FLAG}) 128 | endif () 129 | 130 | if (WIN32) 131 | add_custom_target(speed-test 132 | COMMAND @echo running speed tests... 133 | COMMAND cd ${CMAKE_CFG_INTDIR} 134 | COMMAND @echo printf timings: start %time% 135 | COMMAND .\\tinyformat_speed_test.exe printf >NUL 136 | COMMAND @echo stop %time% 137 | COMMAND @echo iostreams timings: start %time% 138 | COMMAND .\\tinyformat_speed_test.exe iostreams >NUL 139 | COMMAND @echo stop %time% 140 | COMMAND @echo format timings: start %time% 141 | COMMAND .\\tinyformat_speed_test.exe format >NUL 142 | COMMAND @echo stop %time% 143 | COMMAND @echo tinyformat timings: start %time% 144 | COMMAND .\\tinyformat_speed_test.exe tinyformat >NUL 145 | COMMAND @echo stop %time% 146 | COMMAND @echo boost timings: start %time% 147 | COMMAND .\\tinyformat_speed_test.exe boost >NUL 148 | COMMAND @echo stop %time% 149 | COMMAND @echo stb_sprintf timings: start %time% 150 | COMMAND .\\tinyformat_speed_test.exe stb_sprintf >NUL 151 | COMMAND @echo stop %time% 152 | DEPENDS tinyformat_speed_test) 153 | else() 154 | add_custom_target(speed-test 155 | COMMAND @echo running speed tests... 156 | COMMAND @echo printf timings: 157 | COMMAND @time -p ./tinyformat_speed_test printf > /dev/null 158 | COMMAND @echo iostreams timings: 159 | COMMAND @time -p ./tinyformat_speed_test iostreams > /dev/null 160 | COMMAND @echo format timings: 161 | COMMAND @time -p ./tinyformat_speed_test format > /dev/null 162 | COMMAND @echo fmt::compile timings: 163 | COMMAND @time -p ./tinyformat_speed_test fmt::compile > /dev/null 164 | COMMAND @echo tinyformat timings: 165 | COMMAND @time -p ./tinyformat_speed_test tinyformat > /dev/null 166 | COMMAND @echo boost timings: 167 | COMMAND @time -p ./tinyformat_speed_test boost > /dev/null 168 | COMMAND @echo folly timings: 169 | COMMAND @time -p ./tinyformat_speed_test folly > /dev/null 170 | COMMAND @echo stb_sprintf timings: 171 | COMMAND @time -p ./tinyformat_speed_test stb_sprintf > /dev/null 172 | DEPENDS tinyformat_speed_test) 173 | endif () 174 | 175 | add_custom_target(bloat-test 176 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bloat-test.py 177 | -I${Boost_INCLUDE_DIRS} 178 | DEPENDS fmt) 179 | 180 | add_custom_target(variadic-test 181 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/variadic-test.py 182 | \${ARGS} -I${Boost_INCLUDE_DIRS} 183 | DEPENDS fmt) 184 | 185 | enable_testing() 186 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark tests.") 187 | add_subdirectory(benchmark) 188 | 189 | add_executable(digits10-benchmark src/digits10/digits10.cc 190 | src/digits10/digits10.h src/digits10/digits10-benchmark.cc) 191 | target_link_libraries(digits10-benchmark benchmark) 192 | 193 | add_executable(digits10-test src/digits10/digits10.cc 194 | src/digits10/digits10-test.cc) 195 | target_link_libraries(digits10-test gtest benchmark) 196 | add_test(digits10-test digits10-test) 197 | 198 | add_executable(vararg-benchmark src/vararg-benchmark.cc) 199 | target_link_libraries(vararg-benchmark benchmark fmt) 200 | 201 | add_executable(int-benchmark src/int-benchmark.cc) 202 | target_link_libraries(int-benchmark benchmark fmt) 203 | if (TARGET Boost::boost) 204 | target_link_libraries(int-benchmark Boost::boost) 205 | endif () 206 | 207 | target_compile_features(int-benchmark PRIVATE cxx_relaxed_constexpr) 208 | 209 | add_executable(locale-benchmark src/locale-benchmark.cc) 210 | target_link_libraries(locale-benchmark benchmark fmt) 211 | 212 | add_executable(concat-benchmark src/concat-benchmark.cc) 213 | target_link_libraries(concat-benchmark benchmark fmt) 214 | 215 | add_executable(file-benchmark src/file-benchmark.cc) 216 | target_link_libraries(file-benchmark benchmark fmt) 217 | 218 | add_executable(find-pow10-benchmark src/find-pow10-benchmark.cc) 219 | target_link_libraries(find-pow10-benchmark benchmark) 220 | 221 | add_executable( 222 | remove-trailing-zeros-benchmark src/remove-trailing-zeros-benchmark.cc) 223 | target_link_libraries(remove-trailing-zeros-benchmark benchmark fmt) 224 | 225 | add_subdirectory(src/itoa-benchmark) 226 | -------------------------------------------------------------------------------- /src/itoa-benchmark/branchlut.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "digitslut.h" 3 | #include "test.h" 4 | 5 | // Branching for different cases (forward) 6 | // Use lookup table of two digits 7 | 8 | void u32toa_branchlut(uint32_t value, char* buffer) { 9 | if (value < 10000) { 10 | const uint32_t d1 = (value / 100) << 1; 11 | const uint32_t d2 = (value % 100) << 1; 12 | 13 | if (value >= 1000) 14 | *buffer++ = gDigitsLut[d1]; 15 | if (value >= 100) 16 | *buffer++ = gDigitsLut[d1 + 1]; 17 | if (value >= 10) 18 | *buffer++ = gDigitsLut[d2]; 19 | *buffer++ = gDigitsLut[d2 + 1]; 20 | } 21 | else if (value < 100000000) { 22 | // value = bbbbcccc 23 | const uint32_t b = value / 10000; 24 | const uint32_t c = value % 10000; 25 | 26 | const uint32_t d1 = (b / 100) << 1; 27 | const uint32_t d2 = (b % 100) << 1; 28 | 29 | const uint32_t d3 = (c / 100) << 1; 30 | const uint32_t d4 = (c % 100) << 1; 31 | 32 | if (value >= 10000000) 33 | *buffer++ = gDigitsLut[d1]; 34 | if (value >= 1000000) 35 | *buffer++ = gDigitsLut[d1 + 1]; 36 | if (value >= 100000) 37 | *buffer++ = gDigitsLut[d2]; 38 | *buffer++ = gDigitsLut[d2 + 1]; 39 | 40 | *buffer++ = gDigitsLut[d3]; 41 | *buffer++ = gDigitsLut[d3 + 1]; 42 | *buffer++ = gDigitsLut[d4]; 43 | *buffer++ = gDigitsLut[d4 + 1]; 44 | } 45 | else { 46 | // value = aabbbbcccc in decimal 47 | 48 | const uint32_t a = value / 100000000; // 1 to 42 49 | value %= 100000000; 50 | 51 | if (a >= 10) { 52 | const unsigned i = a << 1; 53 | *buffer++ = gDigitsLut[i]; 54 | *buffer++ = gDigitsLut[i + 1]; 55 | } 56 | else 57 | *buffer++ = '0' + static_cast(a); 58 | 59 | const uint32_t b = value / 10000; // 0 to 9999 60 | const uint32_t c = value % 10000; // 0 to 9999 61 | 62 | const uint32_t d1 = (b / 100) << 1; 63 | const uint32_t d2 = (b % 100) << 1; 64 | 65 | const uint32_t d3 = (c / 100) << 1; 66 | const uint32_t d4 = (c % 100) << 1; 67 | 68 | *buffer++ = gDigitsLut[d1]; 69 | *buffer++ = gDigitsLut[d1 + 1]; 70 | *buffer++ = gDigitsLut[d2]; 71 | *buffer++ = gDigitsLut[d2 + 1]; 72 | *buffer++ = gDigitsLut[d3]; 73 | *buffer++ = gDigitsLut[d3 + 1]; 74 | *buffer++ = gDigitsLut[d4]; 75 | *buffer++ = gDigitsLut[d4 + 1]; 76 | } 77 | *buffer++ = '\0'; 78 | } 79 | 80 | void i32toa_branchlut(int32_t value, char* buffer) { 81 | uint32_t u = static_cast(value); 82 | if (value < 0) { 83 | *buffer++ = '-'; 84 | u = ~u + 1; 85 | } 86 | 87 | u32toa_branchlut(u, buffer); 88 | } 89 | 90 | void u64toa_branchlut(uint64_t value, char* buffer) { 91 | if (value < 100000000) { 92 | uint32_t v = static_cast(value); 93 | if (v < 10000) { 94 | const uint32_t d1 = (v / 100) << 1; 95 | const uint32_t d2 = (v % 100) << 1; 96 | 97 | if (v >= 1000) 98 | *buffer++ = gDigitsLut[d1]; 99 | if (v >= 100) 100 | *buffer++ = gDigitsLut[d1 + 1]; 101 | if (v >= 10) 102 | *buffer++ = gDigitsLut[d2]; 103 | *buffer++ = gDigitsLut[d2 + 1]; 104 | } 105 | else { 106 | // value = bbbbcccc 107 | const uint32_t b = v / 10000; 108 | const uint32_t c = v % 10000; 109 | 110 | const uint32_t d1 = (b / 100) << 1; 111 | const uint32_t d2 = (b % 100) << 1; 112 | 113 | const uint32_t d3 = (c / 100) << 1; 114 | const uint32_t d4 = (c % 100) << 1; 115 | 116 | if (value >= 10000000) 117 | *buffer++ = gDigitsLut[d1]; 118 | if (value >= 1000000) 119 | *buffer++ = gDigitsLut[d1 + 1]; 120 | if (value >= 100000) 121 | *buffer++ = gDigitsLut[d2]; 122 | *buffer++ = gDigitsLut[d2 + 1]; 123 | 124 | *buffer++ = gDigitsLut[d3]; 125 | *buffer++ = gDigitsLut[d3 + 1]; 126 | *buffer++ = gDigitsLut[d4]; 127 | *buffer++ = gDigitsLut[d4 + 1]; 128 | } 129 | } 130 | else if (value < 10000000000000000) { 131 | const uint32_t v0 = static_cast(value / 100000000); 132 | const uint32_t v1 = static_cast(value % 100000000); 133 | 134 | const uint32_t b0 = v0 / 10000; 135 | const uint32_t c0 = v0 % 10000; 136 | 137 | const uint32_t d1 = (b0 / 100) << 1; 138 | const uint32_t d2 = (b0 % 100) << 1; 139 | 140 | const uint32_t d3 = (c0 / 100) << 1; 141 | const uint32_t d4 = (c0 % 100) << 1; 142 | 143 | const uint32_t b1 = v1 / 10000; 144 | const uint32_t c1 = v1 % 10000; 145 | 146 | const uint32_t d5 = (b1 / 100) << 1; 147 | const uint32_t d6 = (b1 % 100) << 1; 148 | 149 | const uint32_t d7 = (c1 / 100) << 1; 150 | const uint32_t d8 = (c1 % 100) << 1; 151 | 152 | if (value >= 1000000000000000) 153 | *buffer++ = gDigitsLut[d1]; 154 | if (value >= 100000000000000) 155 | *buffer++ = gDigitsLut[d1 + 1]; 156 | if (value >= 10000000000000) 157 | *buffer++ = gDigitsLut[d2]; 158 | if (value >= 1000000000000) 159 | *buffer++ = gDigitsLut[d2 + 1]; 160 | if (value >= 100000000000) 161 | *buffer++ = gDigitsLut[d3]; 162 | if (value >= 10000000000) 163 | *buffer++ = gDigitsLut[d3 + 1]; 164 | if (value >= 1000000000) 165 | *buffer++ = gDigitsLut[d4]; 166 | if (value >= 100000000) 167 | *buffer++ = gDigitsLut[d4 + 1]; 168 | 169 | *buffer++ = gDigitsLut[d5]; 170 | *buffer++ = gDigitsLut[d5 + 1]; 171 | *buffer++ = gDigitsLut[d6]; 172 | *buffer++ = gDigitsLut[d6 + 1]; 173 | *buffer++ = gDigitsLut[d7]; 174 | *buffer++ = gDigitsLut[d7 + 1]; 175 | *buffer++ = gDigitsLut[d8]; 176 | *buffer++ = gDigitsLut[d8 + 1]; 177 | } 178 | else { 179 | const uint32_t a = static_cast(value / 10000000000000000); // 1 to 1844 180 | value %= 10000000000000000; 181 | 182 | if (a < 10) 183 | *buffer++ = '0' + static_cast(a); 184 | else if (a < 100) { 185 | const uint32_t i = a << 1; 186 | *buffer++ = gDigitsLut[i]; 187 | *buffer++ = gDigitsLut[i + 1]; 188 | } 189 | else if (a < 1000) { 190 | *buffer++ = '0' + static_cast(a / 100); 191 | 192 | const uint32_t i = (a % 100) << 1; 193 | *buffer++ = gDigitsLut[i]; 194 | *buffer++ = gDigitsLut[i + 1]; 195 | } 196 | else { 197 | const uint32_t i = (a / 100) << 1; 198 | const uint32_t j = (a % 100) << 1; 199 | *buffer++ = gDigitsLut[i]; 200 | *buffer++ = gDigitsLut[i + 1]; 201 | *buffer++ = gDigitsLut[j]; 202 | *buffer++ = gDigitsLut[j + 1]; 203 | } 204 | 205 | const uint32_t v0 = static_cast(value / 100000000); 206 | const uint32_t v1 = static_cast(value % 100000000); 207 | 208 | const uint32_t b0 = v0 / 10000; 209 | const uint32_t c0 = v0 % 10000; 210 | 211 | const uint32_t d1 = (b0 / 100) << 1; 212 | const uint32_t d2 = (b0 % 100) << 1; 213 | 214 | const uint32_t d3 = (c0 / 100) << 1; 215 | const uint32_t d4 = (c0 % 100) << 1; 216 | 217 | const uint32_t b1 = v1 / 10000; 218 | const uint32_t c1 = v1 % 10000; 219 | 220 | const uint32_t d5 = (b1 / 100) << 1; 221 | const uint32_t d6 = (b1 % 100) << 1; 222 | 223 | const uint32_t d7 = (c1 / 100) << 1; 224 | const uint32_t d8 = (c1 % 100) << 1; 225 | 226 | *buffer++ = gDigitsLut[d1]; 227 | *buffer++ = gDigitsLut[d1 + 1]; 228 | *buffer++ = gDigitsLut[d2]; 229 | *buffer++ = gDigitsLut[d2 + 1]; 230 | *buffer++ = gDigitsLut[d3]; 231 | *buffer++ = gDigitsLut[d3 + 1]; 232 | *buffer++ = gDigitsLut[d4]; 233 | *buffer++ = gDigitsLut[d4 + 1]; 234 | *buffer++ = gDigitsLut[d5]; 235 | *buffer++ = gDigitsLut[d5 + 1]; 236 | *buffer++ = gDigitsLut[d6]; 237 | *buffer++ = gDigitsLut[d6 + 1]; 238 | *buffer++ = gDigitsLut[d7]; 239 | *buffer++ = gDigitsLut[d7 + 1]; 240 | *buffer++ = gDigitsLut[d8]; 241 | *buffer++ = gDigitsLut[d8 + 1]; 242 | } 243 | 244 | *buffer = '\0'; 245 | } 246 | 247 | void i64toa_branchlut(int64_t value, char* buffer) { 248 | uint64_t u = static_cast(value); 249 | if (value < 0) { 250 | *buffer++ = '-'; 251 | u = ~u + 1; 252 | } 253 | 254 | u64toa_branchlut(u, buffer); 255 | } 256 | 257 | REGISTER_TEST(branchlut); 258 | -------------------------------------------------------------------------------- /src/itoa-benchmark/msinttypes/inttypes.h: -------------------------------------------------------------------------------- 1 | // ISO C9x compliant inttypes.h for Microsoft Visual Studio 2 | // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 3 | // 4 | // Copyright (c) 2006-2013 Alexander Chemeris 5 | // 6 | // Redistribution and use in source and binary forms, with or without 7 | // modification, are permitted provided that the following conditions are met: 8 | // 9 | // 1. Redistributions of source code must retain the above copyright notice, 10 | // this list of conditions and the following disclaimer. 11 | // 12 | // 2. Redistributions in binary form must reproduce the above copyright 13 | // notice, this list of conditions and the following disclaimer in the 14 | // documentation and/or other materials provided with the distribution. 15 | // 16 | // 3. Neither the name of the product nor the names of its contributors may 17 | // be used to endorse or promote products derived from this software 18 | // without specific prior written permission. 19 | // 20 | // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 22 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 | // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 | // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | // 31 | /////////////////////////////////////////////////////////////////////////////// 32 | 33 | #ifndef _MSC_VER // [ 34 | #error "Use this header only with Microsoft Visual C++ compilers!" 35 | #endif // _MSC_VER ] 36 | 37 | #ifndef _MSC_INTTYPES_H_ // [ 38 | #define _MSC_INTTYPES_H_ 39 | 40 | #if _MSC_VER > 1000 41 | #pragma once 42 | #endif 43 | 44 | #include "stdint.h" 45 | 46 | // 7.8 Format conversion of integer types 47 | 48 | typedef struct { 49 | intmax_t quot; 50 | intmax_t rem; 51 | } imaxdiv_t; 52 | 53 | // 7.8.1 Macros for format specifiers 54 | 55 | #if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 56 | 57 | // The fprintf macros for signed integers are: 58 | #define PRId8 "d" 59 | #define PRIi8 "i" 60 | #define PRIdLEAST8 "d" 61 | #define PRIiLEAST8 "i" 62 | #define PRIdFAST8 "d" 63 | #define PRIiFAST8 "i" 64 | 65 | #define PRId16 "hd" 66 | #define PRIi16 "hi" 67 | #define PRIdLEAST16 "hd" 68 | #define PRIiLEAST16 "hi" 69 | #define PRIdFAST16 "hd" 70 | #define PRIiFAST16 "hi" 71 | 72 | #define PRId32 "I32d" 73 | #define PRIi32 "I32i" 74 | #define PRIdLEAST32 "I32d" 75 | #define PRIiLEAST32 "I32i" 76 | #define PRIdFAST32 "I32d" 77 | #define PRIiFAST32 "I32i" 78 | 79 | #define PRId64 "I64d" 80 | #define PRIi64 "I64i" 81 | #define PRIdLEAST64 "I64d" 82 | #define PRIiLEAST64 "I64i" 83 | #define PRIdFAST64 "I64d" 84 | #define PRIiFAST64 "I64i" 85 | 86 | #define PRIdMAX "I64d" 87 | #define PRIiMAX "I64i" 88 | 89 | #define PRIdPTR "Id" 90 | #define PRIiPTR "Ii" 91 | 92 | // The fprintf macros for unsigned integers are: 93 | #define PRIo8 "o" 94 | #define PRIu8 "u" 95 | #define PRIx8 "x" 96 | #define PRIX8 "X" 97 | #define PRIoLEAST8 "o" 98 | #define PRIuLEAST8 "u" 99 | #define PRIxLEAST8 "x" 100 | #define PRIXLEAST8 "X" 101 | #define PRIoFAST8 "o" 102 | #define PRIuFAST8 "u" 103 | #define PRIxFAST8 "x" 104 | #define PRIXFAST8 "X" 105 | 106 | #define PRIo16 "ho" 107 | #define PRIu16 "hu" 108 | #define PRIx16 "hx" 109 | #define PRIX16 "hX" 110 | #define PRIoLEAST16 "ho" 111 | #define PRIuLEAST16 "hu" 112 | #define PRIxLEAST16 "hx" 113 | #define PRIXLEAST16 "hX" 114 | #define PRIoFAST16 "ho" 115 | #define PRIuFAST16 "hu" 116 | #define PRIxFAST16 "hx" 117 | #define PRIXFAST16 "hX" 118 | 119 | #define PRIo32 "I32o" 120 | #define PRIu32 "I32u" 121 | #define PRIx32 "I32x" 122 | #define PRIX32 "I32X" 123 | #define PRIoLEAST32 "I32o" 124 | #define PRIuLEAST32 "I32u" 125 | #define PRIxLEAST32 "I32x" 126 | #define PRIXLEAST32 "I32X" 127 | #define PRIoFAST32 "I32o" 128 | #define PRIuFAST32 "I32u" 129 | #define PRIxFAST32 "I32x" 130 | #define PRIXFAST32 "I32X" 131 | 132 | #define PRIo64 "I64o" 133 | #define PRIu64 "I64u" 134 | #define PRIx64 "I64x" 135 | #define PRIX64 "I64X" 136 | #define PRIoLEAST64 "I64o" 137 | #define PRIuLEAST64 "I64u" 138 | #define PRIxLEAST64 "I64x" 139 | #define PRIXLEAST64 "I64X" 140 | #define PRIoFAST64 "I64o" 141 | #define PRIuFAST64 "I64u" 142 | #define PRIxFAST64 "I64x" 143 | #define PRIXFAST64 "I64X" 144 | 145 | #define PRIoMAX "I64o" 146 | #define PRIuMAX "I64u" 147 | #define PRIxMAX "I64x" 148 | #define PRIXMAX "I64X" 149 | 150 | #define PRIoPTR "Io" 151 | #define PRIuPTR "Iu" 152 | #define PRIxPTR "Ix" 153 | #define PRIXPTR "IX" 154 | 155 | // The fscanf macros for signed integers are: 156 | #define SCNd8 "d" 157 | #define SCNi8 "i" 158 | #define SCNdLEAST8 "d" 159 | #define SCNiLEAST8 "i" 160 | #define SCNdFAST8 "d" 161 | #define SCNiFAST8 "i" 162 | 163 | #define SCNd16 "hd" 164 | #define SCNi16 "hi" 165 | #define SCNdLEAST16 "hd" 166 | #define SCNiLEAST16 "hi" 167 | #define SCNdFAST16 "hd" 168 | #define SCNiFAST16 "hi" 169 | 170 | #define SCNd32 "ld" 171 | #define SCNi32 "li" 172 | #define SCNdLEAST32 "ld" 173 | #define SCNiLEAST32 "li" 174 | #define SCNdFAST32 "ld" 175 | #define SCNiFAST32 "li" 176 | 177 | #define SCNd64 "I64d" 178 | #define SCNi64 "I64i" 179 | #define SCNdLEAST64 "I64d" 180 | #define SCNiLEAST64 "I64i" 181 | #define SCNdFAST64 "I64d" 182 | #define SCNiFAST64 "I64i" 183 | 184 | #define SCNdMAX "I64d" 185 | #define SCNiMAX "I64i" 186 | 187 | #ifdef _WIN64 // [ 188 | # define SCNdPTR "I64d" 189 | # define SCNiPTR "I64i" 190 | #else // _WIN64 ][ 191 | # define SCNdPTR "ld" 192 | # define SCNiPTR "li" 193 | #endif // _WIN64 ] 194 | 195 | // The fscanf macros for unsigned integers are: 196 | #define SCNo8 "o" 197 | #define SCNu8 "u" 198 | #define SCNx8 "x" 199 | #define SCNX8 "X" 200 | #define SCNoLEAST8 "o" 201 | #define SCNuLEAST8 "u" 202 | #define SCNxLEAST8 "x" 203 | #define SCNXLEAST8 "X" 204 | #define SCNoFAST8 "o" 205 | #define SCNuFAST8 "u" 206 | #define SCNxFAST8 "x" 207 | #define SCNXFAST8 "X" 208 | 209 | #define SCNo16 "ho" 210 | #define SCNu16 "hu" 211 | #define SCNx16 "hx" 212 | #define SCNX16 "hX" 213 | #define SCNoLEAST16 "ho" 214 | #define SCNuLEAST16 "hu" 215 | #define SCNxLEAST16 "hx" 216 | #define SCNXLEAST16 "hX" 217 | #define SCNoFAST16 "ho" 218 | #define SCNuFAST16 "hu" 219 | #define SCNxFAST16 "hx" 220 | #define SCNXFAST16 "hX" 221 | 222 | #define SCNo32 "lo" 223 | #define SCNu32 "lu" 224 | #define SCNx32 "lx" 225 | #define SCNX32 "lX" 226 | #define SCNoLEAST32 "lo" 227 | #define SCNuLEAST32 "lu" 228 | #define SCNxLEAST32 "lx" 229 | #define SCNXLEAST32 "lX" 230 | #define SCNoFAST32 "lo" 231 | #define SCNuFAST32 "lu" 232 | #define SCNxFAST32 "lx" 233 | #define SCNXFAST32 "lX" 234 | 235 | #define SCNo64 "I64o" 236 | #define SCNu64 "I64u" 237 | #define SCNx64 "I64x" 238 | #define SCNX64 "I64X" 239 | #define SCNoLEAST64 "I64o" 240 | #define SCNuLEAST64 "I64u" 241 | #define SCNxLEAST64 "I64x" 242 | #define SCNXLEAST64 "I64X" 243 | #define SCNoFAST64 "I64o" 244 | #define SCNuFAST64 "I64u" 245 | #define SCNxFAST64 "I64x" 246 | #define SCNXFAST64 "I64X" 247 | 248 | #define SCNoMAX "I64o" 249 | #define SCNuMAX "I64u" 250 | #define SCNxMAX "I64x" 251 | #define SCNXMAX "I64X" 252 | 253 | #ifdef _WIN64 // [ 254 | # define SCNoPTR "I64o" 255 | # define SCNuPTR "I64u" 256 | # define SCNxPTR "I64x" 257 | # define SCNXPTR "I64X" 258 | #else // _WIN64 ][ 259 | # define SCNoPTR "lo" 260 | # define SCNuPTR "lu" 261 | # define SCNxPTR "lx" 262 | # define SCNXPTR "lX" 263 | #endif // _WIN64 ] 264 | 265 | #endif // __STDC_FORMAT_MACROS ] 266 | 267 | // 7.8.2 Functions for greatest-width integer types 268 | 269 | // 7.8.2.1 The imaxabs function 270 | #define imaxabs _abs64 271 | 272 | // 7.8.2.2 The imaxdiv function 273 | 274 | // This is modified version of div() function from Microsoft's div.c found 275 | // in %MSVC.NET%\crt\src\div.c 276 | #ifdef STATIC_IMAXDIV // [ 277 | static 278 | #else // STATIC_IMAXDIV ][ 279 | _inline 280 | #endif // STATIC_IMAXDIV ] 281 | imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) 282 | { 283 | imaxdiv_t result; 284 | 285 | result.quot = numer / denom; 286 | result.rem = numer % denom; 287 | 288 | if (numer < 0 && result.rem > 0) { 289 | // did division wrong; must fix up 290 | ++result.quot; 291 | result.rem -= denom; 292 | } 293 | 294 | return result; 295 | } 296 | 297 | // 7.8.2.3 The strtoimax and strtoumax functions 298 | #define strtoimax _strtoi64 299 | #define strtoumax _strtoui64 300 | 301 | // 7.8.2.4 The wcstoimax and wcstoumax functions 302 | #define wcstoimax _wcstoi64 303 | #define wcstoumax _wcstoui64 304 | 305 | 306 | #endif // _MSC_INTTYPES_H_ ] 307 | -------------------------------------------------------------------------------- /bloat-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Script to test how much bloating a large project will suffer when using 4 | # different formatting methods. 5 | # Based on bloat_test.sh from https://github.com/c42f/tinyformat. 6 | 7 | from __future__ import print_function 8 | import os, re, sys 9 | from contextlib import ExitStack 10 | from glob import glob 11 | from subprocess import check_call, Popen, PIPE, CalledProcessError 12 | from timeit import timeit 13 | 14 | template = r''' 15 | #ifdef USE_BOOST 16 | 17 | #include 18 | #include 19 | 20 | void doFormat_a() { 21 | std::cout << boost::format("%s\n") % "somefile.cpp"; 22 | std::cout << boost::format("%s:%d\n") % "somefile.cpp" % 42; 23 | std::cout << boost::format("%s:%d:%s\n") % "somefile.cpp" % 42 % "asdf"; 24 | std::cout << 25 | boost::format("%s:%d:%d:%s\n") % "somefile.cpp" % 42 % 1 % "asdf"; 26 | std::cout << 27 | boost::format("%s:%d:%d:%d:%s\n") % "somefile.cpp" % 42 % 1 % 2 % "asdf"; 28 | } 29 | 30 | #elif USE_FOLLY 31 | 32 | #include 33 | #include 34 | 35 | void doFormat_a() { 36 | std::cout << folly::format("{}\n", "somefile.cpp"); 37 | std::cout << folly::format("{}:{}\n", "somefile.cpp", 42); 38 | std::cout << folly::format("{}:{}:{}\n", "somefile.cpp", 42, "asdf"); 39 | std::cout << 40 | folly::format("{}:{}:{}:{}\n", "somefile.cpp", 42, 1, "asdf"); 41 | std::cout << 42 | folly::format("{}:{}:{}:{}:{}\n", "somefile.cpp", 42, 1, 2, "asdf"); 43 | } 44 | 45 | #elif defined(USE_FMT) 46 | 47 | #include "fmt/base.h" 48 | 49 | void doFormat_a() { 50 | fmt::print("{}\n", "somefile.cpp"); 51 | fmt::print("{}:{}\n", "somefile.cpp", 42); 52 | fmt::print("{}:{}:{}\n", "somefile.cpp", 42, "asdf"); 53 | fmt::print("{}:{}:{}:{}\n", "somefile.cpp", 42, 1, "asdf"); 54 | fmt::print("{}:{}:{}:{}:{}\n", "somefile.cpp", 42, 1, 2, "asdf"); 55 | } 56 | 57 | #elif defined(USE_IOSTREAMS) 58 | 59 | #include 60 | 61 | void doFormat_a() { 62 | std::cout << "somefile.cpp" << "\n"; 63 | std::cout << "somefile.cpp:" << 42 << "\n"; 64 | std::cout << "somefile.cpp:" << 42 << ":asdf" << "\n"; 65 | std::cout << "somefile.cpp:" << 42 << ':' << 1 << ":asdf" << "\n"; 66 | std::cout << "somefile.cpp:" << 42 << ':' << 1 << ':' << 2 << ":asdf" << "\n"; 67 | } 68 | 69 | #elif defined(USE_STB_SPRINTF) 70 | 71 | #ifdef FIRST_FILE 72 | # define STB_SPRINTF_IMPLEMENTATION 73 | #endif 74 | // since this test doesn't use floating point numbers shave ~20kb 75 | #define STB_SPRINTF_NOFLOAT 76 | 77 | #include "src/stb_sprintf.h" 78 | #include 79 | 80 | void doFormat_a() { 81 | char buf[100]; 82 | stbsp_sprintf(buf, "%s\n", "somefile.cpp"); 83 | fputs(buf, stdout); 84 | stbsp_sprintf(buf, "%s:%d\n", "somefile.cpp", 42); 85 | fputs(buf, stdout); 86 | stbsp_sprintf(buf, "%s:%d:%s\n", "somefile.cpp", 42, "asdf"); 87 | fputs(buf, stdout); 88 | stbsp_sprintf(buf, "%s:%d:%d:%s\n", "somefile.cpp", 42, 1, "asdf"); 89 | fputs(buf, stdout); 90 | stbsp_sprintf(buf, "%s:%d:%d:%d:%s\n", "somefile.cpp", 42, 1, 2, "asdf"); 91 | fputs(buf, stdout); 92 | } 93 | 94 | #else 95 | # ifdef USE_TINYFORMAT 96 | # include "src/tinyformat.h" 97 | # define PRINTF tfm::printf 98 | # else 99 | # include 100 | # define PRINTF ::printf 101 | # endif 102 | 103 | void doFormat_a() { 104 | PRINTF("%s\n", "somefile.cpp"); 105 | PRINTF("%s:%d\n", "somefile.cpp", 42); 106 | PRINTF("%s:%d:%s\n", "somefile.cpp", 42, "asdf"); 107 | PRINTF("%s:%d:%d:%s\n", "somefile.cpp", 42, 1, "asdf"); 108 | PRINTF("%s:%d:%d:%d:%s\n", "somefile.cpp", 42, 1, 2, "asdf"); 109 | } 110 | #endif 111 | ''' 112 | 113 | prefix = '/tmp/_bloat_test_tmp_' 114 | num_translation_units = 100 115 | 116 | # Remove old files. 117 | filenames = glob(prefix + '??.cc') 118 | for f in [prefix + 'main.cc', prefix + 'all.h']: 119 | if os.path.exists(f): 120 | filenames.append(f) 121 | for f in filenames: 122 | os.remove(f) 123 | 124 | # Generate all the files. 125 | main_source = prefix + 'main.cc' 126 | main_header = prefix + 'all.h' 127 | sources = [main_source] 128 | with ExitStack() as stack: 129 | main_file = stack.enter_context(open(main_source, 'w')) 130 | header_file = stack.enter_context(open(main_header, 'w')) 131 | main_file.write(re.sub('^ +', '', ''' 132 | #include "{}all.h" 133 | 134 | int main() {{ 135 | '''.format(prefix), 0, re.MULTILINE)) 136 | for i in range(num_translation_units): 137 | n = '{:03}'.format(i) 138 | func_name = 'doFormat_a' + n 139 | source = prefix + n + '.cc' 140 | sources.append(source) 141 | with open(source, 'w') as f: 142 | if i == 0: 143 | f.write('#define FIRST_FILE\n') 144 | f.write(template.replace('doFormat_a', func_name).replace('42', str(i))) 145 | main_file.write(func_name + '();\n') 146 | header_file.write('void ' + func_name + '();\n') 147 | main_file.write('}') 148 | 149 | # Find compiler. 150 | compiler_path = None 151 | for path in os.getenv('PATH').split(os.pathsep): 152 | filename = os.path.join(path, 'g++') 153 | if os.path.exists(filename): 154 | if os.path.islink(filename) and \ 155 | os.path.basename(os.path.realpath(filename)) == 'ccache': 156 | # Don't use ccache. 157 | print('Ignoring ccache link at', filename) 158 | continue 159 | compiler_path = filename 160 | break 161 | print('Using compiler', filename) 162 | 163 | class Result: 164 | pass 165 | 166 | # Measure compile time and executable size. 167 | expected_output = None 168 | def benchmark(flags): 169 | output_filename = prefix + '.out' 170 | if os.path.exists(output_filename): 171 | os.remove(output_filename) 172 | include_dir = '-I' + os.path.dirname(os.path.realpath(__file__)) 173 | command = 'check_call({})'.format( 174 | [compiler_path, '-std=c++17', '-o', output_filename, include_dir] + sources + flags) 175 | result = Result() 176 | try: 177 | result.time = timeit( 178 | command, setup = 'from subprocess import check_call', number = 1) 179 | except CalledProcessError: 180 | return None 181 | print('Compile time: {:.2f}s'.format(result.time)) 182 | result.size = os.stat(output_filename).st_size 183 | print('Size: {}'.format(result.size)) 184 | check_call(['strip', output_filename]) 185 | result.stripped_size = os.stat(output_filename).st_size 186 | print('Stripped size: {}'.format(result.stripped_size)) 187 | p = Popen([output_filename], stdout=PIPE, 188 | env={'LD_LIBRARY_PATH': 'fmt', 'DYLD_LIBRARY_PATH': 'fmt'}) 189 | output = p.communicate()[0] 190 | global expected_output 191 | if not expected_output: 192 | expected_output = output 193 | elif output != expected_output: 194 | print(output) 195 | raise Exception("output doesn't match") 196 | sys.stdout.flush() 197 | return result 198 | 199 | configs = [ 200 | ('optimized', ['-O3', '-DNDEBUG']), 201 | ('debug', []) 202 | ] 203 | 204 | fmt_library = 'fmt/libfmt.so' 205 | if not os.path.exists(fmt_library): 206 | fmt_library = fmt_library.replace('.so', '.dylib') 207 | 208 | methods = [ 209 | ('printf' , []), 210 | ('IOStreams' , ['-DUSE_IOSTREAMS']), 211 | ('fmt' , ['-DUSE_FMT', '-Ifmt/include', fmt_library]), 212 | ('tinyformat' , ['-DUSE_TINYFORMAT']), 213 | ('Boost Format' , ['-DUSE_BOOST']), 214 | ('Folly Format' , ['-DUSE_FOLLY', '-lfolly']), 215 | ('stb_sprintf' , ['-DUSE_STB_SPRINTF']), 216 | ] 217 | 218 | def format_field(field, format = '', width = ''): 219 | return '{:{}{}}'.format(field, width, format) 220 | 221 | def print_rulers(widths): 222 | for w in widths: 223 | print('=' * w, end = ' ') 224 | print() 225 | 226 | # Prints a reStructuredText table. 227 | def print_table(table, *formats): 228 | widths = [len(i) for i in table[0]] 229 | for row in table[1:]: 230 | for i in range(len(row)): 231 | widths[i] = max(widths[i], len(format_field(row[i], formats[i]))) 232 | print_rulers(widths) 233 | row = table[0] 234 | for i in range(len(row)): 235 | print(format_field(row[i], '', widths[i]), end = ' ') 236 | print() 237 | print_rulers(widths) 238 | for row in table[1:]: 239 | for i in range(len(row)): 240 | print(format_field(row[i], formats[i], widths[i]), end = ' ') 241 | print() 242 | print_rulers(widths) 243 | 244 | # Converts n to kibibytes. 245 | def to_kib(n): 246 | return int(round(n / 1024.0)) 247 | 248 | exclude_list = [] 249 | NUM_RUNS = 3 250 | for config, flags in configs: 251 | results = {} 252 | for i in range(NUM_RUNS): 253 | for method, method_flags in methods: 254 | if method in exclude_list: 255 | continue 256 | print('Benchmarking', config, method) 257 | sys.stdout.flush() 258 | new_result = benchmark(flags + method_flags + sys.argv[1:]) 259 | if not new_result: 260 | exclude_list.append(method) 261 | print(method + ' is not available') 262 | continue 263 | if method not in results: 264 | results[method] = new_result 265 | continue 266 | old_result = results[method] 267 | old_result.time = min(old_result.time, new_result.time) 268 | if new_result.size != old_result.size or \ 269 | new_result.stripped_size != old_result.stripped_size: 270 | raise Exception('size mismatch') 271 | print(config, 'Results:') 272 | table = [ 273 | ('Method', 'Compile Time, s', 'Executable size, KiB', 'Stripped size, KiB') 274 | ] 275 | for method, method_flags in methods: 276 | if method not in results: 277 | continue 278 | result = results[method] 279 | table.append( 280 | (method, result.time, to_kib(result.size), to_kib(result.stripped_size))) 281 | print_table(table, '', '.1f', '', '') 282 | -------------------------------------------------------------------------------- /src/itoa-benchmark/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "resultfilename.h" 12 | #include "timer.h" 13 | #include "test.h" 14 | 15 | const unsigned kIterationPerDigit = 100000; 16 | const unsigned kIterationForRandom = 100; 17 | const unsigned kTrial = 10; 18 | 19 | template 20 | struct Traits { 21 | }; 22 | 23 | template <> 24 | struct Traits { 25 | enum { kBufferSize = 11 }; 26 | enum { kMaxDigit = 10 }; 27 | static uint32_t Negate(uint32_t x) { return x; }; 28 | }; 29 | 30 | template <> 31 | struct Traits { 32 | enum { kBufferSize = 12 }; 33 | enum { kMaxDigit = 10 }; 34 | static int32_t Negate(int32_t x) { return -x; }; 35 | }; 36 | 37 | template <> 38 | struct Traits { 39 | enum { kBufferSize = 21 }; 40 | enum { kMaxDigit = 20 }; 41 | static uint64_t Negate(uint64_t x) { return x; }; 42 | }; 43 | 44 | template <> 45 | struct Traits { 46 | enum { kBufferSize = 22 }; 47 | enum { kMaxDigit = 20 }; 48 | static int64_t Negate(int64_t x) { return -x; }; 49 | }; 50 | 51 | template 52 | static void VerifyValue(T value, void(*f)(T, char*), void(*g)(T, char*), const char* fname, const char* gname) { 53 | char buffer1[Traits::kBufferSize]; 54 | char buffer2[Traits::kBufferSize]; 55 | 56 | f(value, buffer1); 57 | g(value, buffer2); 58 | 59 | if (strcmp(buffer1, buffer2) != 0) { 60 | printf("\nError: %s -> %s, %s -> %s\n", fname, buffer1, gname, buffer2); 61 | throw std::exception(); 62 | } 63 | //puts(buffer1); 64 | } 65 | 66 | template 67 | static void Verify(void(*f)(T, char*), void(*g)(T, char*), const char* fname, const char* gname) { 68 | printf("Verifying %s = %s ... ", fname, gname); 69 | 70 | // Boundary cases 71 | VerifyValue(0, f, g, fname, gname); 72 | VerifyValue(std::numeric_limits::min(), f, g, fname, gname); 73 | VerifyValue(std::numeric_limits::max(), f, g, fname, gname); 74 | 75 | // 2^n - 1, 2^n, 10^n - 1, 10^n until overflow 76 | for (uint32_t power = 2; power <= 10; power += 8) { 77 | T i = 1, last; 78 | do { 79 | VerifyValue(i - 1, f, g, fname, gname); 80 | VerifyValue(i, f, g, fname, gname); 81 | if (std::numeric_limits::min() < 0) { 82 | VerifyValue(Traits::Negate(i), f, g, fname, gname); 83 | VerifyValue(Traits::Negate(i + 1), f, g, fname, gname); 84 | } 85 | last = i; 86 | i *= power; 87 | } while (last < i); 88 | } 89 | 90 | printf("OK\n"); 91 | } 92 | 93 | void VerifyAll() { 94 | const TestList& tests = TestManager::Instance().GetTests(); 95 | 96 | // Find naive for verification 97 | const Test* naive = 0; 98 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 99 | if (strcmp((*itr)->fname, "naive") == 0) { 100 | naive = *itr; 101 | break; 102 | } 103 | 104 | assert(naive != 0); 105 | 106 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) { 107 | if (strcmp((*itr)->fname, "null") != 0) { // skip null 108 | try { 109 | Verify(naive->u32toa, (*itr)->u32toa, "naive_u32toa", (*itr)->fname); 110 | Verify(naive->i32toa, (*itr)->i32toa, "naive_i32toa", (*itr)->fname); 111 | Verify(naive->u64toa, (*itr)->u64toa, "naive_u64toa", (*itr)->fname); 112 | Verify(naive->i64toa, (*itr)->i64toa, "naive_i64toa", (*itr)->fname); 113 | } 114 | catch (...) { 115 | } 116 | } 117 | } 118 | } 119 | 120 | template 121 | void BenchSequential(void(*f)(T, char*), const char* type, const char* fname, FILE* fp) { 122 | printf("Benchmarking sequential %-20s ... ", fname); 123 | 124 | char buffer[Traits::kBufferSize]; 125 | double minDuration = std::numeric_limits::max(); 126 | double maxDuration = 0.0; 127 | 128 | T start = 1; 129 | for (int digit = 1; digit <= Traits::kMaxDigit; digit++) { 130 | T end = (digit == Traits::kMaxDigit) ? std::numeric_limits::max() : start * 10; 131 | 132 | double duration = std::numeric_limits::max(); 133 | for (unsigned trial = 0; trial < kTrial; trial++) { 134 | T v = start; 135 | T sign = 1; 136 | Timer timer; 137 | timer.Start(); 138 | for (unsigned iteration = 0; iteration < kIterationPerDigit; iteration++) { 139 | f(v * sign, buffer); 140 | sign = Traits::Negate(sign); 141 | if (++v == end) 142 | v = start; 143 | } 144 | timer.Stop(); 145 | duration = std::min(duration, timer.GetElapsedMilliseconds()); 146 | } 147 | 148 | duration *= 1e6 / kIterationPerDigit; // convert to nano second per operation 149 | 150 | minDuration = std::min(minDuration, duration); 151 | maxDuration = std::max(maxDuration, duration); 152 | fprintf(fp, "%s_sequential,%s,%d,%f\n", type, fname, digit, duration); 153 | start = end; 154 | } 155 | 156 | printf("[%8.3fns, %8.3fns]\n", minDuration, maxDuration); 157 | } 158 | 159 | template 160 | class RandomData { 161 | public: 162 | static T* GetData() { 163 | static RandomData singleton; 164 | return singleton.mData; 165 | } 166 | 167 | static const size_t kCountPerDigit = 1000; 168 | static const size_t kCount = kCountPerDigit * Traits::kMaxDigit; 169 | 170 | private: 171 | RandomData() : 172 | mData(new T[kCount]) 173 | { 174 | T* p = mData; 175 | T start = 1; 176 | for (int digit = 1; digit <= Traits::kMaxDigit; digit++) { 177 | T end = (digit == Traits::kMaxDigit) ? std::numeric_limits::max() : start * 10; 178 | T v = start; 179 | T sign = 1; 180 | for (size_t i = 0; i < kCountPerDigit; i++) { 181 | *p++ = v * sign; 182 | sign = Traits::Negate(sign); 183 | if (++v == end) 184 | v = start; 185 | } 186 | start = end; 187 | } 188 | std::mt19937 gen; 189 | std::shuffle(mData, mData + kCount, gen); 190 | } 191 | 192 | ~RandomData() { 193 | delete[] mData; 194 | } 195 | 196 | T* mData; 197 | }; 198 | 199 | template 200 | void BenchRandom(void(*f)(T, char*), const char* type, const char* fname, FILE* fp) { 201 | printf("Benchmarking random %-20s ... ", fname); 202 | 203 | char buffer[Traits::kBufferSize]; 204 | T* data = RandomData::GetData(); 205 | size_t n = RandomData::kCount; 206 | 207 | double duration = std::numeric_limits::max(); 208 | for (unsigned trial = 0; trial < kTrial; trial++) { 209 | Timer timer; 210 | timer.Start(); 211 | 212 | for (unsigned iteration = 0; iteration < kIterationForRandom; iteration++) 213 | for (size_t i = 0; i < n; i++) 214 | f(data[i], buffer); 215 | 216 | timer.Stop(); 217 | duration = std::min(duration, timer.GetElapsedMilliseconds()); 218 | } 219 | duration *= 1e6 / (kIterationForRandom * n); // convert to nano second per operation 220 | fprintf(fp, "%s_random,%s,0,%f\n", type, fname, duration); 221 | 222 | printf("%8.3fns\n", duration); 223 | } 224 | 225 | template 226 | void Bench(void(*f)(T, char*), const char* type, const char* fname, FILE* fp) { 227 | BenchSequential(f, type, fname, fp); 228 | BenchRandom(f, type, fname, fp); 229 | } 230 | 231 | 232 | void BenchAll() { 233 | // Try to write to /result path, where template.php exists 234 | FILE *fp; 235 | if ((fp = fopen("../../result/template.php", "r")) != NULL) { 236 | fclose(fp); 237 | fp = fopen("../../result/" RESULT_FILENAME, "w"); 238 | } 239 | else if ((fp = fopen("../result/template.php", "r")) != NULL) { 240 | fclose(fp); 241 | fp = fopen("../result/" RESULT_FILENAME, "w"); 242 | } 243 | else 244 | fp = fopen(RESULT_FILENAME, "w"); 245 | 246 | fprintf(fp, "Type,Function,Digit,Time(ns)\n"); 247 | 248 | const TestList& tests = TestManager::Instance().GetTests(); 249 | 250 | puts("u32toa"); 251 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 252 | Bench((*itr)->u32toa, "u32toa", (*itr)->fname, fp); 253 | 254 | puts(""); 255 | puts("i32toa"); 256 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 257 | Bench((*itr)->i32toa, "i32toa", (*itr)->fname, fp); 258 | 259 | puts(""); 260 | puts("u64toa"); 261 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 262 | Bench((*itr)->u64toa, "u64toa", (*itr)->fname, fp); 263 | 264 | puts(""); 265 | puts("i64toa"); 266 | for (TestList::const_iterator itr = tests.begin(); itr != tests.end(); ++itr) 267 | Bench((*itr)->i64toa, "i64toa", (*itr)->fname, fp); 268 | 269 | fclose(fp); 270 | } 271 | 272 | int main() { 273 | // sort tests 274 | TestList& tests = TestManager::Instance().GetTests(); 275 | std::sort(tests.begin(), tests.end(), 276 | [](const Test* a, const Test* b) { 277 | return std::string{a->fname} < std::string{b->fname}; 278 | }); 279 | 280 | VerifyAll(); 281 | BenchAll(); 282 | } 283 | -------------------------------------------------------------------------------- /src/itoa-benchmark/msinttypes/stdint.h: -------------------------------------------------------------------------------- 1 | // ISO C9x compliant stdint.h for Microsoft Visual Studio 2 | // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 3 | // 4 | // Copyright (c) 2006-2013 Alexander Chemeris 5 | // 6 | // Redistribution and use in source and binary forms, with or without 7 | // modification, are permitted provided that the following conditions are met: 8 | // 9 | // 1. Redistributions of source code must retain the above copyright notice, 10 | // this list of conditions and the following disclaimer. 11 | // 12 | // 2. Redistributions in binary form must reproduce the above copyright 13 | // notice, this list of conditions and the following disclaimer in the 14 | // documentation and/or other materials provided with the distribution. 15 | // 16 | // 3. Neither the name of the product nor the names of its contributors may 17 | // be used to endorse or promote products derived from this software 18 | // without specific prior written permission. 19 | // 20 | // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 21 | // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 22 | // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 23 | // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 | // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 | // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 | // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 29 | // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | // 31 | /////////////////////////////////////////////////////////////////////////////// 32 | 33 | #ifndef _MSC_VER // [ 34 | #error "Use this header only with Microsoft Visual C++ compilers!" 35 | #endif // _MSC_VER ] 36 | 37 | #ifndef _MSC_STDINT_H_ // [ 38 | #define _MSC_STDINT_H_ 39 | 40 | #if _MSC_VER > 1000 41 | #pragma once 42 | #endif 43 | 44 | // miloyip: Originally Visual Studio 2010 uses its own stdint.h. However it generates warning with INT64_C(), so change to use this file for vs2010. 45 | #if _MSC_VER >= 1600 // [ 46 | #include 47 | 48 | #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 49 | 50 | #undef INT8_C 51 | #undef INT16_C 52 | #undef INT32_C 53 | #undef INT64_C 54 | #undef UINT8_C 55 | #undef UINT16_C 56 | #undef UINT32_C 57 | #undef UINT64_C 58 | 59 | // 7.18.4.1 Macros for minimum-width integer constants 60 | 61 | #define INT8_C(val) val##i8 62 | #define INT16_C(val) val##i16 63 | #define INT32_C(val) val##i32 64 | #define INT64_C(val) val##i64 65 | 66 | #define UINT8_C(val) val##ui8 67 | #define UINT16_C(val) val##ui16 68 | #define UINT32_C(val) val##ui32 69 | #define UINT64_C(val) val##ui64 70 | 71 | // 7.18.4.2 Macros for greatest-width integer constants 72 | // These #ifndef's are needed to prevent collisions with . 73 | // Check out Issue 9 for the details. 74 | #ifndef INTMAX_C // [ 75 | # define INTMAX_C INT64_C 76 | #endif // INTMAX_C ] 77 | #ifndef UINTMAX_C // [ 78 | # define UINTMAX_C UINT64_C 79 | #endif // UINTMAX_C ] 80 | 81 | #endif // __STDC_CONSTANT_MACROS ] 82 | 83 | #else // ] _MSC_VER >= 1700 [ 84 | 85 | #include 86 | 87 | // For Visual Studio 6 in C++ mode and for many Visual Studio versions when 88 | // compiling for ARM we should wrap include with 'extern "C++" {}' 89 | // or compiler give many errors like this: 90 | // error C2733: second C linkage of overloaded function 'wmemchr' not allowed 91 | #ifdef __cplusplus 92 | extern "C" { 93 | #endif 94 | # include 95 | #ifdef __cplusplus 96 | } 97 | #endif 98 | 99 | // Define _W64 macros to mark types changing their size, like intptr_t. 100 | #ifndef _W64 101 | # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 102 | # define _W64 __w64 103 | # else 104 | # define _W64 105 | # endif 106 | #endif 107 | 108 | 109 | // 7.18.1 Integer types 110 | 111 | // 7.18.1.1 Exact-width integer types 112 | 113 | // Visual Studio 6 and Embedded Visual C++ 4 doesn't 114 | // realize that, e.g. char has the same size as __int8 115 | // so we give up on __intX for them. 116 | #if (_MSC_VER < 1300) 117 | typedef signed char int8_t; 118 | typedef signed short int16_t; 119 | typedef signed int int32_t; 120 | typedef unsigned char uint8_t; 121 | typedef unsigned short uint16_t; 122 | typedef unsigned int uint32_t; 123 | #else 124 | typedef signed __int8 int8_t; 125 | typedef signed __int16 int16_t; 126 | typedef signed __int32 int32_t; 127 | typedef unsigned __int8 uint8_t; 128 | typedef unsigned __int16 uint16_t; 129 | typedef unsigned __int32 uint32_t; 130 | #endif 131 | typedef signed __int64 int64_t; 132 | typedef unsigned __int64 uint64_t; 133 | 134 | 135 | // 7.18.1.2 Minimum-width integer types 136 | typedef int8_t int_least8_t; 137 | typedef int16_t int_least16_t; 138 | typedef int32_t int_least32_t; 139 | typedef int64_t int_least64_t; 140 | typedef uint8_t uint_least8_t; 141 | typedef uint16_t uint_least16_t; 142 | typedef uint32_t uint_least32_t; 143 | typedef uint64_t uint_least64_t; 144 | 145 | // 7.18.1.3 Fastest minimum-width integer types 146 | typedef int8_t int_fast8_t; 147 | typedef int16_t int_fast16_t; 148 | typedef int32_t int_fast32_t; 149 | typedef int64_t int_fast64_t; 150 | typedef uint8_t uint_fast8_t; 151 | typedef uint16_t uint_fast16_t; 152 | typedef uint32_t uint_fast32_t; 153 | typedef uint64_t uint_fast64_t; 154 | 155 | // 7.18.1.4 Integer types capable of holding object pointers 156 | #ifdef _WIN64 // [ 157 | typedef signed __int64 intptr_t; 158 | typedef unsigned __int64 uintptr_t; 159 | #else // _WIN64 ][ 160 | typedef _W64 signed int intptr_t; 161 | typedef _W64 unsigned int uintptr_t; 162 | #endif // _WIN64 ] 163 | 164 | // 7.18.1.5 Greatest-width integer types 165 | typedef int64_t intmax_t; 166 | typedef uint64_t uintmax_t; 167 | 168 | 169 | // 7.18.2 Limits of specified-width integer types 170 | 171 | #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 172 | 173 | // 7.18.2.1 Limits of exact-width integer types 174 | #define INT8_MIN ((int8_t)_I8_MIN) 175 | #define INT8_MAX _I8_MAX 176 | #define INT16_MIN ((int16_t)_I16_MIN) 177 | #define INT16_MAX _I16_MAX 178 | #define INT32_MIN ((int32_t)_I32_MIN) 179 | #define INT32_MAX _I32_MAX 180 | #define INT64_MIN ((int64_t)_I64_MIN) 181 | #define INT64_MAX _I64_MAX 182 | #define UINT8_MAX _UI8_MAX 183 | #define UINT16_MAX _UI16_MAX 184 | #define UINT32_MAX _UI32_MAX 185 | #define UINT64_MAX _UI64_MAX 186 | 187 | // 7.18.2.2 Limits of minimum-width integer types 188 | #define INT_LEAST8_MIN INT8_MIN 189 | #define INT_LEAST8_MAX INT8_MAX 190 | #define INT_LEAST16_MIN INT16_MIN 191 | #define INT_LEAST16_MAX INT16_MAX 192 | #define INT_LEAST32_MIN INT32_MIN 193 | #define INT_LEAST32_MAX INT32_MAX 194 | #define INT_LEAST64_MIN INT64_MIN 195 | #define INT_LEAST64_MAX INT64_MAX 196 | #define UINT_LEAST8_MAX UINT8_MAX 197 | #define UINT_LEAST16_MAX UINT16_MAX 198 | #define UINT_LEAST32_MAX UINT32_MAX 199 | #define UINT_LEAST64_MAX UINT64_MAX 200 | 201 | // 7.18.2.3 Limits of fastest minimum-width integer types 202 | #define INT_FAST8_MIN INT8_MIN 203 | #define INT_FAST8_MAX INT8_MAX 204 | #define INT_FAST16_MIN INT16_MIN 205 | #define INT_FAST16_MAX INT16_MAX 206 | #define INT_FAST32_MIN INT32_MIN 207 | #define INT_FAST32_MAX INT32_MAX 208 | #define INT_FAST64_MIN INT64_MIN 209 | #define INT_FAST64_MAX INT64_MAX 210 | #define UINT_FAST8_MAX UINT8_MAX 211 | #define UINT_FAST16_MAX UINT16_MAX 212 | #define UINT_FAST32_MAX UINT32_MAX 213 | #define UINT_FAST64_MAX UINT64_MAX 214 | 215 | // 7.18.2.4 Limits of integer types capable of holding object pointers 216 | #ifdef _WIN64 // [ 217 | # define INTPTR_MIN INT64_MIN 218 | # define INTPTR_MAX INT64_MAX 219 | # define UINTPTR_MAX UINT64_MAX 220 | #else // _WIN64 ][ 221 | # define INTPTR_MIN INT32_MIN 222 | # define INTPTR_MAX INT32_MAX 223 | # define UINTPTR_MAX UINT32_MAX 224 | #endif // _WIN64 ] 225 | 226 | // 7.18.2.5 Limits of greatest-width integer types 227 | #define INTMAX_MIN INT64_MIN 228 | #define INTMAX_MAX INT64_MAX 229 | #define UINTMAX_MAX UINT64_MAX 230 | 231 | // 7.18.3 Limits of other integer types 232 | 233 | #ifdef _WIN64 // [ 234 | # define PTRDIFF_MIN _I64_MIN 235 | # define PTRDIFF_MAX _I64_MAX 236 | #else // _WIN64 ][ 237 | # define PTRDIFF_MIN _I32_MIN 238 | # define PTRDIFF_MAX _I32_MAX 239 | #endif // _WIN64 ] 240 | 241 | #define SIG_ATOMIC_MIN INT_MIN 242 | #define SIG_ATOMIC_MAX INT_MAX 243 | 244 | #ifndef SIZE_MAX // [ 245 | # ifdef _WIN64 // [ 246 | # define SIZE_MAX _UI64_MAX 247 | # else // _WIN64 ][ 248 | # define SIZE_MAX _UI32_MAX 249 | # endif // _WIN64 ] 250 | #endif // SIZE_MAX ] 251 | 252 | // WCHAR_MIN and WCHAR_MAX are also defined in 253 | #ifndef WCHAR_MIN // [ 254 | # define WCHAR_MIN 0 255 | #endif // WCHAR_MIN ] 256 | #ifndef WCHAR_MAX // [ 257 | # define WCHAR_MAX _UI16_MAX 258 | #endif // WCHAR_MAX ] 259 | 260 | #define WINT_MIN 0 261 | #define WINT_MAX _UI16_MAX 262 | 263 | #endif // __STDC_LIMIT_MACROS ] 264 | 265 | 266 | // 7.18.4 Limits of other integer types 267 | 268 | #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 269 | 270 | // 7.18.4.1 Macros for minimum-width integer constants 271 | 272 | #define INT8_C(val) val##i8 273 | #define INT16_C(val) val##i16 274 | #define INT32_C(val) val##i32 275 | #define INT64_C(val) val##i64 276 | 277 | #define UINT8_C(val) val##ui8 278 | #define UINT16_C(val) val##ui16 279 | #define UINT32_C(val) val##ui32 280 | #define UINT64_C(val) val##ui64 281 | 282 | // 7.18.4.2 Macros for greatest-width integer constants 283 | // These #ifndef's are needed to prevent collisions with . 284 | // Check out Issue 9 for the details. 285 | #ifndef INTMAX_C // [ 286 | # define INTMAX_C INT64_C 287 | #endif // INTMAX_C ] 288 | #ifndef UINTMAX_C // [ 289 | # define UINTMAX_C UINT64_C 290 | #endif // UINTMAX_C ] 291 | 292 | #endif // __STDC_CONSTANT_MACROS ] 293 | 294 | #endif // _MSC_VER >= 1600 ] 295 | 296 | #endif // _MSC_STDINT_H_ ] 297 | -------------------------------------------------------------------------------- /src/itoa-benchmark/unrolledlut.cpp: -------------------------------------------------------------------------------- 1 | // unrolledlut.cpp: Fast integer to string conversion by using per-digit-count unrolling and a lookuptable 2 | // 3 | // ===-------- DESCRIPTION --------=== 4 | // 5 | // Very fast implementation of uint32_t to string: 6 | // - Automatically takes advantage of two-byte load/store on 7 | // architectures that support it (memcpy will be optimized). 8 | // - Avoids as many jumps as possible, by unrolling the whole thing for every digit count. 9 | // - Con: Costs some memory for the duplicated instructions of all branches 10 | // 11 | // Further optimization possible: 12 | // - You may reorder the digit-cases, so that the most 13 | // commonly used cases come first. Currently digit-counts 14 | // from 7 to 10 are processed first, as they cover ~99.7% of all uint32_t values. 15 | // By reordering these for your specific needs, you can save one or two extra instructions for these cases. 16 | // 17 | // ===-------- LICENSE --------=== 18 | // 19 | // The MIT License (MIT) 20 | // 21 | // Copyright (c) 2017 nyronium (nyronium@genthree.io) 22 | // 23 | // Permission is hereby granted, free of charge, to any person obtaining a copy 24 | // of this software and associated documentation files (the "Software"), to deal 25 | // in the Software without restriction, including without limitation the rights 26 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | // copies of the Software, and to permit persons to whom the Software is 28 | // furnished to do so, subject to the following conditions: 29 | // 30 | // The above copyright notice and this permission notice shall be included in all 31 | // copies or substantial portions of the Software. 32 | // 33 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 36 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 | // SOFTWARE. 40 | 41 | #include 42 | #include "test.h" 43 | 44 | static const char TWO_DIGITS_TO_STR[201] = 45 | "0001020304050607080910111213141516171819" 46 | "2021222324252627282930313233343536373839" 47 | "4041424344454647484950515253545556575859" 48 | "6061626364656667686970717273747576777879" 49 | "8081828384858687888990919293949596979899"; 50 | 51 | 52 | #define COPY_2_DIGITS(out, value) \ 53 | memcpy(out, &reinterpret_cast(TWO_DIGITS_TO_STR)[value], 2); \ 54 | out += 2; 55 | 56 | #define COPY_1_DIGIT(out, value) \ 57 | *out++ = '0' + value; 58 | 59 | 60 | #define UNROLL_EXACT_DIGITS_8(out, value) { \ 61 | uint32_t digits; \ 62 | digits = value / 1000000; COPY_2_DIGITS(out, digits); \ 63 | value -= digits * 1000000; \ 64 | digits = value / 10000; COPY_2_DIGITS(out, digits); \ 65 | value -= digits * 10000; \ 66 | digits = value / 100; COPY_2_DIGITS(out, digits); \ 67 | value -= digits * 100; \ 68 | COPY_2_DIGITS(out, value); \ 69 | *out = '\0'; \ 70 | } 71 | 72 | #define UNROLL_REMAINING_DIGITS_8(out, value, digits) { \ 73 | value -= digits * 100000000; \ 74 | digits = value / 1000000; COPY_2_DIGITS(out, digits); \ 75 | value -= digits * 1000000; \ 76 | digits = value / 10000; COPY_2_DIGITS(out, digits); \ 77 | value -= digits * 10000; \ 78 | digits = value / 100; COPY_2_DIGITS(out, digits); \ 79 | value -= digits * 100; \ 80 | COPY_2_DIGITS(out, value); \ 81 | *out = '\0'; return out; \ 82 | } 83 | 84 | #define UNROLL_REMAINING_DIGITS_6(out, value, digits) { \ 85 | value -= digits * 1000000; \ 86 | digits = value / 10000; COPY_2_DIGITS(out, digits); \ 87 | value -= digits * 10000; \ 88 | digits = value / 100; COPY_2_DIGITS(out, digits); \ 89 | value -= digits * 100; \ 90 | COPY_2_DIGITS(out, value); \ 91 | *out = '\0'; return out; \ 92 | } 93 | 94 | #define UNROLL_REMAINING_DIGITS_4(out, value, digits) { \ 95 | value -= digits * 10000; \ 96 | digits = value / 100; COPY_2_DIGITS(out, digits); \ 97 | value -= digits * 100; \ 98 | COPY_2_DIGITS(out, value); \ 99 | *out = '\0'; return out; \ 100 | } 101 | 102 | #define UNROLL_REMAINING_DIGITS_2(out, value, digits) { \ 103 | value -= digits * 100; \ 104 | COPY_2_DIGITS(out, value); \ 105 | *out = '\0'; return out; \ 106 | } 107 | 108 | #define UNROLL_REMAINING_DIGITS_0(out, value) { \ 109 | *out = '\0'; return out; \ 110 | } 111 | 112 | 113 | #define UNROLL_DIGIT_PAIR_9_10(out, value) { \ 114 | uint32_t digits; \ 115 | if (value >= 1000000000) { \ 116 | digits = value / 100000000; COPY_2_DIGITS(out, digits); \ 117 | UNROLL_REMAINING_DIGITS_8(out, value, digits); \ 118 | } else { \ 119 | digits = value / 100000000; COPY_1_DIGIT(out, digits); \ 120 | UNROLL_REMAINING_DIGITS_8(out, value, digits); \ 121 | } \ 122 | } 123 | 124 | #define UNROLL_DIGIT_PAIR_7_8(out, value) { \ 125 | uint32_t digits; \ 126 | if (value >= 10000000) { \ 127 | digits = value / 1000000; COPY_2_DIGITS(out, digits); \ 128 | UNROLL_REMAINING_DIGITS_6(out, value, digits); \ 129 | } else { \ 130 | digits = value / 1000000; COPY_1_DIGIT(out, digits); \ 131 | UNROLL_REMAINING_DIGITS_6(out, value, digits); \ 132 | } \ 133 | } 134 | 135 | #define UNROLL_DIGIT_PAIR_5_6(out, value) { \ 136 | uint32_t digits; \ 137 | if (value >= 100000) { \ 138 | digits = value / 10000; COPY_2_DIGITS(out, digits); \ 139 | UNROLL_REMAINING_DIGITS_4(out, value, digits); \ 140 | } else { \ 141 | digits = value / 10000; COPY_1_DIGIT(out, digits); \ 142 | UNROLL_REMAINING_DIGITS_4(out, value, digits); \ 143 | } \ 144 | } 145 | 146 | #define UNROLL_DIGIT_PAIR_3_4(out, value) { \ 147 | uint32_t digits; \ 148 | if (value >= 1000) { \ 149 | digits = value / 100; COPY_2_DIGITS(out, digits); \ 150 | UNROLL_REMAINING_DIGITS_2(out, value, digits); \ 151 | } else { \ 152 | digits = value / 100; COPY_1_DIGIT(out, digits); \ 153 | UNROLL_REMAINING_DIGITS_2(out, value, digits); \ 154 | } \ 155 | } 156 | 157 | #define UNROLL_DIGIT_PAIR_1_2(out, value) { \ 158 | if (value >= 10) { \ 159 | COPY_2_DIGITS(out, value); \ 160 | UNROLL_REMAINING_DIGITS_0(out, value); \ 161 | } else { \ 162 | COPY_1_DIGIT(out, value); \ 163 | UNROLL_REMAINING_DIGITS_0(out, value); \ 164 | } \ 165 | } 166 | 167 | inline char* unrolledlut(uint32_t value, char* out) { 168 | if (value >= 100000000) { 169 | UNROLL_DIGIT_PAIR_9_10(out, value); 170 | } else if (value >= 1000000) { 171 | UNROLL_DIGIT_PAIR_7_8(out, value); 172 | } else if (value < 100) { 173 | UNROLL_DIGIT_PAIR_1_2(out, value); 174 | } else if (value < 10000) { 175 | UNROLL_DIGIT_PAIR_3_4(out, value); 176 | } else { /* (value < 1000000) */ 177 | UNROLL_DIGIT_PAIR_5_6(out, value); 178 | } 179 | } 180 | 181 | char* unrolledlut64(uint64_t value, char* buffer) { 182 | uint32_t least_significant = static_cast(value); 183 | if (least_significant == value) { 184 | return unrolledlut(least_significant, buffer); 185 | } 186 | 187 | uint64_t high12 = value / 100000000; 188 | 189 | /* optimized unrolled recursion */ 190 | least_significant = static_cast(high12); 191 | if (least_significant == high12) { 192 | buffer = unrolledlut(least_significant, buffer); 193 | } else { 194 | uint64_t high4 = high12 / 100000000; 195 | buffer = unrolledlut(high4, buffer); 196 | 197 | uint32_t digits_15_8 = high12 - (high4 * 100000000); 198 | UNROLL_EXACT_DIGITS_8(buffer, digits_15_8); 199 | } 200 | 201 | uint32_t digits_7_0 = value - (high12 * 100000000); 202 | UNROLL_EXACT_DIGITS_8(buffer, digits_7_0); 203 | return buffer; 204 | } 205 | 206 | #undef UNROLL_DIGIT_PAIR_1_2 207 | #undef UNROLL_DIGIT_PAIR_3_4 208 | #undef UNROLL_DIGIT_PAIR_5_6 209 | #undef UNROLL_DIGIT_PAIR_7_8 210 | #undef UNROLL_DIGIT_PAIR_9_10 211 | 212 | #undef UNROLL_REMAINING_DIGITS_0 213 | #undef UNROLL_REMAINING_DIGITS_2 214 | #undef UNROLL_REMAINING_DIGITS_4 215 | #undef UNROLL_REMAINING_DIGITS_6 216 | #undef UNROLL_REMAINING_DIGITS_8 217 | #undef UNROLL_EXACT_DIGITS_8 218 | 219 | #undef COPY_1_DIGIT 220 | #undef COPY_2_DIGITS 221 | 222 | 223 | void u32toa_unrolledlut(uint32_t value, char* buffer) { 224 | unrolledlut(value, buffer); 225 | } 226 | 227 | void i32toa_unrolledlut(int32_t value, char* buffer) { 228 | uint32_t uvalue = static_cast(value); 229 | if (value < 0) { 230 | *buffer++ = '-'; 231 | uvalue = -uvalue; 232 | } 233 | 234 | unrolledlut(uvalue, buffer); 235 | } 236 | 237 | void u64toa_unrolledlut(uint64_t value, char* buffer) { 238 | unrolledlut64(value, buffer); 239 | } 240 | 241 | void i64toa_unrolledlut(int64_t value, char* buffer) { 242 | uint64_t uvalue = static_cast(value); 243 | if (value < 0) { 244 | *buffer++ = '-'; 245 | uvalue = -uvalue; 246 | } 247 | 248 | unrolledlut64(uvalue, buffer); 249 | } 250 | 251 | 252 | REGISTER_TEST(unrolledlut); 253 | -------------------------------------------------------------------------------- /src/int-benchmark.cc: -------------------------------------------------------------------------------- 1 | // A decimal integer to string conversion benchmark 2 | // 3 | // Copyright (c) 2019 - present, Victor Zverovich 4 | // All rights reserved. 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #if __has_include() 21 | # include 22 | # include 23 | # include 24 | # define HAVE_BOOST 25 | #endif 26 | 27 | #include "itostr.cc" 28 | #include "u2985907.h" 29 | 30 | // Integer to string converter by Alf P. Steinbach modified to return a pointer 31 | // past the end of the output to avoid calling strlen. 32 | namespace cppx { 33 | inline auto unsigned_to_decimal(unsigned long number, char* buffer) { 34 | if (number == 0) { 35 | *buffer++ = '0'; 36 | } else { 37 | char* p_first = buffer; 38 | while (number != 0) { 39 | *buffer++ = '0' + number % 10; 40 | number /= 10; 41 | } 42 | std::reverse(p_first, buffer); 43 | } 44 | *buffer = '\0'; 45 | return buffer; 46 | } 47 | 48 | inline auto to_decimal(long number, char* buffer) { 49 | if (number < 0) { 50 | buffer[0] = '-'; 51 | return unsigned_to_decimal(-number, buffer + 1); 52 | } else { 53 | return unsigned_to_decimal(number, buffer); 54 | } 55 | } 56 | 57 | inline auto decimal_from(long number, char* buffer) { 58 | return to_decimal(number, buffer); 59 | } 60 | } // namespace cppx 61 | 62 | // Public domain ltoa by Robert B. Stout dba MicroFirm. 63 | char* ltoa(long N, char* str, int base) { 64 | int i = 2; 65 | long uarg; 66 | constexpr auto BUFSIZE = (sizeof(long) * 8 + 1); 67 | char *tail, *head = str, buf[BUFSIZE]; 68 | 69 | if (36 < base || 2 > base) base = 10; /* can only use 0-9, A-Z */ 70 | tail = &buf[BUFSIZE - 1]; /* last character position */ 71 | *tail-- = '\0'; 72 | 73 | if (10 == base && N < 0L) { 74 | *head++ = '-'; 75 | uarg = -N; 76 | } else 77 | uarg = N; 78 | 79 | if (uarg) { 80 | for (i = 1; uarg; ++i) { 81 | ldiv_t r; 82 | 83 | r = ldiv(uarg, base); 84 | *tail-- = (char)(r.rem + ((9L < r.rem) ? ('A' - 10L) : '0')); 85 | uarg = r.quot; 86 | } 87 | } else 88 | *tail-- = '0'; 89 | 90 | memcpy(head, ++tail, i); 91 | return str; 92 | } 93 | 94 | // Computes a digest of data. It is used both to prevent compiler from 95 | // optimizing away the benchmarked code and to verify that the results are 96 | // correct. The overhead is less than 2.5% compared to just DoNotOptimize. 97 | FMT_INLINE unsigned compute_digest(fmt::string_view data) { 98 | unsigned digest = 0; 99 | for (char c : data) digest += c; 100 | return digest; 101 | } 102 | 103 | struct Data { 104 | std::vector values; 105 | unsigned digest; 106 | 107 | auto begin() const { return values.begin(); } 108 | auto end() const { return values.end(); } 109 | 110 | // Prints the number of values by digit count, e.g. 111 | // 1 27263 112 | // 2 247132 113 | // 3 450601 114 | // 4 246986 115 | // 5 25188 116 | // 6 2537 117 | // 7 251 118 | // 8 39 119 | // 9 2 120 | // 10 1 121 | void print_digit_counts() const { 122 | int counts[11] = {}; 123 | for (auto value : values) ++counts[fmt::format_int(value).size()]; 124 | fmt::print("The number of values by digit count:\n"); 125 | for (int i = 1; i < 11; ++i) fmt::print("{:2} {:6}\n", i, counts[i]); 126 | } 127 | 128 | Data() : values(1'000'000) { 129 | // Similar data as in Boost Karma int generator test: 130 | // https://www.boost.org/doc/libs/1_63_0/libs/spirit/workbench/karma/int_generator.cpp 131 | // with rand replaced by uniform_int_distribution for consistent results 132 | // across platforms. 133 | std::mt19937 gen; 134 | std::uniform_int_distribution dist( 135 | 0, (std::numeric_limits::max)()); 136 | std::generate(values.begin(), values.end(), [&]() { 137 | int scale = dist(gen) / 100 + 1; 138 | return static_cast(dist(gen) * dist(gen)) / scale; 139 | }); 140 | digest = 141 | std::accumulate(begin(), end(), unsigned(), [](unsigned lhs, int rhs) { 142 | char buffer[12]; 143 | unsigned size = std::sprintf(buffer, "%d", rhs); 144 | return lhs + compute_digest({buffer, size}); 145 | }); 146 | print_digit_counts(); 147 | } 148 | } data; 149 | 150 | struct DigestChecker { 151 | benchmark::State& state; 152 | unsigned digest = 0; 153 | 154 | explicit DigestChecker(benchmark::State& s) : state(s) {} 155 | 156 | ~DigestChecker() noexcept(false) { 157 | if (digest != static_cast(state.iterations()) * data.digest) 158 | throw std::logic_error("invalid length"); 159 | state.SetItemsProcessed(state.iterations() * data.values.size()); 160 | benchmark::DoNotOptimize(digest); 161 | } 162 | 163 | FMT_INLINE void add(fmt::string_view s) { digest += compute_digest(s); } 164 | }; 165 | 166 | void sprintf(benchmark::State& state) { 167 | auto dc = DigestChecker(state); 168 | for (auto s : state) { 169 | for (auto value : data) { 170 | char buffer[12]; 171 | unsigned size = std::sprintf(buffer, "%d", value); 172 | dc.add({buffer, size}); 173 | } 174 | } 175 | } 176 | BENCHMARK(sprintf); 177 | 178 | void std_ostringstream(benchmark::State& state) { 179 | auto dc = DigestChecker(state); 180 | std::ostringstream os; 181 | for (auto s : state) { 182 | for (auto value : data) { 183 | os.str(std::string()); 184 | os << value; 185 | std::string s = os.str(); 186 | dc.add(s); 187 | } 188 | } 189 | } 190 | BENCHMARK(std_ostringstream); 191 | 192 | void std_to_string(benchmark::State& state) { 193 | auto dc = DigestChecker(state); 194 | for (auto s : state) { 195 | for (auto value : data) { 196 | std::string s = std::to_string(value); 197 | dc.add(s); 198 | } 199 | } 200 | } 201 | BENCHMARK(std_to_string); 202 | 203 | void std_to_chars(benchmark::State& state) { 204 | auto dc = DigestChecker(state); 205 | for (auto s : state) { 206 | for (auto value : data) { 207 | char buffer[12]; 208 | auto res = std::to_chars(buffer, buffer + sizeof(buffer), value); 209 | unsigned size = res.ptr - buffer; 210 | dc.add({buffer, size}); 211 | } 212 | } 213 | } 214 | BENCHMARK(std_to_chars); 215 | 216 | void fmt_to_string(benchmark::State& state) { 217 | auto dc = DigestChecker(state); 218 | for (auto s : state) { 219 | for (auto value : data) { 220 | std::string s = fmt::to_string(value); 221 | dc.add(s); 222 | } 223 | } 224 | } 225 | BENCHMARK(fmt_to_string); 226 | 227 | void fmt_format_runtime(benchmark::State& state) { 228 | auto dc = DigestChecker(state); 229 | for (auto s : state) { 230 | for (auto value : data) { 231 | std::string s = fmt::format("{}", value); 232 | dc.add(s); 233 | } 234 | } 235 | } 236 | BENCHMARK(fmt_format_runtime); 237 | 238 | void fmt_format_compile(benchmark::State& state) { 239 | auto dc = DigestChecker(state); 240 | for (auto s : state) { 241 | for (auto value : data) { 242 | std::string s = fmt::format(FMT_COMPILE("{}"), value); 243 | dc.add(s); 244 | } 245 | } 246 | } 247 | BENCHMARK(fmt_format_compile); 248 | 249 | void fmt_format_to_runtime(benchmark::State& state) { 250 | auto dc = DigestChecker(state); 251 | for (auto s : state) { 252 | for (auto value : data) { 253 | char buffer[12]; 254 | auto end = fmt::format_to(buffer, "{}", value); 255 | unsigned size = end - buffer; 256 | dc.add({buffer, size}); 257 | } 258 | } 259 | } 260 | BENCHMARK(fmt_format_to_runtime); 261 | 262 | void fmt_format_to_compile(benchmark::State& state) { 263 | auto dc = DigestChecker(state); 264 | for (auto s : state) { 265 | for (auto value : data) { 266 | char buffer[12]; 267 | auto end = fmt::format_to(buffer, FMT_COMPILE("{}"), value); 268 | unsigned size = end - buffer; 269 | dc.add({buffer, size}); 270 | } 271 | } 272 | } 273 | BENCHMARK(fmt_format_to_compile); 274 | 275 | void fmt_format_int(benchmark::State& state) { 276 | auto dc = DigestChecker(state); 277 | for (auto s : state) { 278 | for (auto value : data) { 279 | auto f = fmt::format_int(value); 280 | dc.add({f.data(), f.size()}); 281 | } 282 | } 283 | } 284 | BENCHMARK(fmt_format_int); 285 | 286 | #ifdef HAVE_BOOST 287 | void boost_lexical_cast(benchmark::State& state) { 288 | auto dc = DigestChecker(state); 289 | for (auto s : state) { 290 | for (auto value : data) { 291 | std::string s = boost::lexical_cast(value); 292 | dc.add(s); 293 | } 294 | } 295 | } 296 | BENCHMARK(boost_lexical_cast); 297 | 298 | void boost_format(benchmark::State& state) { 299 | auto dc = DigestChecker(state); 300 | boost::format fmt("%d"); 301 | for (auto s : state) { 302 | for (auto value : data) { 303 | std::string s = boost::str(fmt % value); 304 | dc.add(s); 305 | } 306 | } 307 | } 308 | BENCHMARK(boost_format); 309 | 310 | void boost_karma_generate(benchmark::State& state) { 311 | auto dc = DigestChecker(state); 312 | for (auto s : state) { 313 | for (auto value : data) { 314 | char buffer[12]; 315 | char* ptr = buffer; 316 | boost::spirit::karma::generate(ptr, boost::spirit::karma::int_, value); 317 | unsigned size = ptr - buffer; 318 | dc.add({buffer, size}); 319 | } 320 | } 321 | } 322 | BENCHMARK(boost_karma_generate); 323 | #endif 324 | 325 | void voigt_itostr(benchmark::State& state) { 326 | auto dc = DigestChecker(state); 327 | for (auto s : state) { 328 | for (auto value : data) { 329 | std::string s = itostr(value); 330 | dc.add(s); 331 | } 332 | } 333 | } 334 | BENCHMARK(voigt_itostr); 335 | 336 | void u2985907(benchmark::State& state) { 337 | auto dc = DigestChecker(state); 338 | for (auto s : state) { 339 | for (auto value : data) { 340 | char buffer[12]; 341 | unsigned size = so_u2985907::ufast_itoa10(value, buffer); 342 | dc.add({buffer, size}); 343 | } 344 | } 345 | } 346 | BENCHMARK(u2985907); 347 | 348 | void decimal_from(benchmark::State& state) { 349 | auto dc = DigestChecker(state); 350 | for (auto s : state) { 351 | for (auto value : data) { 352 | char buffer[12]; 353 | auto end = cppx::decimal_from(value, buffer); 354 | unsigned size = end - buffer; 355 | dc.add({buffer, size}); 356 | } 357 | } 358 | } 359 | BENCHMARK(decimal_from); 360 | 361 | void stout_ltoa(benchmark::State& state) { 362 | auto dc = DigestChecker(state); 363 | for (auto s : state) { 364 | for (auto value : data) { 365 | char buffer[12]; 366 | ltoa(value, buffer, 10); 367 | // ltoa doesn't give the size so this invokes strlen. 368 | dc.add(buffer); 369 | } 370 | } 371 | } 372 | BENCHMARK(stout_ltoa); 373 | 374 | BENCHMARK_MAIN(); 375 | -------------------------------------------------------------------------------- /src/itoa-benchmark/sse2.cpp: -------------------------------------------------------------------------------- 1 | // SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html 2 | // Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer. 3 | 4 | #if defined(i386) || defined(__amd64) || defined(_M_IX86) || defined(_M_X64) 5 | 6 | #include 7 | #include 8 | #include 9 | #include "digitslut.h" 10 | #include "test.h" 11 | 12 | #ifdef _MSC_VER 13 | #include "intrin.h" 14 | #endif 15 | 16 | #ifdef _MSC_VER 17 | #define ALIGN_PRE __declspec(align(16)) 18 | #define ALIGN_SUF 19 | #else 20 | #define ALIGN_PRE 21 | #define ALIGN_SUF __attribute__ ((aligned(16))) 22 | #endif 23 | 24 | static const uint32_t kDiv10000 = 0xd1b71759; 25 | ALIGN_PRE static const uint32_t kDiv10000Vector[4] ALIGN_SUF = { kDiv10000, kDiv10000, kDiv10000, kDiv10000 }; 26 | ALIGN_PRE static const uint32_t k10000Vector[4] ALIGN_SUF = { 10000, 10000, 10000, 10000 }; 27 | ALIGN_PRE static const uint16_t kDivPowersVector[8] ALIGN_SUF = { 8389, 5243, 13108, 32768, 8389, 5243, 13108, 32768 }; // 10^3, 10^2, 10^1, 10^0 28 | ALIGN_PRE static const uint16_t kShiftPowersVector[8] ALIGN_SUF = { 29 | 1 << (16 - (23 + 2 - 16)), 30 | 1 << (16 - (19 + 2 - 16)), 31 | 1 << (16 - 1 - 2), 32 | 1 << (15), 33 | 1 << (16 - (23 + 2 - 16)), 34 | 1 << (16 - (19 + 2 - 16)), 35 | 1 << (16 - 1 - 2), 36 | 1 << (15) 37 | }; 38 | ALIGN_PRE static const uint16_t k10Vector[8] ALIGN_SUF = { 10, 10, 10, 10, 10, 10, 10, 10 }; 39 | ALIGN_PRE static const char kAsciiZero[16] ALIGN_SUF = { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }; 40 | 41 | inline __m128i Convert8DigitsSSE2(uint32_t value) { 42 | assert(value <= 99999999); 43 | 44 | // abcd, efgh = abcdefgh divmod 10000 45 | const __m128i abcdefgh = _mm_cvtsi32_si128(value); 46 | const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast(kDiv10000Vector)[0]), 45); 47 | const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast(k10000Vector)[0])); 48 | 49 | // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] 50 | const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); 51 | 52 | // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] 53 | const __m128i v1a = _mm_slli_epi64(v1, 2); 54 | 55 | // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] 56 | const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); 57 | const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); 58 | 59 | // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] 60 | const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast(kDivPowersVector)[0]); 61 | const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast(kShiftPowersVector)[0]); 62 | 63 | // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] 64 | const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast(k10Vector)[0]); 65 | 66 | // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] 67 | const __m128i v6 = _mm_slli_epi64(v5, 16); 68 | 69 | // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } 70 | const __m128i v7 = _mm_sub_epi16(v4, v6); 71 | 72 | return v7; 73 | } 74 | 75 | inline __m128i ShiftDigits_SSE2(__m128i a, unsigned digit) { 76 | assert(digit <= 8); 77 | switch (digit) { 78 | case 0: return a; 79 | case 1: return _mm_srli_si128(a, 1); 80 | case 2: return _mm_srli_si128(a, 2); 81 | case 3: return _mm_srli_si128(a, 3); 82 | case 4: return _mm_srli_si128(a, 4); 83 | case 5: return _mm_srli_si128(a, 5); 84 | case 6: return _mm_srli_si128(a, 6); 85 | case 7: return _mm_srli_si128(a, 7); 86 | case 8: return _mm_srli_si128(a, 8); 87 | } 88 | return a; // should not execute here. 89 | } 90 | 91 | inline void u32toa_sse2(uint32_t value, char* buffer) { 92 | if (value < 10000) { 93 | const uint32_t d1 = (value / 100) << 1; 94 | const uint32_t d2 = (value % 100) << 1; 95 | 96 | if (value >= 1000) 97 | *buffer++ = gDigitsLut[d1]; 98 | if (value >= 100) 99 | *buffer++ = gDigitsLut[d1 + 1]; 100 | if (value >= 10) 101 | *buffer++ = gDigitsLut[d2]; 102 | *buffer++ = gDigitsLut[d2 + 1]; 103 | *buffer++ = '\0'; 104 | } 105 | else if (value < 100000000) { 106 | // Experiment shows that this case SSE2 is slower 107 | #if 0 108 | const __m128i a = Convert8DigitsSSE2(value); 109 | 110 | // Convert to bytes, add '0' 111 | const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast(kAsciiZero)[0]); 112 | 113 | // Count number of digit 114 | const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); 115 | unsigned long digit; 116 | #ifdef _MSC_VER 117 | _BitScanForward(&digit, ~mask | 0x8000); 118 | #else 119 | digit = __builtin_ctz(~mask | 0x8000); 120 | #endif 121 | 122 | // Shift digits to the beginning 123 | __m128i result = ShiftDigits_SSE2(va, digit); 124 | //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); 125 | _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); 126 | buffer[8 - digit] = '\0'; 127 | #else 128 | // value = bbbbcccc 129 | const uint32_t b = value / 10000; 130 | const uint32_t c = value % 10000; 131 | 132 | const uint32_t d1 = (b / 100) << 1; 133 | const uint32_t d2 = (b % 100) << 1; 134 | 135 | const uint32_t d3 = (c / 100) << 1; 136 | const uint32_t d4 = (c % 100) << 1; 137 | 138 | if (value >= 10000000) 139 | *buffer++ = gDigitsLut[d1]; 140 | if (value >= 1000000) 141 | *buffer++ = gDigitsLut[d1 + 1]; 142 | if (value >= 100000) 143 | *buffer++ = gDigitsLut[d2]; 144 | *buffer++ = gDigitsLut[d2 + 1]; 145 | 146 | *buffer++ = gDigitsLut[d3]; 147 | *buffer++ = gDigitsLut[d3 + 1]; 148 | *buffer++ = gDigitsLut[d4]; 149 | *buffer++ = gDigitsLut[d4 + 1]; 150 | *buffer++ = '\0'; 151 | #endif 152 | } 153 | else { 154 | // value = aabbbbbbbb in decimal 155 | 156 | const uint32_t a = value / 100000000; // 1 to 42 157 | value %= 100000000; 158 | 159 | if (a >= 10) { 160 | const unsigned i = a << 1; 161 | *buffer++ = gDigitsLut[i]; 162 | *buffer++ = gDigitsLut[i + 1]; 163 | } 164 | else 165 | *buffer++ = '0' + static_cast(a); 166 | 167 | const __m128i b = Convert8DigitsSSE2(value); 168 | const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast(kAsciiZero)[0]); 169 | const __m128i result = _mm_srli_si128(ba, 8); 170 | _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); 171 | buffer[8] = '\0'; 172 | } 173 | } 174 | 175 | void i32toa_sse2(int32_t value, char* buffer) { 176 | uint32_t u = static_cast(value); 177 | if (value < 0) { 178 | *buffer++ = '-'; 179 | u = ~u + 1; 180 | } 181 | u32toa_sse2(u, buffer); 182 | } 183 | 184 | inline void u64toa_sse2(uint64_t value, char* buffer) { 185 | if (value < 100000000) { 186 | uint32_t v = static_cast(value); 187 | if (v < 10000) { 188 | const uint32_t d1 = (v / 100) << 1; 189 | const uint32_t d2 = (v % 100) << 1; 190 | 191 | if (v >= 1000) 192 | *buffer++ = gDigitsLut[d1]; 193 | if (v >= 100) 194 | *buffer++ = gDigitsLut[d1 + 1]; 195 | if (v >= 10) 196 | *buffer++ = gDigitsLut[d2]; 197 | *buffer++ = gDigitsLut[d2 + 1]; 198 | *buffer++ = '\0'; 199 | } 200 | else { 201 | // Experiment shows that this case SSE2 is slower 202 | #if 0 203 | const __m128i a = Convert8DigitsSSE2(v); 204 | 205 | // Convert to bytes, add '0' 206 | const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast(kAsciiZero)[0]); 207 | 208 | // Count number of digit 209 | const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); 210 | unsigned long digit; 211 | #ifdef _MSC_VER 212 | _BitScanForward(&digit, ~mask | 0x8000); 213 | #else 214 | digit = __builtin_ctz(~mask | 0x8000); 215 | #endif 216 | 217 | // Shift digits to the beginning 218 | __m128i result = ShiftDigits_SSE2(va, digit); 219 | _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); 220 | buffer[8 - digit] = '\0'; 221 | #else 222 | // value = bbbbcccc 223 | const uint32_t b = v / 10000; 224 | const uint32_t c = v % 10000; 225 | 226 | const uint32_t d1 = (b / 100) << 1; 227 | const uint32_t d2 = (b % 100) << 1; 228 | 229 | const uint32_t d3 = (c / 100) << 1; 230 | const uint32_t d4 = (c % 100) << 1; 231 | 232 | if (value >= 10000000) 233 | *buffer++ = gDigitsLut[d1]; 234 | if (value >= 1000000) 235 | *buffer++ = gDigitsLut[d1 + 1]; 236 | if (value >= 100000) 237 | *buffer++ = gDigitsLut[d2]; 238 | *buffer++ = gDigitsLut[d2 + 1]; 239 | 240 | *buffer++ = gDigitsLut[d3]; 241 | *buffer++ = gDigitsLut[d3 + 1]; 242 | *buffer++ = gDigitsLut[d4]; 243 | *buffer++ = gDigitsLut[d4 + 1]; 244 | *buffer++ = '\0'; 245 | #endif 246 | } 247 | } 248 | else if (value < 10000000000000000) { 249 | const uint32_t v0 = static_cast(value / 100000000); 250 | const uint32_t v1 = static_cast(value % 100000000); 251 | 252 | const __m128i a0 = Convert8DigitsSSE2(v0); 253 | const __m128i a1 = Convert8DigitsSSE2(v1); 254 | 255 | // Convert to bytes, add '0' 256 | const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast(kAsciiZero)[0]); 257 | 258 | // Count number of digit 259 | const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast(kAsciiZero)[0])); 260 | #ifdef _MSC_VER 261 | unsigned long digit; 262 | _BitScanForward(&digit, ~mask | 0x8000); 263 | #else 264 | unsigned digit = __builtin_ctz(~mask | 0x8000); 265 | #endif 266 | 267 | // Shift digits to the beginning 268 | __m128i result = ShiftDigits_SSE2(va, digit); 269 | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); 270 | buffer[16 - digit] = '\0'; 271 | } 272 | else { 273 | const uint32_t a = static_cast(value / 10000000000000000); // 1 to 1844 274 | value %= 10000000000000000; 275 | 276 | if (a < 10) 277 | *buffer++ = '0' + static_cast(a); 278 | else if (a < 100) { 279 | const uint32_t i = a << 1; 280 | *buffer++ = gDigitsLut[i]; 281 | *buffer++ = gDigitsLut[i + 1]; 282 | } 283 | else if (a < 1000) { 284 | *buffer++ = '0' + static_cast(a / 100); 285 | 286 | const uint32_t i = (a % 100) << 1; 287 | *buffer++ = gDigitsLut[i]; 288 | *buffer++ = gDigitsLut[i + 1]; 289 | } 290 | else { 291 | const uint32_t i = (a / 100) << 1; 292 | const uint32_t j = (a % 100) << 1; 293 | *buffer++ = gDigitsLut[i]; 294 | *buffer++ = gDigitsLut[i + 1]; 295 | *buffer++ = gDigitsLut[j]; 296 | *buffer++ = gDigitsLut[j + 1]; 297 | } 298 | 299 | const uint32_t v0 = static_cast(value / 100000000); 300 | const uint32_t v1 = static_cast(value % 100000000); 301 | 302 | const __m128i a0 = Convert8DigitsSSE2(v0); 303 | const __m128i a1 = Convert8DigitsSSE2(v1); 304 | 305 | // Convert to bytes, add '0' 306 | const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast(kAsciiZero)[0]); 307 | _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); 308 | buffer[16] = '\0'; 309 | } 310 | } 311 | 312 | void i64toa_sse2(int64_t value, char* buffer) { 313 | uint64_t u = static_cast(value); 314 | if (value < 0) { 315 | *buffer++ = '-'; 316 | u = ~u + 1; 317 | } 318 | u64toa_sse2(u, buffer); 319 | } 320 | 321 | REGISTER_TEST(sse2); 322 | 323 | #endif 324 | -------------------------------------------------------------------------------- /variadic-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Script to test how much bloating a large project will suffer when using 4 | # different formatting methods. 5 | # Based on bloat_test.sh from https://github.com/c42f/tinyformat. 6 | 7 | from __future__ import print_function, division 8 | 9 | import argparse 10 | import os 11 | import pickle 12 | import re 13 | import sys 14 | from glob import glob 15 | from subprocess import check_call, Popen, PIPE 16 | from timeit import timeit 17 | 18 | 19 | parser = argparse.ArgumentParser() 20 | subparsers = parser.add_subparsers(help='possible commands', dest='command') 21 | 22 | parser_bench = subparsers.add_parser('bench', help='run the benchmark') 23 | parser_bench.add_argument('min', type=int, help='minimum number of arguments') 24 | parser_bench.add_argument('max', type=int, help='maximum number of arguments') 25 | parser_bench.add_argument('num_translation_units', metavar='N', type=int, 26 | help='number of translation units') 27 | 28 | parser_plot = subparsers.add_parser('plot', help='plot the results') 29 | parser_plot.add_argument('--filename', type=str, default='variadic-test.pkl', 30 | help='bench result file path') 31 | 32 | parser_plotdiff = subparsers.add_parser( 33 | 'plotdiff', help='plot the difference between result files' 34 | ) 35 | parser_plotdiff.add_argument('files', type=str, nargs='*', 36 | help='result files to be compared') 37 | parser_plotdiff.add_argument('--method', type=str, default='C++ Format', 38 | help='formatting library') 39 | parser_plotdiff.add_argument('--config', type=str, default='optimized', 40 | help='optimized or debug') 41 | 42 | options, more_compiler_flags = parser.parse_known_args(sys.argv[1:]) 43 | 44 | if 'plot' in options.command: 45 | import numpy as np 46 | import matplotlib.pyplot as plt 47 | import seaborn as sns 48 | 49 | 50 | prefix = '_variadic_test_tmp_' 51 | NUM_RUNS = 3 52 | use_clobber = False 53 | 54 | configs = [ 55 | ('optimized', ['-O3', '-DNDEBUG']), 56 | ('debug', []) 57 | ] 58 | 59 | methods = [ 60 | ('printf' , []), 61 | ('IOStreams' , ['-DUSE_IOSTREAMS']), 62 | ('fmt' , ['-DUSE_FMT', '-Lfmt', '-lfmt']), 63 | ('tinyformat' , ['-DUSE_TINYFORMAT']), 64 | ('Boost Format', ['-DUSE_BOOST']) 65 | ] 66 | 67 | method_templates = { 68 | 'boost': { 69 | 'statement': r'std::cout << boost::format("{fmt_str}\n") % {args};', 70 | 'sep': ' % ', 71 | 'specifier': '%{type}', 72 | }, 73 | 'fmt': { 74 | 'statement': r'fmt::print("{fmt_str}\n", {args});', 75 | 'sep': ', ', 76 | 'specifier': '{{:{type}}}', 77 | }, 78 | 'iostream': { 79 | 'statement': r'std::cout << {args} << "\n";', 80 | 'sep': ' << ":" << ', 81 | 'specifier': '', 82 | }, 83 | 'tinyformat': { 84 | 'statement': r'tfm::printf("{fmt_str}\n", {args});', 85 | 'sep': ', ', 86 | 'specifier': '%{type}', 87 | }, 88 | 'printf': { 89 | 'statement': r'::printf("{fmt_str}\n", {args});', 90 | 'sep': ', ', 91 | 'specifier': '%{type}', 92 | } 93 | } 94 | 95 | main_template = r''' 96 | #ifdef USE_BOOST 97 | 98 | #include 99 | #include 100 | 101 | {boost} 102 | 103 | #elif defined(USE_FMT) 104 | 105 | #include "fmt/format.h" 106 | 107 | {fmt} 108 | 109 | #elif defined(USE_IOSTREAMS) 110 | 111 | #include 112 | 113 | {iostream} 114 | 115 | #elif defined(USE_TINYFORMAT) 116 | 117 | #include "tinyformat.h" 118 | 119 | {tinyformat} 120 | 121 | #else 122 | 123 | #include 124 | 125 | {printf} 126 | 127 | #endif 128 | ''' 129 | 130 | 131 | def make_format_string(method, args): 132 | specifiers = [method['specifier'].format(type=a[1]) for a in args] 133 | return ':'.join(specifiers) 134 | 135 | 136 | def make_statement(method, args): 137 | d = {'args': method['sep'].join(a[0] for a in args)} 138 | if method['specifier']: 139 | d['fmt_str'] = make_format_string(method, args) 140 | 141 | return method['statement'].format(**d) 142 | 143 | 144 | def generate_args(start_n): 145 | n = start_n 146 | while True: 147 | args = [(str(n), 'd'), 148 | (str(float(n)), '.1f'), 149 | ('"String{}"'.format(n), 's'), 150 | ('i', 'd'), 151 | ('f', '.1f'), 152 | ('s', 's')] 153 | for a in args: 154 | yield a 155 | n += 1 156 | 157 | 158 | def make_function(func_def, method, n, num_args): 159 | from itertools import islice 160 | mul = 5 161 | args = list(islice(generate_args(n), 2 * mul * num_args)) 162 | 163 | statements = [make_statement(method, args[shift:shift + num_args]) 164 | for shift in range(mul * num_args)] 165 | 166 | if use_clobber: 167 | sep = ' asm volatile("" : : : "memory");\n ' 168 | else: 169 | sep = '\n ' 170 | 171 | function = (func_def + ' {{\n' 172 | ' {}\n' 173 | '}}').format(sep.join(statements)) 174 | return function 175 | 176 | 177 | def make_template(func_def, n, num_args): 178 | functions = {k: make_function(func_def, v, n, num_args) 179 | for k, v in method_templates.items()} 180 | return main_template.format(**functions) 181 | 182 | 183 | class Table: 184 | """Prints a reStructuredText table""" 185 | 186 | def __init__(self, header, formats): 187 | self.widths = [len(i) for i in header] 188 | self.formats = formats 189 | 190 | self.print_rulers() 191 | for field, width in zip(header, self.widths): 192 | print(self.format_field(field, '', width), end=' ') 193 | print() 194 | self.print_rulers() 195 | 196 | @staticmethod 197 | def format_field(field, fmt='', width=''): 198 | return '{:{}{}}'.format(field, width, fmt) 199 | 200 | def print_rulers(self): 201 | for w in self.widths: 202 | print('=' * w, end=' ') 203 | print() 204 | 205 | def print_row(self, *row): 206 | for field, fmt, width in zip(row, self.formats, self.widths): 207 | print(self.format_field(field, fmt, width), end=' ') 208 | print() 209 | 210 | 211 | def to_kib(n): 212 | """Converts n to kibibytes""" 213 | return int(round(n / 1024.0)) 214 | 215 | 216 | def remove_old_files(): 217 | filenames = glob(prefix + '??.cc') 218 | for f in [prefix + 'main.cc', prefix + 'all.h']: 219 | if os.path.exists(f): 220 | filenames.append(f) 221 | for f in filenames: 222 | os.remove(f) 223 | 224 | 225 | def generate_files(num_args): 226 | main_source = prefix + 'main.cc' 227 | main_header = prefix + 'all.h' 228 | sources = [main_source] 229 | with open(main_source, 'w') as cppfile, open(main_header, 'w') as hppfile: 230 | cppfile.write(re.sub('^ +', '', '''\ 231 | #include "{}all.h" 232 | #ifdef USE_IOSTREAMS 233 | # include 234 | #endif 235 | 236 | int main() {{ 237 | #ifdef USE_IOSTREAMS 238 | std::cout.setf(std::ios::fixed); 239 | std::cout.precision(1); 240 | #endif 241 | '''.format(prefix), 0, re.MULTILINE)) 242 | for i in range(options.num_translation_units): 243 | n = '{:03}'.format(i) 244 | func_name = 'doFormat_a' + n 245 | func_params = '(int i, float f, const char* s)' 246 | func_def = 'void ' + func_name + func_params 247 | source = prefix + n + '.cc' 248 | sources.append(source) 249 | 250 | with open(source, 'w') as f: 251 | f.write(make_template(func_def, i, num_args)) 252 | 253 | cppfile.write(func_name + '(1, 1.0f, "String");\n') 254 | hppfile.write(func_def + ';\n') 255 | cppfile.write('}') 256 | 257 | return sources 258 | 259 | 260 | def find_compiler(): 261 | compiler_path = None 262 | for path in os.getenv('PATH').split(os.pathsep): 263 | filename = os.path.join(path, 'g++') 264 | if os.path.exists(filename): 265 | if os.path.islink(filename) and \ 266 | os.path.basename(os.path.realpath(filename)) == 'ccache': 267 | # Don't use ccache. 268 | print('Ignoring ccache link at', filename) 269 | continue 270 | compiler_path = filename 271 | break 272 | 273 | return compiler_path 274 | 275 | 276 | def measure_compile(compiler_path, sources, flags): 277 | """Measure compile time and executable size""" 278 | output_filename = prefix + '.out' 279 | if os.path.exists(output_filename): 280 | os.remove(output_filename) 281 | 282 | include_dir = '-I' + os.path.dirname(os.path.realpath(__file__)) 283 | command = 'check_call({})'.format( 284 | [compiler_path, '-std=c++11', '-o', output_filename, 285 | include_dir] + sources + flags + more_compiler_flags 286 | ) 287 | 288 | result = { 289 | 'time': timeit(command, number=1, 290 | setup='from subprocess import check_call'), 291 | 'size': os.stat(output_filename).st_size 292 | } 293 | 294 | check_call(['strip', output_filename]) 295 | result['stripped_size'] = os.stat(output_filename).st_size 296 | 297 | p = Popen(['./' + output_filename], stdout=PIPE, 298 | env={'LD_LIBRARY_PATH': 'fmt'}) 299 | result['output'] = p.communicate()[0] 300 | sys.stdout.flush() 301 | 302 | return result 303 | 304 | 305 | def bench_single(num_args, flags): 306 | remove_old_files() 307 | sources = generate_files(num_args) 308 | compiler_path = find_compiler() 309 | 310 | result = {} 311 | for i in range(NUM_RUNS): 312 | sys.stdout.flush() 313 | 314 | old_result = result 315 | result = measure_compile(compiler_path, sources, flags) 316 | 317 | if 'time' not in old_result: 318 | continue 319 | 320 | result['time'] = min(old_result['time'], result['time']) 321 | if any(result[k] != old_result[k] for k in ('size', 'stripped_size')): 322 | raise Exception('size mismatch') 323 | 324 | return result 325 | 326 | 327 | def bench(method, config, flags): 328 | print('Benchmarking', config, method) 329 | table = Table( 330 | ['Args', 'Compile time, s', 'Executable size, KiB', 'Stripped size, KiB'], 331 | ['', '.1f', '', ''] 332 | ) 333 | results = [] 334 | for num_args in range(options.min, options.max): 335 | result = bench_single(num_args, flags) 336 | table.print_row(num_args, result['time'], to_kib(result['size']), 337 | to_kib(result['stripped_size'])) 338 | results.append(result) 339 | table.print_rulers() 340 | print() 341 | return results 342 | 343 | 344 | def check_output(expected_list, actual_list): 345 | for expected, actual in zip(expected_list, actual_list): 346 | if expected['output'] != actual['output']: 347 | print(expected['output']) 348 | print(actual['output']) 349 | raise Exception("output doesn't match") 350 | 351 | 352 | def bench_command(): 353 | data = {'options': options} 354 | for method, method_flags in methods: 355 | data[method] = {config: bench(method, config, method_flags + config_flags) 356 | for config, config_flags in configs} 357 | 358 | if 'printf' in data: 359 | for config, _ in configs: 360 | check_output(data['printf'][config], data[method][config]) 361 | 362 | for method, _ in methods: 363 | for config, _ in configs: 364 | for result in data[method][config]: 365 | del result['output'] 366 | 367 | with open('variadic-test.pkl', 'wb') as file: 368 | pickle.dump(data, file) 369 | 370 | 371 | def load_data(filename): 372 | if not filename.endswith('.pkl'): 373 | filename += '.pkl' 374 | 375 | with open(filename, 'rb') as file: 376 | return pickle.load(file) 377 | 378 | 379 | def set_plot_style(): 380 | sns.set_style("ticks", rc={'xtick.direction': 'in', 'ytick.direction': 'in'}) 381 | sns.set_palette('Set1') 382 | 383 | 384 | def plot_title(s): 385 | plt.annotate(s, (0.05, 0.9), xycoords='axes fraction', 386 | fontsize=14, horizontalalignment='left', 387 | bbox=dict(boxstyle="round,pad=0.2", fc='white')) 388 | 389 | 390 | def plot_subfigure(x, y, prop, **kwargs): 391 | zorder = 3 if kwargs.get('label') == 'C++ Format' else 2 392 | plt.plot(x, y, marker='o', markersize=6, zorder=zorder, **kwargs) 393 | plt.axvline(16, ls='--', color='grey') 394 | 395 | xmax = max(x) + 1 396 | plt.xlim(0, xmax) 397 | plt.xticks(np.arange(0, xmax, 4)) 398 | plt.grid(color='k', alpha=0.4, ls=':') 399 | sns.despine() 400 | 401 | plt.xlabel('number of arguments') 402 | if prop == 'time': 403 | plt.ylabel('compile time (s)') 404 | else: 405 | plt.ylabel('binary size (MiB)') 406 | 407 | 408 | def plot_all(filename, prop): 409 | data = load_data(filename) 410 | x = np.arange(data['options'].min, data['options'].max) 411 | 412 | def make_y(results): 413 | y = [result[prop] for result in results] 414 | if prop == 'size': 415 | y = np.array(y) / 1024 / 1024 416 | return y 417 | 418 | set_plot_style() 419 | plt.figure(figsize=(8, 3.5)) 420 | 421 | plt.subplot('121') 422 | plot_title('release') 423 | for method, _ in methods: 424 | plot_subfigure(x, make_y(data[method]['optimized']), prop, label=method) 425 | 426 | plt.legend(loc='upper center', bbox_to_anchor=(1.05, 1.17), 427 | ncol=5, fontsize=11) 428 | 429 | plt.subplot('122') 430 | plot_title('debug') 431 | for method, _ in methods: 432 | plot_subfigure(x, make_y(data[method]['debug']), prop, label=method) 433 | 434 | plt.suptitle('variadic-test', fontsize=16, y=1.1) 435 | plt.savefig('variadic-test_{}.png'.format(prop), bbox_inches='tight') 436 | 437 | 438 | def plot_command(): 439 | for prop in 'time', 'size': 440 | plot_all(options.filename, prop) 441 | 442 | 443 | def plot_diff(filenames, method, config, prop): 444 | dataset = [load_data(f) for f in filenames] 445 | baseline = dataset[0][method][config] 446 | x = np.arange(dataset[0]['options'].min, dataset[0]['options'].max) 447 | 448 | set_plot_style() 449 | plt.figure(figsize=(8, 3.5)) 450 | 451 | plt.subplot('121') 452 | plot_title('absolute') 453 | for name, data in zip(filenames, dataset): 454 | y = [result[prop] for result in data[method][config]] 455 | if prop == 'size': 456 | y = np.array(y) / 1024 / 1024 457 | plot_subfigure(x, y, prop, label=name) 458 | 459 | plt.legend(loc='upper center', bbox_to_anchor=(1.05, 1.17), 460 | ncol=5, fontsize=11) 461 | 462 | plt.subplot('122') 463 | plot_title('relative') 464 | for name, data in zip(filenames, dataset): 465 | results = data[method][config] 466 | y = [result[prop] / base[prop] for result, base in zip(results, baseline)] 467 | plot_subfigure(x, y, prop, label=name) 468 | 469 | ylim = plt.ylim() 470 | plt.ylim(ylim[0] * 0.8, ylim[1] * 1.2) 471 | ax = plt.gca() 472 | vals = ax.get_yticks() 473 | ax.set_yticklabels(['{:3.0%}'.format(x) for x in vals]) 474 | plt.ylabel('') 475 | 476 | plt.suptitle('variadic-diff', fontsize=16, y=1.1) 477 | plt.savefig('variadic-diff_{}.png'.format(prop), bbox_inches='tight') 478 | 479 | 480 | def plotdiff_command(): 481 | for prop in 'time', 'size': 482 | plot_diff(options.files, options.method, options.config, prop) 483 | 484 | 485 | commands = { 486 | 'bench': bench_command, 487 | 'plot': plot_command, 488 | 'plotdiff': plotdiff_command, 489 | } 490 | 491 | if __name__ == '__main__': 492 | commands[options.command]() 493 | --------------------------------------------------------------------------------