├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE_1_0.txt ├── README.md ├── benchmark ├── CMakeLists.txt ├── benchmark.cpp ├── utf8.h └── utf8 │ ├── checked.h │ ├── core.h │ └── unchecked.h ├── example ├── CMakeLists.txt └── utf8_to_utf16be.cpp ├── include └── tcb │ └── utf_ranges │ ├── convert.hpp │ ├── detail │ └── utf.hpp │ ├── istreambuf_range.hpp │ ├── ostreambuf_iterator.hpp │ ├── view.hpp │ └── view │ ├── bom.hpp │ ├── bytes.hpp │ ├── endian_convert.hpp │ ├── line_end_transform.hpp │ └── utf_convert.hpp └── test ├── CMakeLists.txt ├── bom_test.cpp ├── bytes_test.cpp ├── catch.hpp ├── catch_main.cpp ├── convert_test.cpp ├── endian_test.cpp ├── istreambuf_range_test.cpp ├── line_end_transform_test.cpp ├── ostreambuf_iterator_test.cpp └── utf_convert_view_test.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | build*/ 3 | 4 | # QtCreator 5 | CMakeLists.txt.user 6 | 7 | # CLion 8 | .idea/ 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/range-v3"] 2 | path = external/range-v3 3 | url = https://github.com/ericniebler/range-v3.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 3.1) 3 | 4 | project(utf_ranges CXX) 5 | 6 | set(CMAKE_CXX_STANDARD 14) 7 | 8 | if (UNIX) 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") 10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Werror") 11 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") 12 | endif() # UNIX 13 | 14 | if (WIN32) 15 | if (CMAKE_COMPILER_IS_GNUCXX) 16 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") 17 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Werror") 18 | endif() 19 | add_definitions("-DNOMINMAX") 20 | endif() 21 | 22 | find_package(Boost COMPONENTS system REQUIRED) 23 | 24 | include_directories(include) 25 | set(RANGE_INCLUDE_DIR "${utf_ranges_SOURCE_DIR}/external/range-v3/include") 26 | 27 | add_subdirectory(benchmark) 28 | add_subdirectory(example) 29 | add_subdirectory(test) 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /LICENSE_1_0.txt: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # utf_ranges 3 | 4 | A collection of Unicode utilities for C++ using [Range-V3](https://github.com/ericniebler/range-v3) 5 | 6 | This header-only library contains facilities for transforming between UTF-8, UTF-16 and UTF-32 encoded strings (eagerly and lazily), as well as dealing with byte-order marks and transforming line endings. 7 | 8 | ## Example 9 | 10 | A quick overview is best supplied by an example. The following reads a UTF-8 encoded input stream and outputs a UTF-16BE byte stream with byte-order mark: 11 | 12 | ```cpp 13 | namespace rng = ::ranges::v3; 14 | namespace utf = ::tcb::utf_ranges; 15 | 16 | std::ifstream in_file{"input_file.utf8.txt", std::ios::binary}; 17 | std::ofstream out_file{"output_file.utf16be.txt", std::ios::binary}; 18 | 19 | auto view = utf::istreambuf(in_file) // Read range from input stream 20 | | utf::view::consume_bom // Remove UTF-8 "BOM" if present 21 | | utf::view::utf16 // Convert to UTF-16 22 | | utf::view::add_bom // Prepend UTF-16 BOM to start of range 23 | | utf::view::endian_convert // Convert to big-endian 24 | | utf::view::bytes; // Write to disk as bytes 25 | 26 | rng::copy(view, utf::ostreambuf_iterator{out_file}); // Do the copy 27 | ``` 28 | 29 | (see example/utf8_to_utf16be.cpp for the full code). 30 | 31 | ## Conversions 32 | 33 | For "eager" encoding conversions, the library broadly follows the API specified in [Beman Dawes' proposed Unicode conversion library](https://github.com/Beman/unicode/tree/std-proposal), albeit (currently) with simplified error handling (invalid Unicode characters are simply replaced by the Unicode replacement character U+FFFD). The actual conversion uses code taken from Boost.Locale. 34 | 35 | To convert a range of characters between UTF-8, UTF-16 or UTF-32, use the `tcb::utf_ranges::utf_convert()` function. This takes an `InputRange` with a value type that is an arithmetic type of size 1, 2 or 4 bytes (for UTF-8, UTF-16 and UTF-32 respectively), and an `OutputIterator` with a value type similarly defined. For example: 36 | 37 | ```cpp 38 | std::string in = u8"Hello world"; 39 | std::u16string out; 40 | // Note that the output type cannot be determined automatically, so must be specified 41 | tcb::utf_ranges::utf_convert(in, std::back_inserter(out)); 42 | ``` 43 | 44 | To tranform directly to a new string, the `to_utf_string()` function is supplied: 45 | 46 | ```cpp 47 | std::u16string in = u"Hello world"; 48 | std::string out = tcb::utf_ranges::to_utf_string(in); 49 | ``` 50 | 51 | Convenience functions `to_u8string()`, `to_u16string()`, `to_u32string()` and `to_wstring()` are also provided (but please don't use the last one): 52 | 53 | ```cpp 54 | std::u32string in = U"Hello world"; 55 | std::u16string out = tcb::utf_ranges::to_u16string(in); 56 | ``` 57 | 58 | ## Views 59 | 60 | If you're familiar with Range-V3, you'll know that views perform lazy transformations on a given range -- that is, conversion is done one element at a time when the view is iterated over. 61 | 62 | ### Encoding conversions 63 | 64 | This library provides views which lazily perform the same transformations as above. For consistency with Range-V3, these are in the `view::` sub-namespace. 65 | 66 | ```cpp 67 | std::u16string in = u"Hello world"; 68 | 69 | auto view = tcb::utf_ranges::view::utf8(in); 70 | 71 | ranges::v3::copy(view, std::ostream_iterator(std::cout)); 72 | ``` 73 | 74 | There are similar `utf16` and `utf32` views 75 | 76 | ### Endian transformations 77 | 78 | For UTF-16 and UTF-32, the library provides views which perform byte-swapping between native-, big- and little-ending representations, using code from Boost. The output endianness is specifed by a template parameter, and the input endianness is passed as an argument to the constructor. Both default to `boost::endian::native`. For example: 79 | 80 | ``` 81 | std::u16string in = u"Hello world"; // native endian 82 | auto view = tcb::utf_ranges::view::endian_convert(in); 83 | std::vector out = view; // Copy byte-swapped values to vector 84 | ``` 85 | 86 | ### Byte order mark handling 87 | 88 | The library provides two views for dealing with "byte order marks", that is, the Unicode non-breaking space character U+FEFF which is often placed at the start of files to allow the endianness to be detected. 89 | 90 | To detect a byte-order mark, using the `consume_bom` view: 91 | 92 | ``` 93 | std::u16string in = u"\uFEFFHello world"; // native-endian UTF-16 with BOM 94 | auto view = tcb::utf_ranges::view::consume_bom(in); 95 | std::u16string out = view; // copy to new string with BOM removed 96 | ``` 97 | 98 | As suggested by the name, the byte order mark is removed if present. If a BOM is found an has non-native endianness, endian conversion is automatically performed -- that is, the output of the view will always be native-endian. For UTF-8, if a BOM is detected it is simply removed. If no BOM is present, the string is assumed to be native-endian (for UTF-16 and -32), and is passed through unchanged. 99 | 100 | To place a byte-order mark at the start of a string, use the `add_bom` view: 101 | 102 | ``` 103 | std::u16string in = u"Hello world"; 104 | auto view = tcb::utf_ranges::view::add_bom(in); 105 | std::u16string out = view; // copy to new string, with BOM prepended 106 | ``` 107 | 108 | ### Line ending transformation 109 | 110 | Unicode specifies eight possible line endings, and recommends that these are converted to the machine native line ending representation on input. In C++, the native representation is "\n". The `line_end_transform` view performs such a conversion. For example: 111 | 112 | ```cpp 113 | std::string in = u8"Hello world\r\n"; // Windows-style 114 | std::string out = tcb::utf_ranges::view::line_end_transform(in); 115 | assert(out == "Hello world\n"); 116 | ``` 117 | 118 | ### Chaining views 119 | 120 | As with Range-V3, `operator|` is overloaded for views, allowing them to be easily concatenated together, as in the example above. 121 | 122 | ## Licence 123 | 124 | This library is provided under the Boost licence. See LICENCE_1_0.txt for details. 125 | -------------------------------------------------------------------------------- /benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | find_package(Boost COMPONENTS locale REQUIRED) 4 | 5 | if (Boost_FOUND) 6 | add_executable(benchmark benchmark.cpp) 7 | 8 | target_include_directories(benchmark PRIVATE 9 | ${RANGE_INCLUDE_DIR} 10 | ${Boost_INCLUDE_DIR} 11 | ) 12 | else() 13 | message("Boost.Locale not found, skipping benchmark target") 14 | endif() -------------------------------------------------------------------------------- /benchmark/benchmark.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "utf8.h" 15 | 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | 22 | using std::string; 23 | using std::u16string; 24 | using std::u32string; 25 | 26 | namespace { 27 | 28 | struct timer { 29 | using clock = std::chrono::high_resolution_clock; 30 | using time_point = typename clock::time_point; 31 | 32 | timer() = default; 33 | 34 | template 35 | T elapsed() const 36 | { 37 | return std::chrono::duration_cast(clock::now() - start_); 38 | } 39 | 40 | private: 41 | time_point start_ = clock::now(); 42 | }; 43 | 44 | template 45 | void time_function_call(Func f, Arg&& arg, int n, string label) 46 | { 47 | timer t; 48 | for (int i = 0; i < n; i++) { 49 | volatile auto res = f(std::forward(arg)); 50 | } 51 | auto e = t.elapsed(); 52 | std::cout << label << " took " << e.count() << "ms\n"; 53 | } 54 | 55 | /* 56 | * All six codecvt conversion functions 57 | */ 58 | 59 | inline 60 | u16string codecvt_u8_to_u16(const string& u8) 61 | { 62 | using codecvt = std::codecvt_utf8_utf16; 63 | return std::wstring_convert{}.from_bytes(u8); 64 | } 65 | 66 | inline 67 | u32string codecvt_u8_to_u32(const string& u8) 68 | { 69 | using codecvt = std::codecvt_utf8; 70 | return std::wstring_convert{}.from_bytes(u8); 71 | } 72 | 73 | inline 74 | string codecvt_u16_to_u8(const u16string& u16) 75 | { 76 | using codecvt = std::codecvt_utf8_utf16; 77 | return std::wstring_convert{}.to_bytes(u16); 78 | } 79 | 80 | inline 81 | u32string codecvt_u16_to_u32(const u16string& u16) 82 | { 83 | // You might expect std::codecvt_utf16 to convert between 84 | // char16_t and char32_t, but it does not; rather, it operates on 85 | // UTF-16 encoded *byte* strings. This is not what we want. 86 | // We could try to reinterpret_cast<> our way around the problem, but this 87 | // is ugly and error prone. The easiest way is to do the conversion in two 88 | // steps, to UTF-8 and then to UTF-32. While this might be "unfair" on 89 | // codecvt for benchmark purposes, it does rather demonstrate what a 90 | // terrible API it is. 91 | const string u8 = codecvt_u16_to_u8(u16); 92 | return codecvt_u8_to_u32(u8); 93 | } 94 | 95 | inline 96 | string codecvt_u32_to_u8(const u32string& u32) 97 | { 98 | using codecvt = std::codecvt_utf8; 99 | return std::wstring_convert{}.to_bytes(u32); 100 | } 101 | 102 | inline 103 | u16string codecvt_u32_to_u16(const u32string& u32) 104 | { 105 | // As above, to avoid reinterpret_cast<> and trying to pretend that 106 | // a UTF-16 string is really a UTF-16 byte string, we do this in two steps 107 | const string u8 = codecvt_u32_to_u8(u32); 108 | return codecvt_u8_to_u16(u8); 109 | } 110 | 111 | /* 112 | * All six cpputf8 conversion functions 113 | */ 114 | 115 | inline 116 | u16string cpputf8_u8_to_u16(const string& u8) 117 | { 118 | u16string u16; 119 | utf8::utf8to16(std::begin(u8), std::end(u8), std::back_inserter(u16)); 120 | return u16; 121 | } 122 | 123 | inline 124 | u32string cpputf8_u8_to_u32(const string& u8) 125 | { 126 | u32string u32; 127 | utf8::utf8to32(std::begin(u8), std::end(u8), std::back_inserter(u32)); 128 | return u32; 129 | } 130 | 131 | inline 132 | string cpputf8_u16_to_u8(const u16string& u16) 133 | { 134 | string u8; 135 | utf8::utf16to8(std::begin(u16), std::end(u16), std::back_inserter(u8)); 136 | return u8; 137 | } 138 | 139 | inline 140 | u32string cpputf8_u16_to_u32(const u16string& u16) 141 | { 142 | // cpputf8 doesn't support this directly (it is, after all, designed to 143 | // handle UTF-8), so we need to do it in two steps 144 | const string u8 = cpputf8_u16_to_u8(u16); 145 | return cpputf8_u8_to_u32(u8); 146 | } 147 | 148 | inline 149 | string cpputf8_u32_to_u8(const u32string& u32) 150 | { 151 | string u8; 152 | utf8::utf32to8(std::begin(u32), std::end(u32), std::back_inserter(u8)); 153 | return u8; 154 | } 155 | 156 | inline 157 | u16string cpputf8_u32_to_u16(const u32string& u32) 158 | { 159 | // As above, we need to do this in two steps 160 | const string u8 = cpputf8_u32_to_u8(u32); 161 | return cpputf8_u8_to_u16(u8); 162 | } 163 | 164 | /* 165 | * All six Boost.Locale conversion functions 166 | */ 167 | 168 | inline 169 | u16string boost_u8_to_u16(const string& u8) 170 | { 171 | return boost::locale::conv::utf_to_utf(u8); 172 | } 173 | 174 | inline 175 | u32string boost_u8_to_u32(const string& u8) 176 | { 177 | return boost::locale::conv::utf_to_utf(u8); 178 | } 179 | 180 | inline 181 | string boost_u16_to_u8(const u16string& u16) 182 | { 183 | return boost::locale::conv::utf_to_utf(u16); 184 | } 185 | 186 | inline 187 | u32string boost_u16_to_u32(const u16string& u16) 188 | { 189 | return boost::locale::conv::utf_to_utf(u16); 190 | } 191 | 192 | inline 193 | string boost_u32_to_u8(const u32string& u32) 194 | { 195 | return boost::locale::conv::utf_to_utf(u32); 196 | } 197 | 198 | inline 199 | u16string boost_u32_to_u16(const u32string& u32) 200 | { 201 | return boost::locale::conv::utf_to_utf(u32); 202 | } 203 | 204 | /* 205 | * All six range conversion functions 206 | */ 207 | 208 | inline 209 | u16string range_u8_to_u16(const string& u8) 210 | { 211 | return tcb::utf_ranges::to_u16string(u8); 212 | } 213 | 214 | inline 215 | u32string range_u8_to_u32(const string& u8) 216 | { 217 | return tcb::utf_ranges::to_u32string(u8); 218 | } 219 | 220 | inline 221 | string range_u16_to_u8(const u16string& u16) 222 | { 223 | return tcb::utf_ranges::to_u8string(u16); 224 | } 225 | 226 | inline 227 | u32string range_u16_to_u32(const u16string& u16) 228 | { 229 | return tcb::utf_ranges::to_u32string(u16); 230 | } 231 | 232 | inline 233 | string range_u32_to_u8(const u32string& u32) 234 | { 235 | return tcb::utf_ranges::to_u8string(u32); 236 | } 237 | 238 | inline 239 | u16string range_u32_to_u16(const u32string& u32) 240 | { 241 | return tcb::utf_ranges::to_u16string(u32); 242 | } 243 | 244 | /* 245 | * All six range view functions 246 | */ 247 | 248 | inline 249 | u16string range_view_u8_to_u16(const string& u8) 250 | { 251 | return tcb::utf_ranges::view::utf16(u8); 252 | } 253 | 254 | inline 255 | u32string range_view_u8_to_u32(const string& u8) 256 | { 257 | return tcb::utf_ranges::view::utf32(u8); 258 | } 259 | 260 | inline 261 | string range_view_u16_to_u8(const u16string& u16) 262 | { 263 | return tcb::utf_ranges::view::utf8(u16); 264 | } 265 | 266 | inline 267 | u32string range_view_u16_to_u32(const u16string& u16) 268 | { 269 | return tcb::utf_ranges::view::utf32(u16); 270 | } 271 | 272 | inline 273 | string range_view_u32_to_u8(const u32string& u32) 274 | { 275 | return tcb::utf_ranges::view::utf8(u32); 276 | } 277 | 278 | inline 279 | u16string range_view_u32_to_u16(const u32string& u32) 280 | { 281 | return tcb::utf_ranges::view::utf16(u32); 282 | } 283 | 284 | } // end anonymous namespace 285 | 286 | int main(int argc, char** argv) 287 | { 288 | if (argc < 2) { 289 | std::cout << "Usage: benchmark UTF8FILE [ITERATIONS]\n"; 290 | return 1; 291 | } 292 | 293 | const string u8str = [argv] { 294 | std::ifstream f(argv[1]); 295 | return string(std::istreambuf_iterator{f}, 296 | std::istreambuf_iterator{}); 297 | }(); 298 | 299 | const u16string u16str = cpputf8_u8_to_u16(u8str); 300 | const u32string u32str = cpputf8_u8_to_u32(u8str); 301 | 302 | const int num_iterations = argc > 2 ? std::atoi(argv[2]) : 1; 303 | 304 | 305 | // UTF-8 to UTF-16 306 | time_function_call(codecvt_u8_to_u16, u8str, num_iterations, "codecvt u8 to u16"); 307 | time_function_call(cpputf8_u8_to_u16, u8str, num_iterations, "cpputf8 u8 to u16"); 308 | time_function_call(boost_u8_to_u16, u8str, num_iterations, "boost u8 to u16"); 309 | time_function_call(range_u8_to_u16, u8str, num_iterations, "range u8 to u16"); 310 | time_function_call(range_view_u8_to_u16, u8str, num_iterations, "range view u8 to u16"); 311 | std::cout << "\n"; 312 | 313 | // UTF-8 to UTF-32 314 | time_function_call(codecvt_u8_to_u32, u8str, num_iterations, "codecvt u8 to u32"); 315 | time_function_call(cpputf8_u8_to_u32, u8str, num_iterations, "cpputf8 u8 to u32"); 316 | time_function_call(boost_u8_to_u32, u8str, num_iterations, "boost u8 to u32"); 317 | time_function_call(range_u8_to_u32, u8str, num_iterations, "range u8 to u32"); 318 | time_function_call(range_view_u8_to_u32, u8str, num_iterations, "range view u8 to u32"); 319 | std::cout << "\n"; 320 | 321 | // UTF-16 to UTF-8 322 | time_function_call(codecvt_u16_to_u8, u16str, num_iterations, "codecvt u16 to u8"); 323 | time_function_call(cpputf8_u16_to_u8, u16str, num_iterations, "cpputf8 u16 to u8"); 324 | time_function_call(boost_u16_to_u8, u16str, num_iterations, "boost u16 to u8"); 325 | time_function_call(range_u16_to_u8, u16str, num_iterations, "range u16 to u8"); 326 | time_function_call(range_view_u16_to_u8, u16str, num_iterations, "range view u16 to u8"); 327 | std::cout << "\n"; 328 | 329 | // UTF-16 to UTF-32 330 | time_function_call(codecvt_u16_to_u32, u16str, num_iterations, "*codecvt u16 to u32"); 331 | time_function_call(cpputf8_u16_to_u32, u16str, num_iterations, "*cpputf8 u16 to u32"); 332 | time_function_call(boost_u16_to_u32, u16str, num_iterations, "boost u16 to u32"); 333 | time_function_call(range_u16_to_u32, u16str, num_iterations, "range u16 to u32"); 334 | time_function_call(range_view_u16_to_u32, u16str, num_iterations, "range view u16 to u32"); 335 | std::cout << "\n"; 336 | 337 | // UTF-32 to UTF-8 338 | time_function_call(codecvt_u32_to_u8, u32str, num_iterations, "codecvt u32 to u8"); 339 | time_function_call(cpputf8_u32_to_u8, u32str, num_iterations, "cpputf8 u32 to u8"); 340 | time_function_call(boost_u32_to_u8, u32str, num_iterations, "boost u32 to u8"); 341 | time_function_call(range_u32_to_u8, u32str, num_iterations, "range u32 to u8"); 342 | time_function_call(range_view_u32_to_u8, u32str, num_iterations, "range view u32 to u8"); 343 | std::cout << "\n"; 344 | 345 | // UTF-32 to UTF-16 346 | time_function_call(codecvt_u32_to_u16, u32str, num_iterations, "*codecvt u32 to u16"); 347 | time_function_call(cpputf8_u32_to_u16, u32str, num_iterations, "*cpputf8 u32 to u16"); 348 | time_function_call(boost_u32_to_u16, u32str, num_iterations, "boost u32 to u16"); 349 | time_function_call(range_u32_to_u16, u32str, num_iterations, "range u32 to u16"); 350 | time_function_call(range_view_u32_to_u16, u32str, num_iterations, "range view u32 to u16"); 351 | } -------------------------------------------------------------------------------- /benchmark/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "utf8/checked.h" 32 | #include "utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /benchmark/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public ::std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | octet_iterator append(uint32_t cp, octet_iterator result) 74 | { 75 | if (!utf8::internal::is_code_point_valid(cp)) 76 | throw invalid_code_point(cp); 77 | 78 | if (cp < 0x80) // one octet 79 | *(result++) = static_cast(cp); 80 | else if (cp < 0x800) { // two octets 81 | *(result++) = static_cast((cp >> 6) | 0xc0); 82 | *(result++) = static_cast((cp & 0x3f) | 0x80); 83 | } 84 | else if (cp < 0x10000) { // three octets 85 | *(result++) = static_cast((cp >> 12) | 0xe0); 86 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 87 | *(result++) = static_cast((cp & 0x3f) | 0x80); 88 | } 89 | else { // four octets 90 | *(result++) = static_cast((cp >> 18) | 0xf0); 91 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 92 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 93 | *(result++) = static_cast((cp & 0x3f) | 0x80); 94 | } 95 | return result; 96 | } 97 | 98 | template 99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 100 | { 101 | while (start != end) { 102 | octet_iterator sequence_start = start; 103 | internal::utf_error err_code = utf8::internal::validate_next(start, end); 104 | switch (err_code) { 105 | case internal::UTF8_OK : 106 | for (octet_iterator it = sequence_start; it != start; ++it) 107 | *out++ = *it; 108 | break; 109 | case internal::NOT_ENOUGH_ROOM: 110 | throw not_enough_room(); 111 | case internal::INVALID_LEAD: 112 | out = utf8::append (replacement, out); 113 | ++start; 114 | break; 115 | case internal::INCOMPLETE_SEQUENCE: 116 | case internal::OVERLONG_SEQUENCE: 117 | case internal::INVALID_CODE_POINT: 118 | out = utf8::append (replacement, out); 119 | ++start; 120 | // just one replacement mark for the sequence 121 | while (start != end && utf8::internal::is_trail(*start)) 122 | ++start; 123 | break; 124 | } 125 | } 126 | return out; 127 | } 128 | 129 | template 130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 131 | { 132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); 133 | return utf8::replace_invalid(start, end, out, replacement_marker); 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return utf8::next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (utf8::internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return utf8::peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (utf8::internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return utf8::next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | utf8::next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | utf8::next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = utf8::internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (utf8::internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 215 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (utf8::internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = utf8::append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start < end) { 237 | uint32_t cp = utf8::next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = utf8::append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start < end) 261 | (*result++) = utf8::next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {} 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return utf8::next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | utf8::next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | utf8::next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | utf8::prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | utf8::prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /benchmark/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = utf8::internal::mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// Helper for get_sequence_x 137 | template 138 | utf_error increase_safely(octet_iterator& it, octet_iterator end) 139 | { 140 | if (++it == end) 141 | return NOT_ENOUGH_ROOM; 142 | 143 | if (!utf8::internal::is_trail(*it)) 144 | return INCOMPLETE_SEQUENCE; 145 | 146 | return UTF8_OK; 147 | } 148 | 149 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 150 | 151 | /// get_sequence_x functions decode utf-8 sequences of the length x 152 | template 153 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) 154 | { 155 | if (it == end) 156 | return NOT_ENOUGH_ROOM; 157 | 158 | code_point = utf8::internal::mask8(*it); 159 | 160 | return UTF8_OK; 161 | } 162 | 163 | template 164 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) 165 | { 166 | if (it == end) 167 | return NOT_ENOUGH_ROOM; 168 | 169 | code_point = utf8::internal::mask8(*it); 170 | 171 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 172 | 173 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); 174 | 175 | return UTF8_OK; 176 | } 177 | 178 | template 179 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) 180 | { 181 | if (it == end) 182 | return NOT_ENOUGH_ROOM; 183 | 184 | code_point = utf8::internal::mask8(*it); 185 | 186 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 187 | 188 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 189 | 190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 191 | 192 | code_point += (*it) & 0x3f; 193 | 194 | return UTF8_OK; 195 | } 196 | 197 | template 198 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) 199 | { 200 | if (it == end) 201 | return NOT_ENOUGH_ROOM; 202 | 203 | code_point = utf8::internal::mask8(*it); 204 | 205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 206 | 207 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 208 | 209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 210 | 211 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; 212 | 213 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 214 | 215 | code_point += (*it) & 0x3f; 216 | 217 | return UTF8_OK; 218 | } 219 | 220 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 221 | 222 | template 223 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) 224 | { 225 | // Save the original value of it so we can go back in case of failure 226 | // Of course, it does not make much sense with i.e. stream iterators 227 | octet_iterator original_it = it; 228 | 229 | uint32_t cp = 0; 230 | // Determine the sequence length based on the lead octet 231 | typedef typename std::iterator_traits::difference_type octet_difference_type; 232 | const octet_difference_type length = utf8::internal::sequence_length(it); 233 | 234 | // Get trail octets and calculate the code point 235 | utf_error err = UTF8_OK; 236 | switch (length) { 237 | case 0: 238 | return INVALID_LEAD; 239 | case 1: 240 | err = utf8::internal::get_sequence_1(it, end, cp); 241 | break; 242 | case 2: 243 | err = utf8::internal::get_sequence_2(it, end, cp); 244 | break; 245 | case 3: 246 | err = utf8::internal::get_sequence_3(it, end, cp); 247 | break; 248 | case 4: 249 | err = utf8::internal::get_sequence_4(it, end, cp); 250 | break; 251 | } 252 | 253 | if (err == UTF8_OK) { 254 | // Decoding succeeded. Now, security checks... 255 | if (utf8::internal::is_code_point_valid(cp)) { 256 | if (!utf8::internal::is_overlong_sequence(cp, length)){ 257 | // Passed! Return here. 258 | code_point = cp; 259 | ++it; 260 | return UTF8_OK; 261 | } 262 | else 263 | err = OVERLONG_SEQUENCE; 264 | } 265 | else 266 | err = INVALID_CODE_POINT; 267 | } 268 | 269 | // Failure branch - restore the original value of the iterator 270 | it = original_it; 271 | return err; 272 | } 273 | 274 | template 275 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 276 | uint32_t ignored; 277 | return utf8::internal::validate_next(it, end, ignored); 278 | } 279 | 280 | } // namespace internal 281 | 282 | /// The library API - functions intended to be called by the users 283 | 284 | // Byte order mark 285 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 286 | 287 | template 288 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 289 | { 290 | octet_iterator result = start; 291 | while (result != end) { 292 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); 293 | if (err_code != internal::UTF8_OK) 294 | return result; 295 | } 296 | return result; 297 | } 298 | 299 | template 300 | inline bool is_valid(octet_iterator start, octet_iterator end) 301 | { 302 | return (utf8::find_invalid(start, end) == end); 303 | } 304 | 305 | template 306 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 307 | { 308 | return ( 309 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && 310 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && 311 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) 312 | ); 313 | } 314 | 315 | //Deprecated in release 2.3 316 | template 317 | inline bool is_bom (octet_iterator it) 318 | { 319 | return ( 320 | (utf8::internal::mask8(*it++)) == bom[0] && 321 | (utf8::internal::mask8(*it++)) == bom[1] && 322 | (utf8::internal::mask8(*it)) == bom[2] 323 | ); 324 | } 325 | } // namespace utf8 326 | 327 | #endif // header guard 328 | 329 | 330 | -------------------------------------------------------------------------------- /benchmark/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = utf8::internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (utf8::internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return utf8::unchecked::next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (utf8::internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return utf8::unchecked::next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return utf8::unchecked::prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | utf8::unchecked::next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | utf8::unchecked::next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = utf8::internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (utf8::internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = utf8::unchecked::append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = utf8::unchecked::next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = utf8::unchecked::append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = utf8::unchecked::next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {} 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return utf8::unchecked::next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | ::std::advance(it, utf8::internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | ::std::advance(it, utf8::internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | utf8::unchecked::prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | utf8::unchecked::prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(utf8_to_utf16be utf8_to_utf16be.cpp) 3 | target_include_directories(utf8_to_utf16be PRIVATE 4 | ${RANGE_INCLUDE_DIR} 5 | ${Boost_INCLUDE_DIR}) -------------------------------------------------------------------------------- /example/utf8_to_utf16be.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace rng = ::ranges::v3; 14 | namespace utf = ::tcb::utf_ranges; 15 | 16 | int main(int argc, char** argv) 17 | { 18 | if (argc < 3) { 19 | std::cout << "Usage:\n" 20 | << "utf8_to_utf16be INFILE OUTFILE\n" 21 | << "\n" 22 | << "Converts a UTF-8 encoded file to big-endian UTF-16.\n"; 23 | return 1; 24 | } 25 | 26 | std::ifstream in_file{argv[1], std::ios::binary}; 27 | std::ofstream out_file{argv[2], std::ios::binary}; 28 | 29 | auto view = utf::istreambuf(in_file) // Read range from input stream 30 | | utf::view::consume_bom // Remove UTF-8 "BOM" if present 31 | | utf::view::utf16 // Convert to UTF-16 32 | | utf::view::add_bom // Prepend UTF-16 BOM to start of range 33 | | utf::view::endian_convert // Convert to big-endian 34 | | utf::view::bytes; // Write to disk as bytes 35 | 36 | rng::copy(view, utf::ostreambuf_iterator{out_file}); 37 | } -------------------------------------------------------------------------------- /include/tcb/utf_ranges/convert.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_CONVERT_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_CONVERT_HPP_INCLUDED 8 | 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | namespace tcb { 16 | namespace utf_ranges { 17 | 18 | namespace rng = ::ranges::v3; 19 | 20 | template ::value_type> 24 | OutIter utf_convert(InIter first, Sentinel last, OutIter out) 25 | { 26 | while (first != last) { 27 | const char32_t c = detail::utf_traits::decode(first, last); 28 | detail::utf_traits::encode(c, out); 29 | } 30 | return out; 31 | } 32 | 33 | template , 37 | CONCEPT_REQUIRES_(rng::ForwardRange())> 38 | OutIter utf_convert(InRange&& range, OutIter out) 39 | { 40 | return utf_convert, 41 | rng::range_sentinel_t, OutIter, InCharT>( 42 | rng::begin(range), rng::end(range), std::move(out)); 43 | } 44 | 45 | template > 47 | std::basic_string 48 | to_utf_string(Range&& range) 49 | { 50 | using string_type = std::basic_string; 51 | 52 | string_type output; 53 | 54 | // Try to minimise the number of reallocations 55 | if /*constexpr*/ (::ranges::RandomAccessRange()) { 56 | output.reserve(::ranges::size(range)); 57 | } 58 | 59 | utf_convert(std::forward(range), std::back_inserter(output)); 60 | 61 | return output; 62 | } 63 | 64 | template 65 | std::string to_u8string(Range&& range) 66 | { 67 | return to_utf_string(std::forward(range)); 68 | } 69 | 70 | template 71 | std::u16string to_u16string(Range&& range) 72 | { 73 | return to_utf_string(std::forward(range)); 74 | } 75 | 76 | template 77 | std::u32string to_u32string(Range&& range) 78 | { 79 | return to_utf_string(std::forward(range)); 80 | } 81 | 82 | template 83 | std::wstring to_wstsring(Range&& range) 84 | { 85 | return to_utf_string(std::forward(range)); 86 | } 87 | 88 | } // end namespace utf_ranges 89 | } // end namespace tcb 90 | 91 | #endif // TCB_UTF_RANGES_CONVERT_HPP_INCLUDED 92 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/detail/utf.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // This file is based on utf.hpp from Boost.Locale 3 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 4 | // 5 | // Modifications (c) 2016 Tristan Brindle 6 | // 7 | // Distributed under the Boost Software License, Version 1.0. (See 8 | // accompanying file LICENSE_1_0.txt or copy at 9 | // http://www.boost.org/LICENSE_1_0.txt) 10 | // 11 | #ifndef TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED 12 | #define TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED 13 | 14 | #include 15 | 16 | namespace tcb { 17 | namespace utf_ranges { 18 | namespace detail { 19 | 20 | /// \cond INTERNAL 21 | #ifdef __GNUC__ 22 | # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1) 23 | # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0) 24 | #else 25 | # define BOOST_LOCALE_LIKELY(x) (x) 26 | # define BOOST_LOCALE_UNLIKELY(x) (x) 27 | #endif 28 | /// \endcond 29 | 30 | /// 31 | /// \brief The integral type that can hold a Unicode code point 32 | /// 33 | typedef char32_t code_point; 34 | 35 | /// 36 | /// \brief Special constant that defines illegal code point 37 | /// 38 | static constexpr code_point illegal = 0xFFFFFFFFu; 39 | 40 | /// 41 | /// \brief Special constant that defines incomplete code point 42 | /// 43 | static constexpr code_point incomplete = 0xFFFFFFFEu; 44 | 45 | /// 46 | /// \brief the function checks if \a v is a valid code point 47 | /// 48 | inline constexpr bool is_valid_codepoint(code_point v) 49 | { 50 | if (v > 0x10FFFF) 51 | return false; 52 | if (0xD800 <= v && v <= 0xDFFF) // surrogates 53 | return false; 54 | return true; 55 | } 56 | 57 | template 58 | struct utf_traits; 59 | 60 | template 61 | struct encoded_chars { 62 | public: 63 | constexpr encoded_chars() = default; 64 | 65 | constexpr encoded_chars(CharType _1) 66 | : chars_{{_1}}, size_{1} {} 67 | 68 | constexpr encoded_chars(CharType _1, CharType _2) 69 | : chars_{{_1, _2}}, size_{2} {} 70 | 71 | constexpr encoded_chars(CharType _1, CharType _2, CharType _3) 72 | : chars_{{_1, _2, _3}}, size_{3} {} 73 | 74 | constexpr encoded_chars(CharType _1, CharType _2, CharType _3, CharType _4) 75 | : chars_{{_1, _2, _3, _4}}, size_{4} {} 76 | 77 | constexpr int size() const noexcept { return size_; } 78 | 79 | constexpr CharType operator[](int i) const noexcept { return chars_[i]; } 80 | 81 | friend constexpr bool operator==(const encoded_chars& lhs, 82 | const encoded_chars& rhs) 83 | { 84 | return std::equal(std::begin(lhs.chars_), 85 | std::begin(lhs.chars_) + lhs.size_, 86 | std::begin(rhs.chars_), 87 | std::begin(rhs.chars_) + rhs.size_); 88 | } 89 | 90 | private: 91 | std::array chars_{{}}; 92 | int size_ = 0; 93 | }; 94 | 95 | template 96 | struct utf_traits { 97 | 98 | typedef CharType char_type; 99 | 100 | static constexpr int trail_length(char_type ci) 101 | { 102 | unsigned char c = ci; 103 | if (c < 128) 104 | return 0; 105 | if (BOOST_LOCALE_UNLIKELY(c < 194)) 106 | return -1; 107 | if (c < 224) 108 | return 1; 109 | if (c < 240) 110 | return 2; 111 | if (BOOST_LOCALE_LIKELY(c <= 244)) 112 | return 3; 113 | return -1; 114 | } 115 | 116 | static constexpr int max_width = 4; 117 | 118 | static constexpr int width(code_point value) 119 | { 120 | if (value <= 0x7F) { 121 | return 1; 122 | } 123 | else if (value <= 0x7FF) { 124 | return 2; 125 | } 126 | else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { 127 | return 3; 128 | } 129 | else { 130 | return 4; 131 | } 132 | } 133 | 134 | static constexpr bool is_trail(char_type ci) 135 | { 136 | unsigned char c = ci; 137 | return (c & 0xC0) == 0x80; 138 | } 139 | 140 | static constexpr bool is_lead(char_type ci) 141 | { 142 | return !is_trail(ci); 143 | } 144 | 145 | template 146 | static constexpr code_point decode(Iterator& p, Sentinel e) 147 | { 148 | if (BOOST_LOCALE_UNLIKELY(p == e)) 149 | return incomplete; 150 | 151 | unsigned char lead = *p++; 152 | 153 | // First byte is fully validated here 154 | int trail_size = trail_length(lead); 155 | 156 | if (BOOST_LOCALE_UNLIKELY(trail_size < 0)) 157 | return illegal; 158 | 159 | // 160 | // Ok as only ASCII may be of size = 0 161 | // also optimize for ASCII text 162 | // 163 | if (trail_size == 0) 164 | return lead; 165 | 166 | code_point c = lead & ((1 << (6 - trail_size)) - 1); 167 | 168 | // Read the rest 169 | unsigned char tmp{}; 170 | switch (trail_size) { 171 | case 3: 172 | if (BOOST_LOCALE_UNLIKELY(p == e)) 173 | return incomplete; 174 | tmp = *p++; 175 | if (!is_trail(tmp)) 176 | return illegal; 177 | c = (c << 6) | (tmp & 0x3F); 178 | case 2: 179 | if (BOOST_LOCALE_UNLIKELY(p == e)) 180 | return incomplete; 181 | tmp = *p++; 182 | if (!is_trail(tmp)) 183 | return illegal; 184 | c = (c << 6) | (tmp & 0x3F); 185 | case 1: 186 | if (BOOST_LOCALE_UNLIKELY(p == e)) 187 | return incomplete; 188 | tmp = *p++; 189 | if (!is_trail(tmp)) 190 | return illegal; 191 | c = (c << 6) | (tmp & 0x3F); 192 | } 193 | 194 | // Check code point validity: no surrogates and 195 | // valid range 196 | if (BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 197 | return illegal; 198 | 199 | // make sure it is the most compact representation 200 | if (BOOST_LOCALE_UNLIKELY(width(c) != trail_size + 1)) 201 | return illegal; 202 | 203 | return c; 204 | 205 | } 206 | 207 | template 208 | static constexpr code_point decode_valid(Iterator& p) 209 | { 210 | unsigned char lead = *p++; 211 | if (lead < 192) 212 | return lead; 213 | 214 | int trail_size = 0; 215 | 216 | if (lead < 224) 217 | trail_size = 1; 218 | else if (BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare 219 | trail_size = 2; 220 | else 221 | trail_size = 3; 222 | 223 | code_point c = lead & ((1 << (6 - trail_size)) - 1); 224 | 225 | switch (trail_size) { 226 | case 3: 227 | c = (c << 6) | (static_cast(*p++) & 0x3F); 228 | case 2: 229 | c = (c << 6) | (static_cast(*p++) & 0x3F); 230 | case 1: 231 | c = (c << 6) | (static_cast(*p++) & 0x3F); 232 | } 233 | 234 | return c; 235 | } 236 | 237 | template 238 | static constexpr Iterator encode(code_point value, Iterator out) 239 | { 240 | if (value <= 0x7F) { 241 | *out++ = static_cast(value); 242 | } 243 | else if (value <= 0x7FF) { 244 | *out++ = static_cast((value >> 6) | 0xC0); 245 | *out++ = static_cast((value & 0x3F) | 0x80); 246 | } 247 | else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { 248 | *out++ = static_cast((value >> 12) | 0xE0); 249 | *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); 250 | *out++ = static_cast((value & 0x3F) | 0x80); 251 | } 252 | else { 253 | *out++ = static_cast((value >> 18) | 0xF0); 254 | *out++ = static_cast(((value >> 12) & 0x3F) | 0x80); 255 | *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); 256 | *out++ = static_cast((value & 0x3F) | 0x80); 257 | } 258 | return out; 259 | } 260 | 261 | static constexpr encoded_chars encode(code_point value) 262 | { 263 | if (value <= 0x7F) { 264 | return {static_cast(value)}; 265 | } 266 | else if (value <= 0x7FF) { 267 | return {static_cast((value >> 6) | 0xC0), 268 | static_cast((value & 0x3F) | 0x80)}; 269 | } 270 | else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { 271 | return {static_cast((value >> 12) | 0xE0), 272 | static_cast(((value >> 6) & 0x3F) | 0x80), 273 | static_cast((value & 0x3F) | 0x80)}; 274 | } 275 | else { 276 | return {static_cast((value >> 18) | 0xF0), 277 | static_cast(((value >> 12) & 0x3F) | 0x80), 278 | static_cast(((value >> 6) & 0x3F) | 0x80), 279 | static_cast((value & 0x3F) | 0x80)}; 280 | } 281 | } 282 | 283 | }; // utf8 284 | 285 | template 286 | struct utf_traits { 287 | typedef CharType char_type; 288 | 289 | // See RFC 2781 290 | static constexpr bool is_first_surrogate(uint16_t x) 291 | { 292 | return 0xD800 <= x && x <= 0xDBFF; 293 | } 294 | 295 | static constexpr bool is_second_surrogate(uint16_t x) 296 | { 297 | return 0xDC00 <= x && x <= 0xDFFF; 298 | } 299 | 300 | static constexpr code_point combine_surrogate(uint16_t w1, uint16_t w2) 301 | { 302 | return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; 303 | } 304 | 305 | static constexpr int trail_length(char_type c) 306 | { 307 | if (is_first_surrogate(c)) 308 | return 1; 309 | if (is_second_surrogate(c)) 310 | return -1; 311 | return 0; 312 | } 313 | 314 | /// 315 | /// Returns true if c is trail code unit, always false for UTF-32 316 | /// 317 | static constexpr bool is_trail(char_type c) 318 | { 319 | return is_second_surrogate(c); 320 | } 321 | 322 | /// 323 | /// Returns true if c is lead code unit, always true of UTF-32 324 | /// 325 | static constexpr bool is_lead(char_type c) 326 | { 327 | return !is_second_surrogate(c); 328 | } 329 | 330 | template 331 | static constexpr code_point decode(It& current, S last) 332 | { 333 | if (BOOST_LOCALE_UNLIKELY(current == last)) 334 | return incomplete; 335 | uint16_t w1 = *current++; 336 | if (BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 337 | return w1; 338 | } 339 | if (w1 > 0xDBFF) 340 | return illegal; 341 | if (current == last) 342 | return incomplete; 343 | uint16_t w2 = *current++; 344 | if (w2 < 0xDC00 || 0xDFFF < w2) 345 | return illegal; 346 | return combine_surrogate(w1, w2); 347 | } 348 | 349 | template 350 | static constexpr code_point decode_valid(It& current) 351 | { 352 | uint16_t w1 = *current++; 353 | if (BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 354 | return w1; 355 | } 356 | uint16_t w2 = *current++; 357 | return combine_surrogate(w1, w2); 358 | } 359 | 360 | static const int max_width = 2; 361 | 362 | static constexpr int width(code_point u) 363 | { 364 | return u >= 0x10000 ? 2 : 1; 365 | } 366 | 367 | template 368 | static constexpr It encode(code_point u, It out) 369 | { 370 | if (BOOST_LOCALE_LIKELY(u <= 0xFFFF)) { 371 | *out++ = static_cast(u); 372 | } 373 | else { 374 | u -= 0x10000; 375 | *out++ = static_cast(0xD800 | (u >> 10)); 376 | *out++ = static_cast(0xDC00 | (u & 0x3FF)); 377 | } 378 | return out; 379 | } 380 | 381 | static constexpr encoded_chars encode(code_point u) 382 | { 383 | if (BOOST_LOCALE_LIKELY(u <= 0xFFFF)) { 384 | return {static_cast(u)}; 385 | } 386 | else { 387 | u -= 0x10000; 388 | return {static_cast(0xD800 | (u >> 10)), 389 | static_cast(0xDC00 | (u & 0x3FF))}; 390 | } 391 | } 392 | }; // utf16; 393 | 394 | 395 | template 396 | struct utf_traits { 397 | typedef CharType char_type; 398 | 399 | static constexpr int trail_length(char_type c) 400 | { 401 | if (is_valid_codepoint(c)) 402 | return 0; 403 | return -1; 404 | } 405 | 406 | static constexpr bool is_trail(char_type /*c*/) 407 | { 408 | return false; 409 | } 410 | 411 | static constexpr bool is_lead(char_type /*c*/) 412 | { 413 | return true; 414 | } 415 | 416 | template 417 | static constexpr code_point decode_valid(It& current) 418 | { 419 | return *current++; 420 | } 421 | 422 | template 423 | static constexpr code_point decode(It& current, S last) 424 | { 425 | if (BOOST_LOCALE_UNLIKELY(current == last)) 426 | return incomplete; 427 | code_point c = *current++; 428 | if (BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 429 | return illegal; 430 | return c; 431 | } 432 | 433 | static constexpr int max_width = 1; 434 | 435 | static constexpr int width(code_point /*u*/) 436 | { 437 | return 1; 438 | } 439 | 440 | template 441 | static constexpr It encode(code_point u, It out) 442 | { 443 | *out++ = static_cast(u); 444 | return out; 445 | } 446 | 447 | static constexpr encoded_chars encode(code_point u) 448 | { 449 | return {static_cast(u)}; 450 | } 451 | 452 | }; // utf32 453 | 454 | } // end namespace detail 455 | } // end namespace utf_ranges 456 | } // end namespace tcb 457 | 458 | #endif // TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED 459 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/istreambuf_range.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Based on istreambuf_range.hpp from Range-V3 3 | // Copyright Eric Niebler 2013-2014 4 | // 5 | // Modifications (c) 2016 Tristan Brindle 6 | // 7 | // Use, modification and distribution is subject to the 8 | // Boost Software License, Version 1.0. (See accompanying 9 | // file LICENSE_1_0.txt or copy at 10 | // http://www.boost.org/LICENSE_1_0.txt) 11 | // 12 | 13 | #ifndef TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED 14 | #define TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace tcb { 23 | namespace utf_ranges { 24 | 25 | namespace rng = ::ranges::v3; 26 | using rng::static_const; 27 | 28 | template> 29 | struct istreambuf_range 30 | : rng::view_facade, rng::unknown> 31 | { 32 | private: 33 | friend rng::range_access; 34 | std::basic_streambuf *sbin_ = nullptr; 35 | bool done_ = false; 36 | typename Traits::int_type obj_{}; 37 | 38 | struct cursor 39 | { 40 | private: 41 | istreambuf_range *rng_ = nullptr; 42 | public: 43 | cursor() = default; 44 | explicit cursor(istreambuf_range &rng) 45 | : rng_(&rng) 46 | {} 47 | 48 | void next() 49 | { 50 | rng_->next(); 51 | } 52 | 53 | CharT get() const noexcept 54 | { 55 | return Traits::to_char_type(rng_->obj_); 56 | } 57 | 58 | bool done() const 59 | { 60 | return rng_->done_; 61 | } 62 | 63 | }; 64 | 65 | void next() 66 | { 67 | obj_ = sbin_->sbumpc(); 68 | if (obj_ == Traits::eof()) { 69 | done_ = true; 70 | } 71 | } 72 | 73 | cursor begin_cursor() 74 | { 75 | return cursor{*this}; 76 | } 77 | 78 | public: 79 | istreambuf_range() = default; 80 | 81 | istreambuf_range(std::basic_istream& sin) 82 | : sbin_(sin.rdbuf()) 83 | { 84 | next(); // prime the pump 85 | } 86 | }; 87 | 88 | 89 | struct istreambuf_fn 90 | { 91 | template 92 | istreambuf_range 93 | operator()(std::basic_istream& sin) const 94 | { 95 | return {sin}; 96 | } 97 | }; 98 | 99 | RANGES_INLINE_VARIABLE(istreambuf_fn, istreambuf); 100 | 101 | 102 | } // end namespace utf_ranges 103 | } // end namespace tcb 104 | 105 | #endif // TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED 106 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/ostreambuf_iterator.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Based on ranges::ostream_iterator from Range-V3 3 | // Copyright Eric Niebler 2013-2014 4 | // 5 | // Modifications (c) 2016 Tristan Brindle 6 | // 7 | // Use, modification and distribution is subject to the 8 | // Boost Software License, Version 1.0. (See accompanying 9 | // file LICENSE_1_0.txt or copy at 10 | // http://www.boost.org/LICENSE_1_0.txt) 11 | 12 | #ifndef TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED 13 | #define TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED 14 | 15 | #include 16 | 17 | namespace tcb { 18 | namespace utf_ranges { 19 | 20 | template > 21 | struct ostreambuf_iterator { 22 | private: 23 | std::basic_streambuf* sout_; 24 | 25 | struct proxy { 26 | std::basic_streambuf* sout_; 27 | 28 | proxy& operator=(Char t) 29 | { 30 | RANGES_ASSERT(sout_); 31 | sout_->sputc(t); 32 | return *this; 33 | } 34 | }; 35 | 36 | public: 37 | using difference_type = std::ptrdiff_t; 38 | using char_type = Char; 39 | using traits_type = Traits; 40 | 41 | ostreambuf_iterator() = default; 42 | 43 | ostreambuf_iterator(std::basic_ostream& sout) noexcept 44 | : sout_(sout.rdbuf()) {} 45 | 46 | proxy operator*() const noexcept 47 | { 48 | return {sout_}; 49 | } 50 | 51 | ostreambuf_iterator& operator++() 52 | { 53 | return *this; 54 | } 55 | 56 | ostreambuf_iterator operator++(int) 57 | { 58 | return *this; 59 | } 60 | }; 61 | 62 | } // end namespace utf_ranges 63 | } // end namespace tcb 64 | 65 | #endif // TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #endif // TCB_UTF_RANGES_VIEW_HPP_INCLUDED 16 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view/bom.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace tcb { 21 | namespace utf_ranges { 22 | 23 | using rng::static_const; 24 | using rng::operator|; 25 | 26 | namespace view { 27 | 28 | namespace detail { 29 | template constexpr std::size_t bom_size_helper = 0; 30 | template<> constexpr std::size_t bom_size_helper<1> = 3; 31 | template<> constexpr std::size_t bom_size_helper<2> = 1; 32 | template<> constexpr std::size_t bom_size_helper<4> = 1; 33 | 34 | template 35 | struct bom_size { 36 | static constexpr std::size_t value = bom_size_helper; 37 | }; 38 | 39 | template constexpr std::size_t bom_size_v = bom_size::value; 40 | 41 | constexpr boost::endian::order nonnative_order = 42 | boost::endian::order::native == boost::endian::order::big 43 | ? boost::endian::order::little 44 | : boost::endian::order::big; 45 | 46 | template 47 | bool has_bom(const Range& range) 48 | { 49 | using v = rng::range_value_t; 50 | 51 | // Better to use if constexpr once we get it, but this should be 52 | // easily optimised too 53 | switch (sizeof(v)) { 54 | case 1: { 55 | auto it = rng::cbegin(range); 56 | return rng::size(range) >= 3 && 57 | static_cast(*it) == 0xEF && 58 | static_cast(*++it) == 0xBB && 59 | static_cast(*++it) == 0xBF; 60 | } 61 | case 2: 62 | return rng::size(range) != 0 && 63 | static_cast(*rng::begin(range)) == u'\uFEFF'; 64 | case 4: 65 | return rng::size(range) != 0 && 66 | static_cast(*rng::begin(range)) == U'\uFEFF'; 67 | default: 68 | return false; 69 | } 70 | } 71 | 72 | template 73 | bool has_swapped_bom(const Range& range) 74 | { 75 | using v = rng::range_value_t; 76 | 77 | switch (sizeof(v)) { 78 | case 2: 79 | return static_cast(*rng::begin(range)) == 80 | boost::endian::endian_reverse(static_cast(u'\uFEFF')); 81 | case 4: 82 | return static_cast(*rng::begin(range)) == 83 | boost::endian::endian_reverse(static_cast(U'\uFEFF')); 84 | default: 85 | return false; 86 | } 87 | } 88 | 89 | template 90 | struct bom_concat_view : rng::view_adaptor, Rng> 91 | { 92 | private: 93 | using string_type = std::basic_string>; 94 | 95 | struct adaptor : rng::adaptor_base { 96 | 97 | adaptor() = default; 98 | 99 | adaptor(bom_concat_view& view) 100 | : bom_first_(view.bom_.begin()), 101 | bom_last_(view.bom_.end()) 102 | {} 103 | 104 | auto get(rng::range_iterator_t it) const 105 | { 106 | if (bom_first_ != bom_last_) { 107 | return *bom_first_; 108 | } else { 109 | return *it; 110 | } 111 | } 112 | 113 | void next(rng::range_iterator_t& it) 114 | { 115 | if (bom_first_ != bom_last_) { 116 | ++bom_first_; 117 | } else { 118 | ++it; 119 | } 120 | } 121 | 122 | rng::range_iterator_t bom_first_{}; 123 | rng::range_sentinel_t bom_last_{}; 124 | }; 125 | 126 | 127 | public: 128 | bom_concat_view() = default; 129 | 130 | bom_concat_view(Rng rng, string_type bom) 131 | : rng::view_adaptor(std::move(rng)), 132 | bom_(std::move(bom)) 133 | {} 134 | 135 | adaptor begin_adaptor() { return adaptor{*this}; } 136 | 137 | private: 138 | string_type bom_; 139 | }; 140 | 141 | } // end namespace detail 142 | 143 | struct consume_bom_fn { 144 | template ())> 146 | auto operator()(Range&& range) const 147 | { 148 | using value_type = rng::range_value_t; 149 | 150 | rng::range_difference_t bom_size = 0; 151 | boost::endian::order byte_order = boost::endian::order::native; 152 | 153 | if (detail::has_bom(range)) { 154 | bom_size = detail::bom_size_v; 155 | } else if (detail::has_swapped_bom(range)) { 156 | bom_size = detail::bom_size_v; 157 | byte_order = detail::nonnative_order; 158 | } 159 | 160 | return endian_convert<>(rng::view::drop(std::forward(range), bom_size), 161 | byte_order); 162 | } 163 | 164 | template () && 166 | !rng::ForwardRange())> 167 | auto operator()(Range&& range) const 168 | { 169 | using value_type = rng::range_value_t; 170 | constexpr rng::range_difference_t bom_size = detail::bom_size_v; 171 | 172 | boost::endian::order byte_order = boost::endian::order::native; 173 | 174 | // For InputRanges (only), testing for the BOM will "eat up" the first 175 | // character(s) of the range. So save them in a temporary string so that 176 | // we can put them back later if it turns out not to be a BOM. 177 | std::basic_string buf{}; 178 | rng::copy_n(rng::begin(range), bom_size, rng::back_inserter(buf)); 179 | 180 | if (detail::has_bom(buf)) { 181 | buf.clear(); 182 | } else if (detail::has_swapped_bom(buf)) { 183 | buf.clear(); 184 | byte_order = detail::nonnative_order; 185 | } 186 | 187 | return endian_convert<>( 188 | detail::bom_concat_view>( 189 | rng::view::all(std::forward(range)), 190 | std::move(buf)), 191 | byte_order); 192 | } 193 | 194 | decltype(auto) operator()() const { 195 | return rng::make_pipeable(std::bind(*this)); 196 | } 197 | }; 198 | 199 | RANGES_INLINE_VARIABLE(rng::view::view, consume_bom) 200 | 201 | struct add_bom_fn { 202 | 203 | template 204 | auto operator()(Range&& range) const 205 | { 206 | using char_type = rng::range_value_t; 207 | constexpr char32_t bom = U'\uFEFF'; 208 | 209 | return rng::view::concat(utf_convert(rng::view::single(bom)), 210 | std::forward(range)); 211 | } 212 | 213 | decltype(auto) operator()() const 214 | { 215 | return rng::make_pipeable(std::bind(*this)); 216 | } 217 | }; 218 | 219 | RANGES_INLINE_VARIABLE(rng::view::view, add_bom) 220 | 221 | } // end namespace view 222 | } // end namespace utf_ranges 223 | } // end namespace tcb 224 | 225 | #endif // TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED 226 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view/bytes.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | 12 | namespace tcb { 13 | namespace utf_ranges { 14 | 15 | namespace rng = ::ranges::v3; 16 | using rng::static_const; 17 | 18 | template 19 | class bytes_view : public rng::view_adaptor, Rng> 20 | { 21 | private: 22 | using value_type = rng::range_value_t; 23 | using byte = unsigned char; 24 | 25 | friend rng::range_access; 26 | 27 | struct adaptor : rng::adaptor_base 28 | { 29 | adaptor() = default; 30 | 31 | adaptor(const bytes_view& b) 32 | { 33 | if (b.mutable_base().begin() != b.mutable_base().end()) { 34 | fill_buffer(b.mutable_base().begin()); 35 | } 36 | } 37 | 38 | void fill_buffer(rng::range_iterator_t it) 39 | { 40 | const value_type t = *it; 41 | std::copy(reinterpret_cast(&t), 42 | reinterpret_cast(&t) + sizeof(value_type), 43 | buf_.begin()); 44 | idx_ = 0; 45 | } 46 | 47 | 48 | byte get(rng::range_iterator_t) const { 49 | return buf_[idx_++]; 50 | } 51 | 52 | void next(rng::range_iterator_t& it) { 53 | if (idx_ == sizeof(value_type)) { 54 | fill_buffer(++it); 55 | } 56 | } 57 | 58 | std::array buf_{{}}; 59 | mutable int idx_ = 0; 60 | }; 61 | 62 | public: 63 | 64 | bytes_view() = default; 65 | 66 | bytes_view(Rng range) 67 | : rng::view_adaptor(std::move(range)) 68 | {} 69 | 70 | adaptor begin_adaptor() const { return adaptor{*this}; } 71 | }; 72 | 73 | namespace view { 74 | 75 | struct bytes_fn { 76 | 77 | template 78 | bytes_view> operator()(Rng&& range) const 79 | { 80 | return {rng::view::all(std::forward(range))}; 81 | } 82 | 83 | decltype(auto) operator()() const 84 | { 85 | rng::make_pipeable(std::bind(*this)); 86 | } 87 | 88 | }; 89 | 90 | RANGES_INLINE_VARIABLE(rng::view::view, bytes); 91 | 92 | } // end namespace view 93 | } // end namespace utf_ranges 94 | } // end namespace tcb 95 | 96 | #endif // TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED 97 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view/endian_convert.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace tcb { 14 | namespace utf_ranges { 15 | 16 | namespace rng = ::ranges::v3; 17 | using rng::static_const; 18 | using rng::operator|; 19 | 20 | namespace detail { 21 | 22 | // This stuff is necessary because Boost.Endian doesn't seem to handle 23 | // byte-swapping character types very well. [unsigned] char should just be 24 | // returned unaltered, but instead it gets promoted to int, byte-swapped and 25 | // converted back to char, which means that it is always zero. The same happens 26 | // to char16_t. To get around this, we wrap values in a swap_wrapper<> struct, 27 | // with "overloads" for the endian_reverse functions for char and char16_t 28 | // (actually function that get found by the Boost library via ADL). All other 29 | // types get forwarded to the regular boost conversion function. 30 | 31 | template 32 | struct swap_wrapper { 33 | T value; 34 | }; 35 | 36 | template 37 | swap_wrapper make_swap_wrapper(T t) 38 | { 39 | return swap_wrapper{t}; 40 | } 41 | 42 | template 43 | swap_wrapper endian_reverse(swap_wrapper s) noexcept 44 | { 45 | return swap_wrapper{boost::endian::endian_reverse(s.value)}; 46 | } 47 | 48 | inline swap_wrapper endian_reverse(swap_wrapper s) noexcept 49 | { 50 | return s; 51 | } 52 | 53 | inline swap_wrapper endian_reverse(swap_wrapper s) noexcept 54 | { 55 | return make_swap_wrapper( 56 | static_cast( 57 | boost::endian::endian_reverse(static_cast(s.value))) 58 | ); 59 | } 60 | 61 | inline swap_wrapper endian_reverse(swap_wrapper s) noexcept 62 | { 63 | using traits = std::char_traits; 64 | static_assert(sizeof(traits::int_type) == sizeof(wchar_t), ""); 65 | 66 | return swap_wrapper{ 67 | traits::to_char_type( 68 | boost::endian::endian_reverse(traits::to_int_type(s.value))) 69 | }; 70 | } 71 | 72 | } // end namespace detail 73 | 74 | namespace view { 75 | 76 | template 77 | struct endian_convert_fn { 78 | template 79 | auto operator()(Range&& range, 80 | boost::endian::order src_order = boost::endian::order::native) const 81 | { 82 | const auto swapper = [src_order] (auto c){ 83 | return boost::endian::conditional_reverse(detail::make_swap_wrapper(c), 84 | src_order, DestOrder).value; 85 | }; 86 | return rng::view::transform(std::forward(range), 87 | std::move(swapper)); 88 | } 89 | 90 | decltype(auto) operator()(boost::endian::order src_endian = boost::endian::order::native) const 91 | { 92 | return rng::make_pipeable(std::bind(*this, std::placeholders::_1, 93 | rng::protect(src_endian))); 94 | } 95 | }; 96 | 97 | inline namespace 98 | { 99 | template 100 | constexpr auto& endian_convert = static_const>>::value; 101 | } 102 | 103 | } // end namespace view 104 | } // end namespace utf_ranges 105 | } // end namespace tcb 106 | 107 | #endif // TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED 108 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view/line_end_transform.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | 12 | namespace tcb { 13 | namespace utf_ranges { 14 | 15 | namespace rng = ::ranges::v3; 16 | 17 | template 18 | class line_end_transform_view 19 | : public rng::view_adaptor, Rng> 20 | { 21 | private: 22 | friend rng::range_access; 23 | 24 | struct adaptor : rng::adaptor_base { 25 | adaptor() = default; 26 | 27 | char32_t get(rng::range_iterator_t it) const 28 | { 29 | char32_t c = *it; 30 | switch (c) { 31 | case U'\u0085': // Next line (NEL) 32 | case U'\u000B': // Vertical tab (VT) 33 | case U'\u000C': // Form feed (FF) 34 | case U'\u2028': // Line separator (LS) 35 | case U'\u2029': // Paragraph separator (PS) 36 | c = U'\n'; 37 | break; 38 | case U'\u000D': // Carriage return (CR) 39 | c = U'\n'; 40 | // If next character is LF, skip it 41 | skip_next_ = (*++it == U'\u000A'); 42 | break; 43 | } 44 | 45 | return c; 46 | } 47 | 48 | void next(rng::range_iterator_t& it) 49 | { 50 | ++it; 51 | if (skip_next_) { 52 | ++it; 53 | skip_next_ = false; 54 | } 55 | } 56 | 57 | bool equal(const adaptor& other) const 58 | { 59 | return skip_next_ == other.skip_next_; 60 | } 61 | 62 | mutable bool skip_next_ = false; 63 | }; 64 | 65 | public: 66 | 67 | adaptor begin_adaptor() const { return adaptor{}; } 68 | 69 | line_end_transform_view() = default; 70 | 71 | line_end_transform_view(Rng rng) 72 | : rng::view_adaptor{std::move(rng)} 73 | {} 74 | }; 75 | 76 | namespace view { 77 | 78 | template 79 | auto line_end_transform(Rng&& range) 80 | { 81 | // Convert to UTF-32 and then back again 82 | using CharT = rng::range_value_t; 83 | using R = decltype(rng::view::all(utf32(std::forward(range)))); 84 | return utf_convert( 85 | line_end_transform_view{rng::view::all(utf32(std::forward(range)))} 86 | ); 87 | } 88 | 89 | 90 | } // end namespace view 91 | } // end namespace utf_ranges 92 | } // end namespace tcb 93 | 94 | #endif // TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED 95 | -------------------------------------------------------------------------------- /include/tcb/utf_ranges/view/utf_convert.hpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #ifndef TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED 7 | #define TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | namespace tcb { 16 | namespace utf_ranges { 17 | 18 | namespace rng = ::ranges::v3; 19 | using rng::static_const; 20 | 21 | template 22 | class utf_convert_view 23 | : public rng::view_facade, rng::unknown> { 24 | struct cursor { 25 | cursor() = default; 26 | 27 | cursor(utf_convert_view& parent) 28 | : first_(rng::begin(parent.range_)), 29 | last_(rng::end(parent.range_)) 30 | { 31 | if (first_ != last_) { 32 | char32_t c = detail::utf_traits::decode(first_, last_); 33 | next_chars_ = detail::utf_traits::encode(c); 34 | } 35 | } 36 | 37 | cursor(const utf_convert_view& parent) 38 | : first_(rng::begin(parent.range_)), 39 | last_(rng::end(parent.range_)) 40 | { 41 | if (first_ != last_) { 42 | char32_t c = detail::utf_traits::decode(first_, last_); 43 | next_chars_ = detail::utf_traits::encode(c); 44 | } 45 | } 46 | 47 | void next() 48 | { 49 | if (++idx_ == next_chars_.size() && first_ != last_) { 50 | char32_t c = detail::utf_traits::decode(first_, last_); 51 | next_chars_ = detail::utf_traits::encode(c); 52 | idx_ = 0; 53 | } 54 | } 55 | 56 | OutCharT get() const 57 | { 58 | return next_chars_[idx_]; 59 | } 60 | 61 | bool done() const 62 | { 63 | return first_ == last_ && idx_ == next_chars_.size(); 64 | } 65 | 66 | bool equal(const cursor& other) const 67 | { 68 | return std::tie(next_chars_) == 69 | std::tie(other.next_chars_); 70 | } 71 | 72 | detail::encoded_chars next_chars_; 73 | char idx_ = 0; 74 | rng::range_iterator_t first_{}; 75 | rng::range_sentinel_t last_{}; 76 | }; 77 | 78 | public: 79 | cursor begin_cursor() { return cursor{*this}; } 80 | 81 | CONCEPT_REQUIRES(rng::Range()) 82 | cursor begin_cursor() const { return cursor{*this}; } 83 | 84 | utf_convert_view() = default; 85 | 86 | utf_convert_view(Range range) 87 | : range_{std::move(range)} {} 88 | 89 | private: 90 | Range range_{}; 91 | friend rng::range_access; 92 | }; 93 | 94 | namespace view { 95 | 96 | template 97 | struct utf_convert_fn { 98 | template > 100 | utf_convert_view, InCharT, OutCharT> 101 | operator()(Range&& range) const 102 | { 103 | return {rng::view::all(std::forward(range))}; 104 | } 105 | 106 | decltype(auto) operator()() const 107 | { 108 | return rng::make_pipeable(std::bind(*this)); 109 | } 110 | }; 111 | 112 | inline namespace 113 | { 114 | template 115 | constexpr auto& utf_convert = static_const>>::value; 116 | } 117 | 118 | struct utf8_fn { 119 | template 120 | utf_convert_view, rng::range_value_t, char> 121 | operator()(Range&& range) const 122 | { 123 | return {rng::view::all(std::forward(range))}; 124 | } 125 | 126 | decltype(auto) operator()() const 127 | { 128 | return rng::make_pipeable(std::bind(*this)); 129 | } 130 | }; 131 | 132 | RANGES_INLINE_VARIABLE(rng::view::view, utf8); 133 | 134 | struct utf16_fn { 135 | template 136 | utf_convert_view, rng::range_value_t, char16_t> 137 | operator()(Range&& range) const 138 | { 139 | return {rng::view::all(std::forward(range))}; 140 | } 141 | 142 | decltype(auto) operator()() const 143 | { 144 | return rng::make_pipeable(std::bind(*this)); 145 | } 146 | }; 147 | 148 | RANGES_INLINE_VARIABLE(rng::view::view, utf16); 149 | 150 | struct utf32_fn { 151 | template 152 | utf_convert_view, rng::range_value_t, char32_t> 153 | operator()(Range&& range) const 154 | { 155 | return {rng::view::all(std::forward(range))}; 156 | } 157 | 158 | decltype(auto) operator()() const 159 | { 160 | return rng::make_pipeable(std::bind(*this)); 161 | } 162 | }; 163 | 164 | RANGES_INLINE_VARIABLE(rng::view::view, utf32); 165 | 166 | } // end namespace view 167 | } // end namespace utf_ranges 168 | } // end namespace tcb 169 | 170 | #endif // TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED 171 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(utf_ranges_test 3 | bom_test.cpp 4 | bytes_test.cpp 5 | catch_main.cpp 6 | endian_test.cpp 7 | istreambuf_range_test.cpp 8 | line_end_transform_test.cpp 9 | ostreambuf_iterator_test.cpp 10 | utf_convert_view_test.cpp 11 | ) 12 | 13 | target_include_directories(utf_ranges_test PRIVATE 14 | ${RANGE_INCLUDE_DIR} 15 | ${Boost_INCLUDE_DIR} 16 | ) 17 | -------------------------------------------------------------------------------- /test/bom_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E" 16 | 17 | const auto to_little_endian = [] (const auto& in) { 18 | using char_type = ranges::range_value_t; 19 | std::basic_string out; 20 | for (auto c : in) { 21 | out.push_back(boost::endian::native_to_little( 22 | tcb::utf_ranges::detail::make_swap_wrapper(c)).value); 23 | } 24 | return out; 25 | }; 26 | 27 | const auto to_big_endian = [] (const auto& in) { 28 | using char_type = ranges::range_value_t; 29 | std::basic_string out; 30 | for (auto c : in) { 31 | out.push_back(boost::endian::native_to_big( 32 | tcb::utf_ranges::detail::make_swap_wrapper(c)).value); 33 | } 34 | return out; 35 | }; 36 | 37 | 38 | TEST_CASE("Byte order mark is prepended correctly", "[bom]") 39 | { 40 | SECTION("...for UTF-8") { 41 | const std::string str = u8"" TEST_STRING; 42 | const std::string test = tcb::utf_ranges::view::add_bom(str); 43 | REQUIRE(test == u8"\ufeff" + str); 44 | } 45 | 46 | SECTION("...for UTF-16") { 47 | const std::u16string str = u"" TEST_STRING; 48 | const std::u16string test = tcb::utf_ranges::view::add_bom(str); 49 | REQUIRE(test == u"\ufeff" + str); 50 | } 51 | 52 | SECTION("...for UTF-32") { 53 | const std::u32string str = U"" TEST_STRING; 54 | const std::u32string test = tcb::utf_ranges::view::add_bom(str); 55 | REQUIRE(test == U"\ufeff" + str); 56 | } 57 | 58 | SECTION("...for wchar_t") { 59 | const std::wstring str = L"" TEST_STRING; 60 | const std::wstring test = tcb::utf_ranges::view::add_bom(str); 61 | REQUIRE(test == L"\ufeff" + str); 62 | } 63 | } 64 | 65 | TEST_CASE("Byte order marks are correctly identified", "[bom]") 66 | { 67 | SECTION("...in UTF-8") { 68 | const std::string str = u8"\uFEFF" TEST_STRING; 69 | const std::string test = tcb::utf_ranges::view::consume_bom(str); 70 | REQUIRE(test == u8"" TEST_STRING); 71 | } 72 | 73 | SECTION("...in UTF-16") { 74 | const std::u16string str = u"\uFEFF" TEST_STRING; 75 | const std::u16string test = tcb::utf_ranges::view::consume_bom(str); 76 | REQUIRE(test == u"" TEST_STRING); 77 | } 78 | 79 | SECTION("...in UTF-32") { 80 | const std::u32string str = U"\uFEFF" TEST_STRING; 81 | const std::u32string test = tcb::utf_ranges::view::consume_bom(str); 82 | REQUIRE(test == U"" TEST_STRING); 83 | } 84 | } 85 | 86 | TEST_CASE("Strings without byte order marks are unchanged", "[bom]") 87 | { 88 | SECTION("...in UTF-8") { 89 | const std::string str = u8"" TEST_STRING; 90 | const std::string test = tcb::utf_ranges::view::consume_bom(str); 91 | REQUIRE(test == u8"" TEST_STRING); 92 | } 93 | 94 | SECTION("...in UTF-16") { 95 | const std::u16string str = u"" TEST_STRING; 96 | const std::u16string test = tcb::utf_ranges::view::consume_bom(str); 97 | REQUIRE(test == u"" TEST_STRING); 98 | } 99 | 100 | SECTION("...in UTF-32") { 101 | const std::u32string str = U"" TEST_STRING; 102 | const std::u32string test = tcb::utf_ranges::view::consume_bom(str); 103 | REQUIRE(test == U"" TEST_STRING); 104 | } 105 | } 106 | 107 | TEST_CASE("Native endian InputRanges with byte order marks are stripped correctly", "[bom]") 108 | { 109 | SECTION("...in UTF-8") { 110 | std::stringstream ss; 111 | ss << u8"\uFEFF" TEST_STRING; 112 | const std::string test = tcb::utf_ranges::view::consume_bom( 113 | ranges::istream_range(ss)); 114 | REQUIRE(test == u8"" TEST_STRING); 115 | } 116 | } 117 | 118 | TEST_CASE("InputRanges without byte order marks are unchanged", "[bom]") 119 | { 120 | SECTION("...in UTF-8") { 121 | std::stringstream ss; 122 | ss << u8"" TEST_STRING; 123 | const std::string test = tcb::utf_ranges::view::consume_bom( 124 | ranges::istream_range(ss)); 125 | REQUIRE(test == u8"" TEST_STRING); 126 | } 127 | } 128 | 129 | TEST_CASE("Byte order marks are correctly used", "[bom]") 130 | { 131 | SECTION("...for \"UTF-8BE\"") { 132 | const auto str = to_big_endian(std::string(u8"\uFEFF" TEST_STRING)); 133 | const std::string test = tcb::utf_ranges::view::consume_bom(str); 134 | REQUIRE(test == u8"" TEST_STRING); 135 | } 136 | 137 | SECTION("...for wide UTF-16BE") { 138 | const auto str = to_big_endian(std::u16string(u"\uFEFF" TEST_STRING)); 139 | const std::u16string test = tcb::utf_ranges::view::consume_bom(str); 140 | REQUIRE(test == u"" TEST_STRING); 141 | } 142 | 143 | SECTION("...for wide UTF-32BE") { 144 | const auto str = to_big_endian(std::u32string(U"\uFEFF" TEST_STRING)); 145 | const std::u32string test = tcb::utf_ranges::view::consume_bom(str); 146 | REQUIRE(test == U"" TEST_STRING); 147 | } 148 | 149 | SECTION("...with big-endian wide strings") { 150 | const auto str = to_big_endian(std::wstring(L"\uFEFF" TEST_STRING)); 151 | const std::wstring test = tcb::utf_ranges::view::consume_bom(str); 152 | REQUIRE(test == L"" TEST_STRING); 153 | } 154 | 155 | SECTION("...for \"UTF-8LE\"") { 156 | const auto str = to_little_endian(std::string(u8"\uFEFF" TEST_STRING)); 157 | const std::string test = tcb::utf_ranges::view::consume_bom(str); 158 | REQUIRE(test == u8"" TEST_STRING); 159 | } 160 | 161 | SECTION("...for wide UTF-16LE") { 162 | const auto str = to_little_endian(std::u16string(u"\uFEFF" TEST_STRING)); 163 | const std::u16string test = tcb::utf_ranges::view::consume_bom(str); 164 | REQUIRE(test == u"" TEST_STRING); 165 | } 166 | 167 | SECTION("...for wide UTF-32LE") { 168 | const auto str = to_little_endian(std::u32string(U"\uFEFF" TEST_STRING)); 169 | const std::u32string test = tcb::utf_ranges::view::consume_bom(str); 170 | REQUIRE(test == U"" TEST_STRING); 171 | } 172 | 173 | SECTION("...with little-endian wide strings") { 174 | const auto str = to_little_endian(std::wstring(L"\uFEFF" TEST_STRING)); 175 | const std::wstring test = tcb::utf_ranges::view::consume_bom(str); 176 | REQUIRE(test == L"" TEST_STRING); 177 | } 178 | } -------------------------------------------------------------------------------- /test/bytes_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E" 14 | 15 | TEST_CASE("Bytes view works for UTF-8", "[bytes]") 16 | { 17 | std::string str = u8"" TEST_STRING; 18 | std::string test = str | tcb::utf_ranges::view::bytes; 19 | 20 | REQUIRE(str == test); 21 | } 22 | 23 | TEST_CASE("Bytes view works for UTF-16", "[bytes]") 24 | { 25 | // std::codecvt_utf16 is defined to work with UTF-16-encoded 26 | // byte strings, which is usually massively inconvenient but actually 27 | // does exactly what we want in this one particular instance 28 | 29 | using codecvt = std::codecvt_utf16; 31 | 32 | std::u16string u16 = u"" TEST_STRING; 33 | std::u32string u32 = U"" TEST_STRING; 34 | 35 | std::string test = u16 | tcb::utf_ranges::view::bytes; 36 | 37 | std::string u16bytes = std::wstring_convert{}.to_bytes(u32); 38 | 39 | REQUIRE(test == u16bytes); 40 | 41 | //REQUIRE(ranges::equal(test, u16bytes)); 42 | } -------------------------------------------------------------------------------- /test/catch_main.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define CATCH_CONFIG_MAIN 3 | #include "catch.hpp" -------------------------------------------------------------------------------- /test/convert_test.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tcbrindle/utf_ranges/ff5a0b1a4e7a9a2f6c4ec661989a52080f9396ab/test/convert_test.cpp -------------------------------------------------------------------------------- /test/endian_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | 10 | const auto to_little_endian = [] (const auto& in) { 11 | using char_type = ranges::range_value_t; 12 | std::basic_string out; 13 | for (auto c : in) { 14 | out.push_back(boost::endian::native_to_little( 15 | tcb::utf_ranges::detail::make_swap_wrapper(c)).value); 16 | } 17 | return out; 18 | }; 19 | 20 | const auto to_big_endian = [] (const auto& in) { 21 | using char_type = ranges::range_value_t; 22 | std::basic_string out; 23 | for (auto c : in) { 24 | out.push_back(boost::endian::native_to_big( 25 | tcb::utf_ranges::detail::make_swap_wrapper(c)).value); 26 | } 27 | return out; 28 | }; 29 | 30 | 31 | #define TEST_STRING "123456"; 32 | 33 | // Native endian test strings 34 | const std::string test_string8n = u8"" TEST_STRING; 35 | const std::u16string test_string16n = u"" TEST_STRING; 36 | const std::u32string test_string32n = U"" TEST_STRING; 37 | const std::wstring test_stringwn = L"" TEST_STRING; 38 | 39 | // Little endian test strings 40 | const std::string test_string8l = test_string8n; 41 | const std::u16string test_string16l = to_little_endian(test_string16n); 42 | const std::u32string test_string32l = to_little_endian(test_string32n); 43 | const std::wstring test_stringwl = to_little_endian(test_stringwn); 44 | 45 | // Big endian test strings 46 | const std::string test_string8b = test_string8n; 47 | const std::u16string test_string16b = to_big_endian(test_string16n); 48 | const std::u32string test_string32b = to_big_endian(test_string32n); 49 | const std::wstring test_stringwb = to_big_endian(test_stringwn); 50 | 51 | using tcb::utf_ranges::view::endian_convert; 52 | using namespace boost::endian; 53 | 54 | TEST_CASE("Byte swap native-to-native works", "[endian]") 55 | { 56 | SECTION("...for UTF-8") { 57 | const std::string test = endian_convert(test_string8n); 58 | REQUIRE(test == test_string8n); 59 | } 60 | 61 | SECTION("...for UTF-16") { 62 | const std::u16string test = endian_convert(test_string16n); 63 | REQUIRE(test == test_string16n); 64 | } 65 | 66 | SECTION("...for UTF-32") { 67 | const std::u32string test = endian_convert(test_string32n); 68 | REQUIRE(test == test_string32n); 69 | } 70 | 71 | SECTION("... for wide chars") { 72 | const std::wstring test = endian_convert(test_stringwn); 73 | REQUIRE(test == test_stringwn); 74 | } 75 | } 76 | 77 | TEST_CASE("Byte swap native-to-little works", "[endian]") 78 | { 79 | SECTION("...for UTF-8") { 80 | const std::string test = endian_convert(test_string8n); 81 | REQUIRE(test == test_string8n); 82 | } 83 | 84 | SECTION("...for UTF-16") { 85 | const std::u16string test = endian_convert(test_string16n); 86 | REQUIRE(test == test_string16n); 87 | } 88 | 89 | SECTION("...for UTF-32") { 90 | const std::u32string test = endian_convert(test_string32n); 91 | REQUIRE(test == test_string32n); 92 | } 93 | 94 | SECTION("... for wide chars") { 95 | const std::wstring test = endian_convert(test_stringwn); 96 | REQUIRE(test == test_stringwn); 97 | } 98 | } 99 | 100 | TEST_CASE("Byte swap native-to-big works", "[endian]") 101 | { 102 | SECTION("...for UTF-8") { 103 | const std::string test = endian_convert(test_string8n); 104 | REQUIRE(test == test_string8b); 105 | } 106 | 107 | SECTION("...for UTF-16") { 108 | const std::u16string test = endian_convert(test_string16n); 109 | REQUIRE(test == test_string16b); 110 | } 111 | 112 | SECTION("...for UTF-32") { 113 | const std::u32string test = endian_convert(test_string32n); 114 | REQUIRE(test == test_string32b); 115 | } 116 | 117 | SECTION("... for wide chars") { 118 | const std::wstring test = endian_convert(test_stringwn); 119 | REQUIRE(test == test_stringwb); 120 | } 121 | } 122 | 123 | TEST_CASE("Byte swap little-to-native works", "[endian]") 124 | { 125 | SECTION("...for UTF-8") { 126 | const std::string test = endian_convert(test_string8l, 127 | order::little); 128 | REQUIRE(test == test_string8n); 129 | } 130 | 131 | SECTION("...for UTF-16") { 132 | const std::u16string test = endian_convert(test_string16l, 133 | order::little); 134 | REQUIRE(test == test_string16n); 135 | } 136 | 137 | SECTION("...for UTF-32") { 138 | const std::u32string test = endian_convert(test_string32l, 139 | order::little); 140 | REQUIRE(test == test_string32n); 141 | } 142 | 143 | SECTION("... for wide chars") { 144 | const std::wstring test = endian_convert(test_stringwl, 145 | order::little); 146 | REQUIRE(test == test_stringwn); 147 | } 148 | } 149 | 150 | TEST_CASE("Byte swap little-to-little works", "[endian]") 151 | { 152 | SECTION("...for UTF-8") { 153 | const std::string test = endian_convert(test_string8l, 154 | order::little); 155 | REQUIRE(test == test_string8l); 156 | } 157 | 158 | SECTION("...for UTF-16") { 159 | const std::u16string test = endian_convert(test_string16l, 160 | order::little); 161 | REQUIRE(test == test_string16l); 162 | } 163 | 164 | SECTION("...for UTF-32") { 165 | const std::u32string test = endian_convert(test_string32l, 166 | order::little); 167 | REQUIRE(test == test_string32l); 168 | } 169 | 170 | SECTION("... for wide chars") { 171 | const std::wstring test = endian_convert(test_stringwl, 172 | order::little); 173 | REQUIRE(test == test_stringwl); 174 | } 175 | } 176 | 177 | TEST_CASE("Byte swap little-to-big works", "[endian]") 178 | { 179 | SECTION("...for UTF-8") { 180 | const std::string test = endian_convert(test_string8l, 181 | order::little); 182 | REQUIRE(test == test_string8b); 183 | } 184 | 185 | SECTION("...for UTF-16") { 186 | const std::u16string test = endian_convert(test_string16l, 187 | order::little); 188 | REQUIRE(test == test_string16b); 189 | } 190 | 191 | SECTION("...for UTF-32") { 192 | const std::u32string test = endian_convert(test_string32l, 193 | order::little); 194 | REQUIRE(test == test_string32b); 195 | } 196 | 197 | SECTION("... for wide chars") { 198 | const std::wstring test = endian_convert(test_stringwl, 199 | order::little); 200 | REQUIRE(test == test_stringwb); 201 | } 202 | } 203 | 204 | TEST_CASE("Byte swap big-to-native works", "[endian]") 205 | { 206 | SECTION("...for UTF-8") { 207 | const std::string test = endian_convert(test_string8b, 208 | order::big); 209 | REQUIRE(test == test_string8n); 210 | } 211 | 212 | SECTION("...for UTF-16") { 213 | const std::u16string test = endian_convert(test_string16b, 214 | order::big); 215 | REQUIRE(test == test_string16n); 216 | } 217 | 218 | SECTION("...for UTF-32") { 219 | const std::u32string test = endian_convert(test_string32b, 220 | order::big); 221 | REQUIRE(test == test_string32n); 222 | } 223 | 224 | SECTION("... for wide chars") { 225 | const std::wstring test = endian_convert(test_stringwb, 226 | order::big); 227 | REQUIRE(test == test_stringwn); 228 | } 229 | } 230 | 231 | TEST_CASE("Byte swap big-to-little works", "[endian]") 232 | { 233 | SECTION("...for UTF-8") { 234 | const std::string test = endian_convert(test_string8b, 235 | order::big); 236 | REQUIRE(test == test_string8l); 237 | } 238 | 239 | SECTION("...for UTF-16") { 240 | const std::u16string test = endian_convert(test_string16b, 241 | order::big); 242 | REQUIRE(test == test_string16l); 243 | } 244 | 245 | SECTION("...for UTF-32") { 246 | const std::u32string test = endian_convert(test_string32b, 247 | order::big); 248 | REQUIRE(test == test_string32l); 249 | } 250 | 251 | SECTION("... for wide chars") { 252 | const std::wstring test = endian_convert(test_stringwb, 253 | order::big); 254 | REQUIRE(test == test_stringwl); 255 | } 256 | } 257 | 258 | TEST_CASE("Byte swap big-to-big works", "[endian]") 259 | { 260 | SECTION("...for UTF-8") { 261 | const std::string test = endian_convert(test_string8b, 262 | order::big); 263 | REQUIRE(test == test_string8b); 264 | } 265 | 266 | SECTION("...for UTF-16") { 267 | const std::u16string test = endian_convert(test_string16b, 268 | order::big); 269 | REQUIRE(test == test_string16b); 270 | } 271 | 272 | SECTION("...for UTF-32") { 273 | const std::u32string test = endian_convert(test_string32b, 274 | order::big); 275 | REQUIRE(test == test_string32b); 276 | } 277 | 278 | SECTION("... for wide chars") { 279 | const std::wstring test = endian_convert(test_stringwb, 280 | order::big); 281 | REQUIRE(test == test_stringwb); 282 | } 283 | } -------------------------------------------------------------------------------- /test/istreambuf_range_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E" 14 | 15 | TEST_CASE("Basic istreambuf_range test", "[istreambuf_range]") 16 | { 17 | std::istringstream ss{TEST_STRING}; 18 | 19 | auto rng = tcb::utf_ranges::istreambuf(ss); 20 | 21 | // The stringstream eats the final NUL of the string literal, so we have to 22 | // wrap it in a string to do the same (although string_view would do too) 23 | REQUIRE(ranges::equal(rng, std::string(TEST_STRING))); 24 | } 25 | 26 | 27 | TEST_CASE("Basic istreambuf_range test", "[istreambuf_range]") 28 | { 29 | std::basic_istringstream ss{u"" TEST_STRING}; 30 | 31 | auto rng = tcb::utf_ranges::istreambuf(ss); 32 | 33 | REQUIRE(ranges::equal(rng, std::u16string{u"" TEST_STRING})); 34 | } 35 | 36 | TEST_CASE("Basic istreambuf_range test", "[istreambuf_range]") 37 | { 38 | std::basic_istringstream ss{U"" TEST_STRING}; 39 | 40 | auto rng = tcb::utf_ranges::istreambuf(ss); 41 | 42 | REQUIRE(ranges::equal(rng, std::u32string{U"" TEST_STRING})); 43 | } 44 | -------------------------------------------------------------------------------- /test/line_end_transform_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | 10 | #include 11 | 12 | #define TEST_STRING "\n \r \r\n \u0085 \u000b \u000c \u2028 \u2029" 13 | 14 | TEST_CASE("Line end transformations work as expected", "[line_end]") 15 | { 16 | const std::string str = u8"" TEST_STRING; 17 | 18 | auto v = tcb::utf_ranges::view::line_end_transform(str); 19 | 20 | static_assert(ranges::ForwardRange(), ""); 21 | 22 | REQUIRE(ranges::count_if(v, [](char c) { return c == '\n'; }) == 8); 23 | } -------------------------------------------------------------------------------- /test/ostreambuf_iterator_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | const std::string test_str = u8"$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E"; 14 | 15 | TEST_CASE("ostreambuf_iterator works as expected", "[ostreambuf_iterator]") 16 | { 17 | std::ostringstream ss; 18 | 19 | ranges::copy(test_str, tcb::utf_ranges::ostreambuf_iterator(ss)); 20 | 21 | REQUIRE(ss.str() == test_str); 22 | } -------------------------------------------------------------------------------- /test/utf_convert_view_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com) 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 | 6 | #include "catch.hpp" 7 | 8 | #include 9 | #include 10 | 11 | #if __has_include() 12 | #include 13 | using std::experimental::string_view; 14 | using std::experimental::u16string_view; 15 | using std::experimental::u32string_view; 16 | using std::experimental::wstring_view; 17 | #else 18 | #include 19 | using boost::string_view; 20 | using u16string_view = boost::basic_string_view; 21 | using u32string_view = boost::basic_string_view; 22 | using boost::wstring_view; 23 | #endif 24 | 25 | using namespace tcb::utf_ranges; 26 | 27 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E" 28 | 29 | /* 30 | * Default construction 31 | */ 32 | 33 | TEST_CASE("utf_convert_view can be default constructed", "[view]") 34 | { 35 | constexpr char str[] = u8"" TEST_STRING; 36 | const auto v = tcb::utf_ranges::utf_convert_view< 37 | rng::view::all_t, char, char32_t>{}; 38 | static_assert(rng::ForwardRange(), ""); 39 | REQUIRE(v.empty()); 40 | } 41 | 42 | /* 43 | * Handling empty ranges 44 | */ 45 | 46 | TEST_CASE("UTF-8 -> UTF-8 view handles empty ranges", "[view]") 47 | { 48 | constexpr string_view str{}; 49 | const auto v = view::utf8(str); 50 | REQUIRE(v.empty()); 51 | } 52 | 53 | TEST_CASE("UTF-8 -> UTF-16 view handles empty ranges", "[view]") 54 | { 55 | constexpr string_view str{}; 56 | const auto v = view::utf16(str); 57 | REQUIRE(v.empty()); 58 | } 59 | 60 | 61 | TEST_CASE("UTF-8 -> UTF-32 view handles empty ranges", "[view]") 62 | { 63 | constexpr string_view str{}; 64 | const auto v = view::utf32(str); 65 | REQUIRE(v.empty()); 66 | } 67 | 68 | TEST_CASE("UTF-16 -> UTF-8 view handles empty ranges", "[view]") 69 | { 70 | constexpr u16string_view str{}; 71 | const auto v = view::utf8(str); 72 | REQUIRE(v.empty()); 73 | } 74 | 75 | TEST_CASE("UTF-16 -> UTF-16 view handles empty ranges", "[view]") 76 | { 77 | constexpr u16string_view str{}; 78 | const auto v = view::utf16(str); 79 | REQUIRE(v.empty()); 80 | } 81 | 82 | 83 | TEST_CASE("UTF-16 -> UTF-32 view handles empty ranges", "[view]") 84 | { 85 | constexpr u16string_view str{}; 86 | const auto v = view::utf32(str); 87 | REQUIRE(v.empty()); 88 | } 89 | 90 | TEST_CASE("UTF-32 -> UTF-8 view handles empty ranges", "[view]") 91 | { 92 | constexpr u32string_view str{}; 93 | const auto v = view::utf8(str); 94 | REQUIRE(v.empty()); 95 | } 96 | 97 | TEST_CASE("UTF-32 -> UTF-16 view handles empty ranges", "[view]") 98 | { 99 | constexpr u32string_view str{}; 100 | const auto v = view::utf16(str); 101 | REQUIRE(v.empty()); 102 | } 103 | 104 | TEST_CASE("UTF-32 -> UTF-32 view handles empty ranges", "[view]") 105 | { 106 | constexpr u32string_view str{}; 107 | const auto v = view::utf32(str); 108 | REQUIRE(v.empty()); 109 | } 110 | 111 | TEST_CASE("wchar -> UTF-8 view handles empty ranges", "[view]") 112 | { 113 | constexpr wstring_view str{}; 114 | const auto v = view::utf8(str); 115 | REQUIRE(v.empty()); 116 | } 117 | 118 | TEST_CASE("wchar -> UTF-16 view handles empty ranges", "[view]") 119 | { 120 | constexpr wstring_view str{}; 121 | const auto v = view::utf16(str); 122 | REQUIRE(v.empty()); 123 | } 124 | 125 | TEST_CASE("wchar -> UTF-32 view handles empty ranges", "[view]") 126 | { 127 | constexpr wstring_view str{}; 128 | const auto v = view::utf32(str); 129 | REQUIRE(v.empty()); 130 | } 131 | 132 | /* 133 | * Valid (compiler-generated) UTF 134 | */ 135 | 136 | TEST_CASE("UTF-8 -> UTF-8 view handles valid UTF correctly", "[view]") 137 | { 138 | constexpr char str[] = u8"" TEST_STRING; 139 | const auto v = view::utf8(str); 140 | REQUIRE(rng::equal(str, v)); 141 | } 142 | 143 | TEST_CASE("UTF-8 -> UTF-16 view handles valid UTF correctly", "[view]") 144 | { 145 | constexpr char str[] = u8"" TEST_STRING; 146 | constexpr char16_t check[] = u"" TEST_STRING; 147 | const auto v = view::utf16(str); 148 | REQUIRE(rng::equal(check, v)); 149 | } 150 | 151 | TEST_CASE("UTF-8 -> UTF-32 view handles valid UTF correctly", "[view]") 152 | { 153 | constexpr char str[] = u8"" TEST_STRING; 154 | constexpr char32_t check[] = U"" TEST_STRING; 155 | const auto v = view::utf32(str); 156 | REQUIRE(rng::equal(check, v)); 157 | } 158 | 159 | TEST_CASE("UTF-16 -> UTF-8 view handles valid UTF correctly", "[view]") 160 | { 161 | constexpr char16_t str[] = u"" TEST_STRING; 162 | constexpr char check[] = u8"" TEST_STRING; 163 | const auto v = view::utf8(str); 164 | REQUIRE(rng::equal(check, v)); 165 | } 166 | 167 | TEST_CASE("UTF-16 -> UTF-16 view handles valid UTF correctly", "[view]") 168 | { 169 | constexpr char16_t str[] = u"" TEST_STRING; 170 | constexpr char16_t check[] = u"" TEST_STRING; 171 | const auto v = view::utf16(str); 172 | REQUIRE(rng::equal(check, v)); 173 | } 174 | 175 | TEST_CASE("UTF-16 -> UTF-32 view handles valid UTF correctly", "[view]") 176 | { 177 | constexpr char16_t str[] = u"" TEST_STRING; 178 | constexpr char32_t check[] = U"" TEST_STRING; 179 | const auto v = view::utf32(str); 180 | REQUIRE(rng::equal(check, v)); 181 | } 182 | 183 | TEST_CASE("UTF-32 -> UTF-8 view handles valid UTF correctly", "[view]") 184 | { 185 | constexpr char32_t str[] = U"" TEST_STRING; 186 | constexpr char check[] = u8"" TEST_STRING; 187 | const auto v = view::utf8(str); 188 | REQUIRE(rng::equal(check, v)); 189 | } 190 | 191 | TEST_CASE("UTF-32 -> UTF-16 view handles valid UTF correctly", "[view]") 192 | { 193 | constexpr char32_t str[] = U"" TEST_STRING; 194 | constexpr char16_t check[] = u"" TEST_STRING; 195 | const auto v = view::utf16(str); 196 | REQUIRE(rng::equal(check, v)); 197 | } 198 | 199 | TEST_CASE("UTF-32 -> UTF-32 view handles valid UTF correctly", "[view]") 200 | { 201 | constexpr char32_t str[] = U"" TEST_STRING; 202 | constexpr char32_t check[] = U"" TEST_STRING; 203 | const auto v = view::utf32(str); 204 | REQUIRE(rng::equal(check, v)); 205 | } 206 | 207 | TEST_CASE("wchar -> UTF-8 view handles valid UTF correctly", "[view]") 208 | { 209 | constexpr wchar_t str[] = L"" TEST_STRING; 210 | constexpr char check[] = u8"" TEST_STRING; 211 | const auto v = view::utf8(str); 212 | REQUIRE(rng::equal(check, v)); 213 | } 214 | 215 | TEST_CASE("wchar -> UTF-16 view handles valid UTF correctly", "[view]") 216 | { 217 | constexpr wchar_t str[] = L"" TEST_STRING; 218 | constexpr char16_t check[] = u"" TEST_STRING; 219 | const auto v = view::utf16(str); 220 | REQUIRE(rng::equal(check, v)); 221 | } 222 | 223 | TEST_CASE("wchar -> UTF-32 view handles valid UTF correctly", "[view]") 224 | { 225 | constexpr wchar_t str[] = L"" TEST_STRING; 226 | constexpr char32_t check[] = U"" TEST_STRING; 227 | const auto v = view::utf32(str); 228 | REQUIRE(rng::equal(check, v)); 229 | } 230 | 231 | TEST_CASE("Non-character conversions work as expected", "[view]") 232 | { 233 | constexpr char str[] = u8"" TEST_STRING; 234 | constexpr char16_t check[] = u"" TEST_STRING; 235 | std::vector vec = view::utf_convert(str); 236 | REQUIRE(rng::equal(check, vec)); 237 | } 238 | 239 | --------------------------------------------------------------------------------