├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE_1_0.txt
├── README.md
├── benchmark
    ├── CMakeLists.txt
    ├── benchmark.cpp
    ├── utf8.h
    └── utf8
    │   ├── checked.h
    │   ├── core.h
    │   └── unchecked.h
├── example
    ├── CMakeLists.txt
    └── utf8_to_utf16be.cpp
├── include
    └── tcb
    │   └── utf_ranges
    │       ├── convert.hpp
    │       ├── detail
    │           └── utf.hpp
    │       ├── istreambuf_range.hpp
    │       ├── ostreambuf_iterator.hpp
    │       ├── view.hpp
    │       └── view
    │           ├── bom.hpp
    │           ├── bytes.hpp
    │           ├── endian_convert.hpp
    │           ├── line_end_transform.hpp
    │           └── utf_convert.hpp
└── test
    ├── CMakeLists.txt
    ├── bom_test.cpp
    ├── bytes_test.cpp
    ├── catch.hpp
    ├── catch_main.cpp
    ├── convert_test.cpp
    ├── endian_test.cpp
    ├── istreambuf_range_test.cpp
    ├── line_end_transform_test.cpp
    ├── ostreambuf_iterator_test.cpp
    └── utf_convert_view_test.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | build*/
 3 | 
 4 | # QtCreator
 5 | CMakeLists.txt.user
 6 | 
 7 | # CLion
 8 | .idea/
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/range-v3"]
2 | 	path = external/range-v3
3 | 	url = https://github.com/ericniebler/range-v3.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | cmake_minimum_required(VERSION 3.1)
 3 | 
 4 | project(utf_ranges CXX)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 14)
 7 | 
 8 | if (UNIX)
 9 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
10 |     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Werror")
11 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
12 | endif() # UNIX
13 | 
14 | if (WIN32)
15 |     if (CMAKE_COMPILER_IS_GNUCXX)
16 |         set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wall -Wextra")
17 |         set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Werror")
18 |     endif()
19 |     add_definitions("-DNOMINMAX")
20 | endif()
21 | 
22 | find_package(Boost COMPONENTS system REQUIRED)
23 | 
24 | include_directories(include)
25 | set(RANGE_INCLUDE_DIR "${utf_ranges_SOURCE_DIR}/external/range-v3/include")
26 | 
27 | add_subdirectory(benchmark)
28 | add_subdirectory(example)
29 | add_subdirectory(test)
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE_1_0.txt:
--------------------------------------------------------------------------------
 1 | Boost Software License - Version 1.0 - August 17th, 2003
 2 | 
 3 | Permission is hereby granted, free of charge, to any person or organization
 4 | obtaining a copy of the software and accompanying documentation covered by
 5 | this license (the "Software") to use, reproduce, display, distribute,
 6 | execute, and transmit the Software, and to prepare derivative works of the
 7 | Software, and to permit third-parties to whom the Software is furnished to
 8 | do so, all subject to the following:
 9 | 
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # utf_ranges
  3 | 
  4 | A collection of Unicode utilities for C++ using [Range-V3](https://github.com/ericniebler/range-v3)
  5 | 
  6 | This header-only library contains facilities for transforming between UTF-8, UTF-16 and UTF-32 encoded strings (eagerly and lazily), as well as dealing with byte-order marks and transforming line endings.
  7 | 
  8 | ## Example
  9 | 
 10 | A quick overview is best supplied by an example. The following reads a UTF-8 encoded input stream and outputs a UTF-16BE byte stream with byte-order mark:
 11 | 
 12 | ```cpp
 13 |     namespace rng = ::ranges::v3;
 14 |     namespace utf = ::tcb::utf_ranges;
 15 | 
 16 |     std::ifstream in_file{"input_file.utf8.txt", std::ios::binary};
 17 |     std::ofstream out_file{"output_file.utf16be.txt", std::ios::binary};
 18 | 
 19 |     auto view = utf::istreambuf(in_file) // Read range from input stream
 20 |             | utf::view::consume_bom     // Remove UTF-8 "BOM" if present
 21 |             | utf::view::utf16           // Convert to UTF-16
 22 |             | utf::view::add_bom         // Prepend UTF-16 BOM to start of range
 23 |             | utf::view::endian_convert<boost::endian::order::big> // Convert to big-endian
 24 |             | utf::view::bytes;          // Write to disk as bytes
 25 | 
 26 |     rng::copy(view, utf::ostreambuf_iterator<char>{out_file}); // Do the copy
 27 | ```
 28 | 
 29 | (see example/utf8_to_utf16be.cpp for the full code).
 30 | 
 31 | ## Conversions
 32 | 
 33 | For "eager" encoding conversions, the library broadly follows the API specified in [Beman Dawes' proposed Unicode conversion library](https://github.com/Beman/unicode/tree/std-proposal), albeit (currently) with simplified error handling (invalid Unicode characters are simply replaced by the Unicode replacement character U+FFFD). The actual conversion uses code taken from Boost.Locale.
 34 | 
 35 | To convert a range of characters between UTF-8, UTF-16 or UTF-32, use the `tcb::utf_ranges::utf_convert()` function. This takes an `InputRange` with a value type that is an arithmetic type of size 1, 2 or 4 bytes (for UTF-8, UTF-16 and UTF-32 respectively), and an `OutputIterator` with a value type similarly defined. For example:
 36 | 
 37 | ```cpp
 38 | std::string in = u8"Hello world";
 39 | std::u16string out;
 40 | // Note that the output type cannot be determined automatically, so must be specified
 41 | tcb::utf_ranges::utf_convert<char16_t>(in, std::back_inserter(out));
 42 | ```
 43 | 
 44 | To tranform directly to a new string, the `to_utf_string()` function is supplied:
 45 | 
 46 | ```cpp
 47 | std::u16string in = u"Hello world";
 48 | std::string out = tcb::utf_ranges::to_utf_string<char>(in);
 49 | ```
 50 | 
 51 | Convenience functions `to_u8string()`, `to_u16string()`, `to_u32string()` and `to_wstring()` are also provided (but please don't use the last one):
 52 | 
 53 | ```cpp
 54 | std::u32string in = U"Hello world";
 55 | std::u16string out = tcb::utf_ranges::to_u16string(in);
 56 | ```
 57 | 
 58 | ## Views
 59 | 
 60 | If you're familiar with Range-V3, you'll know that views perform lazy transformations on a given range -- that is, conversion is done one element at a time when the view is iterated over.
 61 | 
 62 | ### Encoding conversions
 63 | 
 64 | This library provides views which lazily perform the same transformations as above. For consistency with Range-V3, these are in the `view::` sub-namespace.
 65 | 
 66 | ```cpp
 67 | std::u16string in = u"Hello world";
 68 | 
 69 | auto view = tcb::utf_ranges::view::utf8(in);
 70 | 
 71 | ranges::v3::copy(view, std::ostream_iterator<char>(std::cout));
 72 | ```
 73 | 
 74 | There are similar `utf16` and `utf32` views
 75 | 
 76 | ### Endian transformations
 77 | 
 78 | For UTF-16 and UTF-32, the library provides views which perform byte-swapping between native-, big- and little-ending representations, using code from Boost. The output endianness is specifed by a template parameter, and the input endianness is passed as an argument to the constructor. Both default to `boost::endian::native`. For example:
 79 | 
 80 | ```
 81 | std::u16string in = u"Hello world"; // native endian
 82 | auto view = tcb::utf_ranges::view::endian_convert<boost::endian::order::big>(in);
 83 | std::vector<std::int16_t> out = view; // Copy byte-swapped values to vector
 84 | ```
 85 | 
 86 | ### Byte order mark handling
 87 | 
 88 | The library provides two views for dealing with "byte order marks", that is, the Unicode non-breaking space character U+FEFF which is often placed at the start of files to allow the endianness to be detected.
 89 | 
 90 | To detect a byte-order mark, using the `consume_bom` view:
 91 | 
 92 | ```
 93 | std::u16string in = u"\uFEFFHello world"; // native-endian UTF-16 with BOM
 94 | auto view = tcb::utf_ranges::view::consume_bom(in);
 95 | std::u16string out = view; // copy to new string with BOM removed
 96 | ```
 97 | 
 98 | As suggested by the name, the byte order mark is removed if present. If a BOM is found an has non-native endianness, endian conversion is automatically performed -- that is, the output of the view will always be native-endian. For UTF-8, if a BOM is detected it is simply removed. If no BOM is present, the string is assumed to be native-endian (for UTF-16 and -32), and is passed through unchanged.
 99 | 
100 | To place a byte-order mark at the start of a string, use the `add_bom` view:
101 | 
102 | ```
103 | std::u16string in = u"Hello world";
104 | auto view = tcb::utf_ranges::view::add_bom(in);
105 | std::u16string out = view; // copy to new string, with BOM prepended
106 | ```
107 | 
108 | ### Line ending transformation
109 | 
110 | Unicode specifies eight possible line endings, and recommends that these are converted to the machine native line ending representation on input. In C++, the native representation is "\n". The `line_end_transform` view performs such a conversion. For example:
111 | 
112 | ```cpp
113 | std::string in = u8"Hello world\r\n"; // Windows-style
114 | std::string out = tcb::utf_ranges::view::line_end_transform(in);
115 | assert(out == "Hello world\n");
116 | ```
117 | 
118 | ### Chaining views
119 | 
120 | As with Range-V3, `operator|` is overloaded for views, allowing them to be easily concatenated together, as in the example above.
121 | 
122 | ## Licence
123 | 
124 | This library is provided under the Boost licence. See LICENCE_1_0.txt for details.
125 | 


--------------------------------------------------------------------------------
/benchmark/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | find_package(Boost COMPONENTS locale REQUIRED)
 4 | 
 5 | if (Boost_FOUND)
 6 |     add_executable(benchmark benchmark.cpp)
 7 | 
 8 |     target_include_directories(benchmark PRIVATE
 9 |         ${RANGE_INCLUDE_DIR}
10 |         ${Boost_INCLUDE_DIR}
11 |     )
12 | else()
13 |     message("Boost.Locale not found, skipping benchmark target")
14 | endif()


--------------------------------------------------------------------------------
/benchmark/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #include <cassert>
  7 | #include <chrono>
  8 | #include <codecvt>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | #include <locale>
 12 | #include <string>
 13 | 
 14 | #include "utf8.h"
 15 | 
 16 | #include <boost/locale/encoding_utf.hpp>
 17 | 
 18 | #include <tcb/utf_ranges/view/utf_convert.hpp>
 19 | 
 20 | #include <tcb/utf_ranges/convert.hpp>
 21 | 
 22 | using std::string;
 23 | using std::u16string;
 24 | using std::u32string;
 25 | 
 26 | namespace {
 27 | 
 28 | struct timer {
 29 |     using clock = std::chrono::high_resolution_clock;
 30 |     using time_point = typename clock::time_point;
 31 | 
 32 |     timer() = default;
 33 | 
 34 |     template <typename T = std::chrono::milliseconds>
 35 |     T elapsed() const
 36 |     {
 37 |         return std::chrono::duration_cast<T>(clock::now() - start_);
 38 |     }
 39 | 
 40 | private:
 41 |     time_point start_ = clock::now();
 42 | };
 43 | 
 44 | template <typename Func, typename Arg>
 45 | void time_function_call(Func f, Arg&& arg, int n, string label)
 46 | {
 47 |     timer t;
 48 |     for (int i = 0; i < n; i++) {
 49 |         volatile auto res = f(std::forward<Arg>(arg));
 50 |     }
 51 |     auto e = t.elapsed();
 52 |     std::cout << label << " took " << e.count() << "ms\n";
 53 | }
 54 | 
 55 | /*
 56 |  * All six codecvt conversion functions
 57 |  */
 58 | 
 59 | inline
 60 | u16string codecvt_u8_to_u16(const string& u8)
 61 | {
 62 |     using codecvt = std::codecvt_utf8_utf16<char16_t>;
 63 |     return std::wstring_convert<codecvt, char16_t>{}.from_bytes(u8);
 64 | }
 65 | 
 66 | inline
 67 | u32string codecvt_u8_to_u32(const string& u8)
 68 | {
 69 |     using codecvt = std::codecvt_utf8<char32_t>;
 70 |     return std::wstring_convert<codecvt, char32_t>{}.from_bytes(u8);
 71 | }
 72 | 
 73 | inline
 74 | string codecvt_u16_to_u8(const u16string& u16)
 75 | {
 76 |     using codecvt = std::codecvt_utf8_utf16<char16_t>;
 77 |     return std::wstring_convert<codecvt, char16_t>{}.to_bytes(u16);
 78 | }
 79 | 
 80 | inline
 81 | u32string codecvt_u16_to_u32(const u16string& u16)
 82 | {
 83 |     // You might expect std::codecvt_utf16<char32_t> to convert between
 84 |     // char16_t and char32_t, but it does not; rather, it operates on
 85 |     // UTF-16 encoded *byte* strings. This is not what we want.
 86 |     // We could try to reinterpret_cast<> our way around the problem, but this
 87 |     // is ugly and error prone. The easiest way is to do the conversion in two
 88 |     // steps, to UTF-8 and then to UTF-32. While this might be "unfair" on
 89 |     // codecvt for benchmark purposes, it does rather demonstrate what a
 90 |     // terrible API it is.
 91 |     const string u8 = codecvt_u16_to_u8(u16);
 92 |     return codecvt_u8_to_u32(u8);
 93 | }
 94 | 
 95 | inline
 96 | string codecvt_u32_to_u8(const u32string& u32)
 97 | {
 98 |     using codecvt = std::codecvt_utf8<char32_t>;
 99 |     return std::wstring_convert<codecvt, char32_t>{}.to_bytes(u32);
100 | }
101 | 
102 | inline
103 | u16string codecvt_u32_to_u16(const u32string& u32)
104 | {
105 |     // As above, to avoid reinterpret_cast<> and trying to pretend that
106 |     // a UTF-16 string is really a UTF-16 byte string, we do this in two steps
107 |     const string u8 = codecvt_u32_to_u8(u32);
108 |     return codecvt_u8_to_u16(u8);
109 | }
110 | 
111 | /*
112 |  * All six cpputf8 conversion functions
113 |  */
114 | 
115 | inline
116 | u16string cpputf8_u8_to_u16(const string& u8)
117 | {
118 |     u16string u16;
119 |     utf8::utf8to16(std::begin(u8), std::end(u8), std::back_inserter(u16));
120 |     return u16;
121 | }
122 | 
123 | inline
124 | u32string cpputf8_u8_to_u32(const string& u8)
125 | {
126 |     u32string u32;
127 |     utf8::utf8to32(std::begin(u8), std::end(u8), std::back_inserter(u32));
128 |     return u32;
129 | }
130 | 
131 | inline
132 | string cpputf8_u16_to_u8(const u16string& u16)
133 | {
134 |     string u8;
135 |     utf8::utf16to8(std::begin(u16), std::end(u16), std::back_inserter(u8));
136 |     return u8;
137 | }
138 | 
139 | inline
140 | u32string cpputf8_u16_to_u32(const u16string& u16)
141 | {
142 |     // cpputf8 doesn't support this directly (it is, after all, designed to
143 |     // handle UTF-8), so we need to do it in two steps
144 |     const string u8 = cpputf8_u16_to_u8(u16);
145 |     return cpputf8_u8_to_u32(u8);
146 | }
147 | 
148 | inline
149 | string cpputf8_u32_to_u8(const u32string& u32)
150 | {
151 |     string u8;
152 |     utf8::utf32to8(std::begin(u32), std::end(u32), std::back_inserter(u8));
153 |     return u8;
154 | }
155 | 
156 | inline
157 | u16string cpputf8_u32_to_u16(const u32string& u32)
158 | {
159 |     // As above, we need to do this in two steps
160 |     const string u8 = cpputf8_u32_to_u8(u32);
161 |     return cpputf8_u8_to_u16(u8);
162 | }
163 | 
164 | /*
165 |  * All six Boost.Locale conversion functions
166 |  */
167 | 
168 | inline
169 | u16string boost_u8_to_u16(const string& u8)
170 | {
171 |     return boost::locale::conv::utf_to_utf<char16_t>(u8);
172 | }
173 | 
174 | inline
175 | u32string boost_u8_to_u32(const string& u8)
176 | {
177 |     return boost::locale::conv::utf_to_utf<char32_t>(u8);
178 | }
179 | 
180 | inline
181 | string boost_u16_to_u8(const u16string& u16)
182 | {
183 |     return boost::locale::conv::utf_to_utf<char>(u16);
184 | }
185 | 
186 | inline
187 | u32string boost_u16_to_u32(const u16string& u16)
188 | {
189 |     return boost::locale::conv::utf_to_utf<char32_t>(u16);
190 | }
191 | 
192 | inline
193 | string boost_u32_to_u8(const u32string& u32)
194 | {
195 |     return boost::locale::conv::utf_to_utf<char>(u32);
196 | }
197 | 
198 | inline
199 | u16string boost_u32_to_u16(const u32string& u32)
200 | {
201 |     return boost::locale::conv::utf_to_utf<char16_t>(u32);
202 | }
203 | 
204 | /*
205 |  * All six range conversion functions
206 |  */
207 | 
208 | inline
209 | u16string range_u8_to_u16(const string& u8)
210 | {
211 |     return tcb::utf_ranges::to_u16string(u8);
212 | }
213 | 
214 | inline
215 | u32string range_u8_to_u32(const string& u8)
216 | {
217 |     return tcb::utf_ranges::to_u32string(u8);
218 | }
219 | 
220 | inline
221 | string range_u16_to_u8(const u16string& u16)
222 | {
223 |     return tcb::utf_ranges::to_u8string(u16);
224 | }
225 | 
226 | inline
227 | u32string range_u16_to_u32(const u16string& u16)
228 | {
229 |     return tcb::utf_ranges::to_u32string(u16);
230 | }
231 | 
232 | inline
233 | string range_u32_to_u8(const u32string& u32)
234 | {
235 |     return tcb::utf_ranges::to_u8string(u32);
236 | }
237 | 
238 | inline
239 | u16string range_u32_to_u16(const u32string& u32)
240 | {
241 |     return tcb::utf_ranges::to_u16string(u32);
242 | }
243 | 
244 | /*
245 |  * All six range view functions
246 |  */
247 | 
248 | inline
249 | u16string range_view_u8_to_u16(const string& u8)
250 | {
251 |     return tcb::utf_ranges::view::utf16(u8);
252 | }
253 | 
254 | inline
255 | u32string range_view_u8_to_u32(const string& u8)
256 | {
257 |     return tcb::utf_ranges::view::utf32(u8);
258 | }
259 | 
260 | inline
261 | string range_view_u16_to_u8(const u16string& u16)
262 | {
263 |     return tcb::utf_ranges::view::utf8(u16);
264 | }
265 | 
266 | inline
267 | u32string range_view_u16_to_u32(const u16string& u16)
268 | {
269 |     return tcb::utf_ranges::view::utf32(u16);
270 | }
271 | 
272 | inline
273 | string range_view_u32_to_u8(const u32string& u32)
274 | {
275 |     return tcb::utf_ranges::view::utf8(u32);
276 | }
277 | 
278 | inline
279 | u16string range_view_u32_to_u16(const u32string& u32)
280 | {
281 |     return tcb::utf_ranges::view::utf16(u32);
282 | }
283 | 
284 | } // end anonymous namespace
285 | 
286 | int main(int argc, char** argv)
287 | {
288 |     if (argc < 2) {
289 |         std::cout << "Usage: benchmark UTF8FILE [ITERATIONS]\n";
290 |         return 1;
291 |     }
292 | 
293 |     const string u8str = [argv] {
294 |         std::ifstream f(argv[1]);
295 |         return string(std::istreambuf_iterator<char>{f},
296 |                       std::istreambuf_iterator<char>{});
297 |     }();
298 | 
299 |     const u16string u16str = cpputf8_u8_to_u16(u8str);
300 |     const u32string u32str = cpputf8_u8_to_u32(u8str);
301 | 
302 |     const int num_iterations = argc > 2 ? std::atoi(argv[2]) : 1;
303 | 
304 | 
305 |     // UTF-8 to UTF-16
306 |     time_function_call(codecvt_u8_to_u16, u8str, num_iterations, "codecvt u8 to u16");
307 |     time_function_call(cpputf8_u8_to_u16, u8str, num_iterations, "cpputf8 u8 to u16");
308 |     time_function_call(boost_u8_to_u16, u8str, num_iterations, "boost u8 to u16");
309 |     time_function_call(range_u8_to_u16, u8str, num_iterations, "range u8 to u16");
310 |     time_function_call(range_view_u8_to_u16, u8str, num_iterations, "range view u8 to u16");
311 |     std::cout << "\n";
312 | 
313 |     // UTF-8 to UTF-32
314 |     time_function_call(codecvt_u8_to_u32, u8str, num_iterations, "codecvt u8 to u32");
315 |     time_function_call(cpputf8_u8_to_u32, u8str, num_iterations, "cpputf8 u8 to u32");
316 |     time_function_call(boost_u8_to_u32, u8str, num_iterations, "boost u8 to u32");
317 |     time_function_call(range_u8_to_u32, u8str, num_iterations, "range u8 to u32");
318 |     time_function_call(range_view_u8_to_u32, u8str, num_iterations, "range view u8 to u32");
319 |     std::cout << "\n";
320 | 
321 |     // UTF-16 to UTF-8
322 |     time_function_call(codecvt_u16_to_u8, u16str, num_iterations, "codecvt u16 to u8");
323 |     time_function_call(cpputf8_u16_to_u8, u16str, num_iterations, "cpputf8 u16 to u8");
324 |     time_function_call(boost_u16_to_u8, u16str, num_iterations, "boost u16 to u8");
325 |     time_function_call(range_u16_to_u8, u16str, num_iterations, "range u16 to u8");
326 |     time_function_call(range_view_u16_to_u8, u16str, num_iterations, "range view u16 to u8");
327 |     std::cout << "\n";
328 | 
329 |     // UTF-16 to UTF-32
330 |     time_function_call(codecvt_u16_to_u32, u16str, num_iterations, "*codecvt u16 to u32");
331 |     time_function_call(cpputf8_u16_to_u32, u16str, num_iterations, "*cpputf8 u16 to u32");
332 |     time_function_call(boost_u16_to_u32, u16str, num_iterations, "boost u16 to u32");
333 |     time_function_call(range_u16_to_u32, u16str, num_iterations, "range u16 to u32");
334 |     time_function_call(range_view_u16_to_u32, u16str, num_iterations, "range view u16 to u32");
335 |     std::cout << "\n";
336 | 
337 |     // UTF-32 to UTF-8
338 |     time_function_call(codecvt_u32_to_u8, u32str, num_iterations, "codecvt u32 to u8");
339 |     time_function_call(cpputf8_u32_to_u8, u32str, num_iterations, "cpputf8 u32 to u8");
340 |     time_function_call(boost_u32_to_u8, u32str, num_iterations, "boost u32 to u8");
341 |     time_function_call(range_u32_to_u8, u32str, num_iterations, "range u32 to u8");
342 |     time_function_call(range_view_u32_to_u8, u32str, num_iterations, "range view u32 to u8");
343 |     std::cout << "\n";
344 | 
345 |     // UTF-32 to UTF-16
346 |     time_function_call(codecvt_u32_to_u16, u32str, num_iterations, "*codecvt u32 to u16");
347 |     time_function_call(cpputf8_u32_to_u16, u32str, num_iterations, "*cpputf8 u32 to u16");
348 |     time_function_call(boost_u32_to_u16, u32str, num_iterations, "boost u32 to u16");
349 |     time_function_call(range_u32_to_u16, u32str, num_iterations, "range u32 to u16");
350 |     time_function_call(range_view_u32_to_u16, u32str, num_iterations, "range view u32 to u16");
351 | }


--------------------------------------------------------------------------------
/benchmark/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/benchmark/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public ::std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t cp) : cp(cp) {}
 45 |         virtual const char* what() const throw() { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const throw() { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const throw() { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const throw() { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator>
 73 |     octet_iterator append(uint32_t cp, octet_iterator result)
 74 |     {
 75 |         if (!utf8::internal::is_code_point_valid(cp))
 76 |             throw invalid_code_point(cp);
 77 | 
 78 |         if (cp < 0x80)                        // one octet
 79 |             *(result++) = static_cast<uint8_t>(cp);
 80 |         else if (cp < 0x800) {                // two octets
 81 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
 82 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 83 |         }
 84 |         else if (cp < 0x10000) {              // three octets
 85 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
 86 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 87 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 88 |         }
 89 |         else {                                // four octets
 90 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
 91 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
 92 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 93 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 94 |         }
 95 |         return result;
 96 |     }
 97 | 
 98 |     template <typename octet_iterator, typename output_iterator>
 99 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 |     {
101 |         while (start != end) {
102 |             octet_iterator sequence_start = start;
103 |             internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 |             switch (err_code) {
105 |                 case internal::UTF8_OK :
106 |                     for (octet_iterator it = sequence_start; it != start; ++it)
107 |                         *out++ = *it;
108 |                     break;
109 |                 case internal::NOT_ENOUGH_ROOM:
110 |                     throw not_enough_room();
111 |                 case internal::INVALID_LEAD:
112 |                     out = utf8::append (replacement, out);
113 |                     ++start;
114 |                     break;
115 |                 case internal::INCOMPLETE_SEQUENCE:
116 |                 case internal::OVERLONG_SEQUENCE:
117 |                 case internal::INVALID_CODE_POINT:
118 |                     out = utf8::append (replacement, out);
119 |                     ++start;
120 |                     // just one replacement mark for the sequence
121 |                     while (start != end && utf8::internal::is_trail(*start))
122 |                         ++start;
123 |                     break;
124 |             }
125 |         }
126 |         return out;
127 |     }
128 | 
129 |     template <typename octet_iterator, typename output_iterator>
130 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
131 |     {
132 |         static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
133 |         return utf8::replace_invalid(start, end, out, replacement_marker);
134 |     }
135 | 
136 |     template <typename octet_iterator>
137 |     uint32_t next(octet_iterator& it, octet_iterator end)
138 |     {
139 |         uint32_t cp = 0;
140 |         internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
141 |         switch (err_code) {
142 |             case internal::UTF8_OK :
143 |                 break;
144 |             case internal::NOT_ENOUGH_ROOM :
145 |                 throw not_enough_room();
146 |             case internal::INVALID_LEAD :
147 |             case internal::INCOMPLETE_SEQUENCE :
148 |             case internal::OVERLONG_SEQUENCE :
149 |                 throw invalid_utf8(*it);
150 |             case internal::INVALID_CODE_POINT :
151 |                 throw invalid_code_point(cp);
152 |         }
153 |         return cp;
154 |     }
155 | 
156 |     template <typename octet_iterator>
157 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
158 |     {
159 |         return utf8::next(it, end);
160 |     }
161 | 
162 |     template <typename octet_iterator>
163 |     uint32_t prior(octet_iterator& it, octet_iterator start)
164 |     {
165 |         // can't do much if it == start
166 |         if (it == start)
167 |             throw not_enough_room();
168 | 
169 |         octet_iterator end = it;
170 |         // Go back until we hit either a lead octet or start
171 |         while (utf8::internal::is_trail(*(--it)))
172 |             if (it == start)
173 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
174 |         return utf8::peek_next(it, end);
175 |     }
176 | 
177 |     /// Deprecated in versions that include "prior"
178 |     template <typename octet_iterator>
179 |     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 |     {
181 |         octet_iterator end = it;
182 |         while (utf8::internal::is_trail(*(--it)))
183 |             if (it == pass_start)
184 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
185 |         octet_iterator temp = it;
186 |         return utf8::next(temp, end);
187 |     }
188 | 
189 |     template <typename octet_iterator, typename distance_type>
190 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 |     {
192 |         for (distance_type i = 0; i < n; ++i)
193 |             utf8::next(it, end);
194 |     }
195 | 
196 |     template <typename octet_iterator>
197 |     typename std::iterator_traits<octet_iterator>::difference_type
198 |     distance (octet_iterator first, octet_iterator last)
199 |     {
200 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
201 |         for (dist = 0; first < last; ++dist)
202 |             utf8::next(first, last);
203 |         return dist;
204 |     }
205 | 
206 |     template <typename u16bit_iterator, typename octet_iterator>
207 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 |     {
209 |         while (start != end) {
210 |             uint32_t cp = utf8::internal::mask16(*start++);
211 |             // Take care of surrogate pairs first
212 |             if (utf8::internal::is_lead_surrogate(cp)) {
213 |                 if (start != end) {
214 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
215 |                     if (utf8::internal::is_trail_surrogate(trail_surrogate))
216 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 |                     else
218 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 |                 }
220 |                 else
221 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
222 | 
223 |             }
224 |             // Lone trail surrogate
225 |             else if (utf8::internal::is_trail_surrogate(cp))
226 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
227 | 
228 |             result = utf8::append(cp, result);
229 |         }
230 |         return result;
231 |     }
232 | 
233 |     template <typename u16bit_iterator, typename octet_iterator>
234 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 |     {
236 |         while (start < end) {
237 |             uint32_t cp = utf8::next(start, end);
238 |             if (cp > 0xffff) { //make a surrogate pair
239 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
240 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 |             }
242 |             else
243 |                 *result++ = static_cast<uint16_t>(cp);
244 |         }
245 |         return result;
246 |     }
247 | 
248 |     template <typename octet_iterator, typename u32bit_iterator>
249 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 |     {
251 |         while (start != end)
252 |             result = utf8::append(*(start++), result);
253 | 
254 |         return result;
255 |     }
256 | 
257 |     template <typename octet_iterator, typename u32bit_iterator>
258 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 |     {
260 |         while (start < end)
261 |             (*result++) = utf8::next(start, end);
262 | 
263 |         return result;
264 |     }
265 | 
266 |     // The iterator class
267 |     template <typename octet_iterator>
268 |     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269 |       octet_iterator it;
270 |       octet_iterator range_start;
271 |       octet_iterator range_end;
272 |       public:
273 |       iterator () {}
274 |       explicit iterator (const octet_iterator& octet_it,
275 |                          const octet_iterator& range_start,
276 |                          const octet_iterator& range_end) :
277 |                it(octet_it), range_start(range_start), range_end(range_end)
278 |       {
279 |           if (it < range_start || it > range_end)
280 |               throw std::out_of_range("Invalid utf-8 iterator position");
281 |       }
282 |       // the default "big three" are OK
283 |       octet_iterator base () const { return it; }
284 |       uint32_t operator * () const
285 |       {
286 |           octet_iterator temp = it;
287 |           return utf8::next(temp, range_end);
288 |       }
289 |       bool operator == (const iterator& rhs) const
290 |       {
291 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
292 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 |           return (it == rhs.it);
294 |       }
295 |       bool operator != (const iterator& rhs) const
296 |       {
297 |           return !(operator == (rhs));
298 |       }
299 |       iterator& operator ++ ()
300 |       {
301 |           utf8::next(it, range_end);
302 |           return *this;
303 |       }
304 |       iterator operator ++ (int)
305 |       {
306 |           iterator temp = *this;
307 |           utf8::next(it, range_end);
308 |           return temp;
309 |       }
310 |       iterator& operator -- ()
311 |       {
312 |           utf8::prior(it, range_start);
313 |           return *this;
314 |       }
315 |       iterator operator -- (int)
316 |       {
317 |           iterator temp = *this;
318 |           utf8::prior(it, range_start);
319 |           return temp;
320 |       }
321 |     }; // class iterator
322 | 
323 | } // namespace utf8
324 | 
325 | #endif //header guard
326 | 
327 | 
328 | 


--------------------------------------------------------------------------------
/benchmark/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | namespace utf8
 34 | {
 35 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36 |     // You may need to change them to match your system.
 37 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38 |     typedef unsigned char   uint8_t;
 39 |     typedef unsigned short  uint16_t;
 40 |     typedef unsigned int    uint32_t;
 41 | 
 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 43 | namespace internal
 44 | {
 45 |     // Unicode constants
 46 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 47 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52 |     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53 |     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54 | 
 55 |     // Maximum valid value for a Unicode code point
 56 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57 | 
 58 |     template<typename octet_type>
 59 |     inline uint8_t mask8(octet_type oc)
 60 |     {
 61 |         return static_cast<uint8_t>(0xff & oc);
 62 |     }
 63 |     template<typename u16_type>
 64 |     inline uint16_t mask16(u16_type oc)
 65 |     {
 66 |         return static_cast<uint16_t>(0xffff & oc);
 67 |     }
 68 |     template<typename octet_type>
 69 |     inline bool is_trail(octet_type oc)
 70 |     {
 71 |         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
 72 |     }
 73 | 
 74 |     template <typename u16>
 75 |     inline bool is_lead_surrogate(u16 cp)
 76 |     {
 77 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78 |     }
 79 | 
 80 |     template <typename u16>
 81 |     inline bool is_trail_surrogate(u16 cp)
 82 |     {
 83 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84 |     }
 85 | 
 86 |     template <typename u16>
 87 |     inline bool is_surrogate(u16 cp)
 88 |     {
 89 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90 |     }
 91 | 
 92 |     template <typename u32>
 93 |     inline bool is_code_point_valid(u32 cp)
 94 |     {
 95 |         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
 96 |     }
 97 | 
 98 |     template <typename octet_iterator>
 99 |     inline typename std::iterator_traits<octet_iterator>::difference_type
100 |     sequence_length(octet_iterator lead_it)
101 |     {
102 |         uint8_t lead = utf8::internal::mask8(*lead_it);
103 |         if (lead < 0x80)
104 |             return 1;
105 |         else if ((lead >> 5) == 0x6)
106 |             return 2;
107 |         else if ((lead >> 4) == 0xe)
108 |             return 3;
109 |         else if ((lead >> 3) == 0x1e)
110 |             return 4;
111 |         else
112 |             return 0;
113 |     }
114 | 
115 |     template <typename octet_difference_type>
116 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 |     {
118 |         if (cp < 0x80) {
119 |             if (length != 1) 
120 |                 return true;
121 |         }
122 |         else if (cp < 0x800) {
123 |             if (length != 2) 
124 |                 return true;
125 |         }
126 |         else if (cp < 0x10000) {
127 |             if (length != 3) 
128 |                 return true;
129 |         }
130 | 
131 |         return false;
132 |     }
133 | 
134 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 | 
136 |     /// Helper for get_sequence_x
137 |     template <typename octet_iterator>
138 |     utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 |     {
140 |         if (++it == end)
141 |             return NOT_ENOUGH_ROOM;
142 | 
143 |         if (!utf8::internal::is_trail(*it))
144 |             return INCOMPLETE_SEQUENCE;
145 |         
146 |         return UTF8_OK;
147 |     }
148 | 
149 |     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
150 | 
151 |     /// get_sequence_x functions decode utf-8 sequences of the length x
152 |     template <typename octet_iterator>
153 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 |     {
155 |         if (it == end)
156 |             return NOT_ENOUGH_ROOM;
157 | 
158 |         code_point = utf8::internal::mask8(*it);
159 | 
160 |         return UTF8_OK;
161 |     }
162 | 
163 |     template <typename octet_iterator>
164 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 |     {
166 |         if (it == end) 
167 |             return NOT_ENOUGH_ROOM;
168 |         
169 |         code_point = utf8::internal::mask8(*it);
170 | 
171 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
172 | 
173 |         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174 | 
175 |         return UTF8_OK;
176 |     }
177 | 
178 |     template <typename octet_iterator>
179 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 |     {
181 |         if (it == end)
182 |             return NOT_ENOUGH_ROOM;
183 |             
184 |         code_point = utf8::internal::mask8(*it);
185 | 
186 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
187 | 
188 |         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189 | 
190 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 | 
192 |         code_point += (*it) & 0x3f;
193 | 
194 |         return UTF8_OK;
195 |     }
196 | 
197 |     template <typename octet_iterator>
198 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 |     {
200 |         if (it == end)
201 |            return NOT_ENOUGH_ROOM;
202 | 
203 |         code_point = utf8::internal::mask8(*it);
204 | 
205 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
206 | 
207 |         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208 | 
209 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
210 | 
211 |         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212 | 
213 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
214 | 
215 |         code_point += (*it) & 0x3f;
216 | 
217 |         return UTF8_OK;
218 |     }
219 | 
220 |     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221 | 
222 |     template <typename octet_iterator>
223 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 |     {
225 |         // Save the original value of it so we can go back in case of failure
226 |         // Of course, it does not make much sense with i.e. stream iterators
227 |         octet_iterator original_it = it;
228 | 
229 |         uint32_t cp = 0;
230 |         // Determine the sequence length based on the lead octet
231 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 |         const octet_difference_type length = utf8::internal::sequence_length(it);
233 | 
234 |         // Get trail octets and calculate the code point
235 |         utf_error err = UTF8_OK;
236 |         switch (length) {
237 |             case 0: 
238 |                 return INVALID_LEAD;
239 |             case 1:
240 |                 err = utf8::internal::get_sequence_1(it, end, cp);
241 |                 break;
242 |             case 2:
243 |                 err = utf8::internal::get_sequence_2(it, end, cp);
244 |             break;
245 |             case 3:
246 |                 err = utf8::internal::get_sequence_3(it, end, cp);
247 |             break;
248 |             case 4:
249 |                 err = utf8::internal::get_sequence_4(it, end, cp);
250 |             break;
251 |         }
252 | 
253 |         if (err == UTF8_OK) {
254 |             // Decoding succeeded. Now, security checks...
255 |             if (utf8::internal::is_code_point_valid(cp)) {
256 |                 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 |                     // Passed! Return here.
258 |                     code_point = cp;
259 |                     ++it;
260 |                     return UTF8_OK;
261 |                 }
262 |                 else
263 |                     err = OVERLONG_SEQUENCE;
264 |             }
265 |             else 
266 |                 err = INVALID_CODE_POINT;
267 |         }
268 | 
269 |         // Failure branch - restore the original value of the iterator
270 |         it = original_it;
271 |         return err;
272 |     }
273 | 
274 |     template <typename octet_iterator>
275 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
276 |         uint32_t ignored;
277 |         return utf8::internal::validate_next(it, end, ignored);
278 |     }
279 | 
280 | } // namespace internal
281 | 
282 |     /// The library API - functions intended to be called by the users
283 | 
284 |     // Byte order mark
285 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
286 | 
287 |     template <typename octet_iterator>
288 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
289 |     {
290 |         octet_iterator result = start;
291 |         while (result != end) {
292 |             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
293 |             if (err_code != internal::UTF8_OK)
294 |                 return result;
295 |         }
296 |         return result;
297 |     }
298 | 
299 |     template <typename octet_iterator>
300 |     inline bool is_valid(octet_iterator start, octet_iterator end)
301 |     {
302 |         return (utf8::find_invalid(start, end) == end);
303 |     }
304 | 
305 |     template <typename octet_iterator>
306 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
307 |     {
308 |         return (
309 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 |             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
312 |            );
313 |     }
314 | 	
315 |     //Deprecated in release 2.3 
316 |     template <typename octet_iterator>
317 |     inline bool is_bom (octet_iterator it)
318 |     {
319 |         return (
320 |             (utf8::internal::mask8(*it++)) == bom[0] &&
321 |             (utf8::internal::mask8(*it++)) == bom[1] &&
322 |             (utf8::internal::mask8(*it))   == bom[2]
323 |            );
324 |     }
325 | } // namespace utf8
326 | 
327 | #endif // header guard
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/benchmark/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked 
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);  
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator>
 61 |         uint32_t next(octet_iterator& it)
 62 |         {
 63 |             uint32_t cp = utf8::internal::mask8(*it);
 64 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
 65 |             switch (length) {
 66 |                 case 1:
 67 |                     break;
 68 |                 case 2:
 69 |                     it++;
 70 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 71 |                     break;
 72 |                 case 3:
 73 |                     ++it; 
 74 |                     cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
 75 |                     ++it;
 76 |                     cp += (*it) & 0x3f;
 77 |                     break;
 78 |                 case 4:
 79 |                     ++it;
 80 |                     cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
 81 |                     ++it;
 82 |                     cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
 83 |                     ++it;
 84 |                     cp += (*it) & 0x3f; 
 85 |                     break;
 86 |             }
 87 |             ++it;
 88 |             return cp;        
 89 |         }
 90 | 
 91 |         template <typename octet_iterator>
 92 |         uint32_t peek_next(octet_iterator it)
 93 |         {
 94 |             return utf8::unchecked::next(it);    
 95 |         }
 96 | 
 97 |         template <typename octet_iterator>
 98 |         uint32_t prior(octet_iterator& it)
 99 |         {
100 |             while (utf8::internal::is_trail(*(--it))) ;
101 |             octet_iterator temp = it;
102 |             return utf8::unchecked::next(temp);
103 |         }
104 | 
105 |         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 |         template <typename octet_iterator>
107 |         inline uint32_t previous(octet_iterator& it)
108 |         {
109 |             return utf8::unchecked::prior(it);
110 |         }
111 | 
112 |         template <typename octet_iterator, typename distance_type>
113 |         void advance (octet_iterator& it, distance_type n)
114 |         {
115 |             for (distance_type i = 0; i < n; ++i)
116 |                 utf8::unchecked::next(it);
117 |         }
118 | 
119 |         template <typename octet_iterator>
120 |         typename std::iterator_traits<octet_iterator>::difference_type
121 |         distance (octet_iterator first, octet_iterator last)
122 |         {
123 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
124 |             for (dist = 0; first < last; ++dist) 
125 |                 utf8::unchecked::next(first);
126 |             return dist;
127 |         }
128 | 
129 |         template <typename u16bit_iterator, typename octet_iterator>
130 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 |         {       
132 |             while (start != end) {
133 |                 uint32_t cp = utf8::internal::mask16(*start++);
134 |             // Take care of surrogate pairs first
135 |                 if (utf8::internal::is_lead_surrogate(cp)) {
136 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
137 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 |                 }
139 |                 result = utf8::unchecked::append(cp, result);
140 |             }
141 |             return result;         
142 |         }
143 | 
144 |         template <typename u16bit_iterator, typename octet_iterator>
145 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 |         {
147 |             while (start < end) {
148 |                 uint32_t cp = utf8::unchecked::next(start);
149 |                 if (cp > 0xffff) { //make a surrogate pair
150 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
151 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 |                 }
153 |                 else
154 |                     *result++ = static_cast<uint16_t>(cp);
155 |             }
156 |             return result;
157 |         }
158 | 
159 |         template <typename octet_iterator, typename u32bit_iterator>
160 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 |         {
162 |             while (start != end)
163 |                 result = utf8::unchecked::append(*(start++), result);
164 | 
165 |             return result;
166 |         }
167 | 
168 |         template <typename octet_iterator, typename u32bit_iterator>
169 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 |         {
171 |             while (start < end)
172 |                 (*result++) = utf8::unchecked::next(start);
173 | 
174 |             return result;
175 |         }
176 | 
177 |         // The iterator class
178 |         template <typename octet_iterator>
179 |           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
180 |             octet_iterator it;
181 |             public:
182 |             iterator () {}
183 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 |             // the default "big three" are OK
185 |             octet_iterator base () const { return it; }
186 |             uint32_t operator * () const
187 |             {
188 |                 octet_iterator temp = it;
189 |                 return utf8::unchecked::next(temp);
190 |             }
191 |             bool operator == (const iterator& rhs) const 
192 |             { 
193 |                 return (it == rhs.it);
194 |             }
195 |             bool operator != (const iterator& rhs) const
196 |             {
197 |                 return !(operator == (rhs));
198 |             }
199 |             iterator& operator ++ () 
200 |             {
201 |                 ::std::advance(it, utf8::internal::sequence_length(it));
202 |                 return *this;
203 |             }
204 |             iterator operator ++ (int)
205 |             {
206 |                 iterator temp = *this;
207 |                 ::std::advance(it, utf8::internal::sequence_length(it));
208 |                 return temp;
209 |             }  
210 |             iterator& operator -- ()
211 |             {
212 |                 utf8::unchecked::prior(it);
213 |                 return *this;
214 |             }
215 |             iterator operator -- (int)
216 |             {
217 |                 iterator temp = *this;
218 |                 utf8::unchecked::prior(it);
219 |                 return temp;
220 |             }
221 |           }; // class iterator
222 | 
223 |     } // namespace utf8::unchecked
224 | } // namespace utf8 
225 | 
226 | 
227 | #endif // header guard
228 | 
229 | 


--------------------------------------------------------------------------------
/example/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | add_executable(utf8_to_utf16be utf8_to_utf16be.cpp)
3 | target_include_directories(utf8_to_utf16be PRIVATE
4 |         ${RANGE_INCLUDE_DIR}
5 |         ${Boost_INCLUDE_DIR})


--------------------------------------------------------------------------------
/example/utf8_to_utf16be.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #include <iostream>
 7 | #include <fstream>
 8 | 
 9 | #include <tcb/utf_ranges/view.hpp>
10 | #include <tcb/utf_ranges/istreambuf_range.hpp>
11 | #include <tcb/utf_ranges/ostreambuf_iterator.hpp>
12 | 
13 | namespace rng = ::ranges::v3;
14 | namespace utf = ::tcb::utf_ranges;
15 | 
16 | int main(int argc, char** argv)
17 | {
18 |     if (argc < 3) {
19 |         std::cout << "Usage:\n"
20 |                   << "utf8_to_utf16be INFILE OUTFILE\n"
21 |                   << "\n"
22 |                   << "Converts a UTF-8 encoded file to big-endian UTF-16.\n";
23 |         return 1;
24 |     }
25 | 
26 |     std::ifstream in_file{argv[1], std::ios::binary};
27 |     std::ofstream out_file{argv[2], std::ios::binary};
28 | 
29 |     auto view = utf::istreambuf(in_file) // Read range from input stream
30 |             | utf::view::consume_bom     // Remove UTF-8 "BOM" if present
31 |             | utf::view::utf16           // Convert to UTF-16
32 |             | utf::view::add_bom         // Prepend UTF-16 BOM to start of range
33 |             | utf::view::endian_convert<boost::endian::order::big> // Convert to big-endian
34 |             | utf::view::bytes;          // Write to disk as bytes
35 | 
36 |     rng::copy(view, utf::ostreambuf_iterator<char>{out_file});
37 | }


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/convert.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #ifndef TCB_UTF_RANGES_CONVERT_HPP_INCLUDED
 7 | #define TCB_UTF_RANGES_CONVERT_HPP_INCLUDED
 8 | 
 9 | #include <tcb/utf_ranges/detail/utf.hpp>
10 | 
11 | #include <range/v3/range_fwd.hpp>
12 | 
13 | #include <string>
14 | 
15 | namespace tcb {
16 | namespace utf_ranges {
17 | 
18 | namespace rng = ::ranges::v3;
19 | 
20 | template <typename OutCharT,
21 |           typename InIter, typename Sentinel,
22 |           typename OutIter,
23 |           typename InCharT = typename std::iterator_traits<InIter>::value_type>
24 | OutIter utf_convert(InIter first, Sentinel last, OutIter out)
25 | {
26 |     while (first != last) {
27 |         const char32_t c = detail::utf_traits<InCharT>::decode(first, last);
28 |         detail::utf_traits<OutCharT>::encode(c, out);
29 |     }
30 |     return out;
31 | }
32 | 
33 | template <typename OutCharT,
34 |           typename InRange,
35 |           typename OutIter,
36 |           typename InCharT = rng::range_value_t<InRange>,
37 |           CONCEPT_REQUIRES_(rng::ForwardRange<InRange>())>
38 | OutIter utf_convert(InRange&& range, OutIter out)
39 | {
40 |     return utf_convert<OutCharT, rng::range_iterator_t<InRange>,
41 |                        rng::range_sentinel_t<InRange>, OutIter, InCharT>(
42 |             rng::begin(range), rng::end(range), std::move(out));
43 | }
44 | 
45 | template <typename Range, typename OutCharT,
46 |           typename InCharT = rng::range_value_t<Range>>
47 | std::basic_string<OutCharT>
48 | to_utf_string(Range&& range)
49 | {
50 |     using string_type = std::basic_string<OutCharT>;
51 | 
52 |     string_type output;
53 | 
54 |     // Try to minimise the number of reallocations
55 |     if /*constexpr*/ (::ranges::RandomAccessRange<Range>()) {
56 |         output.reserve(::ranges::size(range));
57 |     }
58 | 
59 |     utf_convert<OutCharT>(std::forward<Range>(range), std::back_inserter(output));
60 | 
61 |     return output;
62 | }
63 | 
64 | template <typename Range>
65 | std::string to_u8string(Range&& range)
66 | {
67 |     return to_utf_string<Range, char>(std::forward<Range>(range));
68 | }
69 | 
70 | template <typename Range>
71 | std::u16string to_u16string(Range&& range)
72 | {
73 |     return to_utf_string<Range, char16_t>(std::forward<Range>(range));
74 | }
75 | 
76 | template <typename Range>
77 | std::u32string to_u32string(Range&& range)
78 | {
79 |     return to_utf_string<Range, char32_t>(std::forward<Range>(range));
80 | }
81 | 
82 | template <typename Range>
83 | std::wstring to_wstsring(Range&& range)
84 | {
85 |     return to_utf_string<Range, wchar_t>(std::forward<Range>(range));
86 | }
87 | 
88 | } // end namespace utf_ranges
89 | } // end namespace tcb
90 | 
91 | #endif // TCB_UTF_RANGES_CONVERT_HPP_INCLUDED
92 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/detail/utf.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  This file is based on utf.hpp from Boost.Locale
  3 | //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  4 | //
  5 | //  Modifications (c) 2016 Tristan Brindle
  6 | //
  7 | //  Distributed under the Boost Software License, Version 1.0. (See
  8 | //  accompanying file LICENSE_1_0.txt or copy at
  9 | //  http://www.boost.org/LICENSE_1_0.txt)
 10 | //
 11 | #ifndef TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED
 12 | #define TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED
 13 | 
 14 | #include <cstdint>
 15 | 
 16 | namespace tcb {
 17 | namespace utf_ranges {
 18 | namespace detail {
 19 | 
 20 | /// \cond INTERNAL
 21 | #ifdef __GNUC__
 22 | #   define BOOST_LOCALE_LIKELY(x)   __builtin_expect((x),1)
 23 | #   define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
 24 | #else
 25 | #   define BOOST_LOCALE_LIKELY(x)   (x)
 26 | #   define BOOST_LOCALE_UNLIKELY(x) (x)
 27 | #endif
 28 | /// \endcond
 29 | 
 30 | ///
 31 | /// \brief The integral type that can hold a Unicode code point
 32 | ///
 33 | typedef char32_t code_point;
 34 | 
 35 | ///
 36 | /// \brief Special constant that defines illegal code point
 37 | ///
 38 | static constexpr code_point illegal = 0xFFFFFFFFu;
 39 | 
 40 | ///
 41 | /// \brief Special constant that defines incomplete code point
 42 | ///
 43 | static constexpr code_point incomplete = 0xFFFFFFFEu;
 44 | 
 45 | ///
 46 | /// \brief the function checks if \a v is a valid code point
 47 | ///
 48 | inline constexpr bool is_valid_codepoint(code_point v)
 49 | {
 50 |     if (v > 0x10FFFF)
 51 |         return false;
 52 |     if (0xD800 <= v && v <= 0xDFFF) // surrogates
 53 |         return false;
 54 |     return true;
 55 | }
 56 | 
 57 | template <typename CharType, int size = sizeof(CharType)>
 58 | struct utf_traits;
 59 | 
 60 | template <typename CharType>
 61 | struct encoded_chars {
 62 | public:
 63 |     constexpr encoded_chars() = default;
 64 | 
 65 |     constexpr encoded_chars(CharType _1)
 66 |             : chars_{{_1}}, size_{1} {}
 67 | 
 68 |     constexpr encoded_chars(CharType _1, CharType _2)
 69 |             : chars_{{_1, _2}}, size_{2} {}
 70 | 
 71 |     constexpr encoded_chars(CharType _1, CharType _2, CharType _3)
 72 |             : chars_{{_1, _2, _3}}, size_{3} {}
 73 | 
 74 |     constexpr encoded_chars(CharType _1, CharType _2, CharType _3, CharType _4)
 75 |             : chars_{{_1, _2, _3, _4}}, size_{4} {}
 76 | 
 77 |     constexpr int size() const noexcept { return size_; }
 78 | 
 79 |     constexpr CharType operator[](int i) const noexcept { return chars_[i]; }
 80 | 
 81 |     friend constexpr bool operator==(const encoded_chars& lhs,
 82 |                                      const encoded_chars& rhs)
 83 |     {
 84 |         return std::equal(std::begin(lhs.chars_),
 85 |                           std::begin(lhs.chars_) + lhs.size_,
 86 |                           std::begin(rhs.chars_),
 87 |                           std::begin(rhs.chars_) + rhs.size_);
 88 |     }
 89 | 
 90 | private:
 91 |     std::array<CharType, 4 / sizeof(CharType)> chars_{{}};
 92 |     int size_ = 0;
 93 | };
 94 | 
 95 | template <typename CharType>
 96 | struct utf_traits<CharType, 1> {
 97 | 
 98 |     typedef CharType char_type;
 99 | 
100 |     static constexpr int trail_length(char_type ci)
101 |     {
102 |         unsigned char c = ci;
103 |         if (c < 128)
104 |             return 0;
105 |         if (BOOST_LOCALE_UNLIKELY(c < 194))
106 |             return -1;
107 |         if (c < 224)
108 |             return 1;
109 |         if (c < 240)
110 |             return 2;
111 |         if (BOOST_LOCALE_LIKELY(c <= 244))
112 |             return 3;
113 |         return -1;
114 |     }
115 | 
116 |     static constexpr int max_width = 4;
117 | 
118 |     static constexpr int width(code_point value)
119 |     {
120 |         if (value <= 0x7F) {
121 |             return 1;
122 |         }
123 |         else if (value <= 0x7FF) {
124 |             return 2;
125 |         }
126 |         else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
127 |             return 3;
128 |         }
129 |         else {
130 |             return 4;
131 |         }
132 |     }
133 | 
134 |     static constexpr bool is_trail(char_type ci)
135 |     {
136 |         unsigned char c = ci;
137 |         return (c & 0xC0) == 0x80;
138 |     }
139 | 
140 |     static constexpr bool is_lead(char_type ci)
141 |     {
142 |         return !is_trail(ci);
143 |     }
144 | 
145 |     template <typename Iterator, typename Sentinel>
146 |     static constexpr code_point decode(Iterator& p, Sentinel e)
147 |     {
148 |         if (BOOST_LOCALE_UNLIKELY(p == e))
149 |             return incomplete;
150 | 
151 |         unsigned char lead = *p++;
152 | 
153 |         // First byte is fully validated here
154 |         int trail_size = trail_length(lead);
155 | 
156 |         if (BOOST_LOCALE_UNLIKELY(trail_size < 0))
157 |             return illegal;
158 | 
159 |         //
160 |         // Ok as only ASCII may be of size = 0
161 |         // also optimize for ASCII text
162 |         //
163 |         if (trail_size == 0)
164 |             return lead;
165 | 
166 |         code_point c = lead & ((1 << (6 - trail_size)) - 1);
167 | 
168 |         // Read the rest
169 |         unsigned char tmp{};
170 |         switch (trail_size) {
171 |         case 3:
172 |             if (BOOST_LOCALE_UNLIKELY(p == e))
173 |                 return incomplete;
174 |             tmp = *p++;
175 |             if (!is_trail(tmp))
176 |                 return illegal;
177 |             c = (c << 6) | (tmp & 0x3F);
178 |         case 2:
179 |             if (BOOST_LOCALE_UNLIKELY(p == e))
180 |                 return incomplete;
181 |             tmp = *p++;
182 |             if (!is_trail(tmp))
183 |                 return illegal;
184 |             c = (c << 6) | (tmp & 0x3F);
185 |         case 1:
186 |             if (BOOST_LOCALE_UNLIKELY(p == e))
187 |                 return incomplete;
188 |             tmp = *p++;
189 |             if (!is_trail(tmp))
190 |                 return illegal;
191 |             c = (c << 6) | (tmp & 0x3F);
192 |         }
193 | 
194 |         // Check code point validity: no surrogates and
195 |         // valid range
196 |         if (BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
197 |             return illegal;
198 | 
199 |         // make sure it is the most compact representation
200 |         if (BOOST_LOCALE_UNLIKELY(width(c) != trail_size + 1))
201 |             return illegal;
202 | 
203 |         return c;
204 | 
205 |     }
206 | 
207 |     template <typename Iterator>
208 |     static constexpr code_point decode_valid(Iterator& p)
209 |     {
210 |         unsigned char lead = *p++;
211 |         if (lead < 192)
212 |             return lead;
213 | 
214 |         int trail_size = 0;
215 | 
216 |         if (lead < 224)
217 |             trail_size = 1;
218 |         else if (BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
219 |             trail_size = 2;
220 |         else
221 |             trail_size = 3;
222 | 
223 |         code_point c = lead & ((1 << (6 - trail_size)) - 1);
224 | 
225 |         switch (trail_size) {
226 |         case 3:
227 |             c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
228 |         case 2:
229 |             c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
230 |         case 1:
231 |             c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
232 |         }
233 | 
234 |         return c;
235 |     }
236 | 
237 |     template <typename Iterator>
238 |     static constexpr Iterator encode(code_point value, Iterator out)
239 |     {
240 |         if (value <= 0x7F) {
241 |             *out++ = static_cast<char_type>(value);
242 |         }
243 |         else if (value <= 0x7FF) {
244 |             *out++ = static_cast<char_type>((value >> 6) | 0xC0);
245 |             *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
246 |         }
247 |         else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
248 |             *out++ = static_cast<char_type>((value >> 12) | 0xE0);
249 |             *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
250 |             *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
251 |         }
252 |         else {
253 |             *out++ = static_cast<char_type>((value >> 18) | 0xF0);
254 |             *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
255 |             *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
256 |             *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
257 |         }
258 |         return out;
259 |     }
260 | 
261 |     static constexpr encoded_chars<CharType> encode(code_point value)
262 |     {
263 |         if (value <= 0x7F) {
264 |             return {static_cast<char_type>(value)};
265 |         }
266 |         else if (value <= 0x7FF) {
267 |             return {static_cast<char_type>((value >> 6) | 0xC0),
268 |                     static_cast<char_type>((value & 0x3F) | 0x80)};
269 |         }
270 |         else if (BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
271 |             return {static_cast<char_type>((value >> 12) | 0xE0),
272 |                     static_cast<char_type>(((value >> 6) & 0x3F) | 0x80),
273 |                     static_cast<char_type>((value & 0x3F) | 0x80)};
274 |         }
275 |         else {
276 |             return {static_cast<char_type>((value >> 18) | 0xF0),
277 |                     static_cast<char_type>(((value >> 12) & 0x3F) | 0x80),
278 |                     static_cast<char_type>(((value >> 6) & 0x3F) | 0x80),
279 |                     static_cast<char_type>((value & 0x3F) | 0x80)};
280 |         }
281 |     }
282 | 
283 | }; // utf8
284 | 
285 | template <typename CharType>
286 | struct utf_traits<CharType, 2> {
287 |     typedef CharType char_type;
288 | 
289 |     // See RFC 2781
290 |     static constexpr bool is_first_surrogate(uint16_t x)
291 |     {
292 |         return 0xD800 <= x && x <= 0xDBFF;
293 |     }
294 | 
295 |     static constexpr bool is_second_surrogate(uint16_t x)
296 |     {
297 |         return 0xDC00 <= x && x <= 0xDFFF;
298 |     }
299 | 
300 |     static constexpr code_point combine_surrogate(uint16_t w1, uint16_t w2)
301 |     {
302 |         return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
303 |     }
304 | 
305 |     static constexpr int trail_length(char_type c)
306 |     {
307 |         if (is_first_surrogate(c))
308 |             return 1;
309 |         if (is_second_surrogate(c))
310 |             return -1;
311 |         return 0;
312 |     }
313 | 
314 |     ///
315 |     /// Returns true if c is trail code unit, always false for UTF-32
316 |     ///
317 |     static constexpr bool is_trail(char_type c)
318 |     {
319 |         return is_second_surrogate(c);
320 |     }
321 | 
322 |     ///
323 |     /// Returns true if c is lead code unit, always true of UTF-32
324 |     ///
325 |     static constexpr bool is_lead(char_type c)
326 |     {
327 |         return !is_second_surrogate(c);
328 |     }
329 | 
330 |     template <typename It, typename S>
331 |     static constexpr code_point decode(It& current, S last)
332 |     {
333 |         if (BOOST_LOCALE_UNLIKELY(current == last))
334 |             return incomplete;
335 |         uint16_t w1 = *current++;
336 |         if (BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
337 |             return w1;
338 |         }
339 |         if (w1 > 0xDBFF)
340 |             return illegal;
341 |         if (current == last)
342 |             return incomplete;
343 |         uint16_t w2 = *current++;
344 |         if (w2 < 0xDC00 || 0xDFFF < w2)
345 |             return illegal;
346 |         return combine_surrogate(w1, w2);
347 |     }
348 | 
349 |     template <typename It>
350 |     static constexpr code_point decode_valid(It& current)
351 |     {
352 |         uint16_t w1 = *current++;
353 |         if (BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
354 |             return w1;
355 |         }
356 |         uint16_t w2 = *current++;
357 |         return combine_surrogate(w1, w2);
358 |     }
359 | 
360 |     static const int max_width = 2;
361 | 
362 |     static constexpr int width(code_point u)
363 |     {
364 |         return u >= 0x10000 ? 2 : 1;
365 |     }
366 | 
367 |     template <typename It>
368 |     static constexpr It encode(code_point u, It out)
369 |     {
370 |         if (BOOST_LOCALE_LIKELY(u <= 0xFFFF)) {
371 |             *out++ = static_cast<char_type>(u);
372 |         }
373 |         else {
374 |             u -= 0x10000;
375 |             *out++ = static_cast<char_type>(0xD800 | (u >> 10));
376 |             *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
377 |         }
378 |         return out;
379 |     }
380 | 
381 |     static constexpr encoded_chars<CharType> encode(code_point u)
382 |     {
383 |         if (BOOST_LOCALE_LIKELY(u <= 0xFFFF)) {
384 |             return {static_cast<char_type>(u)};
385 |         }
386 |         else {
387 |             u -= 0x10000;
388 |             return {static_cast<char_type>(0xD800 | (u >> 10)),
389 |                     static_cast<char_type>(0xDC00 | (u & 0x3FF))};
390 |         }
391 |     }
392 | }; // utf16;
393 | 
394 | 
395 | template <typename CharType>
396 | struct utf_traits<CharType, 4> {
397 |     typedef CharType char_type;
398 | 
399 |     static constexpr int trail_length(char_type c)
400 |     {
401 |         if (is_valid_codepoint(c))
402 |             return 0;
403 |         return -1;
404 |     }
405 | 
406 |     static constexpr bool is_trail(char_type /*c*/)
407 |     {
408 |         return false;
409 |     }
410 | 
411 |     static constexpr bool is_lead(char_type /*c*/)
412 |     {
413 |         return true;
414 |     }
415 | 
416 |     template <typename It>
417 |     static constexpr code_point decode_valid(It& current)
418 |     {
419 |         return *current++;
420 |     }
421 | 
422 |     template <typename It, typename S>
423 |     static constexpr code_point decode(It& current, S last)
424 |     {
425 |         if (BOOST_LOCALE_UNLIKELY(current == last))
426 |             return incomplete;
427 |         code_point c = *current++;
428 |         if (BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
429 |             return illegal;
430 |         return c;
431 |     }
432 | 
433 |     static constexpr int max_width = 1;
434 | 
435 |     static constexpr int width(code_point /*u*/)
436 |     {
437 |         return 1;
438 |     }
439 | 
440 |     template <typename It>
441 |     static constexpr It encode(code_point u, It out)
442 |     {
443 |         *out++ = static_cast<char_type>(u);
444 |         return out;
445 |     }
446 | 
447 |     static constexpr encoded_chars<char_type> encode(code_point u)
448 |     {
449 |         return {static_cast<char_type>(u)};
450 |     }
451 | 
452 | }; // utf32
453 | 
454 | } // end namespace detail
455 | } // end namespace utf_ranges
456 | } // end namespace tcb
457 | 
458 | #endif // TCB_UTF_RANGES_DETAIL_UTF_HPP_INCLUDED
459 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/istreambuf_range.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | //  Based on istreambuf_range.hpp from Range-V3
  3 | //  Copyright Eric Niebler 2013-2014
  4 | //
  5 | //  Modifications (c) 2016 Tristan Brindle
  6 | //
  7 | //  Use, modification and distribution is subject to the
  8 | //  Boost Software License, Version 1.0. (See accompanying
  9 | //  file LICENSE_1_0.txt or copy at
 10 | //  http://www.boost.org/LICENSE_1_0.txt)
 11 | //
 12 | 
 13 | #ifndef TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED
 14 | #define TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED
 15 | 
 16 | #include <istream>
 17 | #include <range/v3/range_fwd.hpp>
 18 | #include <range/v3/view_facade.hpp>
 19 | #include <range/v3/utility/semiregular.hpp>
 20 | #include <range/v3/utility/static_const.hpp>
 21 | 
 22 | namespace tcb {
 23 | namespace utf_ranges {
 24 | 
 25 | namespace rng = ::ranges::v3;
 26 | using rng::static_const;
 27 | 
 28 | template<typename CharT = char, typename Traits = std::char_traits<CharT>>
 29 | struct istreambuf_range
 30 |   : rng::view_facade<istreambuf_range<CharT, Traits>, rng::unknown>
 31 | {
 32 | private:
 33 |     friend rng::range_access;
 34 |     std::basic_streambuf<CharT, Traits> *sbin_ = nullptr;
 35 |     bool done_ = false;
 36 |     typename Traits::int_type obj_{};
 37 | 
 38 |     struct cursor
 39 |     {
 40 |     private:
 41 |         istreambuf_range *rng_ = nullptr;
 42 |     public:
 43 |         cursor() = default;
 44 |         explicit cursor(istreambuf_range &rng)
 45 |           : rng_(&rng)
 46 |         {}
 47 | 
 48 |         void next()
 49 |         {
 50 |             rng_->next();
 51 |         }
 52 | 
 53 |         CharT get() const noexcept
 54 |         {
 55 |             return Traits::to_char_type(rng_->obj_);
 56 |         }
 57 | 
 58 |         bool done() const
 59 |         {
 60 |             return rng_->done_;
 61 |         }
 62 | 
 63 |     };
 64 | 
 65 |     void next()
 66 |     {
 67 |         obj_ = sbin_->sbumpc();
 68 |         if (obj_ == Traits::eof()) {
 69 |             done_ = true;
 70 |         }
 71 |     }
 72 | 
 73 |     cursor begin_cursor()
 74 |     {
 75 |         return cursor{*this};
 76 |     }
 77 | 
 78 | public:
 79 |     istreambuf_range() = default;
 80 | 
 81 |     istreambuf_range(std::basic_istream<CharT, Traits>& sin)
 82 |       : sbin_(sin.rdbuf())
 83 |     {
 84 |         next(); // prime the pump
 85 |     }
 86 | };
 87 | 
 88 | 
 89 | struct istreambuf_fn
 90 | {
 91 |     template <typename CharT, typename Traits>
 92 |     istreambuf_range<CharT, Traits>
 93 |     operator()(std::basic_istream<CharT, Traits>& sin) const
 94 |     {
 95 |         return {sin};
 96 |     }
 97 | };
 98 | 
 99 | RANGES_INLINE_VARIABLE(istreambuf_fn, istreambuf);
100 | 
101 | 
102 | } // end namespace utf_ranges
103 | } // end namespace tcb
104 | 
105 | #endif // TCB_UTF_RANGES_ISTREAMBUF_RANGE_HPP_INCLUDED
106 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/ostreambuf_iterator.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | //  Based on ranges::ostream_iterator from Range-V3
 3 | //  Copyright Eric Niebler 2013-2014
 4 | //
 5 | //  Modifications (c) 2016 Tristan Brindle
 6 | //
 7 | //  Use, modification and distribution is subject to the
 8 | //  Boost Software License, Version 1.0. (See accompanying
 9 | //  file LICENSE_1_0.txt or copy at
10 | //  http://www.boost.org/LICENSE_1_0.txt)
11 | 
12 | #ifndef TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED
13 | #define TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED
14 | 
15 | #include <range/v3/range_fwd.hpp>
16 | 
17 | namespace tcb {
18 | namespace utf_ranges {
19 | 
20 | template <typename Char = char, typename Traits = std::char_traits <Char>>
21 | struct ostreambuf_iterator {
22 | private:
23 |     std::basic_streambuf<Char, Traits>* sout_;
24 | 
25 |     struct proxy {
26 |         std::basic_streambuf<Char, Traits>* sout_;
27 | 
28 |         proxy& operator=(Char t)
29 |         {
30 |             RANGES_ASSERT(sout_);
31 |             sout_->sputc(t);
32 |             return *this;
33 |         }
34 |     };
35 | 
36 | public:
37 |     using difference_type = std::ptrdiff_t;
38 |     using char_type = Char;
39 |     using traits_type = Traits;
40 | 
41 |     ostreambuf_iterator() = default;
42 | 
43 |     ostreambuf_iterator(std::basic_ostream<Char, Traits>& sout) noexcept
44 |             : sout_(sout.rdbuf()) {}
45 | 
46 |     proxy operator*() const noexcept
47 |     {
48 |         return {sout_};
49 |     }
50 | 
51 |     ostreambuf_iterator& operator++()
52 |     {
53 |         return *this;
54 |     }
55 | 
56 |     ostreambuf_iterator operator++(int)
57 |     {
58 |         return *this;
59 |     }
60 | };
61 | 
62 | } // end namespace utf_ranges
63 | } // end namespace tcb
64 | 
65 | #endif // TCB_UTF_RANGES_OSTREAMBUF_ITERATOR_HPP_INCLUDED


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #ifndef TCB_UTF_RANGES_VIEW_HPP_INCLUDED
 7 | #define TCB_UTF_RANGES_VIEW_HPP_INCLUDED
 8 | 
 9 | #include <tcb/utf_ranges/view/bom.hpp>
10 | #include <tcb/utf_ranges/view/bytes.hpp>
11 | #include <tcb/utf_ranges/view/endian_convert.hpp>
12 | #include <tcb/utf_ranges/view/line_end_transform.hpp>
13 | #include <tcb/utf_ranges/view/utf_convert.hpp>
14 | 
15 | #endif // TCB_UTF_RANGES_VIEW_HPP_INCLUDED
16 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view/bom.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #ifndef TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED
  7 | #define TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED
  8 | 
  9 | #include <tcb/utf_ranges/view/endian_convert.hpp>
 10 | #include <tcb/utf_ranges/view/utf_convert.hpp>
 11 | 
 12 | #include <range/v3/algorithm/copy_n.hpp>
 13 | #include <range/v3/algorithm/equal.hpp>
 14 | #include <range/v3/view_interface.hpp>
 15 | #include <range/v3/view/concat.hpp>
 16 | #include <range/v3/view/drop.hpp>
 17 | #include <range/v3/view/single.hpp>
 18 | #include <range/v3/view/take.hpp>
 19 | 
 20 | namespace tcb {
 21 | namespace utf_ranges {
 22 | 
 23 | using rng::static_const;
 24 | using rng::operator|;
 25 | 
 26 | namespace view {
 27 | 
 28 | namespace detail {
 29 | template <std::size_t> constexpr std::size_t bom_size_helper = 0;
 30 | template<> constexpr std::size_t bom_size_helper<1> = 3;
 31 | template<> constexpr std::size_t bom_size_helper<2> = 1;
 32 | template<> constexpr std::size_t bom_size_helper<4> = 1;
 33 | 
 34 | template <typename T>
 35 | struct bom_size {
 36 |     static constexpr std::size_t value = bom_size_helper<sizeof(T)>;
 37 | };
 38 | 
 39 | template <typename T> constexpr std::size_t bom_size_v = bom_size<T>::value;
 40 | 
 41 | constexpr boost::endian::order nonnative_order =
 42 |         boost::endian::order::native == boost::endian::order::big
 43 |         ? boost::endian::order::little
 44 |         : boost::endian::order::big;
 45 | 
 46 | template <typename Range>
 47 | bool has_bom(const Range& range)
 48 | {
 49 |     using v = rng::range_value_t<Range>;
 50 | 
 51 |     // Better to use if constexpr once we get it, but this should be
 52 |     // easily optimised too
 53 |     switch (sizeof(v)) {
 54 |     case 1: {
 55 |         auto it = rng::cbegin(range);
 56 |         return rng::size(range) >= 3 &&
 57 |                 static_cast<uint8_t>(*it) == 0xEF &&
 58 |                 static_cast<uint8_t>(*++it) == 0xBB &&
 59 |                 static_cast<uint8_t>(*++it) == 0xBF;
 60 |     }
 61 |     case 2:
 62 |         return rng::size(range) != 0 &&
 63 |                 static_cast<char16_t>(*rng::begin(range)) == u'\uFEFF';
 64 |     case 4:
 65 |         return rng::size(range) != 0 &&
 66 |                 static_cast<char32_t>(*rng::begin(range)) == U'\uFEFF';
 67 |     default:
 68 |         return false;
 69 |     }
 70 | }
 71 | 
 72 | template <typename Range>
 73 | bool has_swapped_bom(const Range& range)
 74 | {
 75 |     using v = rng::range_value_t<Range>;
 76 | 
 77 |     switch (sizeof(v)) {
 78 |     case 2:
 79 |         return static_cast<std::uint16_t>(*rng::begin(range)) ==
 80 |                 boost::endian::endian_reverse(static_cast<uint16_t>(u'\uFEFF'));
 81 |     case 4:
 82 |         return static_cast<std::uint32_t>(*rng::begin(range)) ==
 83 |                 boost::endian::endian_reverse(static_cast<uint32_t>(U'\uFEFF'));
 84 |     default:
 85 |         return false;
 86 |     }
 87 | }
 88 | 
 89 | template <typename Rng>
 90 | struct bom_concat_view : rng::view_adaptor<bom_concat_view<Rng>, Rng>
 91 | {
 92 | private:
 93 |     using string_type = std::basic_string<rng::range_value_t<Rng>>;
 94 | 
 95 |     struct adaptor : rng::adaptor_base {
 96 | 
 97 |         adaptor() = default;
 98 | 
 99 |         adaptor(bom_concat_view& view)
100 |                 : bom_first_(view.bom_.begin()),
101 |                   bom_last_(view.bom_.end())
102 |         {}
103 | 
104 |         auto get(rng::range_iterator_t<Rng> it) const
105 |         {
106 |             if (bom_first_ != bom_last_) {
107 |                 return *bom_first_;
108 |             } else {
109 |                 return *it;
110 |             }
111 |         }
112 | 
113 |         void next(rng::range_iterator_t<Rng>& it)
114 |         {
115 |             if (bom_first_ != bom_last_) {
116 |                 ++bom_first_;
117 |             } else {
118 |                 ++it;
119 |             }
120 |         }
121 | 
122 |         rng::range_iterator_t<string_type> bom_first_{};
123 |         rng::range_sentinel_t<string_type> bom_last_{};
124 |     };
125 | 
126 | 
127 | public:
128 |     bom_concat_view() = default;
129 | 
130 |     bom_concat_view(Rng rng, string_type bom)
131 |             : rng::view_adaptor<bom_concat_view, Rng>(std::move(rng)),
132 |               bom_(std::move(bom))
133 |     {}
134 | 
135 |     adaptor begin_adaptor() { return adaptor{*this}; }
136 | 
137 | private:
138 |     string_type bom_;
139 | };
140 | 
141 | } // end namespace detail
142 | 
143 | struct consume_bom_fn {
144 |     template <typename Range,
145 |               CONCEPT_REQUIRES_(rng::ForwardRange<Range>())>
146 |     auto operator()(Range&& range) const
147 |     {
148 |         using value_type = rng::range_value_t<Range>;
149 | 
150 |         rng::range_difference_t<Range> bom_size = 0;
151 |         boost::endian::order byte_order = boost::endian::order::native;
152 | 
153 |         if (detail::has_bom(range)) {
154 |             bom_size = detail::bom_size_v<value_type>;
155 |         } else if (detail::has_swapped_bom(range)) {
156 |             bom_size = detail::bom_size_v<value_type>;
157 |             byte_order = detail::nonnative_order;
158 |         }
159 | 
160 |         return endian_convert<>(rng::view::drop(std::forward<Range>(range), bom_size),
161 |                                 byte_order);
162 |     }
163 | 
164 |     template <typename Range,
165 |               CONCEPT_REQUIRES_(rng::InputRange<Range>() &&
166 |                                 !rng::ForwardRange<Range>())>
167 |     auto operator()(Range&& range) const
168 |     {
169 |         using value_type = rng::range_value_t<Range>;
170 |         constexpr rng::range_difference_t<Range> bom_size = detail::bom_size_v<value_type>;
171 | 
172 |         boost::endian::order byte_order = boost::endian::order::native;
173 | 
174 |         // For InputRanges (only), testing for the BOM will "eat up" the first
175 |         // character(s) of the range. So save them in a temporary string so that
176 |         // we can put them back later if it turns out not to be a BOM.
177 |         std::basic_string<value_type> buf{};
178 |         rng::copy_n(rng::begin(range), bom_size, rng::back_inserter(buf));
179 | 
180 |         if (detail::has_bom(buf)) {
181 |             buf.clear();
182 |         } else if (detail::has_swapped_bom(buf)) {
183 |             buf.clear();
184 |             byte_order = detail::nonnative_order;
185 |         }
186 | 
187 |         return endian_convert<>(
188 |                 detail::bom_concat_view<rng::view::all_t<Range>>(
189 |                         rng::view::all(std::forward<Range>(range)),
190 |                         std::move(buf)),
191 |                 byte_order);
192 |     }
193 | 
194 |     decltype(auto) operator()() const {
195 |         return rng::make_pipeable(std::bind(*this));
196 |     }
197 | };
198 | 
199 | RANGES_INLINE_VARIABLE(rng::view::view<consume_bom_fn>, consume_bom)
200 | 
201 | struct add_bom_fn {
202 | 
203 |     template <typename Range>
204 |     auto operator()(Range&& range) const
205 |     {
206 |         using char_type = rng::range_value_t<Range>;
207 |         constexpr char32_t bom = U'\uFEFF';
208 | 
209 |         return rng::view::concat(utf_convert<char_type>(rng::view::single(bom)),
210 |                                  std::forward<Range>(range));
211 |     }
212 | 
213 |     decltype(auto) operator()() const
214 |     {
215 |         return rng::make_pipeable(std::bind(*this));
216 |     }
217 | };
218 | 
219 | RANGES_INLINE_VARIABLE(rng::view::view<add_bom_fn>, add_bom)
220 | 
221 | } // end namespace view
222 | } // end namespace utf_ranges
223 | } // end namespace tcb
224 | 
225 | #endif // TCB_UTF_RANGES_VIEW_BOM_HPP_INCLUDED
226 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view/bytes.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #ifndef TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED
 7 | #define TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED
 8 | 
 9 | #include <range/v3/view_adaptor.hpp>
10 | #include <range/v3/view/view.hpp>
11 | 
12 | namespace tcb {
13 | namespace utf_ranges {
14 | 
15 | namespace rng = ::ranges::v3;
16 | using rng::static_const;
17 | 
18 | template <typename Rng>
19 | class bytes_view : public rng::view_adaptor<bytes_view<Rng>, Rng>
20 | {
21 | private:
22 |     using value_type = rng::range_value_t<Rng>;
23 |     using byte = unsigned char;
24 | 
25 |     friend rng::range_access;
26 | 
27 |     struct adaptor : rng::adaptor_base
28 |     {
29 |         adaptor() = default;
30 | 
31 |         adaptor(const bytes_view& b)
32 |         {
33 |             if (b.mutable_base().begin() != b.mutable_base().end()) {
34 |                 fill_buffer(b.mutable_base().begin());
35 |             }
36 |         }
37 | 
38 |         void fill_buffer(rng::range_iterator_t<Rng> it)
39 |         {
40 |             const value_type t = *it;
41 |             std::copy(reinterpret_cast<const byte*>(&t),
42 |                       reinterpret_cast<const byte*>(&t) + sizeof(value_type),
43 |                       buf_.begin());
44 |             idx_ = 0;
45 |         }
46 | 
47 | 
48 |         byte get(rng::range_iterator_t<Rng>) const {
49 |             return buf_[idx_++];
50 |         }
51 | 
52 |         void next(rng::range_iterator_t<Rng>& it) {
53 |             if (idx_ == sizeof(value_type)) {
54 |                 fill_buffer(++it);
55 |             }
56 |         }
57 | 
58 |         std::array<byte, sizeof(value_type)> buf_{{}};
59 |         mutable int idx_ = 0;
60 |     };
61 | 
62 | public:
63 | 
64 |     bytes_view() = default;
65 | 
66 |     bytes_view(Rng range)
67 |             : rng::view_adaptor<bytes_view, Rng>(std::move(range))
68 |     {}
69 | 
70 |     adaptor begin_adaptor() const { return adaptor{*this}; }
71 | };
72 | 
73 | namespace view {
74 | 
75 | struct bytes_fn {
76 | 
77 |     template <typename Rng>
78 |     bytes_view<rng::view::all_t<Rng>> operator()(Rng&& range) const
79 |     {
80 |         return {rng::view::all(std::forward<Rng>(range))};
81 |     }
82 | 
83 |     decltype(auto) operator()() const
84 |     {
85 |         rng::make_pipeable(std::bind(*this));
86 |     }
87 | 
88 | };
89 | 
90 | RANGES_INLINE_VARIABLE(rng::view::view<bytes_fn>, bytes);
91 | 
92 | } // end namespace view
93 | } // end namespace utf_ranges
94 | } // end namespace tcb
95 | 
96 | #endif // TCB_UTF_RANGES_VIEW_BYTES_HPP_INCLUDED
97 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view/endian_convert.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #ifndef TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED
  7 | #define TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED
  8 | 
  9 | #include <boost/endian/conversion.hpp>
 10 | #include <range/v3/view_adaptor.hpp>
 11 | #include <range/v3/view/transform.hpp>
 12 | 
 13 | namespace tcb {
 14 | namespace utf_ranges {
 15 | 
 16 | namespace rng = ::ranges::v3;
 17 | using rng::static_const;
 18 | using rng::operator|;
 19 | 
 20 | namespace detail {
 21 | 
 22 | // This stuff is necessary because Boost.Endian doesn't seem to handle
 23 | // byte-swapping character types very well. [unsigned] char should just be
 24 | // returned unaltered, but instead it gets promoted to int, byte-swapped and
 25 | // converted back to char, which means that it is always zero. The same happens
 26 | // to char16_t. To get around this, we wrap values in a swap_wrapper<> struct,
 27 | // with "overloads" for the endian_reverse functions for char and char16_t
 28 | // (actually function that get found by the Boost library via ADL). All other
 29 | // types get forwarded to the regular boost conversion function.
 30 | 
 31 | template <typename T>
 32 | struct swap_wrapper {
 33 |     T value;
 34 | };
 35 | 
 36 | template <typename T>
 37 | swap_wrapper<T> make_swap_wrapper(T t)
 38 | {
 39 |     return swap_wrapper<T>{t};
 40 | }
 41 | 
 42 | template <typename T>
 43 | swap_wrapper<T> endian_reverse(swap_wrapper<T> s) noexcept
 44 | {
 45 |     return swap_wrapper<T>{boost::endian::endian_reverse(s.value)};
 46 | }
 47 | 
 48 | inline swap_wrapper<char> endian_reverse(swap_wrapper<char> s) noexcept
 49 | {
 50 |     return s;
 51 | }
 52 | 
 53 | inline swap_wrapper<char16_t> endian_reverse(swap_wrapper<char16_t> s) noexcept
 54 | {
 55 |     return make_swap_wrapper(
 56 |             static_cast<char16_t>(
 57 |                 boost::endian::endian_reverse(static_cast<std::uint16_t>(s.value)))
 58 |     );
 59 | }
 60 | 
 61 | inline swap_wrapper<wchar_t> endian_reverse(swap_wrapper<wchar_t> s) noexcept
 62 | {
 63 |     using traits = std::char_traits<wchar_t>;
 64 |     static_assert(sizeof(traits::int_type) == sizeof(wchar_t), "");
 65 | 
 66 |     return swap_wrapper<wchar_t>{
 67 |             traits::to_char_type(
 68 |                     boost::endian::endian_reverse(traits::to_int_type(s.value)))
 69 |     };
 70 | }
 71 | 
 72 | } // end namespace detail
 73 | 
 74 | namespace view {
 75 | 
 76 | template <boost::endian::order DestOrder>
 77 | struct endian_convert_fn {
 78 |     template <typename Range>
 79 |     auto operator()(Range&& range,
 80 |                     boost::endian::order src_order = boost::endian::order::native) const
 81 |     {
 82 |         const auto swapper = [src_order] (auto c){
 83 |             return boost::endian::conditional_reverse(detail::make_swap_wrapper(c),
 84 |                                                       src_order, DestOrder).value;
 85 |         };
 86 |         return rng::view::transform(std::forward<Range>(range),
 87 |                                     std::move(swapper));
 88 |     }
 89 | 
 90 |     decltype(auto) operator()(boost::endian::order src_endian = boost::endian::order::native) const
 91 |     {
 92 |         return rng::make_pipeable(std::bind(*this, std::placeholders::_1,
 93 |                                             rng::protect(src_endian)));
 94 |     }
 95 | };
 96 | 
 97 | inline namespace
 98 | {
 99 |     template <boost::endian::order DestOrder = boost::endian::order::native>
100 |     constexpr auto& endian_convert = static_const<rng::view::view<endian_convert_fn<DestOrder>>>::value;
101 | }
102 | 
103 | } // end namespace view
104 | } // end namespace utf_ranges
105 | } // end namespace tcb
106 | 
107 | #endif // TCB_UTF_RANGES_VIEW_ENDIAN_CONVERT_HPP_INCLUDED
108 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view/line_end_transform.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #ifndef TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED
 7 | #define TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED
 8 | 
 9 | #include <range/v3/view_adaptor.hpp>
10 | #include <tcb/utf_ranges/view/utf_convert.hpp>
11 | 
12 | namespace tcb {
13 | namespace utf_ranges {
14 | 
15 | namespace rng = ::ranges::v3;
16 | 
17 | template <typename Rng>
18 | class line_end_transform_view
19 |         : public rng::view_adaptor<line_end_transform_view<Rng>, Rng>
20 | {
21 | private:
22 |     friend rng::range_access;
23 | 
24 |     struct adaptor : rng::adaptor_base {
25 |         adaptor() = default;
26 | 
27 |         char32_t get(rng::range_iterator_t<Rng> it) const
28 |         {
29 |             char32_t c = *it;
30 |             switch (c) {
31 |             case U'\u0085': // Next line (NEL)
32 |             case U'\u000B': // Vertical tab (VT)
33 |             case U'\u000C': // Form feed (FF)
34 |             case U'\u2028': // Line separator (LS)
35 |             case U'\u2029': // Paragraph separator (PS)
36 |                 c = U'\n';
37 |                 break;
38 |             case U'\u000D': // Carriage return (CR)
39 |                 c = U'\n';
40 |                 // If next character is LF, skip it
41 |                 skip_next_ = (*++it == U'\u000A');
42 |                 break;
43 |             }
44 | 
45 |             return c;
46 |         }
47 | 
48 |         void next(rng::range_iterator_t<Rng>& it)
49 |         {
50 |             ++it;
51 |             if (skip_next_) {
52 |                 ++it;
53 |                 skip_next_ = false;
54 |             }
55 |         }
56 | 
57 |         bool equal(const adaptor& other) const
58 |         {
59 |             return skip_next_ == other.skip_next_;
60 |         }
61 | 
62 |         mutable bool skip_next_ = false;
63 |     };
64 | 
65 | public:
66 | 
67 |     adaptor begin_adaptor() const { return adaptor{}; }
68 | 
69 |     line_end_transform_view() = default;
70 | 
71 |     line_end_transform_view(Rng rng)
72 |         : rng::view_adaptor<line_end_transform_view, Rng>{std::move(rng)}
73 |     {}
74 | };
75 | 
76 | namespace view {
77 | 
78 | template <typename Rng>
79 | auto line_end_transform(Rng&& range)
80 | {
81 |     // Convert to UTF-32 and then back again
82 |     using CharT = rng::range_value_t<Rng>;
83 |     using R = decltype(rng::view::all(utf32(std::forward<Rng>(range))));
84 |     return utf_convert<CharT>(
85 |         line_end_transform_view<R>{rng::view::all(utf32(std::forward<Rng>(range)))}
86 |     );
87 | }
88 | 
89 | 
90 | } // end namespace view
91 | } // end namespace utf_ranges
92 | } // end namespace tcb
93 | 
94 | #endif // TCB_UTF_RANGES_VIEW_LINE_END_TRANSFORM_HPP_INCLUDED
95 | 


--------------------------------------------------------------------------------
/include/tcb/utf_ranges/view/utf_convert.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #ifndef TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED
  7 | #define TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED
  8 | 
  9 | #include <range/v3/view_facade.hpp>
 10 | #include <range/v3/view/all.hpp>
 11 | #include <range/v3/view/view.hpp>
 12 | 
 13 | #include <tcb/utf_ranges/detail/utf.hpp>
 14 | 
 15 | namespace tcb {
 16 | namespace utf_ranges {
 17 | 
 18 | namespace rng = ::ranges::v3;
 19 | using rng::static_const;
 20 | 
 21 | template <typename Range, typename InCharT, typename OutCharT>
 22 | class utf_convert_view
 23 |         : public rng::view_facade<utf_convert_view<Range, InCharT, OutCharT>, rng::unknown> {
 24 |     struct cursor {
 25 |         cursor() = default;
 26 | 
 27 |         cursor(utf_convert_view& parent)
 28 |                 : first_(rng::begin(parent.range_)),
 29 |                   last_(rng::end(parent.range_))
 30 |         {
 31 |             if (first_ != last_) {
 32 |                 char32_t c = detail::utf_traits<InCharT>::decode(first_, last_);
 33 |                 next_chars_ = detail::utf_traits<OutCharT>::encode(c);
 34 |             }
 35 |         }
 36 | 
 37 |         cursor(const utf_convert_view& parent)
 38 |                 : first_(rng::begin(parent.range_)),
 39 |                   last_(rng::end(parent.range_))
 40 |         {
 41 |             if (first_ != last_) {
 42 |                 char32_t c = detail::utf_traits<InCharT>::decode(first_, last_);
 43 |                 next_chars_ = detail::utf_traits<OutCharT>::encode(c);
 44 |             }
 45 |         }
 46 | 
 47 |         void next()
 48 |         {
 49 |             if (++idx_ == next_chars_.size() && first_ != last_) {
 50 |                 char32_t c = detail::utf_traits<InCharT>::decode(first_, last_);
 51 |                 next_chars_ = detail::utf_traits<OutCharT>::encode(c);
 52 |                 idx_ = 0;
 53 |             }
 54 |         }
 55 | 
 56 |         OutCharT get() const
 57 |         {
 58 |             return next_chars_[idx_];
 59 |         }
 60 | 
 61 |         bool done() const
 62 |         {
 63 |             return first_ == last_ && idx_ == next_chars_.size();
 64 |         }
 65 | 
 66 |         bool equal(const cursor& other) const
 67 |         {
 68 |             return std::tie(next_chars_) ==
 69 |                     std::tie(other.next_chars_);
 70 |         }
 71 | 
 72 |         detail::encoded_chars<OutCharT> next_chars_;
 73 |         char idx_ = 0;
 74 |         rng::range_iterator_t<Range> first_{};
 75 |         rng::range_sentinel_t<Range> last_{};
 76 |     };
 77 | 
 78 | public:
 79 |     cursor begin_cursor() { return cursor{*this}; }
 80 | 
 81 |     CONCEPT_REQUIRES(rng::Range<const Range>())
 82 |     cursor begin_cursor() const { return cursor{*this}; }
 83 | 
 84 |     utf_convert_view() = default;
 85 | 
 86 |     utf_convert_view(Range range)
 87 |             : range_{std::move(range)} {}
 88 | 
 89 | private:
 90 |     Range range_{};
 91 |     friend rng::range_access;
 92 | };
 93 | 
 94 | namespace view {
 95 | 
 96 | template <typename OutCharT>
 97 | struct utf_convert_fn {
 98 |     template <typename Range,
 99 |               typename InCharT = rng::range_value_t<Range>>
100 |     utf_convert_view<rng::view::all_t<Range>, InCharT, OutCharT>
101 |     operator()(Range&& range) const
102 |     {
103 |         return {rng::view::all(std::forward<Range>(range))};
104 |     }
105 | 
106 |     decltype(auto) operator()() const
107 |     {
108 |         return rng::make_pipeable(std::bind(*this));
109 |     }
110 | };
111 | 
112 | inline namespace
113 | {
114 |     template <typename OutCharT>
115 |     constexpr auto& utf_convert = static_const<rng::view::view<utf_convert_fn<OutCharT>>>::value;
116 | }
117 | 
118 | struct utf8_fn {
119 |     template <typename Range>
120 |     utf_convert_view<rng::view::all_t<Range>, rng::range_value_t<Range>, char>
121 |     operator()(Range&& range) const
122 |     {
123 |         return {rng::view::all(std::forward<Range>(range))};
124 |     }
125 | 
126 |     decltype(auto) operator()() const
127 |     {
128 |         return rng::make_pipeable(std::bind(*this));
129 |     }
130 | };
131 | 
132 | RANGES_INLINE_VARIABLE(rng::view::view<utf8_fn>, utf8);
133 | 
134 | struct utf16_fn {
135 |     template <typename Range>
136 |     utf_convert_view<rng::view::all_t<Range>, rng::range_value_t<Range>, char16_t>
137 |     operator()(Range&& range) const
138 |     {
139 |         return {rng::view::all(std::forward<Range>(range))};
140 |     }
141 | 
142 |     decltype(auto) operator()() const
143 |     {
144 |         return rng::make_pipeable(std::bind(*this));
145 |     }
146 | };
147 | 
148 | RANGES_INLINE_VARIABLE(rng::view::view<utf16_fn>, utf16);
149 | 
150 | struct utf32_fn {
151 |     template <typename Range>
152 |     utf_convert_view<rng::view::all_t<Range>, rng::range_value_t<Range>, char32_t>
153 |     operator()(Range&& range) const
154 |     {
155 |         return {rng::view::all(std::forward<Range>(range))};
156 |     }
157 | 
158 |     decltype(auto) operator()() const
159 |     {
160 |         return rng::make_pipeable(std::bind(*this));
161 |     }
162 | };
163 | 
164 | RANGES_INLINE_VARIABLE(rng::view::view<utf32_fn>, utf32);
165 | 
166 | } // end namespace view
167 | } // end namespace utf_ranges
168 | } // end namespace tcb
169 | 
170 | #endif // TCB_UTF_RANGES_VIEW_UTF_CONVERT_HPP_INCLUDED
171 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | add_executable(utf_ranges_test
 3 |     bom_test.cpp
 4 |     bytes_test.cpp
 5 |     catch_main.cpp
 6 |     endian_test.cpp
 7 |     istreambuf_range_test.cpp
 8 |     line_end_transform_test.cpp
 9 |     ostreambuf_iterator_test.cpp
10 |     utf_convert_view_test.cpp
11 |     )
12 | 
13 | target_include_directories(utf_ranges_test PRIVATE
14 |         ${RANGE_INCLUDE_DIR}
15 |         ${Boost_INCLUDE_DIR}
16 |         )
17 | 


--------------------------------------------------------------------------------
/test/bom_test.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #include "catch.hpp"
  7 | 
  8 | #include <tcb/utf_ranges/view/bom.hpp>
  9 | 
 10 | #include <range/v3/algorithm/equal.hpp>
 11 | #include <range/v3/istream_range.hpp>
 12 | #include <iostream>
 13 | #include <sstream>
 14 | 
 15 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E"
 16 | 
 17 | const auto to_little_endian = [] (const auto& in) {
 18 |     using char_type = ranges::range_value_t<decltype(in)>;
 19 |     std::basic_string<char_type> out;
 20 |     for (auto c : in) {
 21 |         out.push_back(boost::endian::native_to_little(
 22 |                 tcb::utf_ranges::detail::make_swap_wrapper(c)).value);
 23 |     }
 24 |     return out;
 25 | };
 26 | 
 27 | const auto to_big_endian = [] (const auto& in) {
 28 |     using char_type = ranges::range_value_t<decltype(in)>;
 29 |     std::basic_string<char_type> out;
 30 |     for (auto c : in) {
 31 |         out.push_back(boost::endian::native_to_big(
 32 |                 tcb::utf_ranges::detail::make_swap_wrapper(c)).value);
 33 |     }
 34 |     return out;
 35 | };
 36 | 
 37 | 
 38 | TEST_CASE("Byte order mark is prepended correctly", "[bom]")
 39 | {
 40 |     SECTION("...for UTF-8") {
 41 |         const std::string str = u8"" TEST_STRING;
 42 |         const std::string test = tcb::utf_ranges::view::add_bom(str);
 43 |         REQUIRE(test == u8"\ufeff" + str);
 44 |     }
 45 | 
 46 |     SECTION("...for UTF-16") {
 47 |         const std::u16string str = u"" TEST_STRING;
 48 |         const std::u16string test = tcb::utf_ranges::view::add_bom(str);
 49 |         REQUIRE(test == u"\ufeff" + str);
 50 |     }
 51 | 
 52 |     SECTION("...for UTF-32") {
 53 |         const std::u32string str = U"" TEST_STRING;
 54 |         const std::u32string test = tcb::utf_ranges::view::add_bom(str);
 55 |         REQUIRE(test == U"\ufeff" + str);
 56 |     }
 57 | 
 58 |     SECTION("...for wchar_t") {
 59 |         const std::wstring str = L"" TEST_STRING;
 60 |         const std::wstring test = tcb::utf_ranges::view::add_bom(str);
 61 |         REQUIRE(test == L"\ufeff" + str);
 62 |     }
 63 | }
 64 | 
 65 | TEST_CASE("Byte order marks are correctly identified", "[bom]")
 66 | {
 67 |     SECTION("...in UTF-8") {
 68 |         const std::string str = u8"\uFEFF" TEST_STRING;
 69 |         const std::string test = tcb::utf_ranges::view::consume_bom(str);
 70 |         REQUIRE(test == u8"" TEST_STRING);
 71 |     }
 72 | 
 73 |     SECTION("...in UTF-16") {
 74 |         const std::u16string str = u"\uFEFF" TEST_STRING;
 75 |         const std::u16string test = tcb::utf_ranges::view::consume_bom(str);
 76 |         REQUIRE(test == u"" TEST_STRING);
 77 |     }
 78 | 
 79 |     SECTION("...in UTF-32") {
 80 |         const std::u32string str = U"\uFEFF" TEST_STRING;
 81 |         const std::u32string test = tcb::utf_ranges::view::consume_bom(str);
 82 |         REQUIRE(test == U"" TEST_STRING);
 83 |     }
 84 | }
 85 | 
 86 | TEST_CASE("Strings without byte order marks are unchanged", "[bom]")
 87 | {
 88 |     SECTION("...in UTF-8") {
 89 |         const std::string str = u8"" TEST_STRING;
 90 |         const std::string test = tcb::utf_ranges::view::consume_bom(str);
 91 |         REQUIRE(test == u8"" TEST_STRING);
 92 |     }
 93 | 
 94 |     SECTION("...in UTF-16") {
 95 |         const std::u16string str = u"" TEST_STRING;
 96 |         const std::u16string test = tcb::utf_ranges::view::consume_bom(str);
 97 |         REQUIRE(test == u"" TEST_STRING);
 98 |     }
 99 | 
100 |     SECTION("...in UTF-32") {
101 |         const std::u32string str = U"" TEST_STRING;
102 |         const std::u32string test = tcb::utf_ranges::view::consume_bom(str);
103 |         REQUIRE(test == U"" TEST_STRING);
104 |     }
105 | }
106 | 
107 | TEST_CASE("Native endian InputRanges with byte order marks are stripped correctly", "[bom]")
108 | {
109 |     SECTION("...in UTF-8") {
110 |         std::stringstream ss;
111 |         ss << u8"\uFEFF" TEST_STRING;
112 |         const std::string test = tcb::utf_ranges::view::consume_bom(
113 |                 ranges::istream_range<char>(ss));
114 |         REQUIRE(test == u8"" TEST_STRING);
115 |     }
116 | }
117 | 
118 | TEST_CASE("InputRanges without byte order marks are unchanged", "[bom]")
119 | {
120 |     SECTION("...in UTF-8") {
121 |         std::stringstream ss;
122 |         ss << u8"" TEST_STRING;
123 |         const std::string test = tcb::utf_ranges::view::consume_bom(
124 |                 ranges::istream_range<char>(ss));
125 |         REQUIRE(test == u8"" TEST_STRING);
126 |     }
127 | }
128 | 
129 | TEST_CASE("Byte order marks are correctly used", "[bom]")
130 | {
131 |     SECTION("...for \"UTF-8BE\"") {
132 |         const auto str = to_big_endian(std::string(u8"\uFEFF" TEST_STRING));
133 |         const std::string test = tcb::utf_ranges::view::consume_bom(str);
134 |         REQUIRE(test == u8"" TEST_STRING);
135 |     }
136 | 
137 |     SECTION("...for wide UTF-16BE") {
138 |         const auto str = to_big_endian(std::u16string(u"\uFEFF" TEST_STRING));
139 |         const std::u16string test = tcb::utf_ranges::view::consume_bom(str);
140 |         REQUIRE(test == u"" TEST_STRING);
141 |     }
142 | 
143 |     SECTION("...for wide UTF-32BE") {
144 |         const auto str = to_big_endian(std::u32string(U"\uFEFF" TEST_STRING));
145 |         const std::u32string test = tcb::utf_ranges::view::consume_bom(str);
146 |         REQUIRE(test == U"" TEST_STRING);
147 |     }
148 | 
149 |     SECTION("...with big-endian wide strings") {
150 |         const auto str = to_big_endian(std::wstring(L"\uFEFF" TEST_STRING));
151 |         const std::wstring test = tcb::utf_ranges::view::consume_bom(str);
152 |         REQUIRE(test == L"" TEST_STRING);
153 |     }
154 | 
155 |     SECTION("...for \"UTF-8LE\"") {
156 |         const auto str = to_little_endian(std::string(u8"\uFEFF" TEST_STRING));
157 |         const std::string test = tcb::utf_ranges::view::consume_bom(str);
158 |         REQUIRE(test == u8"" TEST_STRING);
159 |     }
160 | 
161 |     SECTION("...for wide UTF-16LE") {
162 |         const auto str = to_little_endian(std::u16string(u"\uFEFF" TEST_STRING));
163 |         const std::u16string test = tcb::utf_ranges::view::consume_bom(str);
164 |         REQUIRE(test == u"" TEST_STRING);
165 |     }
166 | 
167 |     SECTION("...for wide UTF-32LE") {
168 |         const auto str = to_little_endian(std::u32string(U"\uFEFF" TEST_STRING));
169 |         const std::u32string test = tcb::utf_ranges::view::consume_bom(str);
170 |         REQUIRE(test == U"" TEST_STRING);
171 |     }
172 | 
173 |     SECTION("...with little-endian wide strings") {
174 |         const auto str = to_little_endian(std::wstring(L"\uFEFF" TEST_STRING));
175 |         const std::wstring test = tcb::utf_ranges::view::consume_bom(str);
176 |         REQUIRE(test == L"" TEST_STRING);
177 |     }
178 | }


--------------------------------------------------------------------------------
/test/bytes_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #include "catch.hpp"
 7 | 
 8 | #include <tcb/utf_ranges/view/bytes.hpp>
 9 | #include <range/v3/algorithm/equal.hpp>
10 | 
11 | #include <codecvt>
12 | 
13 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E"
14 | 
15 | TEST_CASE("Bytes view works for UTF-8", "[bytes]")
16 | {
17 |     std::string str = u8"" TEST_STRING;
18 |     std::string test = str | tcb::utf_ranges::view::bytes;
19 | 
20 |     REQUIRE(str == test);
21 | }
22 | 
23 | TEST_CASE("Bytes view works for UTF-16", "[bytes]")
24 | {
25 |     // std::codecvt_utf16<char32_t> is defined to work with UTF-16-encoded
26 |     // byte strings, which is usually massively inconvenient but actually
27 |     // does exactly what we want in this one particular instance
28 | 
29 |     using codecvt = std::codecvt_utf16<char32_t, 0x10ffff,
30 |                                        std::codecvt_mode::little_endian>;
31 | 
32 |     std::u16string u16 = u"" TEST_STRING;
33 |     std::u32string u32 = U"" TEST_STRING;
34 | 
35 |     std::string test = u16 | tcb::utf_ranges::view::bytes;
36 | 
37 |     std::string u16bytes = std::wstring_convert<codecvt, char32_t>{}.to_bytes(u32);
38 | 
39 |     REQUIRE(test == u16bytes);
40 | 
41 |     //REQUIRE(ranges::equal(test, u16bytes));
42 | }


--------------------------------------------------------------------------------
/test/catch_main.cpp:
--------------------------------------------------------------------------------
1 | 
2 | #define CATCH_CONFIG_MAIN
3 | #include "catch.hpp"


--------------------------------------------------------------------------------
/test/convert_test.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tcbrindle/utf_ranges/ff5a0b1a4e7a9a2f6c4ec661989a52080f9396ab/test/convert_test.cpp


--------------------------------------------------------------------------------
/test/endian_test.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #include "catch.hpp"
  7 | 
  8 | #include <tcb/utf_ranges/view/endian_convert.hpp>
  9 | 
 10 | const auto to_little_endian = [] (const auto& in) {
 11 |     using char_type = ranges::range_value_t<decltype(in)>;
 12 |     std::basic_string<char_type> out;
 13 |     for (auto c : in) {
 14 |         out.push_back(boost::endian::native_to_little(
 15 |                 tcb::utf_ranges::detail::make_swap_wrapper(c)).value);
 16 |     }
 17 |     return out;
 18 | };
 19 | 
 20 | const auto to_big_endian = [] (const auto& in) {
 21 |     using char_type = ranges::range_value_t<decltype(in)>;
 22 |     std::basic_string<char_type> out;
 23 |     for (auto c : in) {
 24 |         out.push_back(boost::endian::native_to_big(
 25 |                 tcb::utf_ranges::detail::make_swap_wrapper(c)).value);
 26 |     }
 27 |     return out;
 28 | };
 29 | 
 30 | 
 31 | #define TEST_STRING "123456";
 32 | 
 33 | // Native endian test strings
 34 | const std::string test_string8n = u8"" TEST_STRING;
 35 | const std::u16string test_string16n = u"" TEST_STRING;
 36 | const std::u32string test_string32n = U"" TEST_STRING;
 37 | const std::wstring test_stringwn = L"" TEST_STRING;
 38 | 
 39 | // Little endian test strings
 40 | const std::string test_string8l = test_string8n;
 41 | const std::u16string test_string16l = to_little_endian(test_string16n);
 42 | const std::u32string test_string32l = to_little_endian(test_string32n);
 43 | const std::wstring test_stringwl = to_little_endian(test_stringwn);
 44 | 
 45 | // Big endian test strings
 46 | const std::string test_string8b = test_string8n;
 47 | const std::u16string test_string16b = to_big_endian(test_string16n);
 48 | const std::u32string test_string32b = to_big_endian(test_string32n);
 49 | const std::wstring test_stringwb = to_big_endian(test_stringwn);
 50 | 
 51 | using tcb::utf_ranges::view::endian_convert;
 52 | using namespace boost::endian;
 53 | 
 54 | TEST_CASE("Byte swap native-to-native works", "[endian]")
 55 | {
 56 |     SECTION("...for UTF-8") {
 57 |         const std::string test = endian_convert<order::native>(test_string8n);
 58 |         REQUIRE(test == test_string8n);
 59 |     }
 60 | 
 61 |     SECTION("...for UTF-16") {
 62 |         const std::u16string test = endian_convert<order::native>(test_string16n);
 63 |         REQUIRE(test == test_string16n);
 64 |     }
 65 | 
 66 |     SECTION("...for UTF-32") {
 67 |         const std::u32string test = endian_convert<order::native>(test_string32n);
 68 |         REQUIRE(test == test_string32n);
 69 |     }
 70 | 
 71 |     SECTION("... for wide chars") {
 72 |         const std::wstring test = endian_convert<order::native>(test_stringwn);
 73 |         REQUIRE(test == test_stringwn);
 74 |     }
 75 | }
 76 | 
 77 | TEST_CASE("Byte swap native-to-little works", "[endian]")
 78 | {
 79 |     SECTION("...for UTF-8") {
 80 |         const std::string test = endian_convert<order::little>(test_string8n);
 81 |         REQUIRE(test == test_string8n);
 82 |     }
 83 | 
 84 |     SECTION("...for UTF-16") {
 85 |         const std::u16string test = endian_convert<order::little>(test_string16n);
 86 |         REQUIRE(test == test_string16n);
 87 |     }
 88 | 
 89 |     SECTION("...for UTF-32") {
 90 |         const std::u32string test = endian_convert<order::little>(test_string32n);
 91 |         REQUIRE(test == test_string32n);
 92 |     }
 93 | 
 94 |     SECTION("... for wide chars") {
 95 |         const std::wstring test = endian_convert<order::little>(test_stringwn);
 96 |         REQUIRE(test == test_stringwn);
 97 |     }
 98 | }
 99 | 
100 | TEST_CASE("Byte swap native-to-big works", "[endian]")
101 | {
102 |     SECTION("...for UTF-8") {
103 |         const std::string test = endian_convert<order::big>(test_string8n);
104 |         REQUIRE(test == test_string8b);
105 |     }
106 | 
107 |     SECTION("...for UTF-16") {
108 |         const std::u16string test = endian_convert<order::big>(test_string16n);
109 |         REQUIRE(test == test_string16b);
110 |     }
111 | 
112 |     SECTION("...for UTF-32") {
113 |         const std::u32string test = endian_convert<order::big>(test_string32n);
114 |         REQUIRE(test == test_string32b);
115 |     }
116 | 
117 |     SECTION("... for wide chars") {
118 |         const std::wstring test = endian_convert<order::big>(test_stringwn);
119 |         REQUIRE(test == test_stringwb);
120 |     }
121 | }
122 | 
123 | TEST_CASE("Byte swap little-to-native works", "[endian]")
124 | {
125 |     SECTION("...for UTF-8") {
126 |         const std::string test = endian_convert<order::native>(test_string8l,
127 |                                                                order::little);
128 |         REQUIRE(test == test_string8n);
129 |     }
130 | 
131 |     SECTION("...for UTF-16") {
132 |         const std::u16string test = endian_convert<order::native>(test_string16l,
133 |                                                                   order::little);
134 |         REQUIRE(test == test_string16n);
135 |     }
136 | 
137 |     SECTION("...for UTF-32") {
138 |         const std::u32string test = endian_convert<order::native>(test_string32l,
139 |                                                                   order::little);
140 |         REQUIRE(test == test_string32n);
141 |     }
142 | 
143 |     SECTION("... for wide chars") {
144 |         const std::wstring test = endian_convert<order::native>(test_stringwl,
145 |                                                                 order::little);
146 |         REQUIRE(test == test_stringwn);
147 |     }
148 | }
149 | 
150 | TEST_CASE("Byte swap little-to-little works", "[endian]")
151 | {
152 |     SECTION("...for UTF-8") {
153 |         const std::string test = endian_convert<order::little>(test_string8l,
154 |                                                                order::little);
155 |         REQUIRE(test == test_string8l);
156 |     }
157 | 
158 |     SECTION("...for UTF-16") {
159 |         const std::u16string test = endian_convert<order::little>(test_string16l,
160 |                                                                   order::little);
161 |         REQUIRE(test == test_string16l);
162 |     }
163 | 
164 |     SECTION("...for UTF-32") {
165 |         const std::u32string test = endian_convert<order::little>(test_string32l,
166 |                                                                   order::little);
167 |         REQUIRE(test == test_string32l);
168 |     }
169 | 
170 |     SECTION("... for wide chars") {
171 |         const std::wstring test = endian_convert<order::little>(test_stringwl,
172 |                                                                 order::little);
173 |         REQUIRE(test == test_stringwl);
174 |     }
175 | }
176 | 
177 | TEST_CASE("Byte swap little-to-big works", "[endian]")
178 | {
179 |     SECTION("...for UTF-8") {
180 |         const std::string test = endian_convert<order::big>(test_string8l,
181 |                                                             order::little);
182 |         REQUIRE(test == test_string8b);
183 |     }
184 | 
185 |     SECTION("...for UTF-16") {
186 |         const std::u16string test = endian_convert<order::big>(test_string16l,
187 |                                                                order::little);
188 |         REQUIRE(test == test_string16b);
189 |     }
190 | 
191 |     SECTION("...for UTF-32") {
192 |         const std::u32string test = endian_convert<order::big>(test_string32l,
193 |                                                                order::little);
194 |         REQUIRE(test == test_string32b);
195 |     }
196 | 
197 |     SECTION("... for wide chars") {
198 |         const std::wstring test = endian_convert<order::big>(test_stringwl,
199 |                                                              order::little);
200 |         REQUIRE(test == test_stringwb);
201 |     }
202 | }
203 | 
204 | TEST_CASE("Byte swap big-to-native works", "[endian]")
205 | {
206 |     SECTION("...for UTF-8") {
207 |         const std::string test = endian_convert<order::native>(test_string8b,
208 |                                                                order::big);
209 |         REQUIRE(test == test_string8n);
210 |     }
211 | 
212 |     SECTION("...for UTF-16") {
213 |         const std::u16string test = endian_convert<order::native>(test_string16b,
214 |                                                                   order::big);
215 |         REQUIRE(test == test_string16n);
216 |     }
217 | 
218 |     SECTION("...for UTF-32") {
219 |         const std::u32string test = endian_convert<order::native>(test_string32b,
220 |                                                                   order::big);
221 |         REQUIRE(test == test_string32n);
222 |     }
223 | 
224 |     SECTION("... for wide chars") {
225 |         const std::wstring test = endian_convert<order::native>(test_stringwb,
226 |                                                                 order::big);
227 |         REQUIRE(test == test_stringwn);
228 |     }
229 | }
230 | 
231 | TEST_CASE("Byte swap big-to-little works", "[endian]")
232 | {
233 |     SECTION("...for UTF-8") {
234 |         const std::string test = endian_convert<order::little>(test_string8b,
235 |                                                                order::big);
236 |         REQUIRE(test == test_string8l);
237 |     }
238 | 
239 |     SECTION("...for UTF-16") {
240 |         const std::u16string test = endian_convert<order::little>(test_string16b,
241 |                                                                   order::big);
242 |         REQUIRE(test == test_string16l);
243 |     }
244 | 
245 |     SECTION("...for UTF-32") {
246 |         const std::u32string test = endian_convert<order::little>(test_string32b,
247 |                                                                   order::big);
248 |         REQUIRE(test == test_string32l);
249 |     }
250 | 
251 |     SECTION("... for wide chars") {
252 |         const std::wstring test = endian_convert<order::little>(test_stringwb,
253 |                                                                 order::big);
254 |         REQUIRE(test == test_stringwl);
255 |     }
256 | }
257 | 
258 | TEST_CASE("Byte swap big-to-big works", "[endian]")
259 | {
260 |     SECTION("...for UTF-8") {
261 |         const std::string test = endian_convert<order::big>(test_string8b,
262 |                                                             order::big);
263 |         REQUIRE(test == test_string8b);
264 |     }
265 | 
266 |     SECTION("...for UTF-16") {
267 |         const std::u16string test = endian_convert<order::big>(test_string16b,
268 |                                                                order::big);
269 |         REQUIRE(test == test_string16b);
270 |     }
271 | 
272 |     SECTION("...for UTF-32") {
273 |         const std::u32string test = endian_convert<order::big>(test_string32b,
274 |                                                                order::big);
275 |         REQUIRE(test == test_string32b);
276 |     }
277 | 
278 |     SECTION("... for wide chars") {
279 |         const std::wstring test = endian_convert<order::big>(test_stringwb,
280 |                                                              order::big);
281 |         REQUIRE(test == test_stringwb);
282 |     }
283 | }


--------------------------------------------------------------------------------
/test/istreambuf_range_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #include "catch.hpp"
 7 | 
 8 | #include <tcb/utf_ranges/istreambuf_range.hpp>
 9 | #include <range/v3/algorithm/equal.hpp>
10 | 
11 | #include <sstream>
12 | 
13 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E"
14 | 
15 | TEST_CASE("Basic istreambuf_range test", "[istreambuf_range]")
16 | {
17 |     std::istringstream ss{TEST_STRING};
18 | 
19 |     auto rng = tcb::utf_ranges::istreambuf(ss);
20 | 
21 |     // The stringstream eats the final NUL of the string literal, so we have to
22 |     // wrap it in a string to do the same (although string_view would do too)
23 |     REQUIRE(ranges::equal(rng, std::string(TEST_STRING)));
24 | }
25 | 
26 | 
27 | TEST_CASE("Basic istreambuf_range<char16_t> test", "[istreambuf_range]")
28 | {
29 |     std::basic_istringstream<char16_t> ss{u"" TEST_STRING};
30 | 
31 |     auto rng = tcb::utf_ranges::istreambuf(ss);
32 | 
33 |     REQUIRE(ranges::equal(rng, std::u16string{u"" TEST_STRING}));
34 | }
35 | 
36 | TEST_CASE("Basic istreambuf_range<char32_t> test", "[istreambuf_range]")
37 | {
38 |     std::basic_istringstream<char32_t> ss{U"" TEST_STRING};
39 | 
40 |     auto rng = tcb::utf_ranges::istreambuf(ss);
41 | 
42 |     REQUIRE(ranges::equal(rng, std::u32string{U"" TEST_STRING}));
43 | }
44 | 


--------------------------------------------------------------------------------
/test/line_end_transform_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #include "catch.hpp"
 7 | 
 8 | #include <tcb/utf_ranges/view.hpp>
 9 | 
10 | #include <range/v3/algorithm/count_if.hpp>
11 | 
12 | #define TEST_STRING "\n \r \r\n \u0085 \u000b \u000c \u2028 \u2029"
13 | 
14 | TEST_CASE("Line end transformations work as expected", "[line_end]")
15 | {
16 |     const std::string str = u8"" TEST_STRING;
17 | 
18 |     auto v = tcb::utf_ranges::view::line_end_transform(str);
19 | 
20 |     static_assert(ranges::ForwardRange<decltype(v)>(), "");
21 | 
22 |     REQUIRE(ranges::count_if(v, [](char c) { return c == '\n'; }) == 8);
23 | }


--------------------------------------------------------------------------------
/test/ostreambuf_iterator_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 5 | 
 6 | #include "catch.hpp"
 7 | 
 8 | #include <tcb/utf_ranges/ostreambuf_iterator.hpp>
 9 | 
10 | #include <sstream>
11 | #include <range/v3/algorithm/copy.hpp>
12 | 
13 | const std::string test_str = u8"$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E";
14 | 
15 | TEST_CASE("ostreambuf_iterator works as expected", "[ostreambuf_iterator]")
16 | {
17 |     std::ostringstream ss;
18 | 
19 |     ranges::copy(test_str, tcb::utf_ranges::ostreambuf_iterator<char>(ss));
20 | 
21 |     REQUIRE(ss.str() == test_str);
22 | }


--------------------------------------------------------------------------------
/test/utf_convert_view_test.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | // Copyright (c) 2016 Tristan Brindle (tcbrindle at gmail dot com)
  3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5 | 
  6 | #include "catch.hpp"
  7 | 
  8 | #include <tcb/utf_ranges/view.hpp>
  9 | #include <range/v3/algorithm/equal.hpp>
 10 | 
 11 | #if __has_include(<experimental/string_view>)
 12 | #include <experimental/string_view>
 13 | using std::experimental::string_view;
 14 | using std::experimental::u16string_view;
 15 | using std::experimental::u32string_view;
 16 | using std::experimental::wstring_view;
 17 | #else
 18 | #include <boost/utility/string_view.hpp>
 19 | using boost::string_view;
 20 | using u16string_view = boost::basic_string_view<char16_t>;
 21 | using u32string_view = boost::basic_string_view<char32_t>;
 22 | using boost::wstring_view;
 23 | #endif
 24 | 
 25 | using namespace tcb::utf_ranges;
 26 | 
 27 | #define TEST_STRING "$€0123456789你好abcdefghijklmnopqrstyvwxyz\U0001F60E"
 28 | 
 29 | /*
 30 |  * Default construction
 31 |  */
 32 | 
 33 | TEST_CASE("utf_convert_view can be default constructed", "[view]")
 34 | {
 35 |     constexpr char str[] = u8"" TEST_STRING;
 36 |     const auto v = tcb::utf_ranges::utf_convert_view<
 37 |             rng::view::all_t<decltype(str)>, char, char32_t>{};
 38 |     static_assert(rng::ForwardRange<decltype(v)>(), "");
 39 |     REQUIRE(v.empty());
 40 | }
 41 | 
 42 | /*
 43 |  * Handling empty ranges
 44 |  */
 45 | 
 46 | TEST_CASE("UTF-8 -> UTF-8 view handles empty ranges", "[view]")
 47 | {
 48 |     constexpr string_view str{};
 49 |     const auto v = view::utf8(str);
 50 |     REQUIRE(v.empty());
 51 | }
 52 | 
 53 | TEST_CASE("UTF-8 -> UTF-16 view handles empty ranges", "[view]")
 54 | {
 55 |     constexpr string_view str{};
 56 |     const auto v = view::utf16(str);
 57 |     REQUIRE(v.empty());
 58 | }
 59 | 
 60 | 
 61 | TEST_CASE("UTF-8 -> UTF-32 view handles empty ranges", "[view]")
 62 | {
 63 |     constexpr string_view str{};
 64 |     const auto v = view::utf32(str);
 65 |     REQUIRE(v.empty());
 66 | }
 67 | 
 68 | TEST_CASE("UTF-16 -> UTF-8 view handles empty ranges", "[view]")
 69 | {
 70 |     constexpr u16string_view str{};
 71 |     const auto v = view::utf8(str);
 72 |     REQUIRE(v.empty());
 73 | }
 74 | 
 75 | TEST_CASE("UTF-16 -> UTF-16 view handles empty ranges", "[view]")
 76 | {
 77 |     constexpr u16string_view str{};
 78 |     const auto v = view::utf16(str);
 79 |     REQUIRE(v.empty());
 80 | }
 81 | 
 82 | 
 83 | TEST_CASE("UTF-16 -> UTF-32 view handles empty ranges", "[view]")
 84 | {
 85 |     constexpr u16string_view str{};
 86 |     const auto v = view::utf32(str);
 87 |     REQUIRE(v.empty());
 88 | }
 89 | 
 90 | TEST_CASE("UTF-32 -> UTF-8 view handles empty ranges", "[view]")
 91 | {
 92 |     constexpr u32string_view str{};
 93 |     const auto v = view::utf8(str);
 94 |     REQUIRE(v.empty());
 95 | }
 96 | 
 97 | TEST_CASE("UTF-32 -> UTF-16 view handles empty ranges", "[view]")
 98 | {
 99 |     constexpr u32string_view str{};
100 |     const auto v = view::utf16(str);
101 |     REQUIRE(v.empty());
102 | }
103 | 
104 | TEST_CASE("UTF-32 -> UTF-32 view handles empty ranges", "[view]")
105 | {
106 |     constexpr u32string_view str{};
107 |     const auto v = view::utf32(str);
108 |     REQUIRE(v.empty());
109 | }
110 | 
111 | TEST_CASE("wchar -> UTF-8 view handles empty ranges", "[view]")
112 | {
113 |     constexpr wstring_view str{};
114 |     const auto v = view::utf8(str);
115 |     REQUIRE(v.empty());
116 | }
117 | 
118 | TEST_CASE("wchar -> UTF-16 view handles empty ranges", "[view]")
119 | {
120 |     constexpr wstring_view str{};
121 |     const auto v = view::utf16(str);
122 |     REQUIRE(v.empty());
123 | }
124 | 
125 | TEST_CASE("wchar -> UTF-32 view handles empty ranges", "[view]")
126 | {
127 |     constexpr wstring_view str{};
128 |     const auto v = view::utf32(str);
129 |     REQUIRE(v.empty());
130 | }
131 | 
132 | /*
133 |  * Valid (compiler-generated) UTF
134 |  */
135 | 
136 | TEST_CASE("UTF-8 -> UTF-8 view handles valid UTF correctly", "[view]")
137 | {
138 |     constexpr char str[] = u8"" TEST_STRING;
139 |     const auto v = view::utf8(str);
140 |     REQUIRE(rng::equal(str, v));
141 | }
142 | 
143 | TEST_CASE("UTF-8 -> UTF-16 view handles valid UTF correctly", "[view]")
144 | {
145 |     constexpr char str[] = u8"" TEST_STRING;
146 |     constexpr char16_t check[] = u"" TEST_STRING;
147 |     const auto v = view::utf16(str);
148 |     REQUIRE(rng::equal(check, v));
149 | }
150 | 
151 | TEST_CASE("UTF-8 -> UTF-32 view handles valid UTF correctly", "[view]")
152 | {
153 |     constexpr char str[] = u8"" TEST_STRING;
154 |     constexpr char32_t check[] = U"" TEST_STRING;
155 |     const auto v = view::utf32(str);
156 |     REQUIRE(rng::equal(check, v));
157 | }
158 | 
159 | TEST_CASE("UTF-16 -> UTF-8 view handles valid UTF correctly", "[view]")
160 | {
161 |     constexpr char16_t str[] = u"" TEST_STRING;
162 |     constexpr char check[] = u8"" TEST_STRING;
163 |     const auto v = view::utf8(str);
164 |     REQUIRE(rng::equal(check, v));
165 | }
166 | 
167 | TEST_CASE("UTF-16 -> UTF-16 view handles valid UTF correctly", "[view]")
168 | {
169 |     constexpr char16_t str[] = u"" TEST_STRING;
170 |     constexpr char16_t check[] = u"" TEST_STRING;
171 |     const auto v = view::utf16(str);
172 |     REQUIRE(rng::equal(check, v));
173 | }
174 | 
175 | TEST_CASE("UTF-16 -> UTF-32 view handles valid UTF correctly", "[view]")
176 | {
177 |     constexpr char16_t str[] = u"" TEST_STRING;
178 |     constexpr char32_t check[] = U"" TEST_STRING;
179 |     const auto v = view::utf32(str);
180 |     REQUIRE(rng::equal(check, v));
181 | }
182 | 
183 | TEST_CASE("UTF-32 -> UTF-8 view handles valid UTF correctly", "[view]")
184 | {
185 |     constexpr char32_t str[] = U"" TEST_STRING;
186 |     constexpr char check[] = u8"" TEST_STRING;
187 |     const auto v = view::utf8(str);
188 |     REQUIRE(rng::equal(check, v));
189 | }
190 | 
191 | TEST_CASE("UTF-32 -> UTF-16 view handles valid UTF correctly", "[view]")
192 | {
193 |     constexpr char32_t str[] = U"" TEST_STRING;
194 |     constexpr char16_t check[] = u"" TEST_STRING;
195 |     const auto v = view::utf16(str);
196 |     REQUIRE(rng::equal(check, v));
197 | }
198 | 
199 | TEST_CASE("UTF-32 -> UTF-32 view handles valid UTF correctly", "[view]")
200 | {
201 |     constexpr char32_t str[] = U"" TEST_STRING;
202 |     constexpr char32_t check[] = U"" TEST_STRING;
203 |     const auto v = view::utf32(str);
204 |     REQUIRE(rng::equal(check, v));
205 | }
206 | 
207 | TEST_CASE("wchar -> UTF-8 view handles valid UTF correctly", "[view]")
208 | {
209 |     constexpr wchar_t str[] = L"" TEST_STRING;
210 |     constexpr char check[] = u8"" TEST_STRING;
211 |     const auto v = view::utf8(str);
212 |     REQUIRE(rng::equal(check, v));
213 | }
214 | 
215 | TEST_CASE("wchar -> UTF-16 view handles valid UTF correctly", "[view]")
216 | {
217 |     constexpr wchar_t str[] = L"" TEST_STRING;
218 |     constexpr char16_t check[] = u"" TEST_STRING;
219 |     const auto v = view::utf16(str);
220 |     REQUIRE(rng::equal(check, v));
221 | }
222 | 
223 | TEST_CASE("wchar -> UTF-32 view handles valid UTF correctly", "[view]")
224 | {
225 |     constexpr wchar_t str[] = L"" TEST_STRING;
226 |     constexpr char32_t check[] = U"" TEST_STRING;
227 |     const auto v = view::utf32(str);
228 |     REQUIRE(rng::equal(check, v));
229 | }
230 | 
231 | TEST_CASE("Non-character conversions work as expected", "[view]")
232 | {
233 |     constexpr char str[] = u8"" TEST_STRING;
234 |     constexpr char16_t check[] = u"" TEST_STRING;
235 |     std::vector<uint16_t> vec = view::utf_convert<uint16_t>(str);
236 |     REQUIRE(rng::equal(check, vec));
237 | }
238 | 
239 | 


--------------------------------------------------------------------------------