├── .gitignore
├── .gitlab-ci.yml
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── src
    ├── utf8_iterator.cpp
    ├── utf8_iterator.hpp
    ├── utf8_string.cpp
    └── utf8_string.hpp
└── test
    ├── lipsum.txt
    └── main.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 
30 | # Codeblocks config files
31 | *.layout
32 | *.cbp
33 | *.depend
34 | 
35 | # Others
36 | Doxy*
37 | html/
38 | *.cppcheck
39 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | 
2 | buildgcc:
3 | 
4 |    script:
5 |      - make CC="g++"
6 |      - ./utf8test


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | compiler:
 4 |   - clang
 5 |   - g++
 6 | 
 7 | 
 8 | before_install:
 9 |   - if [ "$CC" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi
10 |   - sudo apt-get update -qq
11 | 
12 | install:
13 |   - if [ "$CC" == "g++" ]; then sudo apt-get install -qq g++-4.9; fi
14 |   - if [ "$CC" == "g++" ]; then sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 50; fi
15 |   - if [ "$CC" == "g++" ]; then export CC="g++-4.9"; fi
16 | 
17 | script:
18 |   - make
19 |   - ./utf8test
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gumichan01/utf8_string/4e677cd3d7986dc1406f3b50e64ecaec68dd6b88/LICENSE


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile of UTFstring
 2 | 
 3 | CC=g++
 4 | CFLAGS=-Wall -Wextra -g -Weffc++ -Wsign-conversion -Wconversion -std=c++11
 5 | LFLAGS=
 6 | 
 7 | SRC=./src/
 8 | TEST_SRC=./test/
 9 | TEST_MAIN=$(TEST_SRC)main.cpp
10 | TEST_EXE=utf8test
11 | 
12 | UTF8_HEADER=$(SRC)utf8_string.hpp
13 | UTF8_SRC=$(SRC)utf8_string.cpp
14 | UTF8_ITER_HEADER=$(SRC)utf8_iterator.hpp
15 | UTF8_ITER_SRC=$(SRC)utf8_iterator.cpp
16 | 
17 | UTF8_OBJ=utf8_string.o
18 | UTF8_ITER_OBJ=utf8_iterator.o
19 | TEST_OBJ=main.o
20 | OBJS=$(UTF8_OBJ) $(TEST_OBJ) $(UTF8_ITER_OBJ)
21 | 
22 | all: test
23 | 
24 | test: $(TEST_EXE)
25 | 	@echo $(TEST_EXE)" generated. "
26 | 
27 | 
28 | $(TEST_EXE) : $(OBJS)
29 | 	@echo $@" - Compiling..."
30 | 	$(CC) $(CFLAGS) -o $@ $^ $(LFLAGS)
31 | 	@echo $@" - done."
32 | 
33 | $(UTF8_OBJ) : $(UTF8_SRC) $(UTF8_HEADER)
34 | 	@echo $<" -> "$@
35 | 	$(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS)
36 | 	@echo $<" -> "$@" done."
37 | 
38 | $(UTF8_ITER_OBJ) : $(UTF8_ITER_SRC) $(UTF8_ITER_HEADER)
39 | 	@echo $<" -> "$@
40 | 	$(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS)
41 | 	@echo $<" -> "$@" done."
42 | 
43 | 
44 | $(TEST_OBJ) : $(TEST_MAIN) $(UTF8_HEADER)
45 | 	@echo $<" -> "$@
46 | 	$(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS)
47 | 	@echo $<" -> "$@" done."
48 | 
49 | mrproper:
50 | 	rm -f $(TEST_EXE) $(OBJS)
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UTF-8 string #
 2 | 
 3 | [![Build Status](https://travis-ci.org/Gumichan01/utf8_string.svg?branch=master)](https://travis-ci.org/Gumichan01/utf8_string)
 4 | [![pipeline status](https://gitlab.com/Gumichan01/utf8_string/badges/master/pipeline.svg)](https://gitlab.com/Gumichan01/utf8_string/commits/master)
 5 | 
 6 | This is a simple implementation of UTF-8 strings in C++.
 7 | 
 8 | ## Implementation ##
 9 | 
10 | UTF8string is based on *std::string* provided by the standard C++ library
11 | but has been implemented to support UTF-8 encoded strings.
12 | 
13 | Some functions have been adapted for utf8 strings :
14 |  - utf8_length : get number of characters in a string (number of codepoints).
15 |  - utf8_size   : get get the memory size of the string (in byte).
16 |  - utf8_find   : find a utf8 substring in the current string.
17 |  - utf8_substr : get a utf8 substring of the current string.
18 |  - utf8_at     : get the codepoint at a specified position.
19 |  - utf8_pop    : remove the last codepoint of the string.
20 | 
21 | ## Usage ##
22 | 
23 | You just need to include all of the ***.hpp*** and ***.cpp*** files from *src/*
24 | in your project. For each file that uses UTF8string, include this piece of code :
25 | 
26 |     #include "utf8_string.hpp"
27 | 
28 | ## Code example ##
29 | 
30 | ```cpp
31 | UTF8string u8("がんばつて Gumichan");
32 | UTF8string sub = u8.utf8_substr(0,5);
33 | size_t pos = u8.utf8_find(UTF8string("chan"));
34 | size_t sz  = u8.utf8_size();
35 | size_t l   = u8.utf8_length();
36 | 
37 | std::cout << "u8 string: " << u8 << "\n";
38 | std::cout << "utf8 substring from 0 to 5: " << sub << "\n";
39 | std::cout << "utf8 codepoint at 2: " << u8.utf8_at(2) << "\n";
40 | std::cout << "utf8 string \"chan\" at " << pos << "\n";
41 | std::cout << "u8 string - memory size: " << sz << "; length: " << l << "\n\n";
42 | 
43 | for (auto s: sub)    // or for (const std::string& s: u8)
44 | {
45 |     std::cout << "-> " << s << "\n";
46 | }
47 | 
48 | ```
49 | 
50 | Output :
51 | 
52 | ```
53 | utf8 string: がんばつて Gumichan
54 | utf8 substring from 0 to 5: がんばつて
55 | utf8 codepoint at 2: ば
56 | utf8 string "chan" at 10
57 | u8 string - memory size: 24; length: 14
58 | 
59 | -> が
60 | -> ん
61 | -> ば
62 | -> つ
63 | -> て
64 | 
65 | ```
66 | 
67 | ## Project that uses UTF8string ##
68 | 
69 | - [LunatiX](https://github.com/Gumichan01/lunatix)
70 | 
71 | ## License ##
72 | 
73 | This library is under the MIT License.
74 | 


--------------------------------------------------------------------------------
/src/utf8_iterator.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | *   Copyright © 2018 Luxon Jean-Pierre
  4 | *   https://gumichan01.github.io/
  5 | *
  6 | *   This library is under the MIT license
  7 | *
  8 | *   Luxon Jean-Pierre (Gumichan01)
  9 | *   luxon.jean.pierre@gmail.com
 10 | *
 11 | */
 12 | 
 13 | #include "utf8_string.hpp"
 14 | 
 15 | #include <stdexcept>
 16 | 
 17 | 
 18 | UTF8iterator::UTF8iterator( const UTF8string& u ) noexcept
 19 |     : _index( 0 ), _data( u ) {}
 20 | 
 21 | UTF8iterator::UTF8iterator( const UTF8iterator& it ) noexcept
 22 |     : _index( it._index ), _data( it._data ) {}
 23 | 
 24 | 
 25 | UTF8iterator& UTF8iterator::operator =( const UTF8iterator& it ) noexcept
 26 | {
 27 |     _data = it._data;
 28 |     _index = it._index;
 29 |     return *this;
 30 | }
 31 | 
 32 | 
 33 | UTF8iterator& UTF8iterator::operator ++() noexcept
 34 | {
 35 |     if ( _index < _data.utf8_length() )
 36 |         _index += 1;
 37 | 
 38 |     return *this;
 39 | }
 40 | 
 41 | 
 42 | UTF8iterator UTF8iterator::operator ++( int ) noexcept
 43 | {
 44 |     UTF8iterator oldit( *this );
 45 | 
 46 |     if ( _index < _data.utf8_length() )
 47 |         _index += 1;
 48 | 
 49 |     return oldit;
 50 | }
 51 | 
 52 | 
 53 | UTF8iterator& UTF8iterator::operator --() noexcept
 54 | {
 55 |     if ( _index > 0 )
 56 |         _index -= 1;
 57 | 
 58 |     return *this;
 59 | }
 60 | 
 61 | 
 62 | UTF8iterator UTF8iterator::operator --( int ) noexcept
 63 | {
 64 |     UTF8iterator oldit( *this );
 65 | 
 66 |     if ( _index > 0 )
 67 |         _index -= 1;
 68 | 
 69 |     return oldit;
 70 | }
 71 | 
 72 | 
 73 | bool UTF8iterator::operator ==( const UTF8iterator& it ) const noexcept
 74 | {
 75 |     return ( _data == it._data ) && ( _index == it._index );
 76 | }
 77 | 
 78 | 
 79 | bool UTF8iterator::operator !=( const UTF8iterator& it ) const noexcept
 80 | {
 81 |     return !( *this == it );
 82 | }
 83 | 
 84 | 
 85 | bool UTF8iterator::operator <( const UTF8iterator& it ) const noexcept
 86 | {
 87 |     return ( _data == it._data ) && ( _index < it._index );
 88 | }
 89 | 
 90 | bool UTF8iterator::operator >( const UTF8iterator& it ) const noexcept
 91 | {
 92 |     return ( _data == it._data ) && ( _index > it._index );
 93 | }
 94 | 
 95 | bool UTF8iterator::operator <=( const UTF8iterator& it ) const noexcept
 96 | {
 97 |     return ( _data == it._data ) && ( _index <= it._index );
 98 | }
 99 | 
100 | bool UTF8iterator::operator >=( const UTF8iterator& it ) const noexcept
101 | {
102 |     return ( _data == it._data ) && ( _index >= it._index );
103 | }
104 | 
105 | const UTF8string::u8char UTF8iterator::operator *() const
106 | {
107 |     return _data.utf8_at( _index );
108 | }
109 | 
110 | 
111 | UTF8iterator UTF8iterator::operator +( const size_t n ) const noexcept
112 | {
113 |     UTF8iterator newit( *this );
114 |     const size_t U8LEN = newit._data.utf8_length();
115 | 
116 |     if ( newit._index + n < U8LEN )
117 |         newit._index += n;
118 |     else
119 |         newit._index = U8LEN;
120 | 
121 |     return newit;
122 | }
123 | 
124 | 
125 | UTF8iterator UTF8iterator::operator -( const size_t n ) const noexcept
126 | {
127 |     UTF8iterator newit( *this );
128 | 
129 |     if ( newit._index >= n )
130 |         newit._index -= n;
131 |     else
132 |         newit._index = 0;
133 | 
134 |     return newit;
135 | }
136 | 
137 | long UTF8iterator::operator -( const UTF8iterator& it ) const
138 | {
139 |     if ( _data != it._data )
140 |         throw std::invalid_argument( "iterators don't point to the same data" );
141 | 
142 |     return static_cast<long>( _index ) - static_cast<long>( it._index );
143 | }
144 | 


--------------------------------------------------------------------------------
/src/utf8_iterator.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | *   Copyright © 2018 Luxon Jean-Pierre
  4 | *   https://gumichan01.github.io/
  5 | *
  6 | *   This library is under the MIT license
  7 | *
  8 | *   Luxon Jean-Pierre (Gumichan01)
  9 | *   luxon.jean.pierre@gmail.com
 10 | *
 11 | */
 12 | 
 13 | #ifndef UTF8_ITERATOR_HPP_INCLUDED
 14 | #define UTF8_ITERATOR_HPP_INCLUDED
 15 | 
 16 | /**
 17 | *   @file utf8_iterator.hpp
 18 | *   @brief This is a UTF-8 string library header
 19 | */
 20 | 
 21 | class UTF8string;
 22 | 
 23 | 
 24 | /**
 25 | *   @class UTF8iterator final
 26 | *   @brief Iterator on UTF8 string
 27 | *
 28 | *   This class defines the iterator of UTF-8 string
 29 | */
 30 | class UTF8iterator final
 31 | {
 32 |     size_t _index = 0;
 33 |     UTF8string _data;
 34 | 
 35 |     char& operator ->() = delete;
 36 | 
 37 | public:
 38 | 
 39 |     UTF8iterator() = delete;
 40 | 
 41 |     /**
 42 |     *   @fn explicit UTF8iterator(const UTF8string& u) noexcept
 43 |     *   Build an iterator object using a UTF8string object
 44 |     *   @param u utf-8 string
 45 |     */
 46 |     explicit UTF8iterator( const UTF8string& u ) noexcept;
 47 | 
 48 |     /**
 49 |     *   @fn UTF8iterator(const UTF8iterator& it) noexcept
 50 |     *   @param it The iterator to copy
 51 |     */
 52 |     UTF8iterator( const UTF8iterator& it ) noexcept;
 53 | 
 54 |     /**
 55 |     *   @fn UTF8iterator& operator ++() noexcept
 56 |     *   Prefix incrementation
 57 |     *   @return The same iterator, but it has moved forward
 58 |     */
 59 |     UTF8iterator& operator ++() noexcept;
 60 | 
 61 |     /**
 62 |     *   @fn UTF8iterator& operator ++(int) noexcept
 63 |     *
 64 |     *   Postfix incrementation
 65 |     *
 66 |     *   @return The same iterator before it has moved forward
 67 |     */
 68 |     UTF8iterator operator ++( int ) noexcept;
 69 | 
 70 |     /**
 71 |     *   @fn UTF8iterator& operator --() noexcept
 72 |     *   Prefix derementation
 73 |     *   @return The same iterator, but it has moved backward
 74 |     */
 75 |     UTF8iterator& operator --() noexcept;
 76 | 
 77 |     /**
 78 |     *   @fn UTF8iterator operator --(int) noexcept
 79 |     *
 80 |     *   Postfix decrementation
 81 |     *
 82 |     *   @return The same iterator before it has moved backward
 83 |     */
 84 |     UTF8iterator operator --( int ) noexcept;
 85 | 
 86 |     /**
 87 |     *   @fn UTF8iterator& operator =(const UTF8iterator& it) noexcept
 88 |     *   Asignement
 89 |     *   @param it The iterator that wille be assigned
 90 |     *   @return The same iterator as the argument
 91 |     */
 92 |     UTF8iterator& operator =( const UTF8iterator& it ) noexcept;
 93 | 
 94 |     /**
 95 |     *   @fn bool operator ==(const UTF8iterator& it) const noexcept
 96 |     *
 97 |     *   Check if the current iterator is pointing to the same position as
 98 |     *   the iterator given in argument equals.
 99 |     *
100 |     *   @param it The iterator to compare with
101 |     *   @return TRUE if they are pointing to the same position, FALSE otherwise
102 |     */
103 |     bool operator ==( const UTF8iterator& it ) const noexcept;
104 |     /**
105 |     *   @fn bool operator !=(const UTF8iterator& it) const noexcept
106 |     *
107 |     *   Check if the current iterator is pointing to a different position
108 |     *   from the iterator given in argument equals.
109 |     *
110 |     *   @param it The iterator to compare with
111 |     *   @return TRUE if they are not pointing to the same position,
112 |     *           FALSE otherwise
113 |     */
114 |     bool operator !=( const UTF8iterator& it ) const noexcept;
115 |     /**
116 |     *   @fn bool operator <(const UTF8iterator& it) const noexcept
117 |     *   @param it The iterator to compare with
118 |     *   @return TRUE if the position of it is greater than *this, FALSE otherwise
119 |     */
120 |     bool operator <( const UTF8iterator& it ) const noexcept;
121 |     /**
122 |     *   @fn bool operator >(const UTF8iterator& it) const noexcept
123 |     *   @param it The iterator to compare with
124 |     *   @return TRUE if the position of it is less than *this, FALSE otherwise
125 |     */
126 |     bool operator >( const UTF8iterator& it ) const noexcept;
127 |     /**
128 |     *   @fn bool operator <=(const UTF8iterator& it) const noexcept
129 |     *   @param it The iterator to compare with
130 |     *   @return TRUE if the position of it is greater than or equal *this, FALSE otherwise
131 |     */
132 |     bool operator <=( const UTF8iterator& it ) const noexcept;
133 |     /**
134 |     *   @fn bool operator >=(const UTF8iterator& it) const noexcept
135 |     *   @param it The iterator to compare with
136 |     *   @return TRUE if the position of it is less than or equal to *this, FALSE otherwise
137 |     */
138 |     bool operator >=( const UTF8iterator& it ) const noexcept;
139 | 
140 |     /**
141 |     *   @fn UTF8iterator operator +(const size_t n) const noexcept
142 |     *
143 |     *   Returns an iterator which has been moved n positions forward
144 |     *
145 |     *   @param n the number of step to move forward
146 |     *   @return The same iterator that moved forward
147 |     */
148 |     UTF8iterator operator +( const size_t n ) const noexcept;
149 |     /**
150 |     *   @fn UTF8iterator operator -(const size_t n) const noexcept
151 |     *
152 |     *   Returns an iterator which has been moved n positions backward
153 |     *
154 |     *   @param n the number of steps to move backward
155 |     *   @return The same iterator that moved backward
156 |     */
157 |     UTF8iterator operator -( const size_t n ) const noexcept;
158 |     /**
159 |     *   @fn long operator -(const UTF8iterator& it) const
160 |     *
161 |     *   Return the difference value between *this and it
162 |     *
163 |     *   @param it
164 |     *   @return A long value *n* such that it + n = *this
165 |     *   @pre *this and it points to the same data
166 |     *   @post *this == it + (*this - it)
167 |     *   @exception std::invalid_argument if the pre-condition is not satisfied
168 |     */
169 |     long operator -( const UTF8iterator& it ) const;
170 | 
171 |     /**
172 |     *   @fn const UTF8string::u8char operator *() const
173 |     *
174 |     *   Dereferences the pointer returning the codepoint
175 |     *   pointed by the iterator at its current potision
176 |     *
177 |     *   @return The codepoint
178 |     *   @note This function will throw an *std::out_of_range* exception
179 |     *         if the iterator does not point to a codepoint
180 |     */
181 |     const UTF8string::u8char operator *() const;
182 | 
183 |     ~UTF8iterator() = default;
184 | };
185 | 
186 | #endif // UTF8_ITERATOR_HPP_INCLUDED
187 | 


--------------------------------------------------------------------------------
/src/utf8_string.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | *   Copyright © 2018 Luxon Jean-Pierre
  4 | *   https://gumichan01.github.io/
  5 | *
  6 | *   This library is under the MIT license
  7 | *
  8 | *   Luxon Jean-Pierre (Gumichan01)
  9 | *   luxon.jean.pierre@gmail.com
 10 | *
 11 | */
 12 | 
 13 | #include "utf8_string.hpp"
 14 | 
 15 | #include <unordered_map>
 16 | #include <utility>
 17 | 
 18 | 
 19 | namespace
 20 | {
 21 | 
 22 | constexpr size_t min( size_t a, size_t b )
 23 | {
 24 |     return a < b ? a : b;
 25 | }
 26 | 
 27 | inline std::basic_string<unsigned char> toUstring( const std::string& str )
 28 | {
 29 |     return std::basic_string<unsigned char>( str.begin(), str.end() );
 30 | }
 31 | 
 32 | // Used in utf8_find
 33 | void preprocess( const UTF8string& str,
 34 |                  std::unordered_map<UTF8string::u8char, size_t>& u8map ) noexcept
 35 | {
 36 |     const size_t U8LEN = str.utf8_length();
 37 | 
 38 |     // Preprocessing
 39 |     if ( U8LEN > 1 )
 40 |     {
 41 |         for ( size_t i = U8LEN - 2U; ; --i )
 42 |         {
 43 |             const UTF8string::u8char& s = str[i];
 44 | 
 45 |             if ( u8map.find( s ) == u8map.end() )
 46 |                 u8map[s] = U8LEN - 1 - i;
 47 | 
 48 |             if ( i == 0 )
 49 |                 break;
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | }
 55 | 
 56 | 
 57 | UTF8string::UTF8string( const char * str )
 58 |     : UTF8string( std::string( str ) ) {}
 59 | 
 60 | 
 61 | UTF8string::UTF8string( const std::string& str )
 62 |     : _utf8string( str )
 63 | {
 64 |     if ( !utf8_is_valid_() )
 65 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
 66 | 
 67 |     _utf8length = utf8_length_();
 68 | }
 69 | 
 70 | 
 71 | UTF8string::UTF8string( const UTF8string& u8str ) noexcept
 72 |     : _utf8string( u8str._utf8string ), _utf8length( u8str._utf8length ) {}
 73 | 
 74 | UTF8string::UTF8string( const UTF8string& u8str, size_t pos, size_t len ) noexcept
 75 |     : UTF8string( u8str.utf8_substr( pos, len ) ) {}
 76 | 
 77 | UTF8string::UTF8string( UTF8string&& u8str ) noexcept
 78 |     : _utf8string( u8str._utf8string ), _utf8length( u8str._utf8length )
 79 | {
 80 |     u8str.utf8_clear();
 81 |     u8str._utf8string.shrink_to_fit();
 82 | }
 83 | 
 84 | UTF8string& UTF8string::operator =( const char * str )
 85 | {
 86 |     return utf8_assign( str );
 87 | }
 88 | 
 89 | 
 90 | UTF8string& UTF8string::operator =( const std::string& str )
 91 | {
 92 |     return utf8_assign( str );
 93 | }
 94 | 
 95 | 
 96 | UTF8string& UTF8string::operator =( const UTF8string& u8str ) noexcept
 97 | {
 98 |     _utf8string = u8str._utf8string;
 99 |     _utf8length = u8str._utf8length;
100 |     return *this;
101 | }
102 | 
103 | UTF8string& UTF8string::operator =( UTF8string&& u8str ) noexcept
104 | {
105 |     return utf8_assign( std::move( u8str ) );
106 | }
107 | 
108 | const UTF8string& UTF8string::operator +=( const std::string& str )
109 | {
110 |     const UTF8string::u8string BACKUP = _utf8string;
111 |     _utf8string += str;
112 | 
113 |     if ( !utf8_is_valid_() )
114 |     {
115 |         _utf8string = BACKUP;
116 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
117 |     }
118 | 
119 |     _utf8length = utf8_length_();
120 |     return *this;
121 | }
122 | 
123 | 
124 | const UTF8string& UTF8string::operator +=( const UTF8string& u8str )
125 | {
126 |     _utf8string  += u8str._utf8string;
127 |     _utf8length = utf8_length_();
128 |     return *this;
129 | }
130 | 
131 | 
132 | const UTF8string& UTF8string::operator +=( const char * str )
133 | {
134 |     UTF8string::u8string BACKUP = _utf8string;
135 |     _utf8string += std::move( UTF8string::u8string( str ) );
136 | 
137 |     if ( !utf8_is_valid_() )
138 |     {
139 |         _utf8string = BACKUP;
140 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
141 |     }
142 | 
143 |     _utf8length = utf8_length_();
144 |     return *this;
145 | }
146 | 
147 | 
148 | bool UTF8string::utf8_is_valid_() const noexcept
149 | {
150 |     const std::basic_string<unsigned char> U8STRING = toUstring( _utf8string );
151 |     auto it = U8STRING.begin();
152 |     const auto ITEND = U8STRING.cend();
153 | 
154 |     while ( it < ITEND )
155 |     {
156 |         if ( ( 0xF8 & *it ) == 0xF0 && *it <= 0xF4 )
157 |         {
158 |             // The UTF-8 codepoint begin with 0b11110xxx -> 4-byte codepoint
159 |             // If the iterator reach the end of the string before the
160 |             // end of the 4-byte codepoint -> invalid string
161 |             if ( ( it + 1 ) == ITEND || ( it + 2 ) == ITEND || ( it + 3 ) == ITEND )
162 |                 return false;
163 | 
164 |             // Each of the following bytes is a value
165 |             // between 0x80 and 0xBF
166 |             if ( ( ( 0xC0 & *( it + 1 ) ) != 0x80 ) || ( ( 0xC0 & *( it + 2 ) ) != 0x80 )
167 |                     || ( ( 0xC0 & *( it + 3 ) ) != 0x80 ) )
168 |             {
169 |                 return false;
170 |             }
171 | 
172 |             // If the first byte of the sequence is 0xF0
173 |             // then the first continuation byte must be between 0x90 and 0xBF
174 |             // otherwise, if the byte is 0xF4
175 |             // then the first continuation byte must be between 0x80 and 0x8F
176 |             if ( *it == 0xF0 )
177 |             {
178 |                 if ( *( it + 1 ) < 0x90 || *( it + 1 ) > 0xBF )
179 |                     return false;
180 |             }
181 |             else if ( *it == 0xF4 )
182 |             {
183 |                 if ( *( it + 1 ) < 0x80 || *( it + 1 ) > 0x8F )
184 |                     return false;
185 |             }
186 | 
187 |             it += 4;    // Jump to the next codepoint
188 |         }
189 |         else if ( ( 0xF0 & *it ) == 0xE0 )
190 |         {
191 |             // The UTF-8 codepoint begin with 0b1110xxxx -> 3-byte codepoint
192 |             if ( ( it + 1 ) == ITEND || ( it + 2 ) == ITEND )
193 |                 return false;
194 | 
195 |             // Each of the following bytes starts with
196 |             // 0b10xxxxxx in a valid string
197 |             if ( ( ( 0xC0 & *( it + 1 ) ) != 0x80 ) || ( ( 0xC0 & *( it + 2 ) ) != 0x80 ) )
198 |                 return false;
199 | 
200 |             // If the first byte of the sequence is 0xE0
201 |             // then the first continuation byte must be between 0xA0 and 0xBF
202 |             // otherwise, if the byte is 0xF4
203 |             // then the first continuation byte must be between 0x80 and 0x9F
204 |             if ( *it == 0xE0 )
205 |             {
206 |                 if ( *( it + 1 ) < 0xA0 || *( it + 1 ) > 0xBF )
207 |                     return false;
208 |             }
209 |             else if ( *it == 0xED )
210 |             {
211 |                 if ( *( it + 1 ) > 0x9F )
212 |                     return false;
213 |             }
214 | 
215 |             it += 3;
216 |         }
217 |         else if ( ( 0xE0 & *it ) == 0xC0 )
218 |         {
219 |             // The UTF-8 codepoint begin with 0b110xxxxx -> 2-byte codepoint
220 |             if ( ( it + 1 ) == ITEND )
221 |                 return false;
222 | 
223 |             // The following byte starts with 0b10xxxxxx in a valid string
224 |             if ( ( 0xC0 & *( it + 1 ) ) != 0x80 )
225 |                 return false;
226 | 
227 |             it += 2;
228 |         }
229 |         else if ( ( 0x80 & *it ) == 0x00 )
230 |         {
231 |             // The UTF-8 codepoint begin with 0b0xxxxxxx -> 1-byte codepoint
232 |             it += 1;
233 |         }
234 |         else
235 |         {
236 |             // Invalid codepoint
237 |             return false;
238 |         }
239 |     }
240 | 
241 |     return true;
242 | }
243 | 
244 | // Compute the length of the utf-8 string (in number of codepoints)
245 | size_t UTF8string::utf8_length_() const noexcept
246 | {
247 |     auto end_data = _utf8string.end();
248 |     auto it = _utf8string.begin();
249 |     size_t len = 0;
250 | 
251 |     while ( it != end_data )
252 |     {
253 |         byte_t byte = static_cast<byte_t>( *it );
254 | 
255 |         if ( 0xf0 == ( 0xf8 & byte ) )
256 |         {
257 |             // 4-byte utf8 character
258 |             // (0b11110xxx 0bxxxxxxxx 0bxxxxxxxx 0bxxxxxxxx)
259 |             it += 4;
260 |         }
261 |         else if ( 0xe0 == ( 0xf0 & byte ) )
262 |         {
263 |             // 3-byte utf8 code point (0b110xxxxx 0bxxxxxxxx 0bxxxxxxxx)
264 |             it += 3;
265 |         }
266 |         else if ( 0xc0 == ( 0xe0 & byte ) )
267 |         {
268 |             // 2-byte utf8 code point (0b110xxxxx 0bxxxxxxxx)
269 |             it += 2;
270 |         }
271 |         else
272 |         {
273 |             // 1-byte utf8 code point (0b0xxxxxxx)
274 |             it += 1;
275 |         }
276 | 
277 |         // We want the number of characters (utf-8 code point)
278 |         len += 1;
279 |     }
280 | 
281 |     return len;
282 | }
283 | 
284 | // Compute the memory size of a codepoint in the string (in byte)
285 | size_t UTF8string::utf8_codepoint_len_( const size_t j ) const noexcept
286 | {
287 |     if ( 0xf0 == ( 0xf8 & _utf8string[j] ) )
288 |     {
289 |         return 4;
290 |     }
291 |     else if ( 0xe0 == ( 0xf0 & _utf8string[j] ) )
292 |     {
293 |         return 3;
294 |     }
295 |     else if ( 0xc0 == ( 0xe0 & _utf8string[j] ) )
296 |     {
297 |         return 2;
298 |     }
299 |     else
300 |         return 1;
301 | }
302 | 
303 | 
304 | void UTF8string::utf8_clear() noexcept
305 | {
306 |     _utf8string.clear();
307 |     _utf8length = 0;
308 | }
309 | 
310 | 
311 | bool UTF8string::utf8_empty() const noexcept
312 | {
313 |     return _utf8length == 0;
314 | }
315 | 
316 | 
317 | UTF8string& UTF8string::utf8_assign( const char * str )
318 | {
319 |     const UTF8string::u8string BACKUP = _utf8string;
320 |     _utf8string = std::string( str );
321 | 
322 |     if ( !utf8_is_valid_() )
323 |     {
324 |         _utf8string = BACKUP;
325 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
326 |     }
327 | 
328 |     _utf8length = utf8_length_();
329 |     return *this;
330 | }
331 | 
332 | UTF8string& UTF8string::utf8_assign( const u8string& str )
333 | {
334 |     const UTF8string::u8string BACKUP = _utf8string;
335 |     _utf8string = str;
336 | 
337 |     if ( !utf8_is_valid_() )
338 |     {
339 |         _utf8string = BACKUP;
340 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
341 |     }
342 | 
343 |     _utf8length = utf8_length_();
344 |     return *this;
345 | }
346 | 
347 | UTF8string& UTF8string::utf8_assign( const u8string& str, size_t pos, size_t count )
348 | {
349 |     const UTF8string::u8string BACKUP = _utf8string;
350 |     _utf8string = str.substr( pos, count );
351 | 
352 |     if ( !utf8_is_valid_() )
353 |     {
354 |         _utf8string = BACKUP;
355 |         throw std::invalid_argument( "Invalid UTF-8 string\n" );
356 |     }
357 | 
358 |     _utf8length = utf8_length_();
359 |     return *this;
360 | }
361 | 
362 | UTF8string& UTF8string::utf8_assign( UTF8string&& u8str ) noexcept
363 | {
364 |     _utf8string = u8str._utf8string;
365 |     _utf8length = u8str._utf8length;
366 | 
367 |     u8str.utf8_clear();
368 |     u8str._utf8string.shrink_to_fit();
369 | 
370 |     return *this;
371 | }
372 | 
373 | 
374 | /*
375 |     Get the memory position of a codepoint according
376 |     to its position in the utf-8 string
377 | */
378 | size_t UTF8string::utf8_bpos_at_( const size_t cpos ) const noexcept
379 | {
380 |     size_t bpos = 0;
381 |     const size_t U8SIZE = utf8_size();
382 | 
383 |     for ( size_t i = 0; bpos < U8SIZE && i < cpos; i++ )
384 |     {
385 |         bpos += utf8_codepoint_len_( bpos );
386 |     }
387 |     return bpos;
388 | }
389 | 
390 | 
391 | UTF8string::u8string UTF8string::utf8_at_( const size_t index ) const noexcept
392 | {
393 |     size_t bpos = utf8_bpos_at_( index );
394 |     return _utf8string.substr( bpos, utf8_codepoint_len_( bpos ) );
395 | }
396 | 
397 | 
398 | UTF8string::u8char UTF8string::utf8_at( const size_t index ) const
399 | {
400 |     if ( index >= _utf8length )
401 |         throw std::out_of_range( "index value greater than the size of the string" );
402 | 
403 |     return utf8_at_( index );
404 | }
405 | 
406 | 
407 | UTF8string::u8char UTF8string::operator []( const size_t index ) const noexcept
408 | {
409 |     return utf8_at_( index );
410 | }
411 | 
412 | 
413 | void UTF8string::utf8_pop()
414 | {
415 |     if ( _utf8length == 0 )
416 |         throw std::length_error( "Cannot remove the last element from an empty string" );
417 | 
418 |     size_t bpos = utf8_bpos_at_( _utf8length - 1 );
419 |     _utf8string.erase( bpos );
420 |     _utf8length -= 1;
421 | }
422 | 
423 | UTF8string& UTF8string::utf8_erase( const size_t index, const size_t count )
424 | {
425 |     if ( index > _utf8length )
426 |         throw std::out_of_range( "utf8_range - index out of range" );
427 | 
428 |     const size_t COUNT = min( count, _utf8length - index );
429 | 
430 |     if ( _utf8length == 0 || COUNT == 0 )
431 |         return *this;
432 | 
433 |     const size_t BFIRST = utf8_bpos_at_( index );
434 |     const size_t BLAST  = utf8_bpos_at_( index + COUNT );
435 |     const size_t N      = _utf8string.size();
436 |     u8string u8s;
437 | 
438 |     for ( size_t i = 0U; i < N; ++i )
439 |     {
440 |         if ( i < BFIRST || i > BLAST - 1 )
441 |             u8s += _utf8string[i];
442 |     }
443 | 
444 |     _utf8string = u8s;
445 |     _utf8length = utf8_length_();
446 |     return *this;
447 | }
448 | 
449 | UTF8iterator UTF8string::utf8_erase( const UTF8iterator& position )
450 | {
451 |     if ( position == utf8_end() )
452 |         return utf8_end();
453 | 
454 |     if ( position == utf8_end() - 1 )
455 |     {
456 |         utf8_pop();
457 |         return utf8_end();
458 |     }
459 | 
460 |     const size_t d = static_cast<size_t>( position - utf8_begin() );
461 |     utf8_erase( d, 1U );
462 |     return utf8_begin() + d;
463 | }
464 | 
465 | UTF8iterator UTF8string::utf8_erase( const UTF8iterator& first, const UTF8iterator& last )
466 | {
467 |     if ( first == last )
468 |         return utf8_end();
469 | 
470 |     if ( first == utf8_begin() && last == utf8_end() )
471 |     {
472 |         utf8_clear();
473 |         return utf8_end();
474 |     }
475 | 
476 |     const UTF8iterator& REAL_FIRST = first < last ? first : last;
477 |     const UTF8iterator& REAL_LAST  = first < last ? last : first;
478 | 
479 |     const size_t INDEX = static_cast<size_t>( REAL_FIRST - utf8_begin() );
480 |     const size_t COUNT = static_cast<size_t>( REAL_LAST - REAL_FIRST );
481 |     utf8_erase( INDEX, COUNT );
482 | 
483 |     return utf8_begin() + INDEX;
484 | }
485 | 
486 | UTF8string UTF8string::utf8_substr( size_t pos, size_t len ) const
487 | {
488 |     if ( pos > _utf8length )
489 |         return UTF8string();
490 | 
491 |     // Length of the substring (number of code points)
492 |     const size_t N = ( len == UTF8string::npos || ( pos + len ) > _utf8length ) ?
493 |                      ( _utf8length - pos ) : len;
494 | 
495 |     UTF8iterator it = utf8_iterator_() + pos;
496 |     const UTF8iterator _END = ( it + N );
497 |     std::string s;
498 | 
499 |     while ( it != _END )
500 |     {
501 |         s += *( it++ );
502 |     }
503 | 
504 |     return UTF8string( s );
505 | }
506 | 
507 | // This function implements the Boyer-Moore string search algorithm
508 | size_t UTF8string::utf8_find( const UTF8string& str, size_t pos ) const
509 | {
510 |     if ( str._utf8length == 0 )
511 |         return UTF8string::npos;
512 | 
513 |     // Preprocessing
514 |     std::unordered_map<UTF8string::u8char, size_t> u8map;
515 |     preprocess( str, u8map );
516 | 
517 |     const size_t U8LEN = str._utf8length;
518 |     size_t index = pos;
519 | 
520 |     // Look for the subtring
521 |     while ( index <= _utf8length - U8LEN )
522 |     {
523 |         size_t j = U8LEN - 1;
524 |         bool found = false;
525 | 
526 |         while ( ( str.utf8_at( j ) == utf8_at( index + j ) ) )
527 |         {
528 |             if ( j == 0 )
529 |             {
530 |                 found = true;
531 |                 break;
532 |             }
533 | 
534 |             j--;
535 |         }
536 | 
537 |         if ( !found )
538 |         {
539 |             UTF8string::u8char ss = utf8_at( index + j );
540 |             index += ( u8map.find( ss ) == u8map.end() ) ? U8LEN : u8map[ss];
541 |         }
542 |         else
543 |             return index;
544 |     }
545 | 
546 |     return UTF8string::npos;
547 | }
548 | 
549 | // Tail-recursive function that reverse the string
550 | UTF8string UTF8string::utf8_reverse_aux_( UTF8iterator& it, const UTF8iterator& _end, UTF8string& res )
551 | {
552 |     if ( it == _end )
553 |         return res;
554 | 
555 |     res += *( --it );
556 |     return utf8_reverse_aux_( it, _end, res );
557 | }
558 | 
559 | 
560 | UTF8string& UTF8string::utf8_reverse()
561 | {
562 |     if ( _utf8length > 1 )
563 |     {
564 |         UTF8iterator it = utf8_end();
565 |         UTF8string rev;
566 |         _utf8string = ( utf8_reverse_aux_( it, utf8_iterator_(), rev ) )._utf8string;
567 |     }
568 | 
569 |     return *this;
570 | }
571 | 
572 | 
573 | size_t UTF8string::utf8_size() const noexcept
574 | {
575 |     return _utf8string.size();
576 | }
577 | 
578 | 
579 | size_t UTF8string::utf8_length() const noexcept
580 | {
581 |     return _utf8length;
582 | }
583 | 
584 | const std::string UTF8string::utf8_sstring() const noexcept
585 | {
586 |     return _utf8string;
587 | }
588 | 
589 | const char * UTF8string::utf8_str() const noexcept
590 | {
591 |     return _utf8string.c_str();
592 | }
593 | 
594 | size_t UTF8string::hash() const noexcept
595 | {
596 |     // computes the hash using a variant
597 |     // of the Fowler-Noll-Vo hash function
598 |     const size_t MAGIC = 16777619U;
599 |     size_t result = 2166136261U;
600 | 
601 |     for ( const char& c : _utf8string )
602 |     {
603 |         result = ( result * MAGIC ) ^ static_cast<decltype( result )>( c );
604 |     }
605 |     return result ^ ( _utf8length << 1 );
606 | }
607 | 
608 | // Internal function that creates an iterator of the current string
609 | UTF8iterator UTF8string::utf8_iterator_() const noexcept
610 | {
611 |     return UTF8iterator( *this );
612 | }
613 | 
614 | 
615 | UTF8iterator UTF8string::utf8_begin() const noexcept
616 | {
617 |     return utf8_iterator_();
618 | }
619 | 
620 | 
621 | UTF8iterator UTF8string::utf8_end() const noexcept
622 | {
623 |     return utf8_begin() + _utf8length;
624 | }
625 | 
626 | 
627 | UTF8iterator UTF8string::begin() const noexcept
628 | {
629 |     return utf8_begin();
630 | }
631 | 
632 | 
633 | UTF8iterator UTF8string::end() const noexcept
634 | {
635 |     return utf8_begin() + _utf8length;
636 | }
637 | 
638 | 
639 | bool operator ==( const UTF8string& str1, const UTF8string& str2 ) noexcept
640 | {
641 |     return str1.utf8_sstring() == str2.utf8_sstring();
642 | }
643 | 
644 | bool operator !=( const UTF8string& str1, const UTF8string& str2 ) noexcept
645 | {
646 |     return !( str1 == str2 );
647 | }
648 | 
649 | 
650 | bool operator <=( const UTF8string& str1, const UTF8string& str2 ) noexcept
651 | {
652 |     return str1.utf8_sstring() <= str2.utf8_sstring();
653 | }
654 | 
655 | 
656 | bool operator >=( const UTF8string& str1, const UTF8string& str2 ) noexcept
657 | {
658 |     return str1.utf8_sstring() >= str2.utf8_sstring();
659 | }
660 | 
661 | 
662 | bool operator <( const UTF8string& str1, const UTF8string& str2 ) noexcept
663 | {
664 |     return str1.utf8_sstring() < str2.utf8_sstring();
665 | }
666 | 
667 | 
668 | bool operator >( const UTF8string& str1, const UTF8string& str2 ) noexcept
669 | {
670 |     return str1.utf8_sstring() > str2.utf8_sstring();
671 | }
672 | 
673 | 
674 | UTF8string operator +( const UTF8string& str1, const UTF8string& str2 )
675 | {
676 |     return str1 + str2.utf8_sstring();
677 | }
678 | 
679 | 
680 | UTF8string operator +( const UTF8string& str1, const std::string& str2 )
681 | {
682 |     return UTF8string( str1.utf8_sstring() + str2 );
683 | }
684 | 
685 | UTF8string operator +( const std::string& str1, const UTF8string& str2 )
686 | {
687 |     return UTF8string( str1 + str2.utf8_sstring() );
688 | }
689 | 
690 | 
691 | UTF8string operator +( const UTF8string& str1, const char * str2 )
692 | {
693 |     return str1 + std::string( str2 );
694 | }
695 | 
696 | 
697 | UTF8string operator +( const char * str1, const UTF8string& str2 )
698 | {
699 |     return std::string( str1 ) + str2;
700 | }
701 | 
702 | 
703 | std::ostream& operator <<( std::ostream& os, const UTF8string& str )
704 | {
705 |     os << str.utf8_sstring();
706 |     return os;
707 | }
708 | 
709 | 
710 | std::istream& operator >>( std::istream& is, UTF8string& str )
711 | {
712 |     std::string tmp;
713 |     std::getline( is, tmp );
714 |     str = tmp;
715 |     return is;
716 | }
717 | 


--------------------------------------------------------------------------------
/src/utf8_string.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | *   Copyright © 2018 Luxon Jean-Pierre
  4 | *   https://gumichan01.github.io/
  5 | *
  6 | *   This library is under the MIT license
  7 | *
  8 | *   Luxon Jean-Pierre (Gumichan01)
  9 | *   luxon.jean.pierre@gmail.com
 10 | *
 11 | */
 12 | 
 13 | #ifndef UTF8_STRING_HPP_INCLUDED
 14 | #define UTF8_STRING_HPP_INCLUDED
 15 | 
 16 | 
 17 | /**
 18 | *   @file utf8_string.hpp
 19 | *   @brief This is a UTF-8 string library header
 20 | */
 21 | 
 22 | #include <string>
 23 | #include <iostream>
 24 | 
 25 | class UTF8iterator;
 26 | 
 27 | /**
 28 | *   @class UTF8string final
 29 | *   @brief UTF-8 string class
 30 | *
 31 | *   This class defines a UTF-8 string
 32 | */
 33 | class UTF8string final
 34 | {
 35 |     using byte_t = unsigned char;
 36 |     using u8string = std::string;
 37 | 
 38 |     u8string _utf8string = {};
 39 |     size_t _utf8length = 0U;
 40 | 
 41 |     bool utf8_is_valid_() const noexcept;
 42 |     size_t utf8_length_() const noexcept;
 43 |     size_t utf8_codepoint_len_( const size_t j ) const noexcept;
 44 |     size_t utf8_bpos_at_( const size_t cpos ) const noexcept;
 45 |     u8string utf8_at_( const size_t index ) const noexcept;
 46 | 
 47 |     UTF8iterator utf8_iterator_() const noexcept;
 48 |     UTF8string utf8_reverse_aux_( UTF8iterator& it,
 49 |                                   const UTF8iterator& _end, UTF8string& res );
 50 | 
 51 | public:
 52 | 
 53 |     /**
 54 |     *   @typedef u8char
 55 |     *   @brief The UTF-8 character
 56 |     */
 57 |     using u8char = std::string;
 58 | 
 59 |     /**
 60 |     *   @var npos
 61 |     *
 62 |     *   *npos* is a static member constant value with the greatest
 63 |     *   possible value for an element of type *size_t*.
 64 |     *
 65 |     *   This value, when used as the value for a len parameter in some
 66 |     *   UTF-8 string's member functions(utf8_substr()),
 67 |     *   means "until the end of the utf-8 string".
 68 |     *
 69 |     *   As a return value in utf8_find(), it is used to indicate no matches.
 70 |     *   This constant is defined with a value of -1,
 71 |     *   which because *size_t* is an unsigned integral type,
 72 |     *   it is the largest possible representable value for this type.
 73 |     *
 74 |     */
 75 |     constexpr static size_t npos = std::string::npos;
 76 | 
 77 |     /**
 78 |     *   @fn UTF8string() = default
 79 |     */
 80 |     UTF8string() = default;
 81 |     /**
 82 |     *   @fn UTF8string(const char * str)
 83 |     *   @param str
 84 |     *   @pre str is not null
 85 |     *   @exception std::invalid_argument If the string is not valid
 86 |     */
 87 |     UTF8string( const char * str );
 88 |     /**
 89 |     *   @fn UTF8string(const std::string& str)
 90 |     *   @param str
 91 |     *   @exception std::invalid_argument If the string is not valid
 92 |     */
 93 |     UTF8string( const std::string& str );
 94 |     /**
 95 |     *   @fn UTF8string(const UTF8string& u8str) noexcept
 96 |     *   @param u8str
 97 |     */
 98 |     UTF8string( const UTF8string& u8str ) noexcept;
 99 |     /**
100 |     *   @fn UTF8string(const UTF8string& u8str, size_t pos, size_t len = npos) noexcept
101 |     *   @param u8str
102 |     *   @param pos The beginning position of the substring
103 |     *   @param len The length of the substring (in number of codepoints, default value = npos)
104 |     */
105 |     UTF8string( const UTF8string& u8str, size_t pos, size_t len = npos ) noexcept;
106 |     /**
107 |     *   @fn UTF8string(UTF8string&& u8str) noexcept
108 |     *   @param u8str
109 |     */
110 |     UTF8string( UTF8string&& u8str ) noexcept;
111 | 
112 |     /**
113 |     *   @fn UTF8string& operator =(const char * str)
114 |     *   @param str C-string that will be converted
115 |     *   @return A reference to the new utf-8 string
116 |     *   @exception std::invalid_argument If the string is not valid
117 |     *   @note If an exception is thrown, the object in not modified
118 |     */
119 |     UTF8string& operator =( const char * str );
120 |     /**
121 |     *   @fn UTF8string& operator =(const std::string& str)
122 |     *   @param str The string that will be converted and checked
123 |     *   @return A reference to the new utf-8 string
124 |     *   @exception std::invalid_argument If the string is not valid
125 |     *   @note If an exception is thrown, the object in not modified
126 |     */
127 |     UTF8string& operator =( const std::string& str );
128 |     /**
129 |     *   @fn UTF8string& operator =(const UTF8string& u8str)
130 |     *   @param u8str The utf-8 string
131 |     *   @return A reference to the new utf-8 string
132 |     */
133 |     UTF8string& operator =( const UTF8string& u8str ) noexcept;
134 |     /**
135 |     *   @fn UTF8string& operator =(UTF8string&& u8str)
136 |     *   @param u8str The utf-8 string
137 |     *   @return A reference to the new utf-8 string
138 |     */
139 |     UTF8string& operator =( UTF8string&& u8str ) noexcept;
140 | 
141 |     /**
142 |     *   @fn const UTF8string& operator +=(const UTF8string& u8str)
143 |     *
144 |     *   Append a utf-8 string
145 |     *
146 |     *   @param u8str The string to convert from
147 |     *   @return The reference to the concatenated utf-8 string
148 |     */
149 |     const UTF8string& operator +=( const UTF8string& u8str );
150 |     /**
151 |     *   @fn const UTF8string& operator +=(const std::string& str)
152 |     *
153 |     *   Append a string
154 |     *
155 |     *   @param str The string to convert from
156 |     *   @return The reference to the concatenated utf-8 string
157 |     *   @exception std::invalid_argument If the string is not valid
158 |     *   @note If an exception is thrown, the object in not modified
159 |     */
160 |     const UTF8string& operator +=( const std::string& str );
161 |     /**
162 |     *   @fn const UTF8string& operator +=(const char * str)
163 |     *
164 |     *   Append a C-string
165 |     *
166 |     *   @param str The string to convert from
167 |     *   @return The reference to the concatenated utf-8 string
168 |     *   @exception std::invalid_argument If the string is not valid
169 |     */
170 |     const UTF8string& operator +=( const char * str );
171 | 
172 |     /**
173 |     *   @fn void utf8_clear() noexcept
174 |     *   Clear the content of the object
175 |     */
176 |     void utf8_clear() noexcept;
177 |     /**
178 |     *   @fn bool utf8_empty() const noexcept
179 |     *
180 |     *   Check if the content is empty
181 |     *
182 |     *   @return TRUE If it is empty, FALSE otherwise
183 |     */
184 |     bool utf8_empty() const noexcept;
185 | 
186 | 
187 |     /**
188 |     *   @fn UTF8string& utf8_assign(const char * str)
189 |     *   @return The updated string
190 |     */
191 |     UTF8string& utf8_assign( const char * str );
192 |     /**
193 |     *   @fn UTF8string& utf8_assign(const u8string& str)
194 |     *   @return The updated string
195 |     */
196 |     UTF8string& utf8_assign( const u8string& str );
197 |     /**
198 |     *   @fn UTF8string& utf8_assign(const u8string& str, size_t pos, size_t count = npos)
199 |     *
200 |     *   Replaces the contents with a substring [pos, pos+count) of str.
201 |     *   If the requested substring lasts past the end of the string, or if count == npos, the resulting substring is [pos, str.size()).
202 |     *
203 |     *   @exception std::out_of_range If pos > str.size()
204 |     *   @return The updated string
205 |     */
206 |     UTF8string& utf8_assign( const u8string& str, size_t pos, size_t count = npos );
207 |     /**
208 |     *   @fn UTF8string& utf8_assign(UTF8string&& u8str) noexcept
209 |     *   @return The updated string
210 |     */
211 |     UTF8string& utf8_assign( UTF8string&& u8str ) noexcept;
212 | 
213 |     /**
214 |     *   @fn UTF8string::u8char utf8_at(const size_t index) const
215 |     *
216 |     *   Get the codepoint at a specified position.
217 |     *
218 |     *   @param index The index of the requested codepoint in the string
219 |     *   @return The codepoint
220 |     *   @exception std::out_of_range If the index is out of the string range
221 |     *   @note If an exception is thrown, the object in not modified
222 |     */
223 |     UTF8string::u8char utf8_at( const size_t index ) const;
224 |     /**
225 |     *   @fn UTF8string::u8char operator [](const size_t index) const noexcept
226 |     *
227 |     *   Get the codepoint at a specified position.
228 |     *
229 |     *   @param index The index of the requested codepoint in the string
230 |     *   @return The codepoint
231 |     *
232 |     *   @note If the index is out of the string range, calling this functions
233 |     *         causes undefined behaviour
234 |     */
235 |     UTF8string::u8char operator []( const size_t index ) const noexcept;
236 |     /**
237 |     *   @fn void utf8_pop()
238 |     *
239 |     *   Remove the last codepoint.
240 |     *
241 |     *   @exception std::length_error If the string is empty
242 |     *   @note If an exception is thrown, the object in not modified
243 |     */
244 |     void utf8_pop();
245 |     /**
246 |     *   @fn UTF8string& utf8_erase(const size_t index = 0, const size_t count = npos)
247 |     *
248 |     *   Removes min(count, utf8_size() - index) characters starting at index
249 |     *
250 |     *   @return *this
251 |     *   @exception std::out_of_range if ```index > utf8_size()```
252 |     *   @note If an exception is thrown, the object in not modified
253 |     */
254 |     UTF8string& utf8_erase( const size_t index = 0, const size_t count = npos );
255 |     /**
256 |     *   @fn UTF8iterator utf8_erase(const UTF8iterator& position)
257 |     *
258 |     *   Removes the character at position
259 |     *
260 |     *   @return Iterator pointing to the character immediately following the character erased,
261 |     *           or utf8_end() if no such character exists
262 |     *   @note If the iterator does not point to *this, the behaviour is undefined
263 |     */
264 |     UTF8iterator utf8_erase( const UTF8iterator& position );
265 |     /**
266 |     *   @fn UTF8iterator utf8_erase(const UTF8iterator& first, const UTF8iterator& last)
267 |     *
268 |     *   Removes the character in the range [first, last[
269 |     *
270 |     *   @return Iterator pointing to the character ```last```before the erase,
271 |     *           or utf8_end() if no such character exists
272 |     *   @note If one of the iterators does not point to *this, the behaviour is undefined
273 |     */
274 |     UTF8iterator utf8_erase( const UTF8iterator& first, const UTF8iterator& last );
275 | 
276 |     /**
277 |     *   @fn UTF8string utf8_substr(size_t pos = 0, size_t len = npos) const
278 |     *
279 |     *   Generate a substring according to the position and the length requested.
280 |     *
281 |     *   The substring is the portion of the object that starts at
282 |     *   character position *pos* and spans *len* characters
283 |     *   (or until the end of the string, whichever comes first).
284 |     *
285 |     *   @param pos The beginning position of the substring (default value: 0)
286 |     *   @param len The length of the substring (in number of codepoints, default value = npos)
287 |     *   @return The substring
288 |     */
289 |     UTF8string utf8_substr( size_t pos = 0, size_t len = npos ) const;
290 |     /**
291 |     *   @fn size_t utf8_find(const UTF8string& str, size_t pos = 0) const
292 |     *
293 |     *   Search for the first occurrence of utf8 string
294 |     *   specified in argument.
295 |     *
296 |     *   When pos is specified, the search only includes characters
297 |     *   at or after position pos, ignoring any possible occurrences
298 |     *   that include characters before pos.
299 |     *
300 |     *   @param str The string to look for
301 |     *   @param pos The position to start the search
302 |     *   @return The position of the substring if it was found
303 |     *           (in number of codepoints), UTF8string::npos otherwise.
304 |     */
305 |     size_t utf8_find( const UTF8string& str, size_t pos = 0 ) const;
306 |     /**
307 |     *   @fn UTF8string& utf8_reverse()
308 |     *   Reverse the current utf-8 string.
309 |     *   @return The reversed string
310 |     */
311 |     UTF8string& utf8_reverse();
312 | 
313 |     /**
314 |     *   @fn size_t utf8_size() const noexcept
315 |     *   Get the memory size (in bytes) of the utf-8 string
316 |     *   @return The memory size of the utf-8 string
317 |     */
318 |     size_t utf8_size() const noexcept;
319 |     /**
320 |     *   @fn size_t utf8_length() const noexcept
321 |     *   Get the length of the utf-8 string
322 |     *   @return The length of the utf-8 string (in number of codepoints)
323 |     */
324 |     size_t utf8_length() const noexcept;
325 | 
326 |     /**
327 |     *   @fn const std::string utf8_sstring() const noexcept
328 |     *
329 |     *   Returns the string related to the UTF-8 string
330 |     *
331 |     *   @return The string
332 |     */
333 |     const std::string utf8_sstring() const noexcept;
334 |     /**
335 |     *   @fn const char * utf8_str() const noexcept
336 |     *
337 |     *   Returns a pointer to an array that contains a null-terminated sequence
338 |     *   of characters (C-string).
339 |     *
340 |     *   This array include exactly the string plus the null character ('\0')
341 |     *   at the end.
342 |     *
343 |     *   @return A pointer to a C-string
344 |     */
345 |     const char * utf8_str() const noexcept;
346 |     /**
347 |     *   @fn size_t hash() const noexcept
348 |     *   Generate a hash value of the utf8 string
349 |     *   @return The hash value
350 |     */
351 |     size_t hash() const noexcept;
352 | 
353 |     /**
354 |     *   @fn UTF8iterator utf8_begin() const noexcept
355 |     *
356 |     *   Returns an iterator that points to the first codepoint of the string
357 |     *
358 |     *   @return An iterator to the beginnigng of the string
359 |     */
360 |     UTF8iterator utf8_begin() const noexcept;
361 |     /**
362 |     *   @fn UTF8iterator utf8_end() const noexcept
363 |     *
364 |     *   Returns an iterator that points to the *past-the-end* codepoint of the string
365 |     *
366 |     *   The past-the-end codepoint is a theoretical codepoint that would follow
367 |     *   the last codepoint in the string. It shall not be dereferenced.
368 |     *
369 |     *   @return An iterator to the past-the-end codepoint
370 |     */
371 |     UTF8iterator utf8_end() const noexcept;
372 | 
373 |     /**
374 |     *   @fn UTF8iterator begin() const noexcept
375 |     *
376 |     *   Returns an iterator that points to the first codepoint of the string
377 |     *
378 |     *   @return An iterator to the beginnigng of the string
379 |     *   @note Same as utf8_begin()
380 |     */
381 |     UTF8iterator begin() const noexcept;
382 |     /**
383 |     *   @fn UTF8iterator end() const noexcept
384 |     *
385 |     *   Returns an iterator that points to the *past-the-end* codepoint of the string
386 |     *
387 |     *   The past-the-end codepoint is a theoretical codepoint that would follow
388 |     *   the last codepoint in the string. It shall not be dereferenced.
389 |     *
390 |     *   @return An iterator to the past-the-end codepoint
391 |     *   @note Same as utf8_end()
392 |     */
393 |     UTF8iterator end() const noexcept;
394 | 
395 |     ~UTF8string() = default;
396 | };
397 | 
398 | 
399 | namespace std
400 | {
401 | 
402 | template<>
403 | class hash<UTF8string>
404 | {
405 | public:
406 |     size_t operator()( const UTF8string& u8str ) const
407 |     {
408 |         return u8str.hash();
409 |     }
410 | };
411 | 
412 | }
413 | 
414 | 
415 | /**
416 | *   @fn bool operator ==(const UTF8string& str1, const UTF8string& str2) noexcept
417 | *
418 | *   Check if two utf-8 strings are equals.
419 | *
420 | *   Two utf-8 strings are equals if and only if they heve the same length
421 | *   and have the same sequence of codepoints.
422 | *
423 | *   @param str1 utf-8 string
424 | *   @param str2 utf-8 string
425 | *   @return TRUE if they are equals, FALSE otherwise
426 | */
427 | bool operator ==( const UTF8string& str1, const UTF8string& str2 ) noexcept;
428 | 
429 | /**
430 | *   @fn bool operator !=(const UTF8string& str1, const UTF8string& str2) noexcept
431 | *
432 | *   Check if two utf-8 strings are differents.
433 | *
434 | *   @param str1 utf-8 string
435 | *   @param str2 utf-8 string
436 | *   @return TRUE if they are not equals, FALSE otherwise
437 | */
438 | bool operator !=( const UTF8string& str1, const UTF8string& str2 ) noexcept;
439 | 
440 | /**
441 | *   @fn bool operator <=(const UTF8string& str1, const UTF8string& str2) noexcept
442 | *
443 | *   Check if the first utf-8 string is shorter or equal
444 | *   than/to the second utf-8 string
445 | *
446 | *   @param str1 utf-8 string
447 | *   @param str2 utf-8 string
448 | *   @return TRUE if the first string is shorter, FALSE otherwise
449 | */
450 | bool operator <=( const UTF8string& str1, const UTF8string& str2 ) noexcept;
451 | 
452 | /**
453 | *   @fn bool operator >=(const UTF8string& str1, const UTF8string& str2) noexcept
454 | *
455 | *   Check if the first utf-8 string is longer or equal than/to the second utf-8 string
456 | *
457 | *   @param str1 utf-8 string
458 | *   @param str2 utf-8 string
459 | *   @return TRUE if tthe first string is longer, FALSE otherwise
460 | */
461 | bool operator >=( const UTF8string& str1, const UTF8string& str2 ) noexcept;
462 | 
463 | /**
464 | *   @fn bool operator <(const UTF8string& str1, const UTF8string& str2) noexcept
465 | *
466 | *   Check if the first utf-8 string is strictly shorter
467 | *   than the second utf-8 string
468 | *
469 | *   @param str1 utf-8 string
470 | *   @param str2 utf-8 string
471 | *   @return TRUE if the first string is strictly shorter, FALSE otherwise
472 | */
473 | bool operator <( const UTF8string& str1, const UTF8string& str2 ) noexcept;
474 | 
475 | /**
476 | *   @fn bool operator >(const UTF8string& str1, const UTF8string& str2) noexcept
477 | *
478 | *   Check if the first utf-8 string is strictly longer
479 | *   than the second utf-8 string
480 | *
481 | *   @param str1 utf-8 string
482 | *   @param str2 utf-8 string
483 | *   @return TRUE if the string is strictly longer, FALSE otherwise
484 | */
485 | bool operator >( const UTF8string& str1, const UTF8string& str2 ) noexcept;
486 | 
487 | /**
488 | *   @fn UTF8string operator +(const UTF8string& str1, const UTF8string& str2)
489 | *
490 | *   Generate a string as a concatenation of the two utf-8 givenin arguments
491 | *
492 | *   @param str1 utf-8 string
493 | *   @param str2 utf-8 string
494 | *   @return A new string whose values is the concatenation of str1 and str2
495 | */
496 | UTF8string operator +( const UTF8string& str1, const UTF8string& str2 );
497 | 
498 | /**
499 | *   @fn UTF8string operator +(const UTF8string& str1, const std::string& str2)
500 | *
501 | *   Generate a string as a concatenation of a utf-8 string and a string
502 | *   given in arguments
503 | *
504 | *   @param str1 utf-8 string
505 | *   @param str2 string
506 | *   @return A new string whose values is the concatenation of str1 and str2
507 | */
508 | UTF8string operator +( const UTF8string& str1, const std::string& str2 );
509 | 
510 | /**
511 | *   @fn UTF8string operator +(const std::string& str1, const UTF8string& str2)
512 | *
513 | *   Generate a string as a concatenation of a string and a utf-8 string
514 | *   given in arguments
515 | *
516 | *   @param str1 string
517 | *   @param str2 utf-8 string
518 | *   @return A new string whose values is the concatenation of str1 and str2
519 | */
520 | UTF8string operator +( const std::string& str1, const UTF8string& str2 );
521 | 
522 | /**
523 | *   @fn UTF8string operator +(const UTF8string& str1, const char * str2)
524 | *
525 | *   Generate a string as a concatenation of a utf-8 string and a C-string
526 | *   given in arguments
527 | *
528 | *   @param str1 utf-8 string
529 | *   @param str2 C-string
530 | *   @return A new string whose values is the concatenation of str1 and str2
531 | */
532 | UTF8string operator +( const UTF8string& str1, const char * str2 );
533 | 
534 | /**
535 | *   @fn UTF8string operator +(const char * str1, const UTF8string& str2)
536 | *
537 | *   Generate a string as a concatenation of a C-string and a utf-8 string
538 | *   given in arguments
539 | *
540 | *   @param str1 C-string
541 | *   @param str2 utf8 string
542 | *   @return A new string whose values is the concatenation of str1 and str2
543 | */
544 | UTF8string operator +( const char * str1, const UTF8string& str2 );
545 | 
546 | /**
547 | *   @fn std::ostream& operator <<(std::ostream& os, const UTF8string& str)
548 | *
549 | *   Insert a utf-8 string into a stream.
550 | *
551 | *   This function overloads *operator <<* to behave as described
552 | *   in *ostream::operator <<* for C-strings, but applied to utf-8 string objects.
553 | *
554 | *   @param os The output stream
555 | *   @param str utf8 string to put
556 | *   @return The same as parameter *os*
557 | */
558 | std::ostream& operator <<( std::ostream& os, const UTF8string& str );
559 | 
560 | /**
561 | *   @fn std::istream& operator >>(std::istream& is, UTF8string& str)
562 | *
563 | *   Extract a utf-8 string from a stream, storing the sequence in str,
564 | *   which is overwritten (the previous value of str is replaced).
565 | *
566 | *   This function overloads *operator >>* to behave as described
567 | *   in *istream::operator >>* for c-strings, but applied to string objects.
568 | *
569 | *   @param is The input stream
570 | *   @param str utf8 string to put
571 | *   @return The same as parameter *is*
572 | */
573 | std::istream& operator >>( std::istream& is, UTF8string& str );
574 | 
575 | #include "utf8_iterator.hpp"
576 | 
577 | #endif // UTF8_STRING_HPP_INCLUDED
578 | 


--------------------------------------------------------------------------------
/test/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | *
  3 | *   Copyright © 2018 Luxon Jean-Pierre
  4 | *   https://gumichan01.github.io/
  5 | *
  6 | *   This library is under the MIT license
  7 | *
  8 | *   Luxon Jean-Pierre (Gumichan01)
  9 | *   luxon.jean.pierre@gmail.com
 10 | *
 11 | */
 12 | 
 13 | #include <string>
 14 | #include <cstring>
 15 | #include <stdexcept>
 16 | #include <algorithm>
 17 | #include <iostream>
 18 | #include <fstream>
 19 | 
 20 | #include "../src/utf8_string.hpp"
 21 | 
 22 | using namespace std;
 23 | 
 24 | int main()
 25 | {
 26 |     string jap1 = "ドロテ: すみません、ゆうびんきょくはどこですか。\n";
 27 |     string jap2 = "けいかん: ゆうびんきょくですか。このみちをまっすぐいってください。ひとつめのしんごうをみぎにまがってください。\n";
 28 |     string jap3 = "ドロテ: ひとつめのしんごうをみぎですね。\n";
 29 |     string jap4 = "けいかん: はい、それから、まっすぐいってください。ふたつめのかどにゆうびんきょくがあります。\n";
 30 |     string jap5 = "ドロテ: ふたつめのかどですね。わかりました。どうもありがとうございます。\n";
 31 |     string jap6 = "けいかん: いいえ、どういたしまして。";
 32 | 
 33 |     /// First test suite
 34 |     {
 35 |         UTF8string u8( "がんばつて Gumichan" );
 36 |         string utf8s( "がんばつて Gumichan" );
 37 | 
 38 |         {
 39 |             try
 40 |             {
 41 |                 UTF8string tmp( utf8s );
 42 |             }
 43 |             catch ( ... )
 44 |             {
 45 |                 return 100;
 46 |             }
 47 |         }
 48 | 
 49 |         // Test the test
 50 |         if ( u8 != u8 )
 51 |             return 1;
 52 | 
 53 |         // copy construtor
 54 |         {
 55 |             UTF8string hum( u8 );
 56 | 
 57 |             if ( hum != u8 )
 58 |                 return 101;
 59 |         }
 60 | 
 61 |         // move construtor
 62 |         {
 63 |             UTF8string dump( u8 );
 64 |             UTF8string hum2( std::move( dump ) );
 65 | 
 66 |             if ( hum2 != u8 )
 67 |                 return 102;
 68 | 
 69 |             cout << "dump = " << dump << "\n";
 70 |         }
 71 | 
 72 |         // assignment
 73 |         UTF8string uu8 = u8;
 74 | 
 75 |         if ( u8 != uu8 )
 76 |             return 2;
 77 | 
 78 |         // assignment
 79 |         const std::string& str1 = "がんばつて";
 80 |         UTF8string utf8 = str1;
 81 |         const std::string& str2 = utf8.utf8_str();
 82 | 
 83 |         if ( str1 != str2 )
 84 |             return 3;
 85 | 
 86 |         if ( str1 != utf8.utf8_sstring() )
 87 |             return 4;
 88 | 
 89 |         // move assignment
 90 |         {
 91 |             UTF8string hum3( "hello" );
 92 |             UTF8string dump( u8 );
 93 | 
 94 |             hum3 = std::move( dump );
 95 | 
 96 |             if ( hum3 != u8 )
 97 |                 return 104;
 98 | 
 99 |             cout << "dump = " << dump << "\n";
100 |         }
101 | 
102 |         // assignment (again)
103 |         std::string strg1 = "Gumi";
104 |         std::string strg2 = "chan";
105 |         const size_t sz1 = strg1.size();
106 |         const size_t sz2 = strg2.size();
107 |         const size_t len1 = strg1.length();
108 |         const size_t len2 = strg2.length();
109 |         UTF8string utf8_cat = strg1;
110 | 
111 |         if ( utf8_cat.utf8_size() != sz1 )
112 |         {
113 |             cerr << "ERROR : expected : " << sz1
114 |                  << "; got : " << utf8_cat.utf8_size() << "\n";
115 |             return 4;
116 |         }
117 | 
118 |         if ( utf8_cat.utf8_length() != len1 )
119 |         {
120 |             cerr << "ERROR : expected : " << len1
121 |                  << "; got : " << utf8_cat.utf8_length() << "\n";
122 |             return 5;
123 |         }
124 | 
125 |         utf8_cat += strg2;
126 | 
127 |         if ( utf8_cat.utf8_size() != ( sz1 + sz2 ) )
128 |         {
129 |             cerr << "ERROR : expected : " << ( sz1 + sz2 )
130 |                  << "; got : " << utf8_cat.utf8_size() << "\n";
131 |             return 6;
132 |         }
133 | 
134 |         if ( utf8_cat.utf8_length() != ( len1 + len2 ) )
135 |         {
136 |             cerr << "ERROR : expected : " << ( len1 + len2 )
137 |                  << "; got : " << utf8_cat.utf8_length() << "\n";
138 |             return 7;
139 |         }
140 | 
141 |         // utf8_asign
142 |         {
143 |             const char * hello = "hello";
144 |             const std::string world( "world" );
145 |             const UTF8string chw = "Hello World!";
146 |             UTF8string hw = "Hello World!";
147 | 
148 |             UTF8string u8cstr;
149 |             UTF8string u8str;
150 |             UTF8string u8strp;
151 |             UTF8string u8s;
152 | 
153 |             u8cstr.utf8_assign( hello );
154 |             u8str.utf8_assign( world );
155 |             u8strp.utf8_assign( world, 0U, 2U );
156 |             u8s.utf8_assign( std::move( hw ) );
157 | 
158 |             if ( u8cstr.utf8_sstring() != std::string( hello ) )
159 |                 return 180;
160 | 
161 |             if ( u8str.utf8_sstring() != world )
162 |                 return 181;
163 | 
164 |             if ( u8strp.utf8_sstring() != world.substr( 0U, 2U ) )
165 |                 return 182;
166 | 
167 |             if ( u8s != chw )
168 |                 return 181;
169 | 
170 |             if ( !hw.utf8_empty() )
171 |                 return 182;
172 |         }
173 |     }
174 | 
175 |     /// Second test suite
176 |     {
177 |         // Exract the utf8 string
178 |         UTF8string utf( "がんばつて Gumichan" );
179 |         UTF8string sub1 = utf.utf8_substr( 6, 4 );
180 |         UTF8string sub2 = utf.utf8_substr( 0, 5 );
181 |         UTF8string sub3 = utf.utf8_substr( 64, 1024 );
182 |         UTF8string sub4( utf, 0, 5 );
183 |         UTF8string aexpected( "Gumi" );
184 |         UTF8string u8expected( "がんばつて" );
185 | 
186 |         if ( sub1 != aexpected )
187 |         {
188 |             cerr << "ERROR : expected : " << aexpected.utf8_str()
189 |                  << "; got :" << sub1.utf8_str() << "\n";
190 |             return 8;
191 |         }
192 | 
193 |         if ( sub2 != u8expected )
194 |         {
195 |             cerr << "ERROR : expected : " << u8expected.utf8_str()
196 |                  << "; got :" << sub2.utf8_str() << "\n";
197 |             return 9;
198 |         }
199 | 
200 |         if ( !sub3.utf8_empty() )
201 |         {
202 |             return 90;
203 |         }
204 | 
205 |         if ( sub4 != u8expected )
206 |         {
207 |             cerr << "ERROR : expected : " << u8expected.utf8_str()
208 |                  << "; got :" << sub2.utf8_str() << "\n";
209 |             return 91;
210 |         }
211 | 
212 |         UTF8string sub = utf.utf8_substr();
213 |         if ( sub != utf )
214 |         {
215 |             cerr << "ERROR : expected : " << u8expected.utf8_str()
216 |                  << "; got :" << sub.utf8_str() << "\n";
217 |             return 10;
218 |         }
219 | 
220 |         utf.utf8_clear();
221 |         if ( !utf.utf8_empty() )
222 |             return 11;
223 | 
224 |         UTF8string aaaa( "aaaa" );
225 |         UTF8string bbbb( "bbbb" );
226 |         UTF8string aaa( "aaaa" );
227 | 
228 |         if ( aaaa > bbbb )
229 |             return 12;
230 | 
231 |         if ( aaa > aaaa )
232 |             return 13;
233 | 
234 |         if ( aaaa > aaaa )
235 |             return 14;
236 | 
237 |         if ( aaaa < aaaa )
238 |             return 15;
239 | 
240 |         auto it_begin = aaaa.utf8_begin();
241 |         auto it_end = aaaa.utf8_end();
242 | 
243 |         if ( it_begin == it_end )
244 |             return 16;
245 | 
246 |         // Find the string
247 |         UTF8string str( "がんばつて! Gumichan" );
248 |         UTF8string ganbatsute( "がんばつて" );
249 |         UTF8string gumichan( "Gumichan" );
250 |         size_t lenstr0 = str.utf8_find( UTF8string( "a" ) );
251 |         size_t lenstr1 = str.utf8_find( ganbatsute );
252 |         size_t lenstr2 = str.utf8_find( gumichan, 2 );
253 |         size_t lenpos = str.utf8_find( ganbatsute, 1024 );
254 | 
255 |         UTF8string u8 = ( jap1 + jap2 + jap3 + jap4 + jap5 + jap6 );
256 |         UTF8string subjp( std::string( "いいえ、どういたしまして。" ) );
257 |         size_t res = u8.utf8_find( subjp );
258 | 
259 |         if ( lenstr0 != 13 )
260 |         {
261 |             cout << lenstr0 << " expected : 13 \n";
262 |             return 17;
263 |         }
264 | 
265 |         if ( lenstr1 != 0 )
266 |         {
267 |             cout << lenstr1 << " expected : 0 \n";
268 |             return 171;
269 |         }
270 | 
271 |         if ( UTF8string::npos != str.utf8_find( UTF8string( "" ) ) )
272 |         {
273 |             return 172;
274 |         }
275 | 
276 |         if ( lenstr2 != 7 )
277 |         {
278 |             cout << lenstr2 << " expected : 7" << "\n";
279 |             return 18;
280 |         }
281 | 
282 |         if ( lenpos != UTF8string::npos )
283 |         {
284 |             return 181;
285 |         }
286 | 
287 |         if ( res != ( u8.utf8_length() - subjp.utf8_length() ) )
288 |         {
289 |             return 182;
290 |         }
291 |     }
292 | 
293 |     // Invalid UTF-8 string test
294 |     {
295 |         /// 1-byte codepoint
296 |         try
297 |         {
298 |             // An invalid codepoint
299 |             char inv1[] = {'\x80'};
300 |             string chstr = inv1;
301 |             UTF8string u8 = chstr;
302 | 
303 |             return 19;
304 |         }
305 |         catch ( const std::invalid_argument& ) {}
306 | 
307 |         /// 2-byte codepoint
308 |         // Bad start codepoint
309 |         try
310 |         {
311 |             char inv1[] = {'\xFF', '\x00'};
312 |             string chstr = inv1;
313 |             UTF8string u8 = chstr;
314 | 
315 |             return 20;
316 |         }
317 |         catch ( const std::invalid_argument& ) {}
318 | 
319 |         try
320 |         {
321 |             // 0xC2 is followed by a continuation byte > BF
322 |             char inv21[] = {'\xC2', '\xFE', '\x00'};
323 |             string chstr = inv21;
324 |             UTF8string u8 = chstr;
325 | 
326 |             return 21;
327 |         }
328 |         catch ( const std::invalid_argument& ) {}
329 | 
330 | 
331 |         try
332 |         {
333 |             // 0xC2 is followed by a continuation byte < 0x80
334 |             char inv22[] = {'\xC2', '\x7F', '\x00'};
335 |             string chstr = inv22;
336 |             UTF8string u8 = chstr;
337 | 
338 |             return 22;
339 |         }
340 |         catch ( const std::invalid_argument& ) {}
341 | 
342 | 
343 |         /// 3-byte codepoint
344 |         try
345 |         {
346 |             // 0xE0 has no continuation byte
347 |             char inv23[] = {'\xE0'};
348 |             string chstr = inv23;
349 |             UTF8string u8 = chstr;
350 | 
351 |             return 23;
352 |         }
353 |         catch ( const std::invalid_argument& ) {}
354 | 
355 |         try
356 |         {
357 |             // Invalid continuation byte (0xC0) after 0xE0
358 |             char inv24[] = {'\xE0', '\xA7', '\xC0', '\x00'};
359 |             string chstr = inv24;
360 |             UTF8string u8 = chstr;
361 | 
362 |             return 24;
363 |         }
364 |         catch ( const std::invalid_argument& ) {}
365 | 
366 |         try
367 |         {
368 |             // Invalid continuation byte (0x9F) after 0xE0
369 |             char inv25[] = {'\xE0', '\xFF', '\xFF', '\xFF'};
370 |             string chstr = inv25;
371 |             UTF8string u8 = chstr;
372 | 
373 |             return 25;
374 |         }
375 |         catch ( const std::invalid_argument& ) {}
376 | 
377 |         try
378 |         {
379 |             // Invalid continuation byte (0x71) after 0xED
380 |             char inv26[] = {'\xED', '\x71', '\xA7', '\x00'};
381 |             string chstr = inv26;
382 |             UTF8string u8 = chstr;
383 | 
384 |             return 26;
385 |         }
386 |         catch ( const std::invalid_argument& ) {}
387 | 
388 |         try
389 |         {
390 |             // Invalid continuation byte (0xA0) after 0xED
391 |             char inv27[] = {'\xED', '\xA0', '\xFF', '\xFF'};
392 |             string chstr = inv27;
393 |             UTF8string u8 = chstr;
394 | 
395 |             return 27;
396 |         }
397 |         catch ( const std::invalid_argument& ) {}
398 | 
399 |         try
400 |         {
401 |             // valid string
402 |             char inv28[] = {'\xE0', '\xA7', '\xA7', '\x00'};
403 |             string chstr = inv28;
404 |             UTF8string u8 = chstr;
405 | 
406 |         }
407 |         catch ( const std::invalid_argument& )
408 |         {
409 |             return 28;
410 |         }
411 | 
412 |         try
413 |         {
414 |             // 0xE0 has no continuation byte
415 |             char inv29[] = {'\xED'};
416 |             string chstr = inv29;
417 |             UTF8string u8 = chstr;
418 | 
419 |             return 29;
420 |         }
421 |         catch ( const std::invalid_argument& ) {}
422 | 
423 |         /// 4-byte codepoint
424 |         try
425 |         {
426 |             // 0xF0 has no continuation byte
427 |             char inv30[] = {'\xF0'};
428 |             string chstr = inv30;
429 |             UTF8string u8 = chstr;
430 | 
431 |             return 30;
432 |         }
433 |         catch ( const std::invalid_argument& ) {}
434 | 
435 |         try
436 |         {
437 |             // 0xF4 has no continuation byte
438 |             char inv31[] = {'\xF4'};
439 |             string chstr = inv31;
440 |             UTF8string u8 = chstr;
441 | 
442 |             return 31;
443 |         }
444 |         catch ( const std::invalid_argument& ) {}
445 | 
446 |         try
447 |         {
448 |             // Invalid continuation byte (0x90) after 0xF4
449 |             char inv32[] = {'\xF4', '\x90', '\x90', '\x90', '\x00'};
450 |             string chstr = inv32;
451 |             UTF8string u8 = chstr;
452 | 
453 |             return 32;
454 |         }
455 |         catch ( const std::invalid_argument& ) {}
456 | 
457 |         // With 0xF0 as the first byte of the codepoint
458 |         try
459 |         {
460 |             // Invalid continuation byte (0x8F) after 0xF0
461 |             char inv33[] = {'\xF0', '\x8F', '\x91', '\xB5', '\x00'};
462 |             string chstr = inv33;
463 |             UTF8string u8 = chstr;
464 | 
465 |             return 33;
466 |         }
467 |         catch ( const std::invalid_argument& ) {}
468 | 
469 |         try
470 |         {
471 |             // Invalid continuation byte (0x8F) after 0xF0
472 |             char inv34[] = {'\xF0', '\xC7', '\x91', '\xB5', '\x00'};
473 |             string chstr = inv34;
474 |             UTF8string u8 = chstr;
475 | 
476 |             return 34;
477 |         }
478 |         catch ( const std::invalid_argument& ) {}
479 | 
480 |         // With 0xF4 as the first byte of the codepoint
481 |         try
482 |         {
483 |             // Invalid continuation byte (0x7F) after 0xF4
484 |             char inv35[] = {'\xF4', '\x7F', '\x91', '\xB5', '\x00'};
485 |             string chstr = inv35;
486 |             UTF8string u8 = chstr;
487 | 
488 |             return 35;
489 |         }
490 |         catch ( const std::invalid_argument& ) {}
491 | 
492 |         try
493 |         {
494 |             // Invalid continuation byte (0x92) after 0xF4
495 |             char inv36[] = {'\xF4', '\x92', '\x91', '\xB5', '\x00'};
496 |             string chstr = inv36;
497 |             UTF8string u8 = chstr;
498 | 
499 |             return 36;
500 |         }
501 |         catch ( const std::invalid_argument& ) {}
502 | 
503 |         try
504 |         {
505 |             // Not enough bytes after the first codepoint byte
506 |             char inv37[] = {'\xF4', '\x92'};
507 |             string chstr = inv37;
508 |             UTF8string u8 = chstr;
509 | 
510 |             return 37;
511 |         }
512 |         catch ( const std::invalid_argument& ) {}
513 | 
514 |         try
515 |         {
516 |             // Not enough bytes after the first codepoint byte
517 |             char inv38[] = {'\xF4', '\x92', '\x91'};
518 |             string chstr = inv38;
519 |             UTF8string u8 = chstr;
520 | 
521 |             return 38;
522 |         }
523 |         catch ( const std::invalid_argument& ) {}
524 | 
525 | 
526 |         try
527 |         {
528 |             // Valid string
529 |             UTF8string u8 = ( jap1 + jap2 + jap3 + jap4 + jap5 + jap6 );
530 |             UTF8string u8sub = string( "わかりました" );
531 | 
532 |             if ( u8.utf8_find( u8sub ) == std::string::npos )
533 |             {
534 |                 return 40;
535 |             }
536 | 
537 |         }
538 |         catch ( const std::invalid_argument& )
539 |         {
540 |             return 41;
541 |         }
542 |     }
543 | 
544 |     // Concatenate strings
545 |     {
546 |         UTF8string strex1( "がんばつて Gumichan" );
547 |         UTF8string strex2( "Gumichanがんばつて " );
548 |         UTF8string ganba( "がんばつて " );
549 |         std::string gumi( "Gumichan" );
550 | 
551 |         const char * gumistr = "Gumichan";
552 | 
553 |         // concatenate std::string and UTF8string
554 |         if ( strex1 != ( ganba + gumi ) )
555 |             return 42;
556 | 
557 |         if ( strex2 != ( gumi + ganba ) )
558 |             return 43;
559 | 
560 |         if ( ( gumi.size() + ganba.utf8_size() ) !=
561 |                 ( gumi + ganba ).utf8_size() )
562 |             return 44;
563 | 
564 |         if ( ( ganba.utf8_size() + gumi.size() ) !=
565 |                 ( ganba + gumi ).utf8_size() )
566 |             return 45;
567 | 
568 |         // concatenate UTF8string and const char *
569 |         if ( strex2 != ( gumistr + ganba ) )
570 |             return 46;
571 | 
572 |         if ( strex1 != ( ganba + gumistr ) )
573 |             return 46;
574 | 
575 |         if ( ( std::strlen( gumistr ) + ganba.utf8_size() ) !=
576 |                 ( gumistr + ganba ).utf8_size() )
577 |             return 48;
578 | 
579 | 
580 |         // concatenate 2 UTF8string objects
581 |         UTF8string gumiex( gumi );
582 |         if ( strex1 != ( ganba + gumiex ) )
583 |             return 49;
584 | 
585 |         if ( ( ganba.utf8_size() + gumiex.utf8_size() ) !=
586 |                 ( ganba + gumiex ).utf8_size() )
587 |             return 50;
588 | 
589 |         if ( ( ganba.utf8_length() + gumiex.utf8_length() ) !=
590 |                 ( ganba + gumiex ).utf8_length() )
591 |             return 51;
592 | 
593 |         if ( strex2 != ( gumiex + ganba ) )
594 |             return 52;
595 | 
596 |         if ( ( gumiex.utf8_size() + ganba.utf8_size() ) !=
597 |                 ( gumiex + ganba ).utf8_size() )
598 |             return 53;
599 | 
600 |         if ( ( gumiex.utf8_length() + ganba.utf8_length() ) !=
601 |                 ( gumiex + ganba ).utf8_length() )
602 |             return 54;
603 |     }
604 | 
605 |     // Append strings
606 |     {
607 |         UTF8string strex1( "Gumichan がんばつて" );
608 |         UTF8string strex2( "Gumichan がんばつて!" );
609 |         UTF8string gumi( "Gumichan" );
610 | 
611 |         gumi += " がんばつて";
612 | 
613 |         if ( gumi != strex1 )
614 |             return 55;
615 | 
616 |         strex1 += "!";
617 | 
618 |         if ( strex1 != strex2 )
619 |             return 56;
620 |     }
621 | 
622 |     // Get the codepoint at a specified position
623 |     {
624 |         UTF8string astr( "Gumichan" );
625 |         UTF8string str( "がんばつて Gumichan" );
626 |         std::string gcpoint = "G";
627 |         std::string ucpoint = "u";
628 |         std::string ncpoint = "n";
629 |         std::string tcpoint = "て";
630 | 
631 |         try
632 |         {
633 |             str.utf8_at( 42 );
634 |             return 57;
635 |         }
636 |         catch ( std::out_of_range& ) {}
637 | 
638 |         std::string c0 = astr[1];
639 |         std::string cc = astr[0];
640 |         std::string ccc = astr.utf8_at( astr.utf8_size() - 1 );
641 |         std::string c1 = str[4];
642 |         std::string c2 = str[7];
643 | 
644 |         if ( ucpoint != c0 )
645 |         {
646 |             std::cout << "expected: " << ucpoint << "; got: " << c0 << "\n";
647 |             return 58;
648 |         }
649 | 
650 |         if ( gcpoint != cc )
651 |         {
652 |             std::cout << "expected: " << gcpoint << "; got: " << cc << "\n";
653 |             return 59;
654 |         }
655 | 
656 |         if ( ncpoint != ccc )
657 |         {
658 |             std::cout << "expected: " << ncpoint << ";got: " << ccc << "\n";
659 |             return 60;
660 |         }
661 | 
662 |         if ( tcpoint != c1 )
663 |         {
664 |             std::cout << "expected: " << tcpoint << ";got: " << c1 << "\n";
665 |             return 61;
666 |         }
667 | 
668 |         if ( ucpoint != c2 )
669 |         {
670 |             std::cout << "expected: " << ucpoint << ";got: " << c2 << "\n";
671 |             return 62;
672 |         }
673 | 
674 |         std::string point = "。";
675 |         UTF8string u8str( "łþø けいかん: いいえ、どういたしまして。" );
676 |         std::string c = u8str.utf8_at( 22 );
677 | 
678 |         if ( point != c )
679 |         {
680 |             std::cout << "expected: " << point << ";got : " << c << "\n";
681 |             return 63;
682 |         }
683 |     }
684 | 
685 |     // Reverse string using iterator
686 |     {
687 |         UTF8string utf8orig( "がんばつて Gumichan" );
688 |         UTF8string utf8str( "がんばつて Gumichan" );
689 | 
690 |         // Bijection
691 |         if ( utf8orig != ( utf8str.utf8_reverse() ).utf8_reverse() )
692 |         {
693 |             std::cout << "expected: " << utf8orig << "; got: "
694 |                       << ( utf8str.utf8_reverse() ).utf8_reverse() << "\n";
695 |             return 64;
696 |         }
697 | 
698 |         if ( utf8orig.utf8_size() != ( utf8str.utf8_reverse() ).utf8_size() )
699 |         {
700 |             std::cout << "expected: " << utf8orig.utf8_size() << "; got: "
701 |                       << ( utf8str.utf8_reverse() ).utf8_size() << "\n";
702 |             return 65;
703 |         }
704 | 
705 |         if ( utf8orig.utf8_length() != ( utf8str.utf8_reverse() ).utf8_length() )
706 |         {
707 |             std::cout << "expected: " << utf8orig.utf8_length() << "; got: "
708 |                       << ( utf8str.utf8_reverse() ).utf8_length() << "\n";
709 |             return 66;
710 |         }
711 |     }
712 | 
713 |     // Remove the last code point
714 |     {
715 |         UTF8string str( "がんばつて Gumichan" );
716 |         UTF8string str2( "がんばつて Gumichan がんばつて" );
717 |         UTF8string strpop( "がんばつて Gumicha" );
718 |         UTF8string strempty;
719 | 
720 |         str.utf8_pop();
721 | 
722 |         if ( str.utf8_length() != strpop.utf8_length() )
723 |         {
724 |             std::cout << "Excpected : " << strpop.utf8_length() << ";got  : "
725 |                       << str.utf8_length() << "\n";
726 |             return 67;
727 |         }
728 | 
729 |         if ( str != strpop )
730 |         {
731 |             std::cout << "Excpected : " << strpop << ";got  : "
732 |                       << str << "\n";
733 |             return 68;
734 |         }
735 | 
736 |         str.utf8_clear();
737 |         str = "がんばつて Gumichan がんばつて";
738 | 
739 |         str2.utf8_pop();
740 |         str2 += "て";
741 | 
742 |         if ( str2.utf8_length() != str.utf8_length() )
743 |         {
744 |             std::cout << "Excpected : " << str.utf8_length() << ";got  : "
745 |                       << str2.utf8_length() << "\n";
746 |             return 69;
747 |         }
748 | 
749 |         if ( str != str2 )
750 |         {
751 |             std::cout << "Excpected : " << str2 << ";got  : "
752 |                       << str << "\n";
753 |             return 70;
754 |         }
755 | 
756 |         try
757 |         {
758 |             strempty.utf8_pop();
759 |             return 71;
760 | 
761 |         }
762 |         catch ( ... ) {}
763 | 
764 |     }
765 | 
766 |     // Erase #1
767 |     {
768 |         UTF8string s = "This is an example";
769 |         cout << s << "\n";
770 | 
771 |         s.utf8_erase( 0, 5 ); // Erase "This "
772 |         cout << s << "\n";
773 | 
774 |         s.utf8_erase( s.utf8_begin() + 2 ); // Erase ' '
775 |         cout << s << "\n";
776 | 
777 |         s.utf8_erase( s.utf8_find( " " ) ); // Trim from ' ' to the end of the string
778 |         cout << s << "\n";
779 |     }
780 | 
781 |     // Erase #2
782 |     {
783 |         UTF8string empty;
784 |         UTF8string str1( "がんばつて Gumichan" );
785 |         UTF8string str2( "がんばつて Gumichan" );
786 |         UTF8string str3( "がんばつて Gumichan" );
787 |         UTF8string str4( "がんばつて Gumichan" );
788 |         UTF8string exp1( "がんばつて" );
789 |         UTF8string exp2( "Gumichan" );
790 |         UTF8string exp3( "がんばつてGumichan" );
791 | 
792 |         try
793 |         {
794 |             empty.utf8_erase();
795 |         }
796 |         catch ( ... )
797 |         {
798 |             return 80;
799 |         }
800 | 
801 |         try
802 |         {
803 |             empty.utf8_erase( 42, 1024 );
804 |             return 81;
805 |         }
806 |         catch ( const std::out_of_range& out )
807 |         {
808 |             cout << out.what() << '\n';
809 |         }
810 | 
811 |         str1.utf8_erase( 5 );
812 |         str2.utf8_erase( 0, 6 );
813 |         str3.utf8_erase( str3.utf8_begin() + 5 );
814 |         str4.utf8_erase( str4.utf8_begin(), str4.utf8_begin() + 6 );
815 | 
816 |         cout << str1 << "\n";
817 |         cout << str2 << "\n";
818 |         cout << str3 << "\n";
819 |         cout << str4 << "\n";
820 | 
821 |         if ( str1 != exp1 )
822 | 
823 |             return 76;
824 |         if ( str2 != exp2 )
825 |             return 77;
826 | 
827 |         if ( str3 != exp3 )
828 |             return 78;
829 | 
830 |         if ( str4 != exp2 )
831 |             return 79;
832 |     }
833 | 
834 |     {
835 |         UTF8string hello( "hello" );
836 |         UTF8string hello2( hello );
837 |         UTF8string hellom( "heLlo" );
838 | 
839 |         if ( hello.hash() != hello2.hash() )
840 |         {
841 |             cerr << "1 - invalid hash function\n";
842 |             return 82;
843 |         }
844 | 
845 |         if ( hello.hash() == hellom.hash() )
846 |         {
847 |             cerr << "2 - invalid hash function\n";
848 |             return 83;
849 |         }
850 | 
851 |         std::cout << "hash hello : " << std::hash<decltype( hello )>()( hello ) << "\n";
852 |         std::cout << "hash heLlo : " << std::hash<decltype( hellom )>()( hellom ) << "\n";
853 |     }
854 | 
855 |     // Last test : search for a substring in a file
856 |     {
857 |         UTF8string text;
858 |         UTF8string strreq;
859 |         UTF8string strgumi( "がんばつて gumichan01。" );
860 |         std::ifstream u8file( "test/lipsum.txt" );
861 | 
862 |         strreq += "速スご薄具そなラひ置更けゃっ文犬2社ぎル由人へいきつ回見ト供崩モ催屋エ国続セワルリ";
863 |         strreq += "謙髪テシ県住ざ新球ごくき名昨ツセ戸読役ホ細16態量番などトぱ。";
864 | 
865 |         cout << "Find those strings: \n" << strgumi << "\n\n AND \n\n"
866 |              << strreq  << "\n\n";
867 | 
868 |         if ( u8file.is_open() )
869 |         {
870 |             try
871 |             {
872 |                 text = std::string( std::istreambuf_iterator<char>( u8file ),
873 |                                     std::istreambuf_iterator<char>() );
874 |                 u8file.close();
875 |             }
876 |             catch ( std::invalid_argument& e )
877 |             {
878 |                 cerr << e.what() << "\n";
879 |                 u8file.close();
880 |                 return 72;
881 |             }
882 | 
883 |             cout << "File \n" << "name: lipsum.txt"
884 |                  << "\n" << "size: " << text.utf8_size()
885 |                  << "\n" << "Number of characters: "
886 |                  << text.utf8_length() << "\n\n";
887 | 
888 |             cout << "1 - Find - " << strgumi << "\n\n";
889 |             size_t pgumi = text.utf8_find( strgumi );
890 | 
891 |             if ( pgumi == UTF8string::npos )
892 |             {
893 |                 cerr << "The position of \"" << strgumi
894 |                      << "\" in the text must not be UTF8string::npos\n";
895 |                 return 73;
896 |             }
897 |             cout << "1 - string from position " << pgumi << ": \n\n";
898 |             const UTF8string& u8found0 = text.utf8_substr( pgumi, 8192U );
899 |             cout << u8found0 << "\n\n";
900 |         }
901 |         else
902 |         {
903 |             cerr << "File not found!\n";
904 |             return 74;
905 |         }
906 | 
907 |         cout << "2 - Find - " << strreq << "\n\n";
908 |         size_t pos = text.utf8_find( strreq );
909 | 
910 |         if ( pos == UTF8string::npos )
911 |         {
912 |             cerr << "utf8_find: " << strreq << " not found!\n";
913 |             return 75;
914 |         }
915 | 
916 |         cout << "string from position " << pos << ": \n\n";
917 |         const UTF8string& u8found = text.utf8_substr( pos, strreq.utf8_length() );
918 |         cout << u8found << "\n";
919 |     }
920 | 
921 |     return 0;
922 | }
923 | 


--------------------------------------------------------------------------------