├── .gitignore ├── .gitlab-ci.yml ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── src ├── utf8_iterator.cpp ├── utf8_iterator.hpp ├── utf8_string.cpp └── utf8_string.hpp └── test ├── lipsum.txt └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | 30 | # Codeblocks config files 31 | *.layout 32 | *.cbp 33 | *.depend 34 | 35 | # Others 36 | Doxy* 37 | html/ 38 | *.cppcheck 39 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | 2 | buildgcc: 3 | 4 | script: 5 | - make CC="g++" 6 | - ./utf8test -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | compiler: 4 | - clang 5 | - g++ 6 | 7 | 8 | before_install: 9 | - if [ "$CC" == "g++" ]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; fi 10 | - sudo apt-get update -qq 11 | 12 | install: 13 | - if [ "$CC" == "g++" ]; then sudo apt-get install -qq g++-4.9; fi 14 | - if [ "$CC" == "g++" ]; then sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 50; fi 15 | - if [ "$CC" == "g++" ]; then export CC="g++-4.9"; fi 16 | 17 | script: 18 | - make 19 | - ./utf8test 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gumichan01/utf8_string/4e677cd3d7986dc1406f3b50e64ecaec68dd6b88/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile of UTFstring 2 | 3 | CC=g++ 4 | CFLAGS=-Wall -Wextra -g -Weffc++ -Wsign-conversion -Wconversion -std=c++11 5 | LFLAGS= 6 | 7 | SRC=./src/ 8 | TEST_SRC=./test/ 9 | TEST_MAIN=$(TEST_SRC)main.cpp 10 | TEST_EXE=utf8test 11 | 12 | UTF8_HEADER=$(SRC)utf8_string.hpp 13 | UTF8_SRC=$(SRC)utf8_string.cpp 14 | UTF8_ITER_HEADER=$(SRC)utf8_iterator.hpp 15 | UTF8_ITER_SRC=$(SRC)utf8_iterator.cpp 16 | 17 | UTF8_OBJ=utf8_string.o 18 | UTF8_ITER_OBJ=utf8_iterator.o 19 | TEST_OBJ=main.o 20 | OBJS=$(UTF8_OBJ) $(TEST_OBJ) $(UTF8_ITER_OBJ) 21 | 22 | all: test 23 | 24 | test: $(TEST_EXE) 25 | @echo $(TEST_EXE)" generated. " 26 | 27 | 28 | $(TEST_EXE) : $(OBJS) 29 | @echo $@" - Compiling..." 30 | $(CC) $(CFLAGS) -o $@ $^ $(LFLAGS) 31 | @echo $@" - done." 32 | 33 | $(UTF8_OBJ) : $(UTF8_SRC) $(UTF8_HEADER) 34 | @echo $<" -> "$@ 35 | $(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS) 36 | @echo $<" -> "$@" done." 37 | 38 | $(UTF8_ITER_OBJ) : $(UTF8_ITER_SRC) $(UTF8_ITER_HEADER) 39 | @echo $<" -> "$@ 40 | $(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS) 41 | @echo $<" -> "$@" done." 42 | 43 | 44 | $(TEST_OBJ) : $(TEST_MAIN) $(UTF8_HEADER) 45 | @echo $<" -> "$@ 46 | $(CC) -c $(CFLAGS) -o $@ $< $(LFLAGS) 47 | @echo $<" -> "$@" done." 48 | 49 | mrproper: 50 | rm -f $(TEST_EXE) $(OBJS) 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UTF-8 string # 2 | 3 | [![Build Status](https://travis-ci.org/Gumichan01/utf8_string.svg?branch=master)](https://travis-ci.org/Gumichan01/utf8_string) 4 | [![pipeline status](https://gitlab.com/Gumichan01/utf8_string/badges/master/pipeline.svg)](https://gitlab.com/Gumichan01/utf8_string/commits/master) 5 | 6 | This is a simple implementation of UTF-8 strings in C++. 7 | 8 | ## Implementation ## 9 | 10 | UTF8string is based on *std::string* provided by the standard C++ library 11 | but has been implemented to support UTF-8 encoded strings. 12 | 13 | Some functions have been adapted for utf8 strings : 14 | - utf8_length : get number of characters in a string (number of codepoints). 15 | - utf8_size : get get the memory size of the string (in byte). 16 | - utf8_find : find a utf8 substring in the current string. 17 | - utf8_substr : get a utf8 substring of the current string. 18 | - utf8_at : get the codepoint at a specified position. 19 | - utf8_pop : remove the last codepoint of the string. 20 | 21 | ## Usage ## 22 | 23 | You just need to include all of the ***.hpp*** and ***.cpp*** files from *src/* 24 | in your project. For each file that uses UTF8string, include this piece of code : 25 | 26 | #include "utf8_string.hpp" 27 | 28 | ## Code example ## 29 | 30 | ```cpp 31 | UTF8string u8("がんばつて Gumichan"); 32 | UTF8string sub = u8.utf8_substr(0,5); 33 | size_t pos = u8.utf8_find(UTF8string("chan")); 34 | size_t sz = u8.utf8_size(); 35 | size_t l = u8.utf8_length(); 36 | 37 | std::cout << "u8 string: " << u8 << "\n"; 38 | std::cout << "utf8 substring from 0 to 5: " << sub << "\n"; 39 | std::cout << "utf8 codepoint at 2: " << u8.utf8_at(2) << "\n"; 40 | std::cout << "utf8 string \"chan\" at " << pos << "\n"; 41 | std::cout << "u8 string - memory size: " << sz << "; length: " << l << "\n\n"; 42 | 43 | for (auto s: sub) // or for (const std::string& s: u8) 44 | { 45 | std::cout << "-> " << s << "\n"; 46 | } 47 | 48 | ``` 49 | 50 | Output : 51 | 52 | ``` 53 | utf8 string: がんばつて Gumichan 54 | utf8 substring from 0 to 5: がんばつて 55 | utf8 codepoint at 2: ば 56 | utf8 string "chan" at 10 57 | u8 string - memory size: 24; length: 14 58 | 59 | -> が 60 | -> ん 61 | -> ば 62 | -> つ 63 | -> て 64 | 65 | ``` 66 | 67 | ## Project that uses UTF8string ## 68 | 69 | - [LunatiX](https://github.com/Gumichan01/lunatix) 70 | 71 | ## License ## 72 | 73 | This library is under the MIT License. 74 | -------------------------------------------------------------------------------- /src/utf8_iterator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright © 2018 Luxon Jean-Pierre 4 | * https://gumichan01.github.io/ 5 | * 6 | * This library is under the MIT license 7 | * 8 | * Luxon Jean-Pierre (Gumichan01) 9 | * luxon.jean.pierre@gmail.com 10 | * 11 | */ 12 | 13 | #include "utf8_string.hpp" 14 | 15 | #include 16 | 17 | 18 | UTF8iterator::UTF8iterator( const UTF8string& u ) noexcept 19 | : _index( 0 ), _data( u ) {} 20 | 21 | UTF8iterator::UTF8iterator( const UTF8iterator& it ) noexcept 22 | : _index( it._index ), _data( it._data ) {} 23 | 24 | 25 | UTF8iterator& UTF8iterator::operator =( const UTF8iterator& it ) noexcept 26 | { 27 | _data = it._data; 28 | _index = it._index; 29 | return *this; 30 | } 31 | 32 | 33 | UTF8iterator& UTF8iterator::operator ++() noexcept 34 | { 35 | if ( _index < _data.utf8_length() ) 36 | _index += 1; 37 | 38 | return *this; 39 | } 40 | 41 | 42 | UTF8iterator UTF8iterator::operator ++( int ) noexcept 43 | { 44 | UTF8iterator oldit( *this ); 45 | 46 | if ( _index < _data.utf8_length() ) 47 | _index += 1; 48 | 49 | return oldit; 50 | } 51 | 52 | 53 | UTF8iterator& UTF8iterator::operator --() noexcept 54 | { 55 | if ( _index > 0 ) 56 | _index -= 1; 57 | 58 | return *this; 59 | } 60 | 61 | 62 | UTF8iterator UTF8iterator::operator --( int ) noexcept 63 | { 64 | UTF8iterator oldit( *this ); 65 | 66 | if ( _index > 0 ) 67 | _index -= 1; 68 | 69 | return oldit; 70 | } 71 | 72 | 73 | bool UTF8iterator::operator ==( const UTF8iterator& it ) const noexcept 74 | { 75 | return ( _data == it._data ) && ( _index == it._index ); 76 | } 77 | 78 | 79 | bool UTF8iterator::operator !=( const UTF8iterator& it ) const noexcept 80 | { 81 | return !( *this == it ); 82 | } 83 | 84 | 85 | bool UTF8iterator::operator <( const UTF8iterator& it ) const noexcept 86 | { 87 | return ( _data == it._data ) && ( _index < it._index ); 88 | } 89 | 90 | bool UTF8iterator::operator >( const UTF8iterator& it ) const noexcept 91 | { 92 | return ( _data == it._data ) && ( _index > it._index ); 93 | } 94 | 95 | bool UTF8iterator::operator <=( const UTF8iterator& it ) const noexcept 96 | { 97 | return ( _data == it._data ) && ( _index <= it._index ); 98 | } 99 | 100 | bool UTF8iterator::operator >=( const UTF8iterator& it ) const noexcept 101 | { 102 | return ( _data == it._data ) && ( _index >= it._index ); 103 | } 104 | 105 | const UTF8string::u8char UTF8iterator::operator *() const 106 | { 107 | return _data.utf8_at( _index ); 108 | } 109 | 110 | 111 | UTF8iterator UTF8iterator::operator +( const size_t n ) const noexcept 112 | { 113 | UTF8iterator newit( *this ); 114 | const size_t U8LEN = newit._data.utf8_length(); 115 | 116 | if ( newit._index + n < U8LEN ) 117 | newit._index += n; 118 | else 119 | newit._index = U8LEN; 120 | 121 | return newit; 122 | } 123 | 124 | 125 | UTF8iterator UTF8iterator::operator -( const size_t n ) const noexcept 126 | { 127 | UTF8iterator newit( *this ); 128 | 129 | if ( newit._index >= n ) 130 | newit._index -= n; 131 | else 132 | newit._index = 0; 133 | 134 | return newit; 135 | } 136 | 137 | long UTF8iterator::operator -( const UTF8iterator& it ) const 138 | { 139 | if ( _data != it._data ) 140 | throw std::invalid_argument( "iterators don't point to the same data" ); 141 | 142 | return static_cast( _index ) - static_cast( it._index ); 143 | } 144 | -------------------------------------------------------------------------------- /src/utf8_iterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright © 2018 Luxon Jean-Pierre 4 | * https://gumichan01.github.io/ 5 | * 6 | * This library is under the MIT license 7 | * 8 | * Luxon Jean-Pierre (Gumichan01) 9 | * luxon.jean.pierre@gmail.com 10 | * 11 | */ 12 | 13 | #ifndef UTF8_ITERATOR_HPP_INCLUDED 14 | #define UTF8_ITERATOR_HPP_INCLUDED 15 | 16 | /** 17 | * @file utf8_iterator.hpp 18 | * @brief This is a UTF-8 string library header 19 | */ 20 | 21 | class UTF8string; 22 | 23 | 24 | /** 25 | * @class UTF8iterator final 26 | * @brief Iterator on UTF8 string 27 | * 28 | * This class defines the iterator of UTF-8 string 29 | */ 30 | class UTF8iterator final 31 | { 32 | size_t _index = 0; 33 | UTF8string _data; 34 | 35 | char& operator ->() = delete; 36 | 37 | public: 38 | 39 | UTF8iterator() = delete; 40 | 41 | /** 42 | * @fn explicit UTF8iterator(const UTF8string& u) noexcept 43 | * Build an iterator object using a UTF8string object 44 | * @param u utf-8 string 45 | */ 46 | explicit UTF8iterator( const UTF8string& u ) noexcept; 47 | 48 | /** 49 | * @fn UTF8iterator(const UTF8iterator& it) noexcept 50 | * @param it The iterator to copy 51 | */ 52 | UTF8iterator( const UTF8iterator& it ) noexcept; 53 | 54 | /** 55 | * @fn UTF8iterator& operator ++() noexcept 56 | * Prefix incrementation 57 | * @return The same iterator, but it has moved forward 58 | */ 59 | UTF8iterator& operator ++() noexcept; 60 | 61 | /** 62 | * @fn UTF8iterator& operator ++(int) noexcept 63 | * 64 | * Postfix incrementation 65 | * 66 | * @return The same iterator before it has moved forward 67 | */ 68 | UTF8iterator operator ++( int ) noexcept; 69 | 70 | /** 71 | * @fn UTF8iterator& operator --() noexcept 72 | * Prefix derementation 73 | * @return The same iterator, but it has moved backward 74 | */ 75 | UTF8iterator& operator --() noexcept; 76 | 77 | /** 78 | * @fn UTF8iterator operator --(int) noexcept 79 | * 80 | * Postfix decrementation 81 | * 82 | * @return The same iterator before it has moved backward 83 | */ 84 | UTF8iterator operator --( int ) noexcept; 85 | 86 | /** 87 | * @fn UTF8iterator& operator =(const UTF8iterator& it) noexcept 88 | * Asignement 89 | * @param it The iterator that wille be assigned 90 | * @return The same iterator as the argument 91 | */ 92 | UTF8iterator& operator =( const UTF8iterator& it ) noexcept; 93 | 94 | /** 95 | * @fn bool operator ==(const UTF8iterator& it) const noexcept 96 | * 97 | * Check if the current iterator is pointing to the same position as 98 | * the iterator given in argument equals. 99 | * 100 | * @param it The iterator to compare with 101 | * @return TRUE if they are pointing to the same position, FALSE otherwise 102 | */ 103 | bool operator ==( const UTF8iterator& it ) const noexcept; 104 | /** 105 | * @fn bool operator !=(const UTF8iterator& it) const noexcept 106 | * 107 | * Check if the current iterator is pointing to a different position 108 | * from the iterator given in argument equals. 109 | * 110 | * @param it The iterator to compare with 111 | * @return TRUE if they are not pointing to the same position, 112 | * FALSE otherwise 113 | */ 114 | bool operator !=( const UTF8iterator& it ) const noexcept; 115 | /** 116 | * @fn bool operator <(const UTF8iterator& it) const noexcept 117 | * @param it The iterator to compare with 118 | * @return TRUE if the position of it is greater than *this, FALSE otherwise 119 | */ 120 | bool operator <( const UTF8iterator& it ) const noexcept; 121 | /** 122 | * @fn bool operator >(const UTF8iterator& it) const noexcept 123 | * @param it The iterator to compare with 124 | * @return TRUE if the position of it is less than *this, FALSE otherwise 125 | */ 126 | bool operator >( const UTF8iterator& it ) const noexcept; 127 | /** 128 | * @fn bool operator <=(const UTF8iterator& it) const noexcept 129 | * @param it The iterator to compare with 130 | * @return TRUE if the position of it is greater than or equal *this, FALSE otherwise 131 | */ 132 | bool operator <=( const UTF8iterator& it ) const noexcept; 133 | /** 134 | * @fn bool operator >=(const UTF8iterator& it) const noexcept 135 | * @param it The iterator to compare with 136 | * @return TRUE if the position of it is less than or equal to *this, FALSE otherwise 137 | */ 138 | bool operator >=( const UTF8iterator& it ) const noexcept; 139 | 140 | /** 141 | * @fn UTF8iterator operator +(const size_t n) const noexcept 142 | * 143 | * Returns an iterator which has been moved n positions forward 144 | * 145 | * @param n the number of step to move forward 146 | * @return The same iterator that moved forward 147 | */ 148 | UTF8iterator operator +( const size_t n ) const noexcept; 149 | /** 150 | * @fn UTF8iterator operator -(const size_t n) const noexcept 151 | * 152 | * Returns an iterator which has been moved n positions backward 153 | * 154 | * @param n the number of steps to move backward 155 | * @return The same iterator that moved backward 156 | */ 157 | UTF8iterator operator -( const size_t n ) const noexcept; 158 | /** 159 | * @fn long operator -(const UTF8iterator& it) const 160 | * 161 | * Return the difference value between *this and it 162 | * 163 | * @param it 164 | * @return A long value *n* such that it + n = *this 165 | * @pre *this and it points to the same data 166 | * @post *this == it + (*this - it) 167 | * @exception std::invalid_argument if the pre-condition is not satisfied 168 | */ 169 | long operator -( const UTF8iterator& it ) const; 170 | 171 | /** 172 | * @fn const UTF8string::u8char operator *() const 173 | * 174 | * Dereferences the pointer returning the codepoint 175 | * pointed by the iterator at its current potision 176 | * 177 | * @return The codepoint 178 | * @note This function will throw an *std::out_of_range* exception 179 | * if the iterator does not point to a codepoint 180 | */ 181 | const UTF8string::u8char operator *() const; 182 | 183 | ~UTF8iterator() = default; 184 | }; 185 | 186 | #endif // UTF8_ITERATOR_HPP_INCLUDED 187 | -------------------------------------------------------------------------------- /src/utf8_string.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright © 2018 Luxon Jean-Pierre 4 | * https://gumichan01.github.io/ 5 | * 6 | * This library is under the MIT license 7 | * 8 | * Luxon Jean-Pierre (Gumichan01) 9 | * luxon.jean.pierre@gmail.com 10 | * 11 | */ 12 | 13 | #include "utf8_string.hpp" 14 | 15 | #include 16 | #include 17 | 18 | 19 | namespace 20 | { 21 | 22 | constexpr size_t min( size_t a, size_t b ) 23 | { 24 | return a < b ? a : b; 25 | } 26 | 27 | inline std::basic_string toUstring( const std::string& str ) 28 | { 29 | return std::basic_string( str.begin(), str.end() ); 30 | } 31 | 32 | // Used in utf8_find 33 | void preprocess( const UTF8string& str, 34 | std::unordered_map& u8map ) noexcept 35 | { 36 | const size_t U8LEN = str.utf8_length(); 37 | 38 | // Preprocessing 39 | if ( U8LEN > 1 ) 40 | { 41 | for ( size_t i = U8LEN - 2U; ; --i ) 42 | { 43 | const UTF8string::u8char& s = str[i]; 44 | 45 | if ( u8map.find( s ) == u8map.end() ) 46 | u8map[s] = U8LEN - 1 - i; 47 | 48 | if ( i == 0 ) 49 | break; 50 | } 51 | } 52 | } 53 | 54 | } 55 | 56 | 57 | UTF8string::UTF8string( const char * str ) 58 | : UTF8string( std::string( str ) ) {} 59 | 60 | 61 | UTF8string::UTF8string( const std::string& str ) 62 | : _utf8string( str ) 63 | { 64 | if ( !utf8_is_valid_() ) 65 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 66 | 67 | _utf8length = utf8_length_(); 68 | } 69 | 70 | 71 | UTF8string::UTF8string( const UTF8string& u8str ) noexcept 72 | : _utf8string( u8str._utf8string ), _utf8length( u8str._utf8length ) {} 73 | 74 | UTF8string::UTF8string( const UTF8string& u8str, size_t pos, size_t len ) noexcept 75 | : UTF8string( u8str.utf8_substr( pos, len ) ) {} 76 | 77 | UTF8string::UTF8string( UTF8string&& u8str ) noexcept 78 | : _utf8string( u8str._utf8string ), _utf8length( u8str._utf8length ) 79 | { 80 | u8str.utf8_clear(); 81 | u8str._utf8string.shrink_to_fit(); 82 | } 83 | 84 | UTF8string& UTF8string::operator =( const char * str ) 85 | { 86 | return utf8_assign( str ); 87 | } 88 | 89 | 90 | UTF8string& UTF8string::operator =( const std::string& str ) 91 | { 92 | return utf8_assign( str ); 93 | } 94 | 95 | 96 | UTF8string& UTF8string::operator =( const UTF8string& u8str ) noexcept 97 | { 98 | _utf8string = u8str._utf8string; 99 | _utf8length = u8str._utf8length; 100 | return *this; 101 | } 102 | 103 | UTF8string& UTF8string::operator =( UTF8string&& u8str ) noexcept 104 | { 105 | return utf8_assign( std::move( u8str ) ); 106 | } 107 | 108 | const UTF8string& UTF8string::operator +=( const std::string& str ) 109 | { 110 | const UTF8string::u8string BACKUP = _utf8string; 111 | _utf8string += str; 112 | 113 | if ( !utf8_is_valid_() ) 114 | { 115 | _utf8string = BACKUP; 116 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 117 | } 118 | 119 | _utf8length = utf8_length_(); 120 | return *this; 121 | } 122 | 123 | 124 | const UTF8string& UTF8string::operator +=( const UTF8string& u8str ) 125 | { 126 | _utf8string += u8str._utf8string; 127 | _utf8length = utf8_length_(); 128 | return *this; 129 | } 130 | 131 | 132 | const UTF8string& UTF8string::operator +=( const char * str ) 133 | { 134 | UTF8string::u8string BACKUP = _utf8string; 135 | _utf8string += std::move( UTF8string::u8string( str ) ); 136 | 137 | if ( !utf8_is_valid_() ) 138 | { 139 | _utf8string = BACKUP; 140 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 141 | } 142 | 143 | _utf8length = utf8_length_(); 144 | return *this; 145 | } 146 | 147 | 148 | bool UTF8string::utf8_is_valid_() const noexcept 149 | { 150 | const std::basic_string U8STRING = toUstring( _utf8string ); 151 | auto it = U8STRING.begin(); 152 | const auto ITEND = U8STRING.cend(); 153 | 154 | while ( it < ITEND ) 155 | { 156 | if ( ( 0xF8 & *it ) == 0xF0 && *it <= 0xF4 ) 157 | { 158 | // The UTF-8 codepoint begin with 0b11110xxx -> 4-byte codepoint 159 | // If the iterator reach the end of the string before the 160 | // end of the 4-byte codepoint -> invalid string 161 | if ( ( it + 1 ) == ITEND || ( it + 2 ) == ITEND || ( it + 3 ) == ITEND ) 162 | return false; 163 | 164 | // Each of the following bytes is a value 165 | // between 0x80 and 0xBF 166 | if ( ( ( 0xC0 & *( it + 1 ) ) != 0x80 ) || ( ( 0xC0 & *( it + 2 ) ) != 0x80 ) 167 | || ( ( 0xC0 & *( it + 3 ) ) != 0x80 ) ) 168 | { 169 | return false; 170 | } 171 | 172 | // If the first byte of the sequence is 0xF0 173 | // then the first continuation byte must be between 0x90 and 0xBF 174 | // otherwise, if the byte is 0xF4 175 | // then the first continuation byte must be between 0x80 and 0x8F 176 | if ( *it == 0xF0 ) 177 | { 178 | if ( *( it + 1 ) < 0x90 || *( it + 1 ) > 0xBF ) 179 | return false; 180 | } 181 | else if ( *it == 0xF4 ) 182 | { 183 | if ( *( it + 1 ) < 0x80 || *( it + 1 ) > 0x8F ) 184 | return false; 185 | } 186 | 187 | it += 4; // Jump to the next codepoint 188 | } 189 | else if ( ( 0xF0 & *it ) == 0xE0 ) 190 | { 191 | // The UTF-8 codepoint begin with 0b1110xxxx -> 3-byte codepoint 192 | if ( ( it + 1 ) == ITEND || ( it + 2 ) == ITEND ) 193 | return false; 194 | 195 | // Each of the following bytes starts with 196 | // 0b10xxxxxx in a valid string 197 | if ( ( ( 0xC0 & *( it + 1 ) ) != 0x80 ) || ( ( 0xC0 & *( it + 2 ) ) != 0x80 ) ) 198 | return false; 199 | 200 | // If the first byte of the sequence is 0xE0 201 | // then the first continuation byte must be between 0xA0 and 0xBF 202 | // otherwise, if the byte is 0xF4 203 | // then the first continuation byte must be between 0x80 and 0x9F 204 | if ( *it == 0xE0 ) 205 | { 206 | if ( *( it + 1 ) < 0xA0 || *( it + 1 ) > 0xBF ) 207 | return false; 208 | } 209 | else if ( *it == 0xED ) 210 | { 211 | if ( *( it + 1 ) > 0x9F ) 212 | return false; 213 | } 214 | 215 | it += 3; 216 | } 217 | else if ( ( 0xE0 & *it ) == 0xC0 ) 218 | { 219 | // The UTF-8 codepoint begin with 0b110xxxxx -> 2-byte codepoint 220 | if ( ( it + 1 ) == ITEND ) 221 | return false; 222 | 223 | // The following byte starts with 0b10xxxxxx in a valid string 224 | if ( ( 0xC0 & *( it + 1 ) ) != 0x80 ) 225 | return false; 226 | 227 | it += 2; 228 | } 229 | else if ( ( 0x80 & *it ) == 0x00 ) 230 | { 231 | // The UTF-8 codepoint begin with 0b0xxxxxxx -> 1-byte codepoint 232 | it += 1; 233 | } 234 | else 235 | { 236 | // Invalid codepoint 237 | return false; 238 | } 239 | } 240 | 241 | return true; 242 | } 243 | 244 | // Compute the length of the utf-8 string (in number of codepoints) 245 | size_t UTF8string::utf8_length_() const noexcept 246 | { 247 | auto end_data = _utf8string.end(); 248 | auto it = _utf8string.begin(); 249 | size_t len = 0; 250 | 251 | while ( it != end_data ) 252 | { 253 | byte_t byte = static_cast( *it ); 254 | 255 | if ( 0xf0 == ( 0xf8 & byte ) ) 256 | { 257 | // 4-byte utf8 character 258 | // (0b11110xxx 0bxxxxxxxx 0bxxxxxxxx 0bxxxxxxxx) 259 | it += 4; 260 | } 261 | else if ( 0xe0 == ( 0xf0 & byte ) ) 262 | { 263 | // 3-byte utf8 code point (0b110xxxxx 0bxxxxxxxx 0bxxxxxxxx) 264 | it += 3; 265 | } 266 | else if ( 0xc0 == ( 0xe0 & byte ) ) 267 | { 268 | // 2-byte utf8 code point (0b110xxxxx 0bxxxxxxxx) 269 | it += 2; 270 | } 271 | else 272 | { 273 | // 1-byte utf8 code point (0b0xxxxxxx) 274 | it += 1; 275 | } 276 | 277 | // We want the number of characters (utf-8 code point) 278 | len += 1; 279 | } 280 | 281 | return len; 282 | } 283 | 284 | // Compute the memory size of a codepoint in the string (in byte) 285 | size_t UTF8string::utf8_codepoint_len_( const size_t j ) const noexcept 286 | { 287 | if ( 0xf0 == ( 0xf8 & _utf8string[j] ) ) 288 | { 289 | return 4; 290 | } 291 | else if ( 0xe0 == ( 0xf0 & _utf8string[j] ) ) 292 | { 293 | return 3; 294 | } 295 | else if ( 0xc0 == ( 0xe0 & _utf8string[j] ) ) 296 | { 297 | return 2; 298 | } 299 | else 300 | return 1; 301 | } 302 | 303 | 304 | void UTF8string::utf8_clear() noexcept 305 | { 306 | _utf8string.clear(); 307 | _utf8length = 0; 308 | } 309 | 310 | 311 | bool UTF8string::utf8_empty() const noexcept 312 | { 313 | return _utf8length == 0; 314 | } 315 | 316 | 317 | UTF8string& UTF8string::utf8_assign( const char * str ) 318 | { 319 | const UTF8string::u8string BACKUP = _utf8string; 320 | _utf8string = std::string( str ); 321 | 322 | if ( !utf8_is_valid_() ) 323 | { 324 | _utf8string = BACKUP; 325 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 326 | } 327 | 328 | _utf8length = utf8_length_(); 329 | return *this; 330 | } 331 | 332 | UTF8string& UTF8string::utf8_assign( const u8string& str ) 333 | { 334 | const UTF8string::u8string BACKUP = _utf8string; 335 | _utf8string = str; 336 | 337 | if ( !utf8_is_valid_() ) 338 | { 339 | _utf8string = BACKUP; 340 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 341 | } 342 | 343 | _utf8length = utf8_length_(); 344 | return *this; 345 | } 346 | 347 | UTF8string& UTF8string::utf8_assign( const u8string& str, size_t pos, size_t count ) 348 | { 349 | const UTF8string::u8string BACKUP = _utf8string; 350 | _utf8string = str.substr( pos, count ); 351 | 352 | if ( !utf8_is_valid_() ) 353 | { 354 | _utf8string = BACKUP; 355 | throw std::invalid_argument( "Invalid UTF-8 string\n" ); 356 | } 357 | 358 | _utf8length = utf8_length_(); 359 | return *this; 360 | } 361 | 362 | UTF8string& UTF8string::utf8_assign( UTF8string&& u8str ) noexcept 363 | { 364 | _utf8string = u8str._utf8string; 365 | _utf8length = u8str._utf8length; 366 | 367 | u8str.utf8_clear(); 368 | u8str._utf8string.shrink_to_fit(); 369 | 370 | return *this; 371 | } 372 | 373 | 374 | /* 375 | Get the memory position of a codepoint according 376 | to its position in the utf-8 string 377 | */ 378 | size_t UTF8string::utf8_bpos_at_( const size_t cpos ) const noexcept 379 | { 380 | size_t bpos = 0; 381 | const size_t U8SIZE = utf8_size(); 382 | 383 | for ( size_t i = 0; bpos < U8SIZE && i < cpos; i++ ) 384 | { 385 | bpos += utf8_codepoint_len_( bpos ); 386 | } 387 | return bpos; 388 | } 389 | 390 | 391 | UTF8string::u8string UTF8string::utf8_at_( const size_t index ) const noexcept 392 | { 393 | size_t bpos = utf8_bpos_at_( index ); 394 | return _utf8string.substr( bpos, utf8_codepoint_len_( bpos ) ); 395 | } 396 | 397 | 398 | UTF8string::u8char UTF8string::utf8_at( const size_t index ) const 399 | { 400 | if ( index >= _utf8length ) 401 | throw std::out_of_range( "index value greater than the size of the string" ); 402 | 403 | return utf8_at_( index ); 404 | } 405 | 406 | 407 | UTF8string::u8char UTF8string::operator []( const size_t index ) const noexcept 408 | { 409 | return utf8_at_( index ); 410 | } 411 | 412 | 413 | void UTF8string::utf8_pop() 414 | { 415 | if ( _utf8length == 0 ) 416 | throw std::length_error( "Cannot remove the last element from an empty string" ); 417 | 418 | size_t bpos = utf8_bpos_at_( _utf8length - 1 ); 419 | _utf8string.erase( bpos ); 420 | _utf8length -= 1; 421 | } 422 | 423 | UTF8string& UTF8string::utf8_erase( const size_t index, const size_t count ) 424 | { 425 | if ( index > _utf8length ) 426 | throw std::out_of_range( "utf8_range - index out of range" ); 427 | 428 | const size_t COUNT = min( count, _utf8length - index ); 429 | 430 | if ( _utf8length == 0 || COUNT == 0 ) 431 | return *this; 432 | 433 | const size_t BFIRST = utf8_bpos_at_( index ); 434 | const size_t BLAST = utf8_bpos_at_( index + COUNT ); 435 | const size_t N = _utf8string.size(); 436 | u8string u8s; 437 | 438 | for ( size_t i = 0U; i < N; ++i ) 439 | { 440 | if ( i < BFIRST || i > BLAST - 1 ) 441 | u8s += _utf8string[i]; 442 | } 443 | 444 | _utf8string = u8s; 445 | _utf8length = utf8_length_(); 446 | return *this; 447 | } 448 | 449 | UTF8iterator UTF8string::utf8_erase( const UTF8iterator& position ) 450 | { 451 | if ( position == utf8_end() ) 452 | return utf8_end(); 453 | 454 | if ( position == utf8_end() - 1 ) 455 | { 456 | utf8_pop(); 457 | return utf8_end(); 458 | } 459 | 460 | const size_t d = static_cast( position - utf8_begin() ); 461 | utf8_erase( d, 1U ); 462 | return utf8_begin() + d; 463 | } 464 | 465 | UTF8iterator UTF8string::utf8_erase( const UTF8iterator& first, const UTF8iterator& last ) 466 | { 467 | if ( first == last ) 468 | return utf8_end(); 469 | 470 | if ( first == utf8_begin() && last == utf8_end() ) 471 | { 472 | utf8_clear(); 473 | return utf8_end(); 474 | } 475 | 476 | const UTF8iterator& REAL_FIRST = first < last ? first : last; 477 | const UTF8iterator& REAL_LAST = first < last ? last : first; 478 | 479 | const size_t INDEX = static_cast( REAL_FIRST - utf8_begin() ); 480 | const size_t COUNT = static_cast( REAL_LAST - REAL_FIRST ); 481 | utf8_erase( INDEX, COUNT ); 482 | 483 | return utf8_begin() + INDEX; 484 | } 485 | 486 | UTF8string UTF8string::utf8_substr( size_t pos, size_t len ) const 487 | { 488 | if ( pos > _utf8length ) 489 | return UTF8string(); 490 | 491 | // Length of the substring (number of code points) 492 | const size_t N = ( len == UTF8string::npos || ( pos + len ) > _utf8length ) ? 493 | ( _utf8length - pos ) : len; 494 | 495 | UTF8iterator it = utf8_iterator_() + pos; 496 | const UTF8iterator _END = ( it + N ); 497 | std::string s; 498 | 499 | while ( it != _END ) 500 | { 501 | s += *( it++ ); 502 | } 503 | 504 | return UTF8string( s ); 505 | } 506 | 507 | // This function implements the Boyer-Moore string search algorithm 508 | size_t UTF8string::utf8_find( const UTF8string& str, size_t pos ) const 509 | { 510 | if ( str._utf8length == 0 ) 511 | return UTF8string::npos; 512 | 513 | // Preprocessing 514 | std::unordered_map u8map; 515 | preprocess( str, u8map ); 516 | 517 | const size_t U8LEN = str._utf8length; 518 | size_t index = pos; 519 | 520 | // Look for the subtring 521 | while ( index <= _utf8length - U8LEN ) 522 | { 523 | size_t j = U8LEN - 1; 524 | bool found = false; 525 | 526 | while ( ( str.utf8_at( j ) == utf8_at( index + j ) ) ) 527 | { 528 | if ( j == 0 ) 529 | { 530 | found = true; 531 | break; 532 | } 533 | 534 | j--; 535 | } 536 | 537 | if ( !found ) 538 | { 539 | UTF8string::u8char ss = utf8_at( index + j ); 540 | index += ( u8map.find( ss ) == u8map.end() ) ? U8LEN : u8map[ss]; 541 | } 542 | else 543 | return index; 544 | } 545 | 546 | return UTF8string::npos; 547 | } 548 | 549 | // Tail-recursive function that reverse the string 550 | UTF8string UTF8string::utf8_reverse_aux_( UTF8iterator& it, const UTF8iterator& _end, UTF8string& res ) 551 | { 552 | if ( it == _end ) 553 | return res; 554 | 555 | res += *( --it ); 556 | return utf8_reverse_aux_( it, _end, res ); 557 | } 558 | 559 | 560 | UTF8string& UTF8string::utf8_reverse() 561 | { 562 | if ( _utf8length > 1 ) 563 | { 564 | UTF8iterator it = utf8_end(); 565 | UTF8string rev; 566 | _utf8string = ( utf8_reverse_aux_( it, utf8_iterator_(), rev ) )._utf8string; 567 | } 568 | 569 | return *this; 570 | } 571 | 572 | 573 | size_t UTF8string::utf8_size() const noexcept 574 | { 575 | return _utf8string.size(); 576 | } 577 | 578 | 579 | size_t UTF8string::utf8_length() const noexcept 580 | { 581 | return _utf8length; 582 | } 583 | 584 | const std::string UTF8string::utf8_sstring() const noexcept 585 | { 586 | return _utf8string; 587 | } 588 | 589 | const char * UTF8string::utf8_str() const noexcept 590 | { 591 | return _utf8string.c_str(); 592 | } 593 | 594 | size_t UTF8string::hash() const noexcept 595 | { 596 | // computes the hash using a variant 597 | // of the Fowler-Noll-Vo hash function 598 | const size_t MAGIC = 16777619U; 599 | size_t result = 2166136261U; 600 | 601 | for ( const char& c : _utf8string ) 602 | { 603 | result = ( result * MAGIC ) ^ static_cast( c ); 604 | } 605 | return result ^ ( _utf8length << 1 ); 606 | } 607 | 608 | // Internal function that creates an iterator of the current string 609 | UTF8iterator UTF8string::utf8_iterator_() const noexcept 610 | { 611 | return UTF8iterator( *this ); 612 | } 613 | 614 | 615 | UTF8iterator UTF8string::utf8_begin() const noexcept 616 | { 617 | return utf8_iterator_(); 618 | } 619 | 620 | 621 | UTF8iterator UTF8string::utf8_end() const noexcept 622 | { 623 | return utf8_begin() + _utf8length; 624 | } 625 | 626 | 627 | UTF8iterator UTF8string::begin() const noexcept 628 | { 629 | return utf8_begin(); 630 | } 631 | 632 | 633 | UTF8iterator UTF8string::end() const noexcept 634 | { 635 | return utf8_begin() + _utf8length; 636 | } 637 | 638 | 639 | bool operator ==( const UTF8string& str1, const UTF8string& str2 ) noexcept 640 | { 641 | return str1.utf8_sstring() == str2.utf8_sstring(); 642 | } 643 | 644 | bool operator !=( const UTF8string& str1, const UTF8string& str2 ) noexcept 645 | { 646 | return !( str1 == str2 ); 647 | } 648 | 649 | 650 | bool operator <=( const UTF8string& str1, const UTF8string& str2 ) noexcept 651 | { 652 | return str1.utf8_sstring() <= str2.utf8_sstring(); 653 | } 654 | 655 | 656 | bool operator >=( const UTF8string& str1, const UTF8string& str2 ) noexcept 657 | { 658 | return str1.utf8_sstring() >= str2.utf8_sstring(); 659 | } 660 | 661 | 662 | bool operator <( const UTF8string& str1, const UTF8string& str2 ) noexcept 663 | { 664 | return str1.utf8_sstring() < str2.utf8_sstring(); 665 | } 666 | 667 | 668 | bool operator >( const UTF8string& str1, const UTF8string& str2 ) noexcept 669 | { 670 | return str1.utf8_sstring() > str2.utf8_sstring(); 671 | } 672 | 673 | 674 | UTF8string operator +( const UTF8string& str1, const UTF8string& str2 ) 675 | { 676 | return str1 + str2.utf8_sstring(); 677 | } 678 | 679 | 680 | UTF8string operator +( const UTF8string& str1, const std::string& str2 ) 681 | { 682 | return UTF8string( str1.utf8_sstring() + str2 ); 683 | } 684 | 685 | UTF8string operator +( const std::string& str1, const UTF8string& str2 ) 686 | { 687 | return UTF8string( str1 + str2.utf8_sstring() ); 688 | } 689 | 690 | 691 | UTF8string operator +( const UTF8string& str1, const char * str2 ) 692 | { 693 | return str1 + std::string( str2 ); 694 | } 695 | 696 | 697 | UTF8string operator +( const char * str1, const UTF8string& str2 ) 698 | { 699 | return std::string( str1 ) + str2; 700 | } 701 | 702 | 703 | std::ostream& operator <<( std::ostream& os, const UTF8string& str ) 704 | { 705 | os << str.utf8_sstring(); 706 | return os; 707 | } 708 | 709 | 710 | std::istream& operator >>( std::istream& is, UTF8string& str ) 711 | { 712 | std::string tmp; 713 | std::getline( is, tmp ); 714 | str = tmp; 715 | return is; 716 | } 717 | -------------------------------------------------------------------------------- /src/utf8_string.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright © 2018 Luxon Jean-Pierre 4 | * https://gumichan01.github.io/ 5 | * 6 | * This library is under the MIT license 7 | * 8 | * Luxon Jean-Pierre (Gumichan01) 9 | * luxon.jean.pierre@gmail.com 10 | * 11 | */ 12 | 13 | #ifndef UTF8_STRING_HPP_INCLUDED 14 | #define UTF8_STRING_HPP_INCLUDED 15 | 16 | 17 | /** 18 | * @file utf8_string.hpp 19 | * @brief This is a UTF-8 string library header 20 | */ 21 | 22 | #include 23 | #include 24 | 25 | class UTF8iterator; 26 | 27 | /** 28 | * @class UTF8string final 29 | * @brief UTF-8 string class 30 | * 31 | * This class defines a UTF-8 string 32 | */ 33 | class UTF8string final 34 | { 35 | using byte_t = unsigned char; 36 | using u8string = std::string; 37 | 38 | u8string _utf8string = {}; 39 | size_t _utf8length = 0U; 40 | 41 | bool utf8_is_valid_() const noexcept; 42 | size_t utf8_length_() const noexcept; 43 | size_t utf8_codepoint_len_( const size_t j ) const noexcept; 44 | size_t utf8_bpos_at_( const size_t cpos ) const noexcept; 45 | u8string utf8_at_( const size_t index ) const noexcept; 46 | 47 | UTF8iterator utf8_iterator_() const noexcept; 48 | UTF8string utf8_reverse_aux_( UTF8iterator& it, 49 | const UTF8iterator& _end, UTF8string& res ); 50 | 51 | public: 52 | 53 | /** 54 | * @typedef u8char 55 | * @brief The UTF-8 character 56 | */ 57 | using u8char = std::string; 58 | 59 | /** 60 | * @var npos 61 | * 62 | * *npos* is a static member constant value with the greatest 63 | * possible value for an element of type *size_t*. 64 | * 65 | * This value, when used as the value for a len parameter in some 66 | * UTF-8 string's member functions(utf8_substr()), 67 | * means "until the end of the utf-8 string". 68 | * 69 | * As a return value in utf8_find(), it is used to indicate no matches. 70 | * This constant is defined with a value of -1, 71 | * which because *size_t* is an unsigned integral type, 72 | * it is the largest possible representable value for this type. 73 | * 74 | */ 75 | constexpr static size_t npos = std::string::npos; 76 | 77 | /** 78 | * @fn UTF8string() = default 79 | */ 80 | UTF8string() = default; 81 | /** 82 | * @fn UTF8string(const char * str) 83 | * @param str 84 | * @pre str is not null 85 | * @exception std::invalid_argument If the string is not valid 86 | */ 87 | UTF8string( const char * str ); 88 | /** 89 | * @fn UTF8string(const std::string& str) 90 | * @param str 91 | * @exception std::invalid_argument If the string is not valid 92 | */ 93 | UTF8string( const std::string& str ); 94 | /** 95 | * @fn UTF8string(const UTF8string& u8str) noexcept 96 | * @param u8str 97 | */ 98 | UTF8string( const UTF8string& u8str ) noexcept; 99 | /** 100 | * @fn UTF8string(const UTF8string& u8str, size_t pos, size_t len = npos) noexcept 101 | * @param u8str 102 | * @param pos The beginning position of the substring 103 | * @param len The length of the substring (in number of codepoints, default value = npos) 104 | */ 105 | UTF8string( const UTF8string& u8str, size_t pos, size_t len = npos ) noexcept; 106 | /** 107 | * @fn UTF8string(UTF8string&& u8str) noexcept 108 | * @param u8str 109 | */ 110 | UTF8string( UTF8string&& u8str ) noexcept; 111 | 112 | /** 113 | * @fn UTF8string& operator =(const char * str) 114 | * @param str C-string that will be converted 115 | * @return A reference to the new utf-8 string 116 | * @exception std::invalid_argument If the string is not valid 117 | * @note If an exception is thrown, the object in not modified 118 | */ 119 | UTF8string& operator =( const char * str ); 120 | /** 121 | * @fn UTF8string& operator =(const std::string& str) 122 | * @param str The string that will be converted and checked 123 | * @return A reference to the new utf-8 string 124 | * @exception std::invalid_argument If the string is not valid 125 | * @note If an exception is thrown, the object in not modified 126 | */ 127 | UTF8string& operator =( const std::string& str ); 128 | /** 129 | * @fn UTF8string& operator =(const UTF8string& u8str) 130 | * @param u8str The utf-8 string 131 | * @return A reference to the new utf-8 string 132 | */ 133 | UTF8string& operator =( const UTF8string& u8str ) noexcept; 134 | /** 135 | * @fn UTF8string& operator =(UTF8string&& u8str) 136 | * @param u8str The utf-8 string 137 | * @return A reference to the new utf-8 string 138 | */ 139 | UTF8string& operator =( UTF8string&& u8str ) noexcept; 140 | 141 | /** 142 | * @fn const UTF8string& operator +=(const UTF8string& u8str) 143 | * 144 | * Append a utf-8 string 145 | * 146 | * @param u8str The string to convert from 147 | * @return The reference to the concatenated utf-8 string 148 | */ 149 | const UTF8string& operator +=( const UTF8string& u8str ); 150 | /** 151 | * @fn const UTF8string& operator +=(const std::string& str) 152 | * 153 | * Append a string 154 | * 155 | * @param str The string to convert from 156 | * @return The reference to the concatenated utf-8 string 157 | * @exception std::invalid_argument If the string is not valid 158 | * @note If an exception is thrown, the object in not modified 159 | */ 160 | const UTF8string& operator +=( const std::string& str ); 161 | /** 162 | * @fn const UTF8string& operator +=(const char * str) 163 | * 164 | * Append a C-string 165 | * 166 | * @param str The string to convert from 167 | * @return The reference to the concatenated utf-8 string 168 | * @exception std::invalid_argument If the string is not valid 169 | */ 170 | const UTF8string& operator +=( const char * str ); 171 | 172 | /** 173 | * @fn void utf8_clear() noexcept 174 | * Clear the content of the object 175 | */ 176 | void utf8_clear() noexcept; 177 | /** 178 | * @fn bool utf8_empty() const noexcept 179 | * 180 | * Check if the content is empty 181 | * 182 | * @return TRUE If it is empty, FALSE otherwise 183 | */ 184 | bool utf8_empty() const noexcept; 185 | 186 | 187 | /** 188 | * @fn UTF8string& utf8_assign(const char * str) 189 | * @return The updated string 190 | */ 191 | UTF8string& utf8_assign( const char * str ); 192 | /** 193 | * @fn UTF8string& utf8_assign(const u8string& str) 194 | * @return The updated string 195 | */ 196 | UTF8string& utf8_assign( const u8string& str ); 197 | /** 198 | * @fn UTF8string& utf8_assign(const u8string& str, size_t pos, size_t count = npos) 199 | * 200 | * Replaces the contents with a substring [pos, pos+count) of str. 201 | * If the requested substring lasts past the end of the string, or if count == npos, the resulting substring is [pos, str.size()). 202 | * 203 | * @exception std::out_of_range If pos > str.size() 204 | * @return The updated string 205 | */ 206 | UTF8string& utf8_assign( const u8string& str, size_t pos, size_t count = npos ); 207 | /** 208 | * @fn UTF8string& utf8_assign(UTF8string&& u8str) noexcept 209 | * @return The updated string 210 | */ 211 | UTF8string& utf8_assign( UTF8string&& u8str ) noexcept; 212 | 213 | /** 214 | * @fn UTF8string::u8char utf8_at(const size_t index) const 215 | * 216 | * Get the codepoint at a specified position. 217 | * 218 | * @param index The index of the requested codepoint in the string 219 | * @return The codepoint 220 | * @exception std::out_of_range If the index is out of the string range 221 | * @note If an exception is thrown, the object in not modified 222 | */ 223 | UTF8string::u8char utf8_at( const size_t index ) const; 224 | /** 225 | * @fn UTF8string::u8char operator [](const size_t index) const noexcept 226 | * 227 | * Get the codepoint at a specified position. 228 | * 229 | * @param index The index of the requested codepoint in the string 230 | * @return The codepoint 231 | * 232 | * @note If the index is out of the string range, calling this functions 233 | * causes undefined behaviour 234 | */ 235 | UTF8string::u8char operator []( const size_t index ) const noexcept; 236 | /** 237 | * @fn void utf8_pop() 238 | * 239 | * Remove the last codepoint. 240 | * 241 | * @exception std::length_error If the string is empty 242 | * @note If an exception is thrown, the object in not modified 243 | */ 244 | void utf8_pop(); 245 | /** 246 | * @fn UTF8string& utf8_erase(const size_t index = 0, const size_t count = npos) 247 | * 248 | * Removes min(count, utf8_size() - index) characters starting at index 249 | * 250 | * @return *this 251 | * @exception std::out_of_range if ```index > utf8_size()``` 252 | * @note If an exception is thrown, the object in not modified 253 | */ 254 | UTF8string& utf8_erase( const size_t index = 0, const size_t count = npos ); 255 | /** 256 | * @fn UTF8iterator utf8_erase(const UTF8iterator& position) 257 | * 258 | * Removes the character at position 259 | * 260 | * @return Iterator pointing to the character immediately following the character erased, 261 | * or utf8_end() if no such character exists 262 | * @note If the iterator does not point to *this, the behaviour is undefined 263 | */ 264 | UTF8iterator utf8_erase( const UTF8iterator& position ); 265 | /** 266 | * @fn UTF8iterator utf8_erase(const UTF8iterator& first, const UTF8iterator& last) 267 | * 268 | * Removes the character in the range [first, last[ 269 | * 270 | * @return Iterator pointing to the character ```last```before the erase, 271 | * or utf8_end() if no such character exists 272 | * @note If one of the iterators does not point to *this, the behaviour is undefined 273 | */ 274 | UTF8iterator utf8_erase( const UTF8iterator& first, const UTF8iterator& last ); 275 | 276 | /** 277 | * @fn UTF8string utf8_substr(size_t pos = 0, size_t len = npos) const 278 | * 279 | * Generate a substring according to the position and the length requested. 280 | * 281 | * The substring is the portion of the object that starts at 282 | * character position *pos* and spans *len* characters 283 | * (or until the end of the string, whichever comes first). 284 | * 285 | * @param pos The beginning position of the substring (default value: 0) 286 | * @param len The length of the substring (in number of codepoints, default value = npos) 287 | * @return The substring 288 | */ 289 | UTF8string utf8_substr( size_t pos = 0, size_t len = npos ) const; 290 | /** 291 | * @fn size_t utf8_find(const UTF8string& str, size_t pos = 0) const 292 | * 293 | * Search for the first occurrence of utf8 string 294 | * specified in argument. 295 | * 296 | * When pos is specified, the search only includes characters 297 | * at or after position pos, ignoring any possible occurrences 298 | * that include characters before pos. 299 | * 300 | * @param str The string to look for 301 | * @param pos The position to start the search 302 | * @return The position of the substring if it was found 303 | * (in number of codepoints), UTF8string::npos otherwise. 304 | */ 305 | size_t utf8_find( const UTF8string& str, size_t pos = 0 ) const; 306 | /** 307 | * @fn UTF8string& utf8_reverse() 308 | * Reverse the current utf-8 string. 309 | * @return The reversed string 310 | */ 311 | UTF8string& utf8_reverse(); 312 | 313 | /** 314 | * @fn size_t utf8_size() const noexcept 315 | * Get the memory size (in bytes) of the utf-8 string 316 | * @return The memory size of the utf-8 string 317 | */ 318 | size_t utf8_size() const noexcept; 319 | /** 320 | * @fn size_t utf8_length() const noexcept 321 | * Get the length of the utf-8 string 322 | * @return The length of the utf-8 string (in number of codepoints) 323 | */ 324 | size_t utf8_length() const noexcept; 325 | 326 | /** 327 | * @fn const std::string utf8_sstring() const noexcept 328 | * 329 | * Returns the string related to the UTF-8 string 330 | * 331 | * @return The string 332 | */ 333 | const std::string utf8_sstring() const noexcept; 334 | /** 335 | * @fn const char * utf8_str() const noexcept 336 | * 337 | * Returns a pointer to an array that contains a null-terminated sequence 338 | * of characters (C-string). 339 | * 340 | * This array include exactly the string plus the null character ('\0') 341 | * at the end. 342 | * 343 | * @return A pointer to a C-string 344 | */ 345 | const char * utf8_str() const noexcept; 346 | /** 347 | * @fn size_t hash() const noexcept 348 | * Generate a hash value of the utf8 string 349 | * @return The hash value 350 | */ 351 | size_t hash() const noexcept; 352 | 353 | /** 354 | * @fn UTF8iterator utf8_begin() const noexcept 355 | * 356 | * Returns an iterator that points to the first codepoint of the string 357 | * 358 | * @return An iterator to the beginnigng of the string 359 | */ 360 | UTF8iterator utf8_begin() const noexcept; 361 | /** 362 | * @fn UTF8iterator utf8_end() const noexcept 363 | * 364 | * Returns an iterator that points to the *past-the-end* codepoint of the string 365 | * 366 | * The past-the-end codepoint is a theoretical codepoint that would follow 367 | * the last codepoint in the string. It shall not be dereferenced. 368 | * 369 | * @return An iterator to the past-the-end codepoint 370 | */ 371 | UTF8iterator utf8_end() const noexcept; 372 | 373 | /** 374 | * @fn UTF8iterator begin() const noexcept 375 | * 376 | * Returns an iterator that points to the first codepoint of the string 377 | * 378 | * @return An iterator to the beginnigng of the string 379 | * @note Same as utf8_begin() 380 | */ 381 | UTF8iterator begin() const noexcept; 382 | /** 383 | * @fn UTF8iterator end() const noexcept 384 | * 385 | * Returns an iterator that points to the *past-the-end* codepoint of the string 386 | * 387 | * The past-the-end codepoint is a theoretical codepoint that would follow 388 | * the last codepoint in the string. It shall not be dereferenced. 389 | * 390 | * @return An iterator to the past-the-end codepoint 391 | * @note Same as utf8_end() 392 | */ 393 | UTF8iterator end() const noexcept; 394 | 395 | ~UTF8string() = default; 396 | }; 397 | 398 | 399 | namespace std 400 | { 401 | 402 | template<> 403 | class hash 404 | { 405 | public: 406 | size_t operator()( const UTF8string& u8str ) const 407 | { 408 | return u8str.hash(); 409 | } 410 | }; 411 | 412 | } 413 | 414 | 415 | /** 416 | * @fn bool operator ==(const UTF8string& str1, const UTF8string& str2) noexcept 417 | * 418 | * Check if two utf-8 strings are equals. 419 | * 420 | * Two utf-8 strings are equals if and only if they heve the same length 421 | * and have the same sequence of codepoints. 422 | * 423 | * @param str1 utf-8 string 424 | * @param str2 utf-8 string 425 | * @return TRUE if they are equals, FALSE otherwise 426 | */ 427 | bool operator ==( const UTF8string& str1, const UTF8string& str2 ) noexcept; 428 | 429 | /** 430 | * @fn bool operator !=(const UTF8string& str1, const UTF8string& str2) noexcept 431 | * 432 | * Check if two utf-8 strings are differents. 433 | * 434 | * @param str1 utf-8 string 435 | * @param str2 utf-8 string 436 | * @return TRUE if they are not equals, FALSE otherwise 437 | */ 438 | bool operator !=( const UTF8string& str1, const UTF8string& str2 ) noexcept; 439 | 440 | /** 441 | * @fn bool operator <=(const UTF8string& str1, const UTF8string& str2) noexcept 442 | * 443 | * Check if the first utf-8 string is shorter or equal 444 | * than/to the second utf-8 string 445 | * 446 | * @param str1 utf-8 string 447 | * @param str2 utf-8 string 448 | * @return TRUE if the first string is shorter, FALSE otherwise 449 | */ 450 | bool operator <=( const UTF8string& str1, const UTF8string& str2 ) noexcept; 451 | 452 | /** 453 | * @fn bool operator >=(const UTF8string& str1, const UTF8string& str2) noexcept 454 | * 455 | * Check if the first utf-8 string is longer or equal than/to the second utf-8 string 456 | * 457 | * @param str1 utf-8 string 458 | * @param str2 utf-8 string 459 | * @return TRUE if tthe first string is longer, FALSE otherwise 460 | */ 461 | bool operator >=( const UTF8string& str1, const UTF8string& str2 ) noexcept; 462 | 463 | /** 464 | * @fn bool operator <(const UTF8string& str1, const UTF8string& str2) noexcept 465 | * 466 | * Check if the first utf-8 string is strictly shorter 467 | * than the second utf-8 string 468 | * 469 | * @param str1 utf-8 string 470 | * @param str2 utf-8 string 471 | * @return TRUE if the first string is strictly shorter, FALSE otherwise 472 | */ 473 | bool operator <( const UTF8string& str1, const UTF8string& str2 ) noexcept; 474 | 475 | /** 476 | * @fn bool operator >(const UTF8string& str1, const UTF8string& str2) noexcept 477 | * 478 | * Check if the first utf-8 string is strictly longer 479 | * than the second utf-8 string 480 | * 481 | * @param str1 utf-8 string 482 | * @param str2 utf-8 string 483 | * @return TRUE if the string is strictly longer, FALSE otherwise 484 | */ 485 | bool operator >( const UTF8string& str1, const UTF8string& str2 ) noexcept; 486 | 487 | /** 488 | * @fn UTF8string operator +(const UTF8string& str1, const UTF8string& str2) 489 | * 490 | * Generate a string as a concatenation of the two utf-8 givenin arguments 491 | * 492 | * @param str1 utf-8 string 493 | * @param str2 utf-8 string 494 | * @return A new string whose values is the concatenation of str1 and str2 495 | */ 496 | UTF8string operator +( const UTF8string& str1, const UTF8string& str2 ); 497 | 498 | /** 499 | * @fn UTF8string operator +(const UTF8string& str1, const std::string& str2) 500 | * 501 | * Generate a string as a concatenation of a utf-8 string and a string 502 | * given in arguments 503 | * 504 | * @param str1 utf-8 string 505 | * @param str2 string 506 | * @return A new string whose values is the concatenation of str1 and str2 507 | */ 508 | UTF8string operator +( const UTF8string& str1, const std::string& str2 ); 509 | 510 | /** 511 | * @fn UTF8string operator +(const std::string& str1, const UTF8string& str2) 512 | * 513 | * Generate a string as a concatenation of a string and a utf-8 string 514 | * given in arguments 515 | * 516 | * @param str1 string 517 | * @param str2 utf-8 string 518 | * @return A new string whose values is the concatenation of str1 and str2 519 | */ 520 | UTF8string operator +( const std::string& str1, const UTF8string& str2 ); 521 | 522 | /** 523 | * @fn UTF8string operator +(const UTF8string& str1, const char * str2) 524 | * 525 | * Generate a string as a concatenation of a utf-8 string and a C-string 526 | * given in arguments 527 | * 528 | * @param str1 utf-8 string 529 | * @param str2 C-string 530 | * @return A new string whose values is the concatenation of str1 and str2 531 | */ 532 | UTF8string operator +( const UTF8string& str1, const char * str2 ); 533 | 534 | /** 535 | * @fn UTF8string operator +(const char * str1, const UTF8string& str2) 536 | * 537 | * Generate a string as a concatenation of a C-string and a utf-8 string 538 | * given in arguments 539 | * 540 | * @param str1 C-string 541 | * @param str2 utf8 string 542 | * @return A new string whose values is the concatenation of str1 and str2 543 | */ 544 | UTF8string operator +( const char * str1, const UTF8string& str2 ); 545 | 546 | /** 547 | * @fn std::ostream& operator <<(std::ostream& os, const UTF8string& str) 548 | * 549 | * Insert a utf-8 string into a stream. 550 | * 551 | * This function overloads *operator <<* to behave as described 552 | * in *ostream::operator <<* for C-strings, but applied to utf-8 string objects. 553 | * 554 | * @param os The output stream 555 | * @param str utf8 string to put 556 | * @return The same as parameter *os* 557 | */ 558 | std::ostream& operator <<( std::ostream& os, const UTF8string& str ); 559 | 560 | /** 561 | * @fn std::istream& operator >>(std::istream& is, UTF8string& str) 562 | * 563 | * Extract a utf-8 string from a stream, storing the sequence in str, 564 | * which is overwritten (the previous value of str is replaced). 565 | * 566 | * This function overloads *operator >>* to behave as described 567 | * in *istream::operator >>* for c-strings, but applied to string objects. 568 | * 569 | * @param is The input stream 570 | * @param str utf8 string to put 571 | * @return The same as parameter *is* 572 | */ 573 | std::istream& operator >>( std::istream& is, UTF8string& str ); 574 | 575 | #include "utf8_iterator.hpp" 576 | 577 | #endif // UTF8_STRING_HPP_INCLUDED 578 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright © 2018 Luxon Jean-Pierre 4 | * https://gumichan01.github.io/ 5 | * 6 | * This library is under the MIT license 7 | * 8 | * Luxon Jean-Pierre (Gumichan01) 9 | * luxon.jean.pierre@gmail.com 10 | * 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../src/utf8_string.hpp" 21 | 22 | using namespace std; 23 | 24 | int main() 25 | { 26 | string jap1 = "ドロテ: すみません、ゆうびんきょくはどこですか。\n"; 27 | string jap2 = "けいかん: ゆうびんきょくですか。このみちをまっすぐいってください。ひとつめのしんごうをみぎにまがってください。\n"; 28 | string jap3 = "ドロテ: ひとつめのしんごうをみぎですね。\n"; 29 | string jap4 = "けいかん: はい、それから、まっすぐいってください。ふたつめのかどにゆうびんきょくがあります。\n"; 30 | string jap5 = "ドロテ: ふたつめのかどですね。わかりました。どうもありがとうございます。\n"; 31 | string jap6 = "けいかん: いいえ、どういたしまして。"; 32 | 33 | /// First test suite 34 | { 35 | UTF8string u8( "がんばつて Gumichan" ); 36 | string utf8s( "がんばつて Gumichan" ); 37 | 38 | { 39 | try 40 | { 41 | UTF8string tmp( utf8s ); 42 | } 43 | catch ( ... ) 44 | { 45 | return 100; 46 | } 47 | } 48 | 49 | // Test the test 50 | if ( u8 != u8 ) 51 | return 1; 52 | 53 | // copy construtor 54 | { 55 | UTF8string hum( u8 ); 56 | 57 | if ( hum != u8 ) 58 | return 101; 59 | } 60 | 61 | // move construtor 62 | { 63 | UTF8string dump( u8 ); 64 | UTF8string hum2( std::move( dump ) ); 65 | 66 | if ( hum2 != u8 ) 67 | return 102; 68 | 69 | cout << "dump = " << dump << "\n"; 70 | } 71 | 72 | // assignment 73 | UTF8string uu8 = u8; 74 | 75 | if ( u8 != uu8 ) 76 | return 2; 77 | 78 | // assignment 79 | const std::string& str1 = "がんばつて"; 80 | UTF8string utf8 = str1; 81 | const std::string& str2 = utf8.utf8_str(); 82 | 83 | if ( str1 != str2 ) 84 | return 3; 85 | 86 | if ( str1 != utf8.utf8_sstring() ) 87 | return 4; 88 | 89 | // move assignment 90 | { 91 | UTF8string hum3( "hello" ); 92 | UTF8string dump( u8 ); 93 | 94 | hum3 = std::move( dump ); 95 | 96 | if ( hum3 != u8 ) 97 | return 104; 98 | 99 | cout << "dump = " << dump << "\n"; 100 | } 101 | 102 | // assignment (again) 103 | std::string strg1 = "Gumi"; 104 | std::string strg2 = "chan"; 105 | const size_t sz1 = strg1.size(); 106 | const size_t sz2 = strg2.size(); 107 | const size_t len1 = strg1.length(); 108 | const size_t len2 = strg2.length(); 109 | UTF8string utf8_cat = strg1; 110 | 111 | if ( utf8_cat.utf8_size() != sz1 ) 112 | { 113 | cerr << "ERROR : expected : " << sz1 114 | << "; got : " << utf8_cat.utf8_size() << "\n"; 115 | return 4; 116 | } 117 | 118 | if ( utf8_cat.utf8_length() != len1 ) 119 | { 120 | cerr << "ERROR : expected : " << len1 121 | << "; got : " << utf8_cat.utf8_length() << "\n"; 122 | return 5; 123 | } 124 | 125 | utf8_cat += strg2; 126 | 127 | if ( utf8_cat.utf8_size() != ( sz1 + sz2 ) ) 128 | { 129 | cerr << "ERROR : expected : " << ( sz1 + sz2 ) 130 | << "; got : " << utf8_cat.utf8_size() << "\n"; 131 | return 6; 132 | } 133 | 134 | if ( utf8_cat.utf8_length() != ( len1 + len2 ) ) 135 | { 136 | cerr << "ERROR : expected : " << ( len1 + len2 ) 137 | << "; got : " << utf8_cat.utf8_length() << "\n"; 138 | return 7; 139 | } 140 | 141 | // utf8_asign 142 | { 143 | const char * hello = "hello"; 144 | const std::string world( "world" ); 145 | const UTF8string chw = "Hello World!"; 146 | UTF8string hw = "Hello World!"; 147 | 148 | UTF8string u8cstr; 149 | UTF8string u8str; 150 | UTF8string u8strp; 151 | UTF8string u8s; 152 | 153 | u8cstr.utf8_assign( hello ); 154 | u8str.utf8_assign( world ); 155 | u8strp.utf8_assign( world, 0U, 2U ); 156 | u8s.utf8_assign( std::move( hw ) ); 157 | 158 | if ( u8cstr.utf8_sstring() != std::string( hello ) ) 159 | return 180; 160 | 161 | if ( u8str.utf8_sstring() != world ) 162 | return 181; 163 | 164 | if ( u8strp.utf8_sstring() != world.substr( 0U, 2U ) ) 165 | return 182; 166 | 167 | if ( u8s != chw ) 168 | return 181; 169 | 170 | if ( !hw.utf8_empty() ) 171 | return 182; 172 | } 173 | } 174 | 175 | /// Second test suite 176 | { 177 | // Exract the utf8 string 178 | UTF8string utf( "がんばつて Gumichan" ); 179 | UTF8string sub1 = utf.utf8_substr( 6, 4 ); 180 | UTF8string sub2 = utf.utf8_substr( 0, 5 ); 181 | UTF8string sub3 = utf.utf8_substr( 64, 1024 ); 182 | UTF8string sub4( utf, 0, 5 ); 183 | UTF8string aexpected( "Gumi" ); 184 | UTF8string u8expected( "がんばつて" ); 185 | 186 | if ( sub1 != aexpected ) 187 | { 188 | cerr << "ERROR : expected : " << aexpected.utf8_str() 189 | << "; got :" << sub1.utf8_str() << "\n"; 190 | return 8; 191 | } 192 | 193 | if ( sub2 != u8expected ) 194 | { 195 | cerr << "ERROR : expected : " << u8expected.utf8_str() 196 | << "; got :" << sub2.utf8_str() << "\n"; 197 | return 9; 198 | } 199 | 200 | if ( !sub3.utf8_empty() ) 201 | { 202 | return 90; 203 | } 204 | 205 | if ( sub4 != u8expected ) 206 | { 207 | cerr << "ERROR : expected : " << u8expected.utf8_str() 208 | << "; got :" << sub2.utf8_str() << "\n"; 209 | return 91; 210 | } 211 | 212 | UTF8string sub = utf.utf8_substr(); 213 | if ( sub != utf ) 214 | { 215 | cerr << "ERROR : expected : " << u8expected.utf8_str() 216 | << "; got :" << sub.utf8_str() << "\n"; 217 | return 10; 218 | } 219 | 220 | utf.utf8_clear(); 221 | if ( !utf.utf8_empty() ) 222 | return 11; 223 | 224 | UTF8string aaaa( "aaaa" ); 225 | UTF8string bbbb( "bbbb" ); 226 | UTF8string aaa( "aaaa" ); 227 | 228 | if ( aaaa > bbbb ) 229 | return 12; 230 | 231 | if ( aaa > aaaa ) 232 | return 13; 233 | 234 | if ( aaaa > aaaa ) 235 | return 14; 236 | 237 | if ( aaaa < aaaa ) 238 | return 15; 239 | 240 | auto it_begin = aaaa.utf8_begin(); 241 | auto it_end = aaaa.utf8_end(); 242 | 243 | if ( it_begin == it_end ) 244 | return 16; 245 | 246 | // Find the string 247 | UTF8string str( "がんばつて! Gumichan" ); 248 | UTF8string ganbatsute( "がんばつて" ); 249 | UTF8string gumichan( "Gumichan" ); 250 | size_t lenstr0 = str.utf8_find( UTF8string( "a" ) ); 251 | size_t lenstr1 = str.utf8_find( ganbatsute ); 252 | size_t lenstr2 = str.utf8_find( gumichan, 2 ); 253 | size_t lenpos = str.utf8_find( ganbatsute, 1024 ); 254 | 255 | UTF8string u8 = ( jap1 + jap2 + jap3 + jap4 + jap5 + jap6 ); 256 | UTF8string subjp( std::string( "いいえ、どういたしまして。" ) ); 257 | size_t res = u8.utf8_find( subjp ); 258 | 259 | if ( lenstr0 != 13 ) 260 | { 261 | cout << lenstr0 << " expected : 13 \n"; 262 | return 17; 263 | } 264 | 265 | if ( lenstr1 != 0 ) 266 | { 267 | cout << lenstr1 << " expected : 0 \n"; 268 | return 171; 269 | } 270 | 271 | if ( UTF8string::npos != str.utf8_find( UTF8string( "" ) ) ) 272 | { 273 | return 172; 274 | } 275 | 276 | if ( lenstr2 != 7 ) 277 | { 278 | cout << lenstr2 << " expected : 7" << "\n"; 279 | return 18; 280 | } 281 | 282 | if ( lenpos != UTF8string::npos ) 283 | { 284 | return 181; 285 | } 286 | 287 | if ( res != ( u8.utf8_length() - subjp.utf8_length() ) ) 288 | { 289 | return 182; 290 | } 291 | } 292 | 293 | // Invalid UTF-8 string test 294 | { 295 | /// 1-byte codepoint 296 | try 297 | { 298 | // An invalid codepoint 299 | char inv1[] = {'\x80'}; 300 | string chstr = inv1; 301 | UTF8string u8 = chstr; 302 | 303 | return 19; 304 | } 305 | catch ( const std::invalid_argument& ) {} 306 | 307 | /// 2-byte codepoint 308 | // Bad start codepoint 309 | try 310 | { 311 | char inv1[] = {'\xFF', '\x00'}; 312 | string chstr = inv1; 313 | UTF8string u8 = chstr; 314 | 315 | return 20; 316 | } 317 | catch ( const std::invalid_argument& ) {} 318 | 319 | try 320 | { 321 | // 0xC2 is followed by a continuation byte > BF 322 | char inv21[] = {'\xC2', '\xFE', '\x00'}; 323 | string chstr = inv21; 324 | UTF8string u8 = chstr; 325 | 326 | return 21; 327 | } 328 | catch ( const std::invalid_argument& ) {} 329 | 330 | 331 | try 332 | { 333 | // 0xC2 is followed by a continuation byte < 0x80 334 | char inv22[] = {'\xC2', '\x7F', '\x00'}; 335 | string chstr = inv22; 336 | UTF8string u8 = chstr; 337 | 338 | return 22; 339 | } 340 | catch ( const std::invalid_argument& ) {} 341 | 342 | 343 | /// 3-byte codepoint 344 | try 345 | { 346 | // 0xE0 has no continuation byte 347 | char inv23[] = {'\xE0'}; 348 | string chstr = inv23; 349 | UTF8string u8 = chstr; 350 | 351 | return 23; 352 | } 353 | catch ( const std::invalid_argument& ) {} 354 | 355 | try 356 | { 357 | // Invalid continuation byte (0xC0) after 0xE0 358 | char inv24[] = {'\xE0', '\xA7', '\xC0', '\x00'}; 359 | string chstr = inv24; 360 | UTF8string u8 = chstr; 361 | 362 | return 24; 363 | } 364 | catch ( const std::invalid_argument& ) {} 365 | 366 | try 367 | { 368 | // Invalid continuation byte (0x9F) after 0xE0 369 | char inv25[] = {'\xE0', '\xFF', '\xFF', '\xFF'}; 370 | string chstr = inv25; 371 | UTF8string u8 = chstr; 372 | 373 | return 25; 374 | } 375 | catch ( const std::invalid_argument& ) {} 376 | 377 | try 378 | { 379 | // Invalid continuation byte (0x71) after 0xED 380 | char inv26[] = {'\xED', '\x71', '\xA7', '\x00'}; 381 | string chstr = inv26; 382 | UTF8string u8 = chstr; 383 | 384 | return 26; 385 | } 386 | catch ( const std::invalid_argument& ) {} 387 | 388 | try 389 | { 390 | // Invalid continuation byte (0xA0) after 0xED 391 | char inv27[] = {'\xED', '\xA0', '\xFF', '\xFF'}; 392 | string chstr = inv27; 393 | UTF8string u8 = chstr; 394 | 395 | return 27; 396 | } 397 | catch ( const std::invalid_argument& ) {} 398 | 399 | try 400 | { 401 | // valid string 402 | char inv28[] = {'\xE0', '\xA7', '\xA7', '\x00'}; 403 | string chstr = inv28; 404 | UTF8string u8 = chstr; 405 | 406 | } 407 | catch ( const std::invalid_argument& ) 408 | { 409 | return 28; 410 | } 411 | 412 | try 413 | { 414 | // 0xE0 has no continuation byte 415 | char inv29[] = {'\xED'}; 416 | string chstr = inv29; 417 | UTF8string u8 = chstr; 418 | 419 | return 29; 420 | } 421 | catch ( const std::invalid_argument& ) {} 422 | 423 | /// 4-byte codepoint 424 | try 425 | { 426 | // 0xF0 has no continuation byte 427 | char inv30[] = {'\xF0'}; 428 | string chstr = inv30; 429 | UTF8string u8 = chstr; 430 | 431 | return 30; 432 | } 433 | catch ( const std::invalid_argument& ) {} 434 | 435 | try 436 | { 437 | // 0xF4 has no continuation byte 438 | char inv31[] = {'\xF4'}; 439 | string chstr = inv31; 440 | UTF8string u8 = chstr; 441 | 442 | return 31; 443 | } 444 | catch ( const std::invalid_argument& ) {} 445 | 446 | try 447 | { 448 | // Invalid continuation byte (0x90) after 0xF4 449 | char inv32[] = {'\xF4', '\x90', '\x90', '\x90', '\x00'}; 450 | string chstr = inv32; 451 | UTF8string u8 = chstr; 452 | 453 | return 32; 454 | } 455 | catch ( const std::invalid_argument& ) {} 456 | 457 | // With 0xF0 as the first byte of the codepoint 458 | try 459 | { 460 | // Invalid continuation byte (0x8F) after 0xF0 461 | char inv33[] = {'\xF0', '\x8F', '\x91', '\xB5', '\x00'}; 462 | string chstr = inv33; 463 | UTF8string u8 = chstr; 464 | 465 | return 33; 466 | } 467 | catch ( const std::invalid_argument& ) {} 468 | 469 | try 470 | { 471 | // Invalid continuation byte (0x8F) after 0xF0 472 | char inv34[] = {'\xF0', '\xC7', '\x91', '\xB5', '\x00'}; 473 | string chstr = inv34; 474 | UTF8string u8 = chstr; 475 | 476 | return 34; 477 | } 478 | catch ( const std::invalid_argument& ) {} 479 | 480 | // With 0xF4 as the first byte of the codepoint 481 | try 482 | { 483 | // Invalid continuation byte (0x7F) after 0xF4 484 | char inv35[] = {'\xF4', '\x7F', '\x91', '\xB5', '\x00'}; 485 | string chstr = inv35; 486 | UTF8string u8 = chstr; 487 | 488 | return 35; 489 | } 490 | catch ( const std::invalid_argument& ) {} 491 | 492 | try 493 | { 494 | // Invalid continuation byte (0x92) after 0xF4 495 | char inv36[] = {'\xF4', '\x92', '\x91', '\xB5', '\x00'}; 496 | string chstr = inv36; 497 | UTF8string u8 = chstr; 498 | 499 | return 36; 500 | } 501 | catch ( const std::invalid_argument& ) {} 502 | 503 | try 504 | { 505 | // Not enough bytes after the first codepoint byte 506 | char inv37[] = {'\xF4', '\x92'}; 507 | string chstr = inv37; 508 | UTF8string u8 = chstr; 509 | 510 | return 37; 511 | } 512 | catch ( const std::invalid_argument& ) {} 513 | 514 | try 515 | { 516 | // Not enough bytes after the first codepoint byte 517 | char inv38[] = {'\xF4', '\x92', '\x91'}; 518 | string chstr = inv38; 519 | UTF8string u8 = chstr; 520 | 521 | return 38; 522 | } 523 | catch ( const std::invalid_argument& ) {} 524 | 525 | 526 | try 527 | { 528 | // Valid string 529 | UTF8string u8 = ( jap1 + jap2 + jap3 + jap4 + jap5 + jap6 ); 530 | UTF8string u8sub = string( "わかりました" ); 531 | 532 | if ( u8.utf8_find( u8sub ) == std::string::npos ) 533 | { 534 | return 40; 535 | } 536 | 537 | } 538 | catch ( const std::invalid_argument& ) 539 | { 540 | return 41; 541 | } 542 | } 543 | 544 | // Concatenate strings 545 | { 546 | UTF8string strex1( "がんばつて Gumichan" ); 547 | UTF8string strex2( "Gumichanがんばつて " ); 548 | UTF8string ganba( "がんばつて " ); 549 | std::string gumi( "Gumichan" ); 550 | 551 | const char * gumistr = "Gumichan"; 552 | 553 | // concatenate std::string and UTF8string 554 | if ( strex1 != ( ganba + gumi ) ) 555 | return 42; 556 | 557 | if ( strex2 != ( gumi + ganba ) ) 558 | return 43; 559 | 560 | if ( ( gumi.size() + ganba.utf8_size() ) != 561 | ( gumi + ganba ).utf8_size() ) 562 | return 44; 563 | 564 | if ( ( ganba.utf8_size() + gumi.size() ) != 565 | ( ganba + gumi ).utf8_size() ) 566 | return 45; 567 | 568 | // concatenate UTF8string and const char * 569 | if ( strex2 != ( gumistr + ganba ) ) 570 | return 46; 571 | 572 | if ( strex1 != ( ganba + gumistr ) ) 573 | return 46; 574 | 575 | if ( ( std::strlen( gumistr ) + ganba.utf8_size() ) != 576 | ( gumistr + ganba ).utf8_size() ) 577 | return 48; 578 | 579 | 580 | // concatenate 2 UTF8string objects 581 | UTF8string gumiex( gumi ); 582 | if ( strex1 != ( ganba + gumiex ) ) 583 | return 49; 584 | 585 | if ( ( ganba.utf8_size() + gumiex.utf8_size() ) != 586 | ( ganba + gumiex ).utf8_size() ) 587 | return 50; 588 | 589 | if ( ( ganba.utf8_length() + gumiex.utf8_length() ) != 590 | ( ganba + gumiex ).utf8_length() ) 591 | return 51; 592 | 593 | if ( strex2 != ( gumiex + ganba ) ) 594 | return 52; 595 | 596 | if ( ( gumiex.utf8_size() + ganba.utf8_size() ) != 597 | ( gumiex + ganba ).utf8_size() ) 598 | return 53; 599 | 600 | if ( ( gumiex.utf8_length() + ganba.utf8_length() ) != 601 | ( gumiex + ganba ).utf8_length() ) 602 | return 54; 603 | } 604 | 605 | // Append strings 606 | { 607 | UTF8string strex1( "Gumichan がんばつて" ); 608 | UTF8string strex2( "Gumichan がんばつて!" ); 609 | UTF8string gumi( "Gumichan" ); 610 | 611 | gumi += " がんばつて"; 612 | 613 | if ( gumi != strex1 ) 614 | return 55; 615 | 616 | strex1 += "!"; 617 | 618 | if ( strex1 != strex2 ) 619 | return 56; 620 | } 621 | 622 | // Get the codepoint at a specified position 623 | { 624 | UTF8string astr( "Gumichan" ); 625 | UTF8string str( "がんばつて Gumichan" ); 626 | std::string gcpoint = "G"; 627 | std::string ucpoint = "u"; 628 | std::string ncpoint = "n"; 629 | std::string tcpoint = "て"; 630 | 631 | try 632 | { 633 | str.utf8_at( 42 ); 634 | return 57; 635 | } 636 | catch ( std::out_of_range& ) {} 637 | 638 | std::string c0 = astr[1]; 639 | std::string cc = astr[0]; 640 | std::string ccc = astr.utf8_at( astr.utf8_size() - 1 ); 641 | std::string c1 = str[4]; 642 | std::string c2 = str[7]; 643 | 644 | if ( ucpoint != c0 ) 645 | { 646 | std::cout << "expected: " << ucpoint << "; got: " << c0 << "\n"; 647 | return 58; 648 | } 649 | 650 | if ( gcpoint != cc ) 651 | { 652 | std::cout << "expected: " << gcpoint << "; got: " << cc << "\n"; 653 | return 59; 654 | } 655 | 656 | if ( ncpoint != ccc ) 657 | { 658 | std::cout << "expected: " << ncpoint << ";got: " << ccc << "\n"; 659 | return 60; 660 | } 661 | 662 | if ( tcpoint != c1 ) 663 | { 664 | std::cout << "expected: " << tcpoint << ";got: " << c1 << "\n"; 665 | return 61; 666 | } 667 | 668 | if ( ucpoint != c2 ) 669 | { 670 | std::cout << "expected: " << ucpoint << ";got: " << c2 << "\n"; 671 | return 62; 672 | } 673 | 674 | std::string point = "。"; 675 | UTF8string u8str( "łþø けいかん: いいえ、どういたしまして。" ); 676 | std::string c = u8str.utf8_at( 22 ); 677 | 678 | if ( point != c ) 679 | { 680 | std::cout << "expected: " << point << ";got : " << c << "\n"; 681 | return 63; 682 | } 683 | } 684 | 685 | // Reverse string using iterator 686 | { 687 | UTF8string utf8orig( "がんばつて Gumichan" ); 688 | UTF8string utf8str( "がんばつて Gumichan" ); 689 | 690 | // Bijection 691 | if ( utf8orig != ( utf8str.utf8_reverse() ).utf8_reverse() ) 692 | { 693 | std::cout << "expected: " << utf8orig << "; got: " 694 | << ( utf8str.utf8_reverse() ).utf8_reverse() << "\n"; 695 | return 64; 696 | } 697 | 698 | if ( utf8orig.utf8_size() != ( utf8str.utf8_reverse() ).utf8_size() ) 699 | { 700 | std::cout << "expected: " << utf8orig.utf8_size() << "; got: " 701 | << ( utf8str.utf8_reverse() ).utf8_size() << "\n"; 702 | return 65; 703 | } 704 | 705 | if ( utf8orig.utf8_length() != ( utf8str.utf8_reverse() ).utf8_length() ) 706 | { 707 | std::cout << "expected: " << utf8orig.utf8_length() << "; got: " 708 | << ( utf8str.utf8_reverse() ).utf8_length() << "\n"; 709 | return 66; 710 | } 711 | } 712 | 713 | // Remove the last code point 714 | { 715 | UTF8string str( "がんばつて Gumichan" ); 716 | UTF8string str2( "がんばつて Gumichan がんばつて" ); 717 | UTF8string strpop( "がんばつて Gumicha" ); 718 | UTF8string strempty; 719 | 720 | str.utf8_pop(); 721 | 722 | if ( str.utf8_length() != strpop.utf8_length() ) 723 | { 724 | std::cout << "Excpected : " << strpop.utf8_length() << ";got : " 725 | << str.utf8_length() << "\n"; 726 | return 67; 727 | } 728 | 729 | if ( str != strpop ) 730 | { 731 | std::cout << "Excpected : " << strpop << ";got : " 732 | << str << "\n"; 733 | return 68; 734 | } 735 | 736 | str.utf8_clear(); 737 | str = "がんばつて Gumichan がんばつて"; 738 | 739 | str2.utf8_pop(); 740 | str2 += "て"; 741 | 742 | if ( str2.utf8_length() != str.utf8_length() ) 743 | { 744 | std::cout << "Excpected : " << str.utf8_length() << ";got : " 745 | << str2.utf8_length() << "\n"; 746 | return 69; 747 | } 748 | 749 | if ( str != str2 ) 750 | { 751 | std::cout << "Excpected : " << str2 << ";got : " 752 | << str << "\n"; 753 | return 70; 754 | } 755 | 756 | try 757 | { 758 | strempty.utf8_pop(); 759 | return 71; 760 | 761 | } 762 | catch ( ... ) {} 763 | 764 | } 765 | 766 | // Erase #1 767 | { 768 | UTF8string s = "This is an example"; 769 | cout << s << "\n"; 770 | 771 | s.utf8_erase( 0, 5 ); // Erase "This " 772 | cout << s << "\n"; 773 | 774 | s.utf8_erase( s.utf8_begin() + 2 ); // Erase ' ' 775 | cout << s << "\n"; 776 | 777 | s.utf8_erase( s.utf8_find( " " ) ); // Trim from ' ' to the end of the string 778 | cout << s << "\n"; 779 | } 780 | 781 | // Erase #2 782 | { 783 | UTF8string empty; 784 | UTF8string str1( "がんばつて Gumichan" ); 785 | UTF8string str2( "がんばつて Gumichan" ); 786 | UTF8string str3( "がんばつて Gumichan" ); 787 | UTF8string str4( "がんばつて Gumichan" ); 788 | UTF8string exp1( "がんばつて" ); 789 | UTF8string exp2( "Gumichan" ); 790 | UTF8string exp3( "がんばつてGumichan" ); 791 | 792 | try 793 | { 794 | empty.utf8_erase(); 795 | } 796 | catch ( ... ) 797 | { 798 | return 80; 799 | } 800 | 801 | try 802 | { 803 | empty.utf8_erase( 42, 1024 ); 804 | return 81; 805 | } 806 | catch ( const std::out_of_range& out ) 807 | { 808 | cout << out.what() << '\n'; 809 | } 810 | 811 | str1.utf8_erase( 5 ); 812 | str2.utf8_erase( 0, 6 ); 813 | str3.utf8_erase( str3.utf8_begin() + 5 ); 814 | str4.utf8_erase( str4.utf8_begin(), str4.utf8_begin() + 6 ); 815 | 816 | cout << str1 << "\n"; 817 | cout << str2 << "\n"; 818 | cout << str3 << "\n"; 819 | cout << str4 << "\n"; 820 | 821 | if ( str1 != exp1 ) 822 | 823 | return 76; 824 | if ( str2 != exp2 ) 825 | return 77; 826 | 827 | if ( str3 != exp3 ) 828 | return 78; 829 | 830 | if ( str4 != exp2 ) 831 | return 79; 832 | } 833 | 834 | { 835 | UTF8string hello( "hello" ); 836 | UTF8string hello2( hello ); 837 | UTF8string hellom( "heLlo" ); 838 | 839 | if ( hello.hash() != hello2.hash() ) 840 | { 841 | cerr << "1 - invalid hash function\n"; 842 | return 82; 843 | } 844 | 845 | if ( hello.hash() == hellom.hash() ) 846 | { 847 | cerr << "2 - invalid hash function\n"; 848 | return 83; 849 | } 850 | 851 | std::cout << "hash hello : " << std::hash()( hello ) << "\n"; 852 | std::cout << "hash heLlo : " << std::hash()( hellom ) << "\n"; 853 | } 854 | 855 | // Last test : search for a substring in a file 856 | { 857 | UTF8string text; 858 | UTF8string strreq; 859 | UTF8string strgumi( "がんばつて gumichan01。" ); 860 | std::ifstream u8file( "test/lipsum.txt" ); 861 | 862 | strreq += "速スご薄具そなラひ置更けゃっ文犬2社ぎル由人へいきつ回見ト供崩モ催屋エ国続セワルリ"; 863 | strreq += "謙髪テシ県住ざ新球ごくき名昨ツセ戸読役ホ細16態量番などトぱ。"; 864 | 865 | cout << "Find those strings: \n" << strgumi << "\n\n AND \n\n" 866 | << strreq << "\n\n"; 867 | 868 | if ( u8file.is_open() ) 869 | { 870 | try 871 | { 872 | text = std::string( std::istreambuf_iterator( u8file ), 873 | std::istreambuf_iterator() ); 874 | u8file.close(); 875 | } 876 | catch ( std::invalid_argument& e ) 877 | { 878 | cerr << e.what() << "\n"; 879 | u8file.close(); 880 | return 72; 881 | } 882 | 883 | cout << "File \n" << "name: lipsum.txt" 884 | << "\n" << "size: " << text.utf8_size() 885 | << "\n" << "Number of characters: " 886 | << text.utf8_length() << "\n\n"; 887 | 888 | cout << "1 - Find - " << strgumi << "\n\n"; 889 | size_t pgumi = text.utf8_find( strgumi ); 890 | 891 | if ( pgumi == UTF8string::npos ) 892 | { 893 | cerr << "The position of \"" << strgumi 894 | << "\" in the text must not be UTF8string::npos\n"; 895 | return 73; 896 | } 897 | cout << "1 - string from position " << pgumi << ": \n\n"; 898 | const UTF8string& u8found0 = text.utf8_substr( pgumi, 8192U ); 899 | cout << u8found0 << "\n\n"; 900 | } 901 | else 902 | { 903 | cerr << "File not found!\n"; 904 | return 74; 905 | } 906 | 907 | cout << "2 - Find - " << strreq << "\n\n"; 908 | size_t pos = text.utf8_find( strreq ); 909 | 910 | if ( pos == UTF8string::npos ) 911 | { 912 | cerr << "utf8_find: " << strreq << " not found!\n"; 913 | return 75; 914 | } 915 | 916 | cout << "string from position " << pos << ": \n\n"; 917 | const UTF8string& u8found = text.utf8_substr( pos, strreq.utf8_length() ); 918 | cout << u8found << "\n"; 919 | } 920 | 921 | return 0; 922 | } 923 | --------------------------------------------------------------------------------