├── test ├── .gitignore ├── CMakeLists.txt └── main.cpp ├── img ├── logo.png └── hello_world.png ├── clang-format.bash ├── CONTRIBUTING.md ├── .gitignore ├── README.md ├── LICENSE ├── .clang-format └── include └── unicode └── display_width.hpp /test/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | build_linux/ -------------------------------------------------------------------------------- /img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/unicode_display_width/HEAD/img/logo.png -------------------------------------------------------------------------------- /img/hello_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/unicode_display_width/HEAD/img/hello_world.png -------------------------------------------------------------------------------- /clang-format.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | find ./include -type f \( -iname \*.cpp -o -iname \*.hpp \) | xargs clang-format -style=file -i 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | Contributions are welcomed. Open a pull-request or an issue. 3 | 4 | ## Code of conduct 5 | This project adheres to the [Open Code of Conduct][code-of-conduct]. By participating, you are expected to honor this code. 6 | 7 | [code-of-conduct]: https://github.com/spotify/code-of-conduct/blob/master/code-of-conduct.md 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | display_width 3 |

4 | 5 | Cross-platform single-header library to calculate the display width of UTF-8 strings. 6 | 7 | ## Quick Start 8 | 9 | ```cpp 10 | #include 11 | 12 | int main() { 13 | const std::string input = u8"Hello, world!"; 14 | 15 | // Calculate display width 16 | const auto result = unicode::display_width(input); 17 | 18 | // Verify result 19 | std::cout << "Input : " << input << "\n"; 20 | std::cout << "Output : " << std::string(result, '|') << "\n"; 21 | std::cout << "Width : " << result << "\n\n"; 22 | } 23 | ``` 24 | 25 |

26 | hello_world 27 |

28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Pranav Srinivas Kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.6) 2 | project(UNICODE_DISPLAY_WIDTH) 3 | 4 | if(MSVC) 5 | # Force to always compile with W4 6 | if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") 7 | string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 8 | else() 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") 10 | endif() 11 | elseif(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) 12 | # Update if necessary 13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-long-long -pedantic") 14 | endif() 15 | 16 | if(NOT CMAKE_BUILD_TYPE) 17 | set(CMAKE_BUILD_TYPE Release) 18 | endif() 19 | 20 | # Disable deprecation for windows 21 | if (WIN32) 22 | add_compile_definitions(_CRT_SECURE_NO_WARNINGS) 23 | endif() 24 | 25 | # UNICODE_DISPLAY_WIDTH executable 26 | file(GLOB UNICODE_DISPLAY_WIDTH_TEST_SOURCES 27 | main.cpp 28 | ) 29 | set_source_files_properties(main.cpp 30 | PROPERTIES 31 | COMPILE_DEFINITIONS DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN) 32 | ADD_EXECUTABLE(UNICODE_DISPLAY_WIDTH ${UNICODE_DISPLAY_WIDTH_TEST_SOURCES}) 33 | INCLUDE_DIRECTORIES("../include" ".") 34 | set_target_properties(UNICODE_DISPLAY_WIDTH PROPERTIES OUTPUT_NAME tests) 35 | set_property(TARGET UNICODE_DISPLAY_WIDTH PROPERTY CXX_STANDARD 11) 36 | 37 | # Set ${PROJECT_NAME} as the startup project 38 | set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT UNICODE_DISPLAY_WIDTH) 39 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Right 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: false 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 80 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: false 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 65 | Priority: 2 66 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 67 | Priority: 3 68 | - Regex: '.*' 69 | Priority: 1 70 | IncludeIsMainRegex: '(Test)?$' 71 | IndentCaseLabels: false 72 | IndentPPDirectives: None 73 | IndentWidth: 2 74 | IndentWrappedFunctionNames: false 75 | JavaScriptQuotes: Leave 76 | JavaScriptWrapImports: true 77 | KeepEmptyLinesAtTheStartOfBlocks: true 78 | MacroBlockBegin: '' 79 | MacroBlockEnd: '' 80 | MaxEmptyLinesToKeep: 1 81 | NamespaceIndentation: None 82 | ObjCBlockIndentWidth: 2 83 | ObjCSpaceAfterProperty: false 84 | ObjCSpaceBeforeProtocolList: true 85 | PenaltyBreakAssignment: 2 86 | PenaltyBreakBeforeFirstCallParameter: 19 87 | PenaltyBreakComment: 300 88 | PenaltyBreakFirstLessLess: 120 89 | PenaltyBreakString: 1000 90 | PenaltyExcessCharacter: 1000000 91 | PenaltyReturnTypeOnItsOwnLine: 60 92 | PointerAlignment: Right 93 | RawStringFormats: 94 | - Language: TextProto 95 | Delimiters: 96 | - 'pb' 97 | - 'proto' 98 | BasedOnStyle: google 99 | ReflowComments: true 100 | SortIncludes: true 101 | SortUsingDeclarations: true 102 | SpaceAfterCStyleCast: false 103 | SpaceAfterTemplateKeyword: true 104 | SpaceBeforeAssignmentOperators: true 105 | SpaceBeforeParens: ControlStatements 106 | SpaceInEmptyParentheses: false 107 | SpacesBeforeTrailingComments: 1 108 | SpacesInAngles: false 109 | SpacesInContainerLiterals: true 110 | SpacesInCStyleCastParentheses: false 111 | SpacesInParentheses: false 112 | SpacesInSquareBrackets: false 113 | Standard: Cpp11 114 | TabWidth: 8 115 | UseTab: Never 116 | ... 117 | 118 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using doctest::test_suite; 6 | 7 | inline void print_columns(const std::string& input, const int result) { 8 | std::cout << "Input : " << input << "\n"; 9 | std::cout << "Output : " << std::string(result, '|') << "\n"; 10 | std::cout << "Width : " << result << "\n\n"; 11 | } 12 | 13 | TEST_CASE("unicode::display_width correctly returns the display width of unicode strings" * test_suite("unicode::display_width")) { 14 | { 15 | const std::string input = "Hello, World!"; 16 | const auto result = unicode::display_width(input); 17 | REQUIRE(result == 13); 18 | print_columns(input, result); 19 | } 20 | 21 | { 22 | const std::string input = u8"Hello, world!"; 23 | const auto result = unicode::display_width(input); 24 | REQUIRE(result == 23); 25 | print_columns(input, result); 26 | } 27 | 28 | { 29 | // Halfwidth and Fullwidth Forms 30 | const std::string input = u8"A B C D E F G H I J K L M N O P Q R S T U V W X Y Z"; 31 | const auto result = unicode::display_width(input); 32 | REQUIRE(result == 77); 33 | print_columns(input, result); 34 | } 35 | 36 | { 37 | // Mathematical Operators 38 | const std::string input = u8"∀ ∁ ∂ ∃ ∄ ∅ ∆ ∇ ∈ ∉ ∊ ∋ ∌ ∍ ∎ ∏ ∐ ∑ − ∓ ∔ ∕ ∖ ∗ ∘ ∙ √ ∛ ∜"; 39 | const auto result = unicode::display_width(input); 40 | REQUIRE(result == 57); 41 | print_columns(input, result); 42 | } 43 | 44 | { 45 | // Amharic 46 | const std::string input = u8"እው ሰላም ነው. እንዴት ነህ?"; 47 | const auto result = unicode::display_width(input); 48 | REQUIRE(result == 19); 49 | print_columns(input, result); 50 | } 51 | 52 | { 53 | // French 54 | const std::string input = u8"Je t’aime"; 55 | const auto result = unicode::display_width(input); 56 | REQUIRE(result == 9); 57 | print_columns(input, result); 58 | } 59 | 60 | { 61 | // Greek 62 | const std::string input = u8"Σ΄αγαπώ (Se agapo)"; 63 | const auto result = unicode::display_width(input); 64 | REQUIRE(result == 18); 65 | print_columns(input, result); 66 | } 67 | 68 | { 69 | // Mandarin Chinese 70 | const std::string input = u8"我爱你"; 71 | const auto result = unicode::display_width(input); 72 | REQUIRE(result == 6); 73 | print_columns(input, result); 74 | } 75 | 76 | { 77 | // Chinese simplified 78 | const std::string input = u8"你好。 你好吗?"; 79 | const auto result = unicode::display_width(input); 80 | REQUIRE(result == 15); 81 | print_columns(input, result); 82 | } 83 | 84 | { 85 | // Japanese 86 | const std::string input = u8"愛してる"; 87 | const auto result = unicode::display_width(input); 88 | REQUIRE(result == 8); 89 | print_columns(input, result); 90 | } 91 | 92 | { 93 | // Korean 94 | const std::string input = u8"사랑해 (Saranghae)"; 95 | const auto result = unicode::display_width(input); 96 | REQUIRE(result == 18); 97 | print_columns(input, result); 98 | } 99 | 100 | { 101 | // Russian 102 | const std::string input = u8"Я тебя люблю (Ya tebya liubliu)"; 103 | const auto result = unicode::display_width(input); 104 | REQUIRE(result == 31); 105 | print_columns(input, result); 106 | } 107 | 108 | { 109 | // Hebrew 110 | const std::string input = u8"אני אוהב אותך (Ani ohev otakh)"; 111 | const auto result = unicode::display_width(input); 112 | REQUIRE(result == 30); 113 | print_columns(input, result); 114 | } 115 | 116 | { 117 | // Armenian 118 | const std::string input = u8"Ինչպես ես?"; 119 | const auto result = unicode::display_width(input); 120 | REQUIRE(result == 10); 121 | print_columns(input, result); 122 | } 123 | 124 | { 125 | // Tamil 126 | const std::string input = u8"நீங்கள் எப்படி இருக்கிறீர்கள்?"; 127 | const auto result = unicode::display_width(input); 128 | REQUIRE(result == 22); 129 | print_columns(input, result); 130 | } 131 | 132 | { 133 | // Punjabi 134 | const std::string input = u8"ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?"; 135 | const auto result = unicode::display_width(input); 136 | REQUIRE(result == 10); 137 | print_columns(input, result); 138 | } 139 | 140 | { 141 | // smiley emoji 142 | const std::string input = u8"😁"; 143 | const auto result = unicode::display_width(input); 144 | REQUIRE(result == 2); 145 | print_columns(input, result); 146 | } 147 | 148 | { 149 | // zero-width-joiner ZWJ 150 | const std::string input = u8"‍"; 151 | const auto result = unicode::display_width(input); 152 | REQUIRE(result == 0); 153 | print_columns(input, result); 154 | } 155 | 156 | { 157 | // Woman scientist emoji - explicit version 158 | // i.e. Woman emoji + zero-width-joiner (‍) + microscope emoji 159 | const std::string input = u8"👩\u200D🔬"; 160 | REQUIRE(input == u8"👩‍🔬"); 161 | const auto result = unicode::display_width(input); 162 | REQUIRE(result == 2); 163 | print_columns(input, result); 164 | } 165 | 166 | { 167 | // Woman scientist emoji - implicit version 168 | const std::string input = u8"👩‍🔬"; 169 | const auto result = unicode::display_width(input); 170 | REQUIRE(result == 2); 171 | print_columns(input, result); 172 | } 173 | 174 | { 175 | // "This is cool" 176 | const std::string input = u8"𝓽𝓱𝓲𝓼 𝓲𝓼 𝓬𝓸𝓸𝓵"; 177 | const auto result = unicode::display_width(input); 178 | REQUIRE(result == 12); 179 | print_columns(input, result); 180 | } 181 | 182 | { 183 | // Glasses of disapproval 184 | const std::string input = u8"(-■_■)"; 185 | const auto result = unicode::display_width(input); 186 | REQUIRE(result == 6); 187 | print_columns(input, result); 188 | } 189 | 190 | { 191 | // Right back at ya! 192 | const std::string input = u8"(☞゚∀゚)☞"; 193 | const auto result = unicode::display_width(input); 194 | REQUIRE(result == 7); 195 | print_columns(input, result); 196 | } 197 | 198 | { 199 | // Equalizer 200 | const std::string input = u8"█ ▄ █ ▄ ▄ █ ▄ █ ▄ █"; 201 | const auto result = unicode::display_width(input); 202 | REQUIRE(result == 19); 203 | print_columns(input, result); 204 | } 205 | 206 | { 207 | // Animal Face 208 | const std::string input = u8"/人 ◕ ‿‿ ◕ 人\"; 209 | const auto result = unicode::display_width(input); 210 | REQUIRE(result == 16); 211 | print_columns(input, result); 212 | } 213 | 214 | { 215 | // Symbols 216 | const std::string input = u8"▣ ■ □ ▢ ◯ ▲ ▶ ► ▼ ◆ ◢ ◣ ◤ ◥"; 217 | const auto result = unicode::display_width(input); 218 | REQUIRE(result == 27); 219 | print_columns(input, result); 220 | } 221 | 222 | { 223 | // 1234 224 | const std::string input = "\u2081\u2082\u2083\u2084"; 225 | const auto result = unicode::display_width(input); 226 | REQUIRE(result == 4); 227 | print_columns(input, result); 228 | } 229 | 230 | { 231 | // wstring 232 | const std::wstring input = L"Μένω στους Παξούς"; 233 | const auto result = unicode::display_width(input); 234 | REQUIRE(result == 17); 235 | } 236 | 237 | { 238 | // ANSI escape characters wrapping a red and (blinking) green 'XX' 239 | const std::string input = "\x1b[31mX\x1b[23;32mX\x1b[0m"; 240 | const auto result = unicode::display_width(input); 241 | REQUIRE(result == 2); 242 | print_columns(input, result); 243 | } 244 | } -------------------------------------------------------------------------------- /include/unicode/display_width.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UNICODE_DISPLAY_WIDTH_H 2 | #define UNICODE_DISPLAY_WIDTH_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace unicode { 11 | 12 | namespace details { 13 | 14 | /* 15 | * This is an implementation of wcwidth() and wcswidth() (defined in 16 | * IEEE Std 1002.1-2001) for Unicode. 17 | * 18 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html 19 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html 20 | * 21 | * In fixed-width output devices, Latin characters all occupy a single 22 | * "cell" position of equal width, whereas ideographic CJK characters 23 | * occupy two such cells. Interoperability between terminal-line 24 | * applications and (teletype-style) character terminals using the 25 | * UTF-8 encoding requires agreement on which character should advance 26 | * the cursor by how many cell positions. No established formal 27 | * standards exist at present on which Unicode character shall occupy 28 | * how many cell positions on character terminals. These routines are 29 | * a first attempt of defining such behavior based on simple rules 30 | * applied to data provided by the Unicode Consortium. 31 | * 32 | * For some graphical characters, the Unicode standard explicitly 33 | * defines a character-cell width via the definition of the East Asian 34 | * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. 35 | * In all these cases, there is no ambiguity about which width a 36 | * terminal shall use. For characters in the East Asian Ambiguous (A) 37 | * class, the width choice depends purely on a preference of backward 38 | * compatibility with either historic CJK or Western practice. 39 | * Choosing single-width for these characters is easy to justify as 40 | * the appropriate long-term solution, as the CJK practice of 41 | * displaying these characters as double-width comes from historic 42 | * implementation simplicity (8-bit encoded characters were displayed 43 | * single-width and 16-bit ones double-width, even for Greek, 44 | * Cyrillic, etc.) and not any typographic considerations. 45 | * 46 | * Much less clear is the choice of width for the Not East Asian 47 | * (Neutral) class. Existing practice does not dictate a width for any 48 | * of these characters. It would nevertheless make sense 49 | * typographically to allocate two character cells to characters such 50 | * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be 51 | * represented adequately with a single-width glyph. The following 52 | * routines at present merely assign a single-cell width to all 53 | * neutral characters, in the interest of simplicity. This is not 54 | * entirely satisfactory and should be reconsidered before 55 | * establishing a formal standard in this area. At the moment, the 56 | * decision which Not East Asian (Neutral) characters should be 57 | * represented by double-width glyphs cannot yet be answered by 58 | * applying a simple rule from the Unicode database content. Setting 59 | * up a proper standard for the behavior of UTF-8 character terminals 60 | * will require a careful analysis not only of each Unicode character, 61 | * but also of each presentation form, something the author of these 62 | * routines has avoided to do so far. 63 | * 64 | * http://www.unicode.org/unicode/reports/tr11/ 65 | * 66 | * Markus Kuhn -- 2007-05-26 (Unicode 5.0) 67 | * 68 | * Permission to use, copy, modify, and distribute this software 69 | * for any purpose and without fee is hereby granted. The author 70 | * disclaims all warranties with regard to this software. 71 | * 72 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 73 | */ 74 | 75 | struct interval { 76 | int first; 77 | int last; 78 | }; 79 | 80 | /* auxiliary function for binary search in interval table */ 81 | static int bisearch(wchar_t ucs, const struct interval* table, int max) { 82 | int min = 0; 83 | int mid; 84 | 85 | if (ucs < table[0].first || ucs > table[max].last) 86 | return 0; 87 | while (max >= min) { 88 | mid = (min + max) / 2; 89 | if (ucs > table[mid].last) 90 | min = mid + 1; 91 | else if (ucs < table[mid].first) 92 | max = mid - 1; 93 | else 94 | return 1; 95 | } 96 | 97 | return 0; 98 | } 99 | 100 | 101 | /* The following two functions define the column width of an ISO 10646 102 | * character as follows: 103 | * 104 | * - The null character (U+0000) has a column width of 0. 105 | * 106 | * - Other C0/C1 control characters and DEL will lead to a return 107 | * value of -1. 108 | * 109 | * - Non-spacing and enclosing combining characters (general 110 | * category code Mn or Me in the Unicode database) have a 111 | * column width of 0. 112 | * 113 | * - SOFT HYPHEN (U+00AD) has a column width of 1. 114 | * 115 | * - Other format characters (general category code Cf in the Unicode 116 | * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. 117 | * 118 | * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) 119 | * have a column width of 0. 120 | * 121 | * - Spacing characters in the East Asian Wide (W) or East Asian 122 | * Full-width (F) category as defined in Unicode Technical 123 | * Report #11 have a column width of 2. 124 | * 125 | * - All remaining characters (including all printable 126 | * ISO 8859-1 and WGL4 characters, Unicode control characters, 127 | * etc.) have a column width of 1. 128 | * 129 | * This implementation assumes that wchar_t characters are encoded 130 | * in ISO 10646. 131 | */ 132 | 133 | inline int mk_wcwidth(wchar_t ucs) 134 | { 135 | /* sorted list of non-overlapping intervals of non-spacing characters */ 136 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 137 | static const struct interval combining[] = { 138 | { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 }, 139 | { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, 140 | { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 }, 141 | { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 }, 142 | { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, 143 | { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, 144 | { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 }, 145 | { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D }, 146 | { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, 147 | { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, 148 | { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, 149 | { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, 150 | { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC }, 151 | { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD }, 152 | { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C }, 153 | { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D }, 154 | { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, 155 | { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, 156 | { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC }, 157 | { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, 158 | { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D }, 159 | { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, 160 | { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, 161 | { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, 162 | { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, 163 | { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E }, 164 | { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 }, 165 | { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 }, 166 | { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 }, 167 | { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F }, 168 | { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, 169 | { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, 170 | { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, 171 | { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, 172 | { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B }, 173 | { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 }, 174 | { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 }, 175 | { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF }, 176 | { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 }, 177 | { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, { 0x302A, 0x302F }, 178 | { 0x3099, 0x309A }, { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, 179 | { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, 180 | { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB }, 181 | { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, 182 | { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 }, 183 | { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, 184 | { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F }, 185 | { 0xE0100, 0xE01EF } 186 | }; 187 | 188 | /* test for 8-bit control characters */ 189 | if (ucs == 0) 190 | return 0; 191 | 192 | /* detect ansi escape sequence */ 193 | if (ucs == 0x1b) 194 | return -1; 195 | 196 | /* detect zero-width-joiner (ZWJ) e.g. used to combine emojis like flags */ 197 | if (ucs == 0x200D) 198 | return -2; 199 | 200 | if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) 201 | return -10; 202 | 203 | /* binary search in table of non-spacing characters */ 204 | if (bisearch(ucs, combining, 205 | sizeof(combining) / sizeof(struct interval) - 1)) 206 | return 0; 207 | 208 | /* if we arrive here, ucs is not a combining or C0/C1 control character */ 209 | 210 | return 1 + 211 | (ucs >= 0x1100 && 212 | (ucs <= 0x115f || /* Hangul Jamo init. consonants */ 213 | ucs == 0x2329 || ucs == 0x232a || 214 | (ucs >= 0x2e80 && ucs <= 0xa4cf && 215 | ucs != 0x303f) || /* CJK ... Yi */ 216 | (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ 217 | (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ 218 | (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ 219 | (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ 220 | (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ 221 | (ucs >= 0xffe0 && ucs <= 0xffe6) || 222 | (ucs >= 0x20000 && ucs <= 0x2fffd) || // CJK 223 | (ucs >= 0x30000 && ucs <= 0x3fffd) || 224 | // Miscellaneous Symbols and Pictographs + Emoticons: 225 | (ucs >= 0x1f300 && ucs <= 0x1f64f) || 226 | // Supplemental Symbols and Pictographs: 227 | (ucs >= 0x1f900 && ucs <= 0x1f9ff))); 228 | } 229 | 230 | 231 | inline int mk_wcswidth(const wchar_t* pwcs, size_t n) 232 | { 233 | int w, width = 0; 234 | 235 | int last_code_point_width = 0; 236 | bool in_escape_sequence = false; 237 | for (; *pwcs && n-- > 0; pwcs++) { 238 | if ((w = mk_wcwidth(*pwcs)) < 0) { 239 | switch (w) { 240 | case -1: // detected ansi escape sequence 241 | in_escape_sequence = true; 242 | last_code_point_width = 0; 243 | continue; 244 | case -2: // detected zero-width-joiner 245 | if (last_code_point_width >= 0) { 246 | width -= last_code_point_width; 247 | } else { 248 | last_code_point_width = 0; 249 | } 250 | continue; 251 | default: 252 | return -1; 253 | } 254 | } else { 255 | if (in_escape_sequence) { 256 | if (std::iswalpha(*pwcs)) { // N.B. usually an 'm' 257 | in_escape_sequence = false; 258 | } 259 | } else { 260 | width += w; 261 | last_code_point_width = w; 262 | } 263 | } 264 | } 265 | 266 | return width; 267 | } 268 | 269 | 270 | /* 271 | * The following functions are the same as mk_wcwidth() and 272 | * mk_wcswidth(), except that spacing characters in the East Asian 273 | * Ambiguous (A) category as defined in Unicode Technical Report #11 274 | * have a column width of 2. This variant might be useful for users of 275 | * CJK legacy encodings who want to migrate to UCS without changing 276 | * the traditional terminal character-width behaviour. It is not 277 | * otherwise recommended for general use. 278 | */ 279 | inline int mk_wcwidth_cjk(wchar_t ucs) 280 | { 281 | /* sorted list of non-overlapping intervals of East Asian Ambiguous 282 | * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */ 283 | static const struct interval ambiguous[] = { 284 | { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 }, 285 | { 0x00AA, 0x00AA }, { 0x00AE, 0x00AE }, { 0x00B0, 0x00B4 }, 286 | { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 }, 287 | { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 }, 288 | { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED }, 289 | { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA }, 290 | { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 }, 291 | { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B }, 292 | { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 }, 293 | { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 }, 294 | { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 }, 295 | { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE }, 296 | { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 }, 297 | { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA }, 298 | { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 }, 299 | { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB }, 300 | { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB }, 301 | { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0391, 0x03A1 }, 302 | { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, { 0x03C3, 0x03C9 }, 303 | { 0x0401, 0x0401 }, { 0x0410, 0x044F }, { 0x0451, 0x0451 }, 304 | { 0x2010, 0x2010 }, { 0x2013, 0x2016 }, { 0x2018, 0x2019 }, 305 | { 0x201C, 0x201D }, { 0x2020, 0x2022 }, { 0x2024, 0x2027 }, 306 | { 0x2030, 0x2030 }, { 0x2032, 0x2033 }, { 0x2035, 0x2035 }, 307 | { 0x203B, 0x203B }, { 0x203E, 0x203E }, { 0x2074, 0x2074 }, 308 | { 0x207F, 0x207F }, { 0x2081, 0x2084 }, { 0x20AC, 0x20AC }, 309 | { 0x2103, 0x2103 }, { 0x2105, 0x2105 }, { 0x2109, 0x2109 }, 310 | { 0x2113, 0x2113 }, { 0x2116, 0x2116 }, { 0x2121, 0x2122 }, 311 | { 0x2126, 0x2126 }, { 0x212B, 0x212B }, { 0x2153, 0x2154 }, 312 | { 0x215B, 0x215E }, { 0x2160, 0x216B }, { 0x2170, 0x2179 }, 313 | { 0x2190, 0x2199 }, { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 }, 314 | { 0x21D4, 0x21D4 }, { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 }, 315 | { 0x2202, 0x2203 }, { 0x2207, 0x2208 }, { 0x220B, 0x220B }, 316 | { 0x220F, 0x220F }, { 0x2211, 0x2211 }, { 0x2215, 0x2215 }, 317 | { 0x221A, 0x221A }, { 0x221D, 0x2220 }, { 0x2223, 0x2223 }, 318 | { 0x2225, 0x2225 }, { 0x2227, 0x222C }, { 0x222E, 0x222E }, 319 | { 0x2234, 0x2237 }, { 0x223C, 0x223D }, { 0x2248, 0x2248 }, 320 | { 0x224C, 0x224C }, { 0x2252, 0x2252 }, { 0x2260, 0x2261 }, 321 | { 0x2264, 0x2267 }, { 0x226A, 0x226B }, { 0x226E, 0x226F }, 322 | { 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x2295, 0x2295 }, 323 | { 0x2299, 0x2299 }, { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF }, 324 | { 0x2312, 0x2312 }, { 0x2460, 0x24E9 }, { 0x24EB, 0x254B }, 325 | { 0x2550, 0x2573 }, { 0x2580, 0x258F }, { 0x2592, 0x2595 }, 326 | { 0x25A0, 0x25A1 }, { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 }, 327 | { 0x25B6, 0x25B7 }, { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 }, 328 | { 0x25C6, 0x25C8 }, { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 }, 329 | { 0x25E2, 0x25E5 }, { 0x25EF, 0x25EF }, { 0x2605, 0x2606 }, 330 | { 0x2609, 0x2609 }, { 0x260E, 0x260F }, { 0x2614, 0x2615 }, 331 | { 0x261C, 0x261C }, { 0x261E, 0x261E }, { 0x2640, 0x2640 }, 332 | { 0x2642, 0x2642 }, { 0x2660, 0x2661 }, { 0x2663, 0x2665 }, 333 | { 0x2667, 0x266A }, { 0x266C, 0x266D }, { 0x266F, 0x266F }, 334 | { 0x273D, 0x273D }, { 0x2776, 0x277F }, { 0xE000, 0xF8FF }, 335 | { 0xFFFD, 0xFFFD }, { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD } 336 | }; 337 | 338 | /* binary search in table of non-spacing characters */ 339 | if (bisearch(ucs, ambiguous, 340 | sizeof(ambiguous) / sizeof(struct interval) - 1)) 341 | return 2; 342 | 343 | return mk_wcwidth(ucs); 344 | } 345 | 346 | 347 | inline int mk_wcswidth_cjk(const wchar_t* pwcs, size_t n) 348 | { 349 | int w, width = 0; 350 | 351 | for (; *pwcs && n-- > 0; pwcs++) 352 | if ((w = mk_wcwidth_cjk(*pwcs)) < 0) 353 | return -1; 354 | else 355 | width += w; 356 | 357 | return width; 358 | } 359 | 360 | // convert UTF-8 string to wstring 361 | inline std::wstring utf8_decode(const std::string& str) { 362 | std::wstring_convert> myconv; 363 | return myconv.from_bytes(str); 364 | } 365 | 366 | // convert wstring to UTF-8 string 367 | inline std::string utf8_encode(const std::wstring& str) { 368 | std::wstring_convert> myconv; 369 | return myconv.to_bytes(str); 370 | } 371 | 372 | } 373 | 374 | inline int display_width(const std::string& input) { 375 | using namespace unicode::details; 376 | return details::mk_wcswidth(utf8_decode(input).c_str(), input.size()); 377 | } 378 | 379 | inline int display_width(const std::wstring& input) { 380 | return details::mk_wcswidth(input.c_str(), input.size()); 381 | } 382 | 383 | } 384 | #endif // UNICODE_DISPLAY_WIDTH_H 385 | --------------------------------------------------------------------------------