├── .clang-format ├── .clang-tidy ├── .editorconfig ├── .github ├── FUNDING.yml ├── labeler.yml └── workflows │ ├── build.yml │ ├── checks.yml │ └── labeler.yml ├── .gitignore ├── .vimspector.json ├── CMakeLists.txt ├── CMakePresets.json ├── Changelog.md ├── LICENSE ├── README.md ├── TODO.md ├── cmake ├── ClangTidy.cmake ├── EnableCcache.cmake ├── PedanticCompiler.cmake ├── ThirdParties.cmake └── presets │ ├── common.json │ ├── os-linux.json │ ├── os-macos.json │ └── os-windows.json ├── pylintrc ├── scripts ├── check-pr-todos.sh ├── install-deps.ps1 └── install-deps.sh ├── src ├── libunicode │ ├── CMakeLists.txt │ ├── benchmark.cpp │ ├── capi.cpp │ ├── capi.h │ ├── capi_test.cpp │ ├── codepoint_properties.cpp │ ├── codepoint_properties.h │ ├── codepoint_properties_loader.cpp │ ├── codepoint_properties_loader.h │ ├── convert.h │ ├── convert_test.cpp │ ├── emoji_presentation_scanner.c │ ├── emoji_presentation_scanner.rl │ ├── emoji_segmenter.cpp │ ├── emoji_segmenter.h │ ├── emoji_segmenter_test.cpp │ ├── grapheme_segmenter.cpp │ ├── grapheme_segmenter.h │ ├── grapheme_segmenter_test.cpp │ ├── intrinsics.h │ ├── libunicode-config.cmake.in │ ├── mktables.py │ ├── multistage_table_generator.h │ ├── multistage_table_view.h │ ├── run_segmenter.h │ ├── run_segmenter_test.cpp │ ├── scan.cpp │ ├── scan.h │ ├── scan256.cpp │ ├── scan512.cpp │ ├── scan_simd_impl.h │ ├── scan_test.cpp │ ├── scoped_timer.h │ ├── script_segmenter.cpp │ ├── script_segmenter.h │ ├── script_segmenter_test.cpp │ ├── simd_detector.cpp │ ├── simd_detector.h │ ├── support.h │ ├── tablegen.cpp │ ├── test_main.cpp │ ├── ucd_private.h │ ├── unicode_test.cpp │ ├── utf8.cpp │ ├── utf8.h │ ├── utf8_grapheme_segmenter.h │ ├── utf8_grapheme_segmenter_test.cpp │ ├── utf8_test.cpp │ ├── width.cpp │ ├── width.h │ ├── width_test.cpp │ ├── word_segmenter.h │ └── word_segmenter_test.cpp └── tools │ ├── CMakeLists.txt │ ├── uc-inspect.cpp │ └── unicode-query.cpp ├── tests └── zalgo.txt └── vcpkg.json /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Microsoft 3 | AccessModifierOffset: '-2' 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveMacros: 'true' 6 | AlignConsecutiveDeclarations: 'false' 7 | AlignEscapedNewlines: Left 8 | AlignOperands: 'true' 9 | AlignTrailingComments: 'true' 10 | AllowAllArgumentsOnNextLine: 'true' 11 | AllowAllConstructorInitializersOnNextLine: 'true' 12 | AllowAllParametersOfDeclarationOnNextLine: 'true' 13 | AllowShortBlocksOnASingleLine: 'false' 14 | AllowShortCaseLabelsOnASingleLine: 'true' 15 | AllowShortFunctionsOnASingleLine: InlineOnly 16 | AllowShortIfStatementsOnASingleLine: Never 17 | AllowShortLambdasOnASingleLine: Inline 18 | AllowShortLoopsOnASingleLine: 'false' 19 | AlwaysBreakAfterReturnType: None 20 | AlwaysBreakBeforeMultilineStrings: 'false' 21 | AlwaysBreakTemplateDeclarations: 'Yes' 22 | BinPackArguments: 'false' 23 | BinPackParameters: 'false' 24 | BreakBeforeBinaryOperators: NonAssignment 25 | BreakBeforeBraces: Custom 26 | BreakBeforeTernaryOperators: 'true' 27 | BreakConstructorInitializers: AfterColon 28 | BreakInheritanceList: AfterColon 29 | BreakStringLiterals: 'true' 30 | ColumnLimit: '130' 31 | CompactNamespaces: 'false' 32 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'true' 33 | ConstructorInitializerIndentWidth: '4' 34 | ContinuationIndentWidth: '4' 35 | Cpp11BracedListStyle: 'false' 36 | DerivePointerAlignment: 'false' 37 | FixNamespaceComments: 'true' 38 | IncludeBlocks: Regroup 39 | IndentCaseLabels: true 40 | IndentPPDirectives: BeforeHash 41 | IndentWidth: '4' 42 | IndentWrappedFunctionNames: 'false' 43 | Language: Cpp 44 | MaxEmptyLinesToKeep: '1' 45 | NamespaceIndentation: Inner 46 | PenaltyBreakAssignment: '0' 47 | PointerAlignment: Left 48 | ReflowComments: 'true' 49 | SortIncludes: 'true' 50 | SortUsingDeclarations: 'true' 51 | SpaceAfterCStyleCast: 'true' 52 | SpaceAfterLogicalNot: 'false' 53 | SpaceAfterTemplateKeyword: 'true' 54 | SpaceBeforeAssignmentOperators: 'true' 55 | SpaceBeforeCpp11BracedList: 'true' 56 | SpaceBeforeCtorInitializerColon: 'false' 57 | SpaceBeforeInheritanceColon: 'false' 58 | SpaceBeforeParens: ControlStatements 59 | SpaceBeforeRangeBasedForLoopColon: 'false' 60 | SpaceInEmptyParentheses: 'false' 61 | SpacesInAngles: 'false' 62 | SpacesInCStyleCastParentheses: 'false' 63 | SpacesInContainerLiterals: 'false' 64 | SpacesInParentheses: 'false' 65 | SpacesInSquareBrackets: 'false' 66 | Standard: Cpp11 67 | TabWidth: '4' 68 | UseTab: Never 69 | IncludeCategories: 70 | - Regex: '^<(contour)/' 71 | Priority: 0 72 | - Regex: '^<(terminal)/' 73 | Priority: 1 74 | - Regex: '^<(terminal_renderer)/' 75 | Priority: 2 76 | - Regex: '^<(text_shaper)/' 77 | Priority: 3 78 | - Regex: '^<(crispy)/' 79 | Priority: 4 80 | - Regex: '^<(libunicode)/' 81 | Priority: 5 82 | - Regex: '^<(fmt)/' 83 | Priority: 6 84 | - Regex: '^<(yaml-cpp)/' 85 | Priority: 7 86 | - Regex: '^<(range)/' 87 | Priority: 8 88 | - Regex: '^' 99 | Priority: 21 100 | - Regex: '<[[:alnum:]_]+\.h>' 101 | Priority: 22 102 | - Regex: '.*' 103 | Priority: 23 104 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: >- 3 | -*, 4 | clang-diagnostic-*, 5 | clang-analyzer-*, 6 | bugprone-*, 7 | -bugprone-suspicious-include, 8 | bugprone-unchecked-optional-access, 9 | performance-*, 10 | -performance-no-int-to-ptr, 11 | readability-non-const-parameter, 12 | readability-redundant-*, 13 | cppcoreguidelines-slicing, 14 | readability-identifier-naming 15 | UseColor: true 16 | WarningsAsErrors: true 17 | HeaderFilterRegex: '' 18 | FormatStyle: none 19 | CheckOptions: 20 | - key: bugprone-easily-swappable-parameters.MinimumLength 21 | value: '3' 22 | - key: cert-dcl16-c.NewSuffixes 23 | value: 'L;LL;LU;LLU' 24 | - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField 25 | value: '0' 26 | - key: cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors 27 | value: '1' 28 | - key: cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic 29 | value: '1' 30 | - key: google-readability-braces-around-statements.ShortStatementLines 31 | value: '1' 32 | - key: google-readability-function-size.StatementThreshold 33 | value: '800' 34 | - key: google-readability-namespace-comments.ShortNamespaceLines 35 | value: '10' 36 | - key: google-readability-namespace-comments.SpacesBeforeComments 37 | value: '2' 38 | - key: modernize-loop-convert.MaxCopySize 39 | value: '16' 40 | - key: modernize-loop-convert.MinConfidence 41 | value: reasonable 42 | - key: modernize-loop-convert.NamingStyle 43 | value: CamelCase 44 | - key: modernize-pass-by-value.IncludeStyle 45 | value: llvm 46 | - key: modernize-replace-auto-ptr.IncludeStyle 47 | value: llvm 48 | - key: modernize-use-nullptr.NullMacros 49 | value: 'NULL' 50 | - key: readability-identifier-naming.EnumCase 51 | value: Camel_Snake_Case 52 | - key: readability-identifier-naming.ClassIgnoredRegexp 53 | value: '^(RagelIterator|Expectation)$' 54 | - key: readability-identifier-naming.ClassCase 55 | value: lower_case 56 | - key: readability-identifier-naming.ClassMemberCase 57 | value: lower_case 58 | - key: readability-identifier-naming.ClassMethodCase 59 | value: lower_case 60 | - key: readability-identifier-naming.ParameterCase 61 | value: camelBack 62 | - key: readability-identifier-naming.ParameterPrefix 63 | value: '' 64 | - key: readability-identifier-naming.ScopedEnumConstantCase 65 | value: Camel_Snake_Case 66 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | insert_final_newline = true 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | 11 | [*.yml] 12 | indent_style = space 13 | indent_size = 4 14 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: ['christianparpart'] 4 | custom: ['https://paypal.me/ChristianParpart'] 5 | -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | CI: 2 | - .github/** 3 | CMake: 4 | - "**CMakeLists.txt" 5 | - cmake/** 6 | documentation: 7 | - "**/*.md" 8 | - docs/** 9 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'docs/**' 7 | - '.github/ISSUE_TEMPLATE/**' 8 | - '.github/*.yml' 9 | - 'LICENSE.txt' 10 | - '*.md' 11 | - '*.sh' 12 | branches: 13 | - master 14 | - edge 15 | pull_request: 16 | branches: 17 | - master 18 | - edge 19 | 20 | concurrency: 21 | group: ${{ github.ref }} 22 | cancel-in-progress: true 23 | 24 | env: 25 | CTEST_OUTPUT_ON_FAILURE: 1 26 | 27 | jobs: 28 | 29 | ubuntu_matrix: 30 | strategy: 31 | fail-fast: false 32 | matrix: 33 | os_version: ['24.04'] 34 | name: "Ubuntu ${{ matrix.os_version }}" 35 | runs-on: ubuntu-${{ matrix.os_version }} 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: ccache 39 | uses: hendrikmuhs/ccache-action@v1 40 | with: 41 | key: "ccache-ubuntu_${{ matrix.os_version }}" 42 | max-size: 256M 43 | - name: "Update package database" 44 | run: sudo apt -q update 45 | - name: "install dependencies" 46 | run: ./scripts/install-deps.sh 47 | - name: "cmake" 48 | run: | 49 | cmake -S . -B build -G Ninja \ 50 | -D CMAKE_BUILD_TYPE="RelWithDebInfo" \ 51 | -D LIBUNICODE_BENCHMARK=ON \ 52 | -D LIBUNICODE_TESTING=ON 53 | - name: "build" 54 | run: cmake --build build/ -- -j3 55 | - name: "test" 56 | run: ./build/src/libunicode/unicode_test 57 | 58 | # {{{ macOS 59 | osx: 60 | name: "macOS" 61 | runs-on: macos-14 62 | steps: 63 | - uses: actions/checkout@v4 64 | - name: ccache 65 | uses: hendrikmuhs/ccache-action@v1.2 66 | with: 67 | key: ccache-osx_qt${{ steps.set_vars.outputs.QTVER }}-r1 68 | max-size: 256M 69 | - name: "Install dependencies" 70 | # Sometimes, brew thinks it needs to install from source rather than binary. 71 | # For Qt this may take ages (many many hours). Let's not waste our CPU credits here, 72 | # and limit the run time. 73 | timeout-minutes: 15 74 | run: | 75 | set -ex 76 | #brew update 77 | ./scripts/install-deps.sh 78 | - name: "Generate build files" 79 | run: cmake --preset macos-release 80 | - name: "Build" 81 | run: cmake --build --preset macos-release 82 | - name: "Test" 83 | run: ctest --preset macos-release 84 | # }}} 85 | 86 | windows: 87 | name: "Windows" 88 | runs-on: windows-latest 89 | steps: 90 | - uses: actions/checkout@v4 91 | - name: "vcpkg: Install dependencies" 92 | uses: lukka/run-vcpkg@v11.1 93 | id: runvcpkg 94 | with: 95 | vcpkgDirectory: ${{ runner.workspace }}/vcpkg/ 96 | vcpkgGitCommitId: 80403036a665cb8fcc1a1b3e17593d20b03b2489 97 | - name: "List cmake presets" 98 | run: cmake --list-presets 99 | - name: "Generate build files" 100 | run: cmake --preset windows-cl-release -DCMAKE_TOOLCHAIN_FILE="${{ runner.workspace }}\vcpkg\scripts\buildsystems\vcpkg.cmake" 101 | - name: "Build" 102 | run: cmake --build --preset windows-cl-release 103 | - name: "test" 104 | run: ctest --preset windows-cl-release 105 | 106 | Fedora: 107 | name: Fedora 108 | runs-on: ubuntu-24.04 109 | container: fedora:latest 110 | 111 | steps: 112 | - uses: actions/checkout@v4 113 | - name: Install build dependencies 114 | run: | 115 | dnf install -y curl 116 | PREPARE_ONLY_EMBEDS=OFF SYSDEP_ASSUME_YES=ON ./scripts/install-deps.sh 117 | dnf install -y unicode-ucd 118 | - name: configure 119 | run: cmake --preset linux-gcc-debug -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd 120 | - name: build 121 | run: cmake --build --preset linux-gcc-debug -j$(nproc) 122 | - name: test 123 | run: | 124 | ctest --preset linux-gcc-debug 125 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '.github/ISSUE_TEMPLATE/**' 7 | - '.github/*.yml' 8 | - 'LICENSE.txt' 9 | branches: 10 | - master 11 | pull_request: 12 | branches: 13 | - master 14 | 15 | concurrency: 16 | group: checks-${{ github.ref }} 17 | cancel-in-progress: true 18 | 19 | jobs: 20 | check_PR_TODOs: 21 | name: "Check PR-TODOs" 22 | runs-on: ubuntu-20.04 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: "Checking for open PR-related TODO items" 26 | run: | 27 | set -ex 28 | ./scripts/check-pr-todos.sh 29 | 30 | check_clang_format: 31 | name: "Check C++ style" 32 | runs-on: ubuntu-20.04 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Install clang 36 | run: | 37 | wget https://apt.llvm.org/llvm.sh 38 | chmod +x llvm.sh 39 | sudo ./llvm.sh 18 40 | sudo apt-get install clang-format-18 41 | - name: "Clang-format" 42 | run: find ./src/ -name "*.cpp" -o -name "*.h" | xargs clang-format-18 --Werror --dry-run 43 | -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: "PR Labeler" 2 | 3 | on: 4 | - pull_request_target 5 | 6 | jobs: 7 | triage: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/labeler@v3 11 | with: 12 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /out/ 3 | /_deps/ 4 | /_ucd/ 5 | /.cache/ 6 | /.clangd/ 7 | /compile_commands.json 8 | /.vscode/ 9 | /sandbox/ 10 | /target/ 11 | src/libunicode/ucd.cpp 12 | src/libunicode/ucd.h 13 | src/libunicode/ucd_enums.h 14 | src/libunicode/ucd_fmt.h 15 | src/libunicode/ucd_ostream.h 16 | src/libunicode/codepoint_properties_data.cpp 17 | src/libunicode/codepoint_properties_data.h 18 | src/libunicode/codepoint_properties_names.cpp 19 | -------------------------------------------------------------------------------- /.vimspector.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#", 3 | "configurations": { 4 | "ModelTest": { 5 | "adapter": "vscode-cpptools", 6 | "configuration": { 7 | "request": "launch", 8 | "program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test", 9 | "args": [ 10 | ], 11 | "cwd": "${workspaceRoot}", 12 | "externalConsole": true, 13 | "stopAtEntry": false, 14 | "MIMode": "gdb" 15 | }, 16 | "breakpoints": { 17 | "exception": { 18 | "caught": "Y", 19 | "uncaught": "Y" 20 | } 21 | } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR) 2 | 3 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 4 | 5 | project(libunicode VERSION "0.6.0" LANGUAGES CXX) 6 | 7 | set(MASTER_PROJECT OFF) 8 | if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}) 9 | set(MASTER_PROJECT ON) 10 | endif() 11 | 12 | if(MASTER_PROJECT AND NOT WIN32) 13 | set(LIBUNICODE_BUILD_STATIC_DEFAULT OFF) 14 | else() 15 | set(LIBUNICODE_BUILD_STATIC_DEFAULT ON) 16 | endif() 17 | 18 | # setting defaults 19 | if (NOT("${CMAKE_CXX_STANDARD}")) 20 | set(CMAKE_CXX_STANDARD 20) 21 | endif() 22 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 23 | set(CMAKE_CXX_EXTENSIONS OFF) 24 | 25 | set(CMAKE_COLOR_DIAGNOSTICS ON) 26 | 27 | if(("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") OR ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")) 28 | add_compile_options(-Wall) 29 | add_compile_options(-Wextra) 30 | if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release") 31 | add_definitions(-D_GLIBCXX_DEBUG) 32 | endif() 33 | elseif(DEFINED MSVC) 34 | add_definitions(-DNOMINMAX) 35 | add_compile_options(/utf-8) 36 | endif() 37 | 38 | include(EnableCcache) 39 | include(ClangTidy) 40 | include(PedanticCompiler) 41 | 42 | set(CMAKE_EXPORT_COMPILE_COMMANDS ${MASTER_PROJECT}) 43 | option(LIBUNICODE_COVERAGE "libunicode: Builds with codecov [default: OFF]" OFF) 44 | option(LIBUNICODE_EXAMPLES "libunicode: Enables building of example programs. [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT}) 45 | option(LIBUNICODE_TESTING "libunicode: Enables building of unittests for libunicode [default: ${MASTER_PROJECT}" ${MASTER_PROJECT}) 46 | option(LIBUNICODE_BENCHMARK "libunicode: Enables building of benchmark for libunicode [default: OFF]" OFF) 47 | option(LIBUNICODE_TOOLS "libunicode: Builds CLI tools [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT}) 48 | option(LIBUNICODE_BUILD_STATIC "libunicode: provide static library instead of dynamic [default: ${LIBUNICODE_BUILD_STATIC_DEFAULT}]" ${LIBUNICODE_BUILD_STATIC_DEFAULT}) 49 | option(LIBUNICODE_TABLEGEN_FASTBUILD "libunicode: Use fast table generation (takes more memory in final tables) [default: OFF]" OFF) 50 | 51 | string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSTEM_PROCESSOR_LOWER) 52 | 53 | if(NOT LIBUNICODE_SIMD_IMPLEMENTATION) 54 | if((SYSTEM_PROCESSOR_LOWER STREQUAL "x86_64") 55 | OR (SYSTEM_PROCESSOR_LOWER STREQUAL "aarch64") 56 | OR (SYSTEM_PROCESSOR_LOWER STREQUAL "amd64") 57 | OR (SYSTEM_PROCESSOR_LOWER STREQUAL "arm64")) 58 | set(LIBUNICODE_SIMD_IMPLEMENTATION "intrinsics" CACHE STRING "libunicode: SIMD implementation to use" FORCE) 59 | else() 60 | set(LIBUNICODE_SIMD_IMPLEMENTATION "std" CACHE STRING "libunicode: SIMD implementation to use" FORCE) 61 | endif() 62 | set_property(CACHE LIBUNICODE_SIMD_IMPLEMENTATION PROPERTY STRINGS "std" "intrinsics" "none") 63 | endif() 64 | 65 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE) 66 | include(ThirdParties) 67 | 68 | if(LIBUNICODE_TESTING) 69 | enable_testing() 70 | endif() 71 | 72 | # ---------------------------------------------------------------------------- 73 | set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version") 74 | set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.") 75 | 76 | set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip") 77 | set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233") 78 | set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip") 79 | set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.") 80 | 81 | # ---------------------------------------------------------------------------- 82 | # code coverage 83 | 84 | if(LIBUNICODE_COVERAGE AND NOT MSVC) 85 | add_compile_options(-g --coverage) 86 | set(CMAKE_EXE_LINKER_FLAGS "--coverage ${CMAKE_EXE_LINKER_FLAGS}") 87 | message("-- [code coverage] Enabled.") 88 | else() 89 | message("-- [code coverage] Disabled.") 90 | endif() 91 | 92 | # ---------------------------------------------------------------------------- 93 | 94 | add_subdirectory(src/libunicode) 95 | add_subdirectory(src/tools) 96 | 97 | if("${CCACHE}" STREQUAL "") 98 | set(USING_CCACHE_STRING "OFF") 99 | else() 100 | set(USING_CCACHE_STRING "${CCACHE}") 101 | endif() 102 | 103 | if(LIBUNICODE_BUILD_STATIC) 104 | set(LIBUNICODE_BUILD_MODE "static") 105 | else() 106 | set(LIBUNICODE_BUILD_MODE "dynamic") 107 | endif() 108 | 109 | # Export the cmake package to the cmake package registry (~/.cmake/packages/) 110 | export(PACKAGE libunicode) 111 | 112 | message(STATUS "------------------------------------------------------------------------------") 113 | message(STATUS " libunicode (version ${libunicode_VERSION}${libunicode_VERSION_SUFFIX})") 114 | message(STATUS "------------------------------------------------------------------------------") 115 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 116 | message(STATUS "Build mode: ${LIBUNICODE_BUILD_MODE}") 117 | message(STATUS "Build unit tests: ${LIBUNICODE_TESTING}") 118 | message(STATUS "Build benchmark: ${LIBUNICODE_BENCHMARK}") 119 | message(STATUS "Build tools: ${LIBUNICODE_TOOLS}") 120 | message(STATUS "Enable tablegen fast build: ${LIBUNICODE_TABLEGEN_FASTBUILD}") 121 | message(STATUS "Using ccache: ${USING_CCACHE_STRING}") 122 | message(STATUS "SIMD support: ${LIBUNICODE_SIMD_IMPLEMENTATION}") 123 | message(STATUS "Using UCD directory: ${LIBUNICODE_UCD_DIR}") 124 | message(STATUS "Enable clang-tidy: ${ENABLE_TIDY} (${CMAKE_CXX_CLANG_TIDY})") 125 | message(STATUS "------------------------------------------------------------------------------") 126 | 127 | ThirdPartiesSummary2() 128 | -------------------------------------------------------------------------------- /CMakePresets.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 6, 3 | "cmakeMinimumRequired": { 4 | "major": 3, 5 | "minor": 27, 6 | "patch": 0 7 | }, 8 | "include": [ 9 | "cmake/presets/os-linux.json", 10 | "cmake/presets/os-macos.json", 11 | "cmake/presets/os-windows.json" 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /Changelog.md: -------------------------------------------------------------------------------- 1 | ## 0.5.0 (unreleased) 2 | 3 | - Show emoji presentation in Unicode properties query tool. 4 | 5 | ## 0.4.0 (2023-11-27) 6 | 7 | - Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`. 8 | - Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref. 9 | - Move `scan_result.next` to `scan_state.next`. 10 | 11 | ## 0.3.0 (2023-03-01) 12 | 13 | - Fixes build error on GCC 13. 14 | - Fixes properly stopping at control characters in complex sub-state in scan API. 15 | - Fixes successful processing invalid UTF-8 in scan API. 16 | - Fixes installing missing headers for use of this API as non-embedded library. 17 | - Changes project and include directory from `unicode` to `libunicode` to avoid include path conflict with `ICU`. 18 | - Adds compile time option to either build static or dynamic binaries (`LIBUNICODE_BUILD_STATIC`). 19 | - Adds SONAME version to libraries. 20 | 21 | ## 0.2.1 (2023-02-14) 22 | 23 | - Fixes unicode-query's output for "character width". 24 | - Fixes decoding invalid UTF-8 locking up. 25 | - Fixes stage1 multistage-table sizes, reducing memory footprint a bit. 26 | - Adds SIMD implementation for scan API on ARM64 (NEON). 27 | - unicode-query is now linked statically on UNIX platforms. 28 | 29 | ## 0.2.0 (2022-11-13) 30 | 31 | - Slightly improve performance of grapheme cluster segmentation. 32 | - Fixes grapheme cluster segmentation of multiple consecutive regional flags. 33 | - Add access to Age property of a codepoint (giving information about at which Unicode version a codepoint was introduced). 34 | - Add access to the assigned name of a codepoint. 35 | - unicode-query: Now also prints name and age properties. 36 | - CMake install target also installs header and library (not just tools). 37 | - Reduce number of dependencies down to fmtlib and (for unit tests) Catch2. 38 | - Enables libunicode to be found via CMake's `find_package()`. 39 | - Improved default installation directories on UNIX via GNUInstallDirs helper. 40 | - Enable compiling on ARM64. 41 | 42 | ## 0.1.0 (2022-11-03) 43 | 44 | While version 0.1.0 sounds like a small number, this project is out there since quite some years already 45 | and actively used by Contour Terminal. 46 | 47 | The biggest movements lately are major performance improvements in accessing Unicode properties, 48 | fixing bugs as usual, and apart from being a modern C++ Unicode library, we've now also added 49 | a command line tool to query Unicode properties in the hope it'll be useful to you. 50 | 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![C++20](https://img.shields.io/badge/standard-C%2B%2B%2020-blue.svg?logo=C%2B%2B)](https://isocpp.org/) 2 | [![CI Build](https://github.com/contour-terminal/libunicode/workflows/Build/badge.svg)](https://github.com/contour-terminal/libunicode/actions?query=workflow%3ABuild) 3 | 4 | # Modern C++20 Unicode Library 5 | 6 | The goal of this library is to bring painless unicode support to C++ with simple and easy to understand APIs. 7 | 8 | The API naming conventions are chosen to look familiar to those using the C++ standard libary. 9 | 10 | ### Feature Overview 11 | 12 | - [x] API for accessing UCD properties 13 | - [x] UTF8 <-> UTF32 conversion 14 | - [x] wcwidth equivalent (`int unicode::width(char32_t)`) 15 | - [x] grapheme segmentation (UTS algorithm) 16 | - [x] symbol/emoji segmentation (UTS algorithm) 17 | - [x] script segmentation [UTS 24](https://unicode.org/reports/tr24/) 18 | - [x] unit tests for most parts (wcwidth / segmentation) 19 | - [x] generic text run segmentation (top level segmentation API suitable for text shaping implementations) 20 | - [ ] word segmentation (UTS algorithm) 21 | - [x] CLI tool: `uc-inspect` for inspecting input files by code point properties, grapheme cluster, word, script, ... 22 | 23 | # Unicode Technical Specifications 24 | 25 | - [UTS 11](https://unicode.org/reports/tr11/) - character width 26 | - [UTS 24](https://unicode.org/reports/tr24/) - script property 27 | - [UTS 29](https://unicode.org/reports/tr29/) - text segmentation (grapheme cluster, word boundary) 28 | - [UTS 51](https://unicode.org/reports/tr51/) - Emoji 29 | 30 | ### Integrate with your CMake project 31 | 32 | ```sh 33 | git submodule add --name libunicode https://github.com/contour-terminal/libunicode 3rdparty/libunicode 34 | ``` 35 | 36 | ```cmake 37 | add_subdirectory(3rdparty/libunicode) 38 | 39 | add_executable(your_tool your_tool.cpp) 40 | target_link_libraries(your_tool PRIVATE unicode::unicode) 41 | ``` 42 | 43 | ### Contributing 44 | 45 | - for filing issues please visit: https://github.com/contour-terminal/libunicode/issues 46 | - fork and create pull requests: https://github.com/contour-terminal/libunicode/pulls 47 | - I am also happy to just receive code reviews 48 | - you can help with documentation, or 49 | - general feedback is also very welcome 50 | 51 | ### Users of this library 52 | 53 | * [Contour Terminal Emulator](https://github.com/contour-terminal/contour/) 54 | 55 | ### Disclaimer 56 | 57 | This library is -in terms of features- by no means competive to the ICU library, but it attempts to 58 | provide a clean and intuitive modern C++ API for those that do not want to fight legacy-style C APIs. 59 | 60 | I hope that over time we can add more and more features to this library to conform to the Unicode 61 | specification eventually at some point and I welcome everyone to contribute to it by forking the 62 | library, creating pull requests, or even just constructive feedback. 63 | 64 | ### License 65 | 66 | ``` 67 | libunicode - a modern C++20 unicode library 68 | ------------------------------------------- 69 | 70 | Licensed under the Apache License, Version 2.0 (the "License"); 71 | you may not use this file except in compliance with the License. 72 | 73 | Unless required by applicable law or agreed to in writing, software 74 | distributed under the License is distributed on an "AS IS" BASIS, 75 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 76 | See the License for the specific language governing permissions and 77 | limitations under the License. 78 | ``` 79 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | 2 | # TODO 3 | 4 | - [ ] rewrite test functions from `bool foo(x) ...` to `bool is_foo(x) ...` 5 | - [ ] all about emoji flag sequences 6 | - [ ] `bool is_mirrorred(char32_t) noexcept` (such as parenthesis, curly braces, brackets, ...) 7 | - also ability to get the mirrorring codepoint 8 | - [ ] map codepoint to block (enum) - see Blocks.txt 9 | - [ ] map coepoint to plane (enum) 10 | - [ ] map block to codepoint range 11 | - [ ] map plane to codepoint range 12 | - [ ] provide C API binding for basic functionality 13 | - [ ] `script_segmenter`: add support for commonPreferredScript tracking wrt brackets () [] {}. 14 | - [ ] `script_segmenter`: test "foo(λ);" -> {Latin, Greek, Latin} 15 | - [ ] `orientation_segmenter` (and integrate it into `run_segmenter` as well as its tests) 16 | - [ ] mktables: `fmtlib` integration into `ucd_fmt.h` (without actually depending on fmtlib itself) 17 | - [ ] mktables: `to_string` builder 18 | - [ ] mktables: `to_type` builder 19 | - [ ] mktables: pylint into CI 20 | - [ ] clang-tidy into CI 21 | - [ ] META: cmake install target (header files and .a file, executable) 22 | - [ ] META: pkg-config file 23 | - [ ] word segmentation (UTS algorithm) 24 | - [ ] generic text segmentation (top level segmentation API suitable for text shaping implementations) 25 | - [ ] CLI tool: unicode-inspect for inspecting input files by code point, grapheme cluster, word, script, ... 26 | - [x] unit tests for most parts (wcwidth / segmentation) 27 | - [x] README: list all TRs that are being implemented 28 | - [x] API for accessing UCD properties 29 | - [x] UTF8 <-> UTF32 conversion 30 | - [x] grapheme segmentation (UTS algorithm) 31 | - [x] symbol/emoji segmentation (UTS algorithm) 32 | - [x] wcwidth equivalent (`unicode::width(char32_t)`) 33 | - [x] script segmentation 34 | - [x] `out` helper to force explicit `ref(val)` for more readability. 35 | - [x] `operator<<(ostream&, T)` for all UCD properties - in its own header file (`ucd_ostream.h`) 36 | - [x] `emoji_segmenter`: test "x 😀 y" -> {Text, Emoji, Text} 37 | - [x] make `run_segmenter` more templated / customizable 38 | - [x] mktables: `enum class` builder 39 | 40 | ## Integration TODO 41 | 42 | * [x] integrate into contour 43 | * [ ] see if this makes sense: make use of this library in klex lexical scanner, to allow unicode input 44 | 45 | -------------------------------------------------------------------------------- /cmake/ClangTidy.cmake: -------------------------------------------------------------------------------- 1 | 2 | option(ENABLE_TIDY "Enable clang-tidy [default: OFF]" OFF) 3 | if(ENABLE_TIDY) 4 | find_program(CLANG_TIDY_EXE 5 | NAMES clang-tidy-9 clang-tidy-8 clang-tidy-7 clang-tidy 6 | DOC "Path to clang-tidy executable") 7 | if(NOT CLANG_TIDY_EXE) 8 | message(STATUS "[clang-tidy] Not found.") 9 | else() 10 | message(STATUS "[clang-tidy] found: ${CLANG_TIDY_EXE}") 11 | set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_EXE}") 12 | endif() 13 | else() 14 | message(STATUS "[clang-tidy] Disabled.") 15 | endif() 16 | -------------------------------------------------------------------------------- /cmake/EnableCcache.cmake: -------------------------------------------------------------------------------- 1 | # Setup ccache. 2 | # 3 | # The ccache is auto-enabled if the tool is found. 4 | # To disable set -DCCACHE=OFF option. 5 | if(NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER) 6 | find_program(CCACHE ccache DOC "ccache tool path; set to OFF to disable") 7 | if(CCACHE) 8 | set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE}) 9 | if(COMMAND cotire) 10 | # Change ccache config to meet cotire requirements. 11 | set(ENV{CCACHE_SLOPPINESS} pch_defines,time_macros) 12 | endif() 13 | message(STATUS "[ccache] Enabled: ${CCACHE}") 14 | else() 15 | message(STATUS "[ccache] Disabled.") 16 | endif() 17 | endif() 18 | -------------------------------------------------------------------------------- /cmake/PedanticCompiler.cmake: -------------------------------------------------------------------------------- 1 | include(CheckCXXCompilerFlag) 2 | function(try_add_compile_options FLAG) 3 | # Remove leading - or / from the flag name. 4 | string(REGEX REPLACE "^[-/]" "" name ${FLAG}) 5 | # Deletes any ':' because it's invalid variable names. 6 | string(REGEX REPLACE ":" "" name ${name}) 7 | check_cxx_compiler_flag(${FLAG} ${name}) 8 | if(${name}) 9 | message(STATUS "Adding compiler flag: ${FLAG}.") 10 | add_compile_options(${FLAG}) 11 | else() 12 | message(STATUS "Adding compiler flag: ${FLAG} failed.") 13 | endif() 14 | 15 | # If the optional argument passed, store the result there. 16 | if(ARGV1) 17 | set(${ARGV1} ${name} PARENT_SCOPE) 18 | endif() 19 | endfunction() 20 | 21 | option(PEDANTIC_COMPILER "Compile the project with almost all warnings turned on." ON) 22 | option(PEDANTIC_COMPILER_WERROR "Enables -Werror to force warnings to be treated as errors." OFF) 23 | 24 | # Always show diagnostics in colored output. 25 | try_add_compile_options(-fdiagnostics-color=always) 26 | 27 | if(${PEDANTIC_COMPILER}) 28 | if(("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") OR ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")) 29 | message(STATUS "Enabling pedantic compiler options: yes") 30 | # TODO: check https://github.com/lefticus/cppbestpractices/blob/master/02-Use_the_Tools_Available.md#compilers 31 | try_add_compile_options(-Qunused-arguments) 32 | try_add_compile_options(-Wall) 33 | try_add_compile_options(-Wconversion) 34 | try_add_compile_options(-Wduplicate-enum) 35 | try_add_compile_options(-Wduplicated-cond) 36 | try_add_compile_options(-Wextra) 37 | try_add_compile_options(-Wextra-semi) 38 | try_add_compile_options(-Wfinal-dtor-non-final-class) 39 | try_add_compile_options(-Wimplicit-fallthrough) 40 | try_add_compile_options(-Wlogical-op) 41 | try_add_compile_options(-Wmissing-declarations) 42 | try_add_compile_options(-Wnewline-eof) 43 | try_add_compile_options(-Wno-unknown-attributes) 44 | try_add_compile_options(-Wno-unknown-pragmas) 45 | try_add_compile_options(-Wnull-dereference) 46 | try_add_compile_options(-Wpessimizing-move) 47 | try_add_compile_options(-Wredundant-move) 48 | try_add_compile_options(-Wsign-conversion) 49 | try_add_compile_options(-Wsuggest-destructor-override) 50 | try_add_compile_options(-pedantic) 51 | else() 52 | message(STATUS "Enabling pedantic compiler options: unsupported platform") 53 | endif() 54 | else() 55 | message(STATUS "Enabling pedantic compiler options: no") 56 | endif() 57 | 58 | if(${PEDANTIC_COMPILER_WERROR}) 59 | try_add_compile_options(-Werror) # XXX Not yet, but hopefully soon. 60 | 61 | # Not sure how to work around these. 62 | try_add_compile_options(-Wno-error=class-memaccess) 63 | try_add_compile_options(-Wno-class-memaccess) 64 | 65 | # TODO: Should be addressed. 66 | try_add_compile_options(-Wno-error=missing-declarations) 67 | try_add_compile_options(-Wno-missing-declarations) 68 | endif() 69 | -------------------------------------------------------------------------------- /cmake/ThirdParties.cmake: -------------------------------------------------------------------------------- 1 | # This directory structure is being created by `scripts/install-deps.sh` 2 | # and is used to inject all the dependencies the operating system's 3 | # package manager did not provide (not found or too old version). 4 | 5 | if(EXISTS ${PROJECT_SOURCE_DIR}/_deps/sources/CMakeLists.txt) 6 | message(STATUS "Embedding 3rdparty libraries ...") 7 | add_subdirectory(${PROJECT_SOURCE_DIR}/_deps/sources) 8 | endif() 9 | 10 | set(LIST ThirdParties) 11 | macro(Thirdparty_Include_If_MIssing _TARGET _PACKAGE_NAME) 12 | if(${_PACKAGE_NAME} STREQUAL "") 13 | set(${_PACKAGE_NAME} ${_TARGET}) 14 | endif() 15 | if (NOT TARGET ${_TARGET}) 16 | find_package(${_PACKAGE_NAME} REQUIRED) 17 | list(APPEND ThirdParties ${_TARGET}_SYSDEP) 18 | set(THIRDPARTY_BUILTIN_${_TARGET} "system package") 19 | else() 20 | list(APPEND ThirdParties ${_TARGET}_EMBED) 21 | set(THIRDPARTY_BUILTIN_${_TARGET} "embedded") 22 | endif() 23 | endmacro() 24 | 25 | # TODO make me working 26 | macro(ThirdPartiesSummary) 27 | message(STATUS "==============================================================================") 28 | message(STATUS " ThirdParties") 29 | message(STATUS "------------------------------------------------------------------------------") 30 | foreach(TP ${ThirdParties}) 31 | message(STATUS "${TP}\t\t${THIRDPARTY_BUILTIN_${TP}}") 32 | endforeach() 33 | endmacro() 34 | 35 | # Now, conditionally find all dependencies that were not included above 36 | # via find_package, usually system installed packages. 37 | 38 | if(LIBUNICODE_TESTING) 39 | if(TARGET Catch2::Catch2WithMain) 40 | set(THIRDPARTY_BUILTIN_Catch2 "embedded") 41 | else() 42 | find_package(Catch2 REQUIRED) 43 | set(THIRDPARTY_BUILTIN_Catch2 "system package") 44 | endif() 45 | endif() 46 | 47 | if(LIBUNICODE_BENCHMARK) 48 | if(TARGET benchmark::benchmark_main) 49 | set(THIRDPARTY_BUILTIN_benchmark "embedded") 50 | else() 51 | find_package(benchmark REQUIRED) 52 | set(THIRDPARTY_BUILTIN_benchmark "system package") 53 | endif() 54 | endif() 55 | 56 | 57 | macro(ThirdPartiesSummary2) 58 | message(STATUS "==============================================================================") 59 | message(STATUS " ThirdParties") 60 | message(STATUS "------------------------------------------------------------------------------") 61 | if(LIBUNICODE_TESTING) 62 | message(STATUS "Catch2 ${THIRDPARTY_BUILTIN_Catch2}") 63 | endif() 64 | if(LIBUNICODE_BENCHMARK) 65 | message(STATUS "Benchmark ${THIRDPARTY_BUILTIN_benchmark}") 66 | endif() 67 | message(STATUS "------------------------------------------------------------------------------") 68 | endmacro() 69 | -------------------------------------------------------------------------------- /cmake/presets/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 6, 3 | "configurePresets": [ 4 | { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON" } }, 5 | { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, 6 | { "name": "arch-native", "hidden": true, "cacheVariables": { "CMAKE_CXX_FLAGS": "-march=native" } }, 7 | { "name": "clang", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "clang++" } }, 8 | { "name": "gcc", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "g++" } }, 9 | { 10 | "name": "libunicode-common", 11 | "hidden": true, 12 | "binaryDir": "${sourceDir}/build/${presetName}", 13 | "cacheVariables": { 14 | "LIBUNICODE_BENCHMARK": "ON", 15 | "LIBUNICODE_TESTING": "ON", 16 | "PEDANTIC_COMPILER": "ON", 17 | "PEDANTIC_COMPILER_WERROR": "ON" 18 | } 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /cmake/presets/os-linux.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 6, 3 | "include": [ "common.json" ], 4 | "configurePresets": [ 5 | { 6 | "name": "linux-common", 7 | "inherits": "libunicode-common", 8 | "generator": "Ninja", 9 | "hidden": true, 10 | "condition": { 11 | "type": "equals", 12 | "lhs": "${hostSystemName}", 13 | "rhs": "Linux" 14 | } 15 | }, 16 | { 17 | "name": "linux-clang-debug", 18 | "displayName": "Linux (Clang) Debug", 19 | "inherits": ["linux-common", "debug", "clang"] 20 | }, 21 | { 22 | "name": "linux-clang-release", 23 | "displayName": "Linux (Clang) Release", 24 | "inherits": ["linux-common", "release", "clang"] 25 | }, 26 | { 27 | "name": "linux-gcc-debug", 28 | "displayName": "Linux (GCC) Debug", 29 | "inherits": ["linux-common", "debug", "gcc"] 30 | }, 31 | { 32 | "name": "linux-gcc-release", 33 | "displayName": "Linux (GCC) Release", 34 | "inherits": ["linux-common", "release", "gcc"] 35 | }, 36 | { 37 | "name": "linux-native-clang-release", 38 | "displayName": "Linux (Clang, Native arch, Release)", 39 | "inherits": ["linux-common", "release", "arch-native", "clang"] 40 | }, 41 | { 42 | "name": "linux-native-gcc-release", 43 | "displayName": "Linux (GCC, Native arch, Release)", 44 | "inherits": ["linux-common", "release", "arch-native", "gcc"] 45 | } 46 | ], 47 | "buildPresets": [ 48 | { "name": "linux-clang-debug", "configurePreset": "linux-clang-debug" }, 49 | { "name": "linux-clang-release", "configurePreset": "linux-clang-release" }, 50 | { "name": "linux-gcc-debug", "configurePreset": "linux-gcc-debug" }, 51 | { "name": "linux-gcc-release", "configurePreset": "linux-gcc-release" }, 52 | { "name": "linux-native-clang-release", "configurePreset": "linux-native-clang-release" }, 53 | { "name": "linux-native-gcc-release", "configurePreset": "linux-native-gcc-release" } 54 | ], 55 | "testPresets": [ 56 | { "name": "linux-clang-debug", "configurePreset": "linux-clang-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }, 57 | { "name": "linux-clang-release", "configurePreset": "linux-clang-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }, 58 | { "name": "linux-gcc-debug", "configurePreset": "linux-gcc-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }, 59 | { "name": "linux-gcc-release", "configurePreset": "linux-gcc-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } } 60 | ] 61 | } 62 | -------------------------------------------------------------------------------- /cmake/presets/os-macos.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 6, 3 | "include": [ "common.json" ], 4 | "configurePresets": [ 5 | { 6 | "name": "macos-common", 7 | "inherits": "libunicode-common", 8 | "generator": "Ninja", 9 | "hidden": true, 10 | "condition": { 11 | "type": "equals", 12 | "lhs": "${hostSystemName}", 13 | "rhs": "Darwin" 14 | } 15 | }, 16 | { "name": "macos-debug", "displayName": "MacOS Debug", "inherits": ["macos-common", "debug"] }, 17 | { "name": "macos-release", "displayName": "MacOS Release", "inherits": ["macos-common", "release"] } 18 | ], 19 | "buildPresets": [ 20 | { "name": "macos-debug", "configurePreset": "macos-debug" }, 21 | { "name": "macos-release", "configurePreset": "macos-release" } 22 | ], 23 | "testPresets": [ 24 | { "name": "macos-debug", "configurePreset": "macos-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }, 25 | { "name": "macos-release", "configurePreset": "macos-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } } 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /cmake/presets/os-windows.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 6, 3 | "include": [ "common.json" ], 4 | "configurePresets": [ 5 | { 6 | "name": "windows-common", 7 | "inherits": "libunicode-common", 8 | "displayName": "Windows - common settings", 9 | "hidden": true, 10 | "binaryDir": "${sourceDir}/out/build/${presetName}", 11 | "condition": { 12 | "type": "equals", 13 | "lhs": "${hostSystemName}", 14 | "rhs": "Windows" 15 | }, 16 | "cacheVariables": { 17 | "VCPKG_TARGET_TRIPLET": "x64-windows", 18 | "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", 19 | "CMAKE_VERBOSE_MAKEFILE": "ON", 20 | "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/../vcpkg/scripts/buildsystems/vcpkg.cmake" 21 | } 22 | }, 23 | { "name": "windows-cl-debug", "inherits": ["windows-common", "debug"], "displayName": "Windows (MSVC) Debug", "description": "Using MSVC compiler (64-bit)" }, 24 | { "name": "windows-cl-release", "inherits": ["windows-common", "release"], "displayName": "Windows (MSVC) Release", "description": "Using MSVC compiler (64-bit)" }, 25 | { "name": "windows-clang-common", "inherits": ["windows-common"], "hidden": true, "toolset": "ClangCL,host=x64" }, 26 | { "name": "windows-clang-debug", "inherits": ["windows-clang-common", "debug"], "displayName": "Windows (ClangCL) Debug", "description": "Using Clang compiler (64-bit)" }, 27 | { "name": "windows-clang-release", "inherits": ["windows-clang-common", "release"], "displayName": "Windows (ClangCL) Release", "description": "Using Clang compiler (64-bit)" } 28 | ], 29 | "buildPresets": [ 30 | { "name": "windows-cl-debug", "displayName": "x64 (MSVC) Debug", "configurePreset": "windows-cl-debug", "configuration": "Debug" }, 31 | { "name": "windows-cl-release", "displayName": "x64 (MSVC) RelWithDebInfo", "configurePreset": "windows-cl-release", "configuration": "RelWithDebInfo" }, 32 | { "name": "windows-clang-debug", "displayName": "x64 (Clang) Debug", "configurePreset": "windows-clang-debug", "configuration": "Debug" }, 33 | { "name": "windows-clang-release", "displayName": "x64 (Clang) RelWithDebInfo", "configurePreset": "windows-clang-release", "configuration": "RelWithDebInfo" } 34 | ], 35 | "testPresets": [ 36 | { "name": "windows-cl-debug", "configurePreset": "windows-cl-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }, 37 | { "name": "windows-cl-release", "configurePreset": "windows-cl-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /scripts/check-pr-todos.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | FOUND=$(git grep "TODO(pr)" | grep -v "scripts/check-pr-todos.sh") 3 | if [[ "${FOUND}" == "" ]]; then 4 | exit 0 5 | fi 6 | 7 | echo "This PR still contains PR-related TODO itmes that must be resolved." 8 | echo 9 | echo "${FOUND}" 10 | exit 1 11 | -------------------------------------------------------------------------------- /scripts/install-deps.ps1: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env pwsh 2 | 3 | # Let's assume for now, that this script is only invoked from within Windows 4 | # But in the future, I'd like it to support all the others, too. 5 | 6 | class ThirdParty 7 | { 8 | [ValidateNotNullOrEmpty()] [string] $Folder 9 | [ValidateNotNullOrEmpty()] [string] $Archive 10 | [ValidateNotNullOrEmpty()] [string] $URI 11 | } 12 | 13 | # Take care, order matters, at least as much as dependencies are of concern. 14 | $ThirdParties = 15 | @( 16 | [ThirdParty]@{ 17 | Folder="Catch2-3.4.0"; 18 | Archive="Catch2-3.4.0.zip"; 19 | URI="https://github.com/catchorg/Catch2/archive/refs/tags/v3.4.0.zip" 20 | }; 21 | ) 22 | 23 | function Fetch-And-Add 24 | { 25 | param ( 26 | [Parameter(Mandatory)] [string] $Target, 27 | [Parameter(Mandatory)] [string] $Folder, 28 | [Parameter(Mandatory)] [string] $Archive, 29 | [Parameter(Mandatory)] [string] $URI, 30 | [Parameter(Mandatory)] [string] $CMakeListsFile 31 | ) 32 | 33 | $DistfilesDir = "${Target}/distfiles" 34 | if (! [System.IO.Directory]::Exists($DistfilesDir)) 35 | { 36 | New-Item -ItemType Directory -Force -Path $DistfilesDir 37 | } 38 | 39 | $ArchivePath = "${DistfilesDir}/${Archive}" 40 | if (! [System.IO.File]::Exists($ArchivePath)) 41 | { 42 | Write-Host "Downloading $Archive to $ArchivePath" 43 | Invoke-WebRequest -Uri $URI -OutFile $ArchivePath 44 | } 45 | else 46 | { 47 | Write-Host "Already there: $ArchivePath" 48 | } 49 | 50 | if (! [System.IO.Directory]::Exists("$Target/sources/$Folder")) 51 | { 52 | Write-Host "Populating ${Folder}" 53 | Expand-Archive $ArchivePath -DestinationPath "${Target}/sources/" 54 | } 55 | else 56 | { 57 | Write-Host "Already there ${Folder}" 58 | } 59 | 60 | Add-Content $CMakeListsFile "add_subdirectory(${Folder} EXCLUDE_FROM_ALL)" 61 | } 62 | 63 | function Run 64 | { 65 | $ProjectRoot = "${PSScriptRoot}/.." 66 | $ThirsPartiesDir = "${ProjectRoot}/_deps" 67 | $DistfilesDir = "${ThirsPartiesDir}/distfiles" 68 | $SourcesDir = "${ThirsPartiesDir}/sources" 69 | $CMakeListsFile = "${SourcesDir}/CMakeLists.txt" 70 | 71 | if (! [System.IO.Directory]::Exists($DistfilesDir)) 72 | { 73 | New-Item -ItemType Directory -Force -Path $DistfilesDir 74 | } 75 | 76 | if (! [System.IO.Directory]::Exists($SourcesDir)) 77 | { 78 | New-Item -ItemType Directory -Force -Path $SourcesDir 79 | } 80 | 81 | if ([System.IO.File]::Exists($CMakeListsFile)) 82 | { 83 | Clear-Content $CMakeListsFile 84 | } 85 | 86 | foreach($TP in $ThirdParties) 87 | { 88 | Fetch-And-Add -Folder $TP.Folder -Archive $TP.Archive -URI $TP.URI -Target $ThirsPartiesDir -CMakeListsFile $CMakeListsFile 89 | } 90 | } 91 | 92 | Run 93 | -------------------------------------------------------------------------------- /scripts/install-deps.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -ex 4 | 5 | # Special environment variable to be used when only fetching and extracting 6 | # embedded dependencies should be done, i.e. no system package manager is 7 | # being invoked. 8 | # 9 | # set this as environment variable to ON to activate this mode. 10 | if [ x$PREPARE_ONLY_EMBEDS = x ] 11 | then 12 | PREPARE_ONLY_EMBEDS=OFF 13 | fi 14 | 15 | # if SYSDEP_ASSUME_YES=ON is set, then system package managers are attempted 16 | # to install packages automatically, i.e. without confirmation. 17 | if [ x$SYSDEP_ASSUME_YES = xON ] 18 | then 19 | SYSDEP_ASSUME_YES='-y' 20 | else 21 | unset SYSDEP_ASSUME_YES 22 | fi 23 | 24 | # {{{ sysdeps fetcher and unpacker for deps that aren't available via sys pkg mgnr 25 | SYSDEPS_BASE_DIR="$(dirname $0)/../_deps" 26 | 27 | SYSDEPS_DIST_DIR="$SYSDEPS_BASE_DIR/distfiles" 28 | SYSDEPS_SRC_DIR="$SYSDEPS_BASE_DIR/sources" 29 | SYSDEPS_CMAKE_FILE="$SYSDEPS_SRC_DIR/CMakeLists.txt" 30 | 31 | fetch_and_unpack() 32 | { 33 | NAME=$1 34 | DISTFILE=$2 35 | URL=$3 36 | 37 | FULL_DISTFILE="$SYSDEPS_DIST_DIR/$DISTFILE" 38 | 39 | if ! test -f "$FULL_DISTFILE"; then 40 | if which curl &>/dev/null; then 41 | curl -L -o "$FULL_DISTFILE" "$URL" 42 | elif which wget &>/dev/null; then 43 | wget -O "$FULL_DISTFILE" "$URL" 44 | elif which fetch &>/dev/null; then 45 | # FreeBSD 46 | fetch -o "$FULL_DISTFILE" "$URL" 47 | else 48 | echo "Don't know how to fetch from the internet." 1>&2 49 | exit 1 50 | fi 51 | else 52 | echo "Already fetched $DISTFILE. Skipping." 53 | fi 54 | 55 | if ! test -d "$SYSDEPS_SRC_DIR/$NAME"; then 56 | echo "Extracting $DISTFILE" 57 | tar xzpf $FULL_DISTFILE -C $SYSDEPS_SRC_DIR 58 | else 59 | echo "Already extracted $DISTFILE. Skipping." 60 | fi 61 | 62 | echo "add_subdirectory($NAME EXCLUDE_FROM_ALL)" >> $SYSDEPS_CMAKE_FILE 63 | } 64 | 65 | fetch_and_unpack_Catch2() 66 | { 67 | fetch_and_unpack \ 68 | Catch2-3.4.0 \ 69 | Catch2-3.4.0.tar.gz \ 70 | https://github.com/catchorg/Catch2/archive/refs/tags/v3.4.0.tar.gz 71 | } 72 | 73 | fetch_and_unpack_benchmark() 74 | { 75 | fetch_and_unpack \ 76 | benchmark-1.8.3 \ 77 | benchmark-1.8.3.tar.gz \ 78 | https://github.com/google/benchmark/archive/refs/tags/v1.8.3.tar.gz 79 | } 80 | 81 | 82 | prepare_fetch_and_unpack() 83 | { 84 | mkdir -p "${SYSDEPS_BASE_DIR}" 85 | mkdir -p "${SYSDEPS_DIST_DIR}" 86 | mkdir -p "${SYSDEPS_SRC_DIR}" 87 | 88 | # empty out sysdeps CMakeLists.txt 89 | rm -f $SYSDEPS_CMAKE_FILE 90 | } 91 | # }}} 92 | 93 | install_deps_ubuntu() 94 | { 95 | local packages=" 96 | build-essential 97 | cmake 98 | debhelper 99 | dpkg-dev 100 | libc6-dev 101 | make 102 | ninja-build 103 | " 104 | 105 | RELEASE=`grep VERSION_ID /etc/os-release | cut -d= -f2 | tr -d '"'` 106 | 107 | local NAME=`grep ^NAME /etc/os-release | cut -d= -f2 | cut -f1 | tr -d '"'` 108 | 109 | case $RELEASE in 110 | "24.04") 111 | fetch_and_unpack_Catch2 112 | packages="$packages g++-14" 113 | ;; 114 | *) 115 | packages="$packages g++" 116 | packages="$packages catch2" 117 | ;; 118 | esac 119 | 120 | fetch_and_unpack_benchmark 121 | 122 | [ x$PREPARE_ONLY_EMBEDS = xON ] && return 123 | 124 | sudo apt install $SYSDEP_ASSUME_YES $packages 125 | # sudo snap install --classic powershell 126 | } 127 | 128 | install_deps_FreeBSD() 129 | { 130 | fetch_and_unpack_benchmark 131 | 132 | [ x$PREPARE_ONLY_EMBEDS = xON ] && return 133 | 134 | su root -c "pkg install $SYSDEP_ASSUME_YES \ 135 | catch \ 136 | cmake \ 137 | ninja \ 138 | pkgconf \ 139 | range-v3 140 | " 141 | } 142 | 143 | install_deps_arch() 144 | { 145 | fetch_and_unpack_benchmark 146 | [ x$PREPARE_ONLY_EMBEDS = xON ] && return 147 | 148 | sudo pacman -S -y --needed \ 149 | catch2 \ 150 | cmake \ 151 | git \ 152 | ninja \ 153 | range-v3 154 | } 155 | 156 | install_deps_fedora() 157 | { 158 | version=`cat /etc/fedora-release | awk '{print $3}'` 159 | 160 | local packages=" 161 | catch-devel 162 | cmake 163 | gcc-c++ 164 | google-benchmark-devel 165 | ninja-build 166 | pkgconf 167 | " 168 | 169 | [ x$PREPARE_ONLY_EMBEDS = xON ] && return 170 | 171 | sudo dnf install $SYSDEP_ASSUME_YES $packages 172 | } 173 | 174 | 175 | install_deps_darwin() 176 | { 177 | fetch_and_unpack_Catch2 178 | fetch_and_unpack_benchmark 179 | 180 | [ x$PREPARE_ONLY_EMBEDS = xON ] && return 181 | 182 | # NB: Also available in brew: mimalloc 183 | # catch2: available in brew, but too new (version 3+) 184 | brew install $SYSDEP_ASSUME_YES \ 185 | ninja \ 186 | pkg-config \ 187 | range-v3 188 | } 189 | 190 | main() 191 | { 192 | if test x$OS_OVERRIDE != x; then 193 | # In CI, we need to be able to fetch embedd-setups for different OSes. 194 | ID=$OS_OVERRIDE 195 | elif test -f /etc/os-release; then 196 | ID=`grep ^ID= /etc/os-release | cut -d= -f2` 197 | else 198 | ID=`uname -s` 199 | fi 200 | 201 | prepare_fetch_and_unpack 202 | 203 | case "$ID" in 204 | arch) 205 | install_deps_arch 206 | ;; 207 | fedora) 208 | install_deps_fedora 209 | ;; 210 | ubuntu|neon|debian) 211 | install_deps_ubuntu 212 | ;; 213 | Darwin) 214 | install_deps_darwin 215 | ;; 216 | FreeBSD) 217 | install_deps_FreeBSD 218 | ;; 219 | *) 220 | fetch_and_unpack_Catch2 221 | fetch_and_unpack_benchmark 222 | echo "OS $ID not supported." 223 | echo "Dependencies were fetch manually and most likely libunicode will compile." 224 | ;; 225 | esac 226 | } 227 | 228 | main $* 229 | -------------------------------------------------------------------------------- /src/libunicode/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | 9 | using std::string_view; 10 | 11 | template 12 | static void benchmarkWithLength(benchmark::State& benchmarkState) 13 | { 14 | auto TestText = std::string(L, 'a') + "\u00A9"; 15 | for (auto _: benchmarkState) 16 | { 17 | benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10)); 18 | } 19 | } 20 | 21 | template 22 | static void benchmarkWithOffset(benchmark::State& benchmarkState) 23 | { 24 | auto TestText = std::string(L, 'a') + "\U0001F600" + std::string(1000, 'a'); 25 | for (auto _: benchmarkState) 26 | { 27 | benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10)); 28 | } 29 | } 30 | 31 | BENCHMARK(benchmarkWithLength<1>); 32 | BENCHMARK(benchmarkWithLength<10>); 33 | BENCHMARK(benchmarkWithLength<100>); 34 | BENCHMARK(benchmarkWithLength<1000>); 35 | BENCHMARK(benchmarkWithLength<10000>); 36 | BENCHMARK(benchmarkWithLength<100000>); 37 | BENCHMARK(benchmarkWithLength<1000000>); 38 | 39 | BENCHMARK(benchmarkWithOffset<5>); 40 | BENCHMARK(benchmarkWithOffset<10>); 41 | BENCHMARK(benchmarkWithOffset<15>); 42 | BENCHMARK(benchmarkWithOffset<20>); 43 | BENCHMARK(benchmarkWithOffset<25>); 44 | BENCHMARK(benchmarkWithOffset<30>); 45 | BENCHMARK(benchmarkWithOffset<35>); 46 | BENCHMARK(benchmarkWithOffset<40>); 47 | BENCHMARK(benchmarkWithOffset<45>); 48 | BENCHMARK(benchmarkWithOffset<50>); 49 | BENCHMARK(benchmarkWithOffset<55>); 50 | BENCHMARK(benchmarkWithOffset<60>); 51 | BENCHMARK(benchmarkWithOffset<65>); 52 | BENCHMARK(benchmarkWithOffset<70>); 53 | BENCHMARK(benchmarkWithOffset<75>); 54 | BENCHMARK(benchmarkWithOffset<80>); 55 | BENCHMARK(benchmarkWithOffset<85>); 56 | BENCHMARK(benchmarkWithOffset<90>); 57 | BENCHMARK(benchmarkWithOffset<95>); 58 | BENCHMARK(benchmarkWithOffset<100>); 59 | BENCHMARK(benchmarkWithOffset<105>); 60 | BENCHMARK(benchmarkWithOffset<110>); 61 | BENCHMARK(benchmarkWithOffset<115>); 62 | BENCHMARK(benchmarkWithOffset<120>); 63 | BENCHMARK(benchmarkWithOffset<125>); 64 | BENCHMARK(benchmarkWithOffset<130>); 65 | 66 | // Run the benchmark 67 | BENCHMARK_MAIN(); 68 | -------------------------------------------------------------------------------- /src/libunicode/capi.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2021 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | int u32_gc_count(u32_char_t const* codepoints, size_t size) 23 | { 24 | if (!size) 25 | return 0; 26 | 27 | int count = 1; 28 | auto segmenter = unicode::grapheme_segmenter((char32_t const*) codepoints, (char32_t const*) codepoints + size); 29 | while (segmenter.codepointsAvailable()) 30 | { 31 | ++segmenter; 32 | ++count; 33 | } 34 | return count; 35 | } 36 | 37 | int u8_gc_count(u8_char_t const* codepoints, size_t size) 38 | { 39 | auto const u32 = unicode::convert_to(std::string_view(codepoints, size)); 40 | return u32_gc_count((uint32_t const*) u32.data(), u32.size()); 41 | } 42 | 43 | int u32_gc_width(u32_char_t const* codepoints, size_t size, int mode) 44 | { 45 | int totalWidth = 0; 46 | auto segmenter = unicode::grapheme_segmenter((char32_t const*) codepoints, (char32_t const*) codepoints + size); 47 | while (segmenter.codepointsAvailable()) 48 | { 49 | auto const cluster = *segmenter; 50 | int thisWidth = static_cast(unicode::width(cluster.front())); 51 | if (mode != GC_WIDTH_MODE_NON_MODIFIABLE) 52 | { 53 | for (size_t i = 1; i < size; ++i) 54 | { 55 | auto const codepoint = codepoints[i]; 56 | auto const width = [&]() { 57 | switch (codepoint) 58 | { 59 | case 0xFE0E: return 1; 60 | case 0xFE0F: return 2; 61 | default: return static_cast(unicode::width(codepoint)); 62 | } 63 | }(); 64 | if (width && width != thisWidth) 65 | thisWidth = width; 66 | } 67 | } 68 | totalWidth += thisWidth; 69 | ++segmenter; 70 | } 71 | return totalWidth; 72 | } 73 | 74 | int u8_gc_width(u8_char_t const* codepoints, size_t count, int allowMod) 75 | { 76 | (void) codepoints; 77 | (void) count; 78 | (void) allowMod; 79 | 80 | return -1; // TODO 81 | } 82 | 83 | int u32_grapheme_unbreakable(u32_char_t a, u32_char_t b) 84 | { 85 | return unicode::grapheme_segmenter::nonbreakable(a, b); 86 | } 87 | 88 | struct u8u32_stream_state 89 | { 90 | unicode::decoder conv {}; 91 | }; 92 | 93 | u8u32_stream_state_t u8u32_stream_convert_create() 94 | { 95 | return new u8u32_stream_state(); 96 | } 97 | 98 | int u8u32_stream_convert_run(u8u32_stream_state_t handle, u8_char_t input, u32_char_t* output) 99 | { 100 | if (auto const codepoint = handle->conv(static_cast(input)); codepoint.has_value()) 101 | { 102 | *output = codepoint.value(); 103 | return 1; 104 | } 105 | return 0; 106 | } 107 | 108 | void u8u32_stream_convert_destroy(u8u32_stream_state_t* handle) 109 | { 110 | delete *handle; 111 | *handle = nullptr; 112 | } 113 | 114 | int u32u8_convert(u32_char_t const* source, size_t slen, u8_char_t* dest, size_t dlen) 115 | { 116 | auto conv = unicode::encoder {}; 117 | auto nwritten = 0; 118 | 119 | for (size_t i = 0; i < slen; ++i) 120 | { 121 | u8_char_t buf[4]; 122 | auto const bufEnd = conv(source[i], buf); 123 | auto const bufLength = static_cast(std::distance(buf, bufEnd)); 124 | if (!(bufLength < dlen)) 125 | return -1; 126 | 127 | for (size_t k = 0; k < bufLength; ++k) 128 | dest[k] = buf[k]; 129 | nwritten += static_cast(bufLength); 130 | dest += bufLength; 131 | dlen -= bufLength; 132 | } 133 | 134 | return nwritten; 135 | } 136 | -------------------------------------------------------------------------------- /src/libunicode/capi.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2021 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #ifndef LIBUNICODE_CAPI_H 15 | #define LIBUNICODE_CAPI_H 1 16 | 17 | #include 18 | #include 19 | 20 | #if !defined(__cplusplus) 21 | extern "C" 22 | { 23 | #endif 24 | 25 | #define U32_CODEPOINT_MAX 0x10FFFF // 0b1'0000'1111'1111'1111'1111 26 | #define U32_CODEPOINT_MIN 0 27 | #define U32_CODEPOINT_MASK 0x1FFFFF // 0b1'1111'1111'1111'1111'1111 28 | 29 | /// UTF-8 character or 8bit segment of an UTF-8 character. 30 | typedef char u8_char_t; 31 | 32 | /// UTF-32 codepoint between 0 and 0x10FFFF. Any valud outside that 33 | /// range must be properly handled by the functions above to avoid undefined 34 | /// behavior. 35 | typedef uint_least32_t u32_char_t; 36 | 37 | /** 38 | * Verifies that _p codepoint is a valid codepoint, 39 | */ 40 | #define u32_is_valid_codepoint(_codepoint) \ 41 | ((_codepoint) < 0xD800 || ((_codepoint) > 0xDFFF && (_codepoint) <= 0x10FFFF) 42 | 43 | /** 44 | * Extracts the unused higher order bits and moves them bit-wise to the right. 45 | * 46 | * A UTF-32 character is 32 bits wide (on a machine at least 32 bits wide), 47 | * and the largest valid UTF-32 codepoint is 0x10FFFF. 48 | * That is, the 21 least significant bits are used and the 11 most significant 49 | * bits are available other application specific purposes. 50 | */ 51 | #define u32_unused_bit_mask(_codepoint) ((_codepoint) >> 21) 52 | 53 | /** 54 | * Returns the number of available bits that are free 55 | * for application-specific use. 56 | * 57 | * If the machine type for UTF-32 is actually 32 bits wide, this 58 | * function yields 11 bits. It is guaranteed to always return at least 11. 59 | */ 60 | #define u32_unused_bit_capacity() (8 * sizeof(u32_char_t) - 21) 61 | 62 | /** 63 | * Tests if given bit at @p _index of the unused most significant bits is set. 64 | */ 65 | #define u32_unused_bit_get(_codepoint, _index) (((_codepoint) & (1 << ((_index) + 21))) != 0) 66 | 67 | /** 68 | * Sets the bit at @p _index of the unused most significant bits. 69 | */ 70 | #define u32_unused_bit_on(_codepoint, _index) ((_codepoint) | (1 << ((_index) + 21))) 71 | 72 | /** 73 | * Clears the bit at @p _index of the unused most significant bits. 74 | */ 75 | #define u32_unused_bit_off(_codepoint, _index) ((_codepoint) & ~(1 << ((_index) + 21))) 76 | 77 | /** 78 | * Returns @p _codepoint with all unused bits cleared. 79 | */ 80 | #define u32_unused_bit_cleared(_codepoint) ((_codepoint) & U32_CODEPOINT_MASK) 81 | 82 | /** 83 | * Counts the number of grapheme clusters for given sequence of codepoints. 84 | * 85 | * Use this function to determine the number of 86 | * user perceived characters (grapheme clusters). 87 | * 88 | * @param codepoints pointer to the first codepoint. 89 | * @param n number of codepoints to count the grapheme clusters for. 90 | * 91 | * @return number of user perceived characters (grapheme clusters) counted 92 | * in [codepoints, codepoints+n). 93 | */ 94 | int u32_gc_count(u32_char_t const* codepoints, size_t n); 95 | int u8_gc_count(u8_char_t const* codepoints, size_t n); 96 | 97 | /** 98 | * Determines that u32_gc_width()/u8_gc_width() must not respect 99 | * variation selectors, and thus, will not change the width of a 100 | * processed grapheme cluster. 101 | * 102 | * Using this is not recommended unless backwards compatibility with 103 | * broken clients is of concern. 104 | */ 105 | #define GC_WIDTH_MODE_NON_MODIFIABLE 0 106 | 107 | /** 108 | * Mandates that u32_gc_width()/u8_gc_width() must respect 109 | * variation selectors, thus, allow changing the width of 110 | * a processed grapheme cluster. 111 | */ 112 | #define GC_WIDTH_MODE_MODIFIABLE 1 113 | 114 | /** 115 | * Computes the display width for given sequence of codepoints, 116 | * respecting grapheme cluters, and modifiers. 117 | * 118 | * @param codepoints pointer to first codepoint 119 | * @param n number of codepoints 120 | * @param mode determines how to deal with variation selectors that do 121 | * force changing the width or a grapheme cluster. 122 | * Valid values are: 123 | * GC_WIDTH_MODE_MODIFIABLE (allow, recommended), 124 | * GC_WIDTH_MODE_NON_MODIFIABLE (disallowed). 125 | * 126 | * Use this function to determine how many terminal grid cells a 127 | * string of codepoints should occupy when being rendered. 128 | */ 129 | int u32_gc_width(u32_char_t const* codepoints, size_t n, int mode); 130 | 131 | /** 132 | * UTF-8 version of @c u32_gc_width(). 133 | * 134 | * @see u32_gc_width(u32_char_t const* codepoints, size_t n, int allowMod) 135 | */ 136 | int u8_gc_width(u8_char_t const* codepoints, size_t n, int allowMod); 137 | 138 | /** 139 | * Tests if two consecutive codepoints do belong to the same grapheme cluster, 140 | * i.e. are unbreakable and thus should not be broken up. 141 | * 142 | * @retval 1 both codepoints to belong to the same grapheme cluster. 143 | * @retval 0 both codepoints do not belong to the same grapheme cluster. 144 | * 145 | * @note The grapheme cluster segmentation algorithm walks through an 146 | * ordered sequence of checks that would either yield return value true 147 | * or value. If non of these rules match, true will be returned, meaning 148 | * that the both codepoints @p a and @p b can be broken up. 149 | * This implies that codepoints outside the valid Unicode range will also yield 150 | * return code true. 151 | */ 152 | int u32_grapheme_unbreakable(u32_char_t a, u32_char_t b); 153 | 154 | /** 155 | * Opaque handle for the UTF-8 to UTF-32 stream converter. 156 | */ 157 | struct u8u32_stream_state; 158 | typedef struct u8u32_stream_state* u8u32_stream_state_t; 159 | 160 | /** 161 | * Constructs an UTF-8-to-UTF-32 streamed converter context. 162 | */ 163 | u8u32_stream_state_t u8u32_stream_convert_create(); 164 | 165 | /** 166 | * Processes a single UTF-8 byte to incrementally convert 167 | * consecutively incoming UTF-8 bytes into a sequence of UTF-32 codepoints. 168 | * 169 | * @param handle The handle to the previously created streaming context. 170 | * @param input A UTF-8 character to be procecced consecutively. 171 | * @param output Will contain the fully parsed UTF-32 codepoint every time 172 | * one is available. 173 | * 174 | * @retval 0 The codepoint is incomplete and needs more data; @p output is not touched. 175 | * @retval 1 The UTF-8 codepoint was fully processed and stored into @p output. 176 | * 177 | * @note Invalid input is silently ignored. 178 | */ 179 | int u8u32_stream_convert_run(u8u32_stream_state_t handle, u8_char_t input, u32_char_t* output); 180 | 181 | /** 182 | * Destroys the UTF-8-to-UTF-32 streaming converter context. 183 | * The parameer @p handle will be set to NULL when this call leaves. 184 | */ 185 | void u8u32_stream_convert_destroy(u8u32_stream_state_t* handle); 186 | 187 | /** 188 | * Convertes a UTF-32 sequence to UTF-8. 189 | * 190 | * @param source Pointer sequence of UTF-32 characters to convert. 191 | * @param slen Number of UTF-32 characters to convert. 192 | * @param dest Destination address where to store the converted UTF-8 sequence to. 193 | * @param dlen Number of bytes to write to @p _dest at most. 194 | * 195 | * @note No trailing zero byte will be written. 196 | * 197 | * @retval >0 Success. TRhe number of bytes written to @p _dest is returned. 198 | * @retval 0 _slen is 0, and nothing was converted. 199 | * @retval -1 Some characters have been converted but target destination 200 | * is not large enough to continue. 201 | */ 202 | int u32u8_convert(u32_char_t const* source, size_t slen, u8_char_t* dest, size_t dlen); 203 | 204 | #if !defined(__cplusplus) 205 | } 206 | #endif 207 | 208 | #endif 209 | -------------------------------------------------------------------------------- /src/libunicode/capi_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2021 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | using namespace std; 23 | using namespace std::string_view_literals; 24 | 25 | TEST_CASE("capi.gc_count") 26 | { 27 | CHECK(0 == u32_gc_count((u32_char_t const*) U"", 0)); 28 | CHECK(1 == u32_gc_count((u32_char_t const*) U"\U0001F600\uFE0E", 2)); 29 | CHECK(2 == u32_gc_count((u32_char_t const*) U"\U0001F600\uFE0E\U0001F600", 3)); 30 | CHECK(3 == u32_gc_count((u32_char_t const*) U"Yeo", 3)); 31 | CHECK(4 == u32_gc_count((u32_char_t const*) U"Hi \U0001F600\uFE0E", 5)); 32 | CHECK(4 == u32_gc_count((u32_char_t const*) U"1234", 4)); 33 | CHECK(1 == u32_gc_count((u32_char_t const*) U"\U0001F468\U0001F3FE\u200D\U0001F9B3", 4)); 34 | } 35 | 36 | TEST_CASE("capi.u8u32_stream_convert_and_inverse") 37 | { 38 | auto constexpr input = "[\xC3\xB6\xE2\x82\xAC\xF0\x9F\x98\x80"sv; 39 | auto constexpr expected = U"[ö€😀"sv; 40 | 41 | u8u32_stream_state_t conv = u8u32_stream_convert_create(); 42 | u32string output; 43 | for (size_t i = 0; i < input.size(); ++i) 44 | { 45 | u32_char_t out {}; 46 | if (u8u32_stream_convert_run(conv, input.at(i), &out)) 47 | output.push_back(out); 48 | } 49 | CHECK(output == expected); 50 | u8u32_stream_convert_destroy(&conv); 51 | 52 | // Verify inverse conversion (UTF-32 to UTF-8) works, too. 53 | array inverse {}; 54 | auto const ilen = u32u8_convert((u32_char_t const*) expected.data(), expected.size(), inverse.data(), inverse.size()); 55 | CHECK(ilen == (int) input.size()); 56 | auto const inverseSV = string_view { inverse.data(), static_cast(ilen) }; 57 | CHECK(inverseSV == input); 58 | } 59 | 60 | // TODO more C-API tests 61 | -------------------------------------------------------------------------------- /src/libunicode/codepoint_properties.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | 17 | namespace unicode 18 | { 19 | 20 | codepoint_properties::tables_view codepoint_properties::configured_tables { precompiled::stage1.data(), 21 | precompiled::stage2.data(), 22 | precompiled::properties.data() }; 23 | 24 | codepoint_properties::names_view codepoint_properties::configured_names { 25 | precompiled::names_stage1.data(), 26 | precompiled::names_stage2.data(), 27 | precompiled::names_stage3.data(), 28 | }; 29 | 30 | } // namespace unicode 31 | -------------------------------------------------------------------------------- /src/libunicode/codepoint_properties.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include // Only for EmojiSegmentationCategory. 17 | #include 18 | #include // Only for LIBUNICODE_PACKED. 19 | #include // Only for the UCD enums. 20 | 21 | #include 22 | 23 | namespace unicode 24 | { 25 | 26 | struct LIBUNICODE_PACKED codepoint_properties 27 | { 28 | uint8_t char_width = 0; 29 | uint8_t flags = 0; 30 | Script script = Script::Unknown; 31 | Grapheme_Cluster_Break grapheme_cluster_break = Grapheme_Cluster_Break::Other; 32 | East_Asian_Width east_asian_width = East_Asian_Width::Narrow; 33 | General_Category general_category = General_Category::Unassigned; 34 | EmojiSegmentationCategory emoji_segmentation_category = EmojiSegmentationCategory::Invalid; 35 | Age age = Age::Unassigned; 36 | 37 | static uint8_t constexpr FlagEmoji = 0x01; // NOLINT(readability-identifier-naming) 38 | static uint8_t constexpr FlagEmojiPresentation = 0x02; // NOLINT(readability-identifier-naming) 39 | static uint8_t constexpr FlagEmojiComponent = 0x04; // NOLINT(readability-identifier-naming) 40 | static uint8_t constexpr FlagEmojiModifier = 0x08; // NOLINT(readability-identifier-naming) 41 | static uint8_t constexpr FlagEmojiModifierBase = 0x10; // NOLINT(readability-identifier-naming) 42 | static uint8_t constexpr FlagExtendedPictographic = 0x20; // NOLINT(readability-identifier-naming) 43 | static uint8_t constexpr FlagCoreGraphemeExtend = 0x40; // NOLINT(readability-identifier-naming) 44 | 45 | constexpr bool emoji() const noexcept { return flags & FlagEmoji; } 46 | constexpr bool emoji_presentation() const noexcept { return flags & FlagEmojiPresentation; } 47 | constexpr bool emoji_component() const noexcept { return flags & FlagEmojiComponent; } 48 | constexpr bool emoji_modifier() const noexcept { return flags & FlagEmojiModifier; } 49 | constexpr bool emoji_modifier_base() const noexcept { return flags & FlagEmojiModifierBase; } 50 | constexpr bool extended_pictographic() const noexcept { return flags & FlagExtendedPictographic; } 51 | constexpr bool core_grapheme_extend() const noexcept { return flags & FlagCoreGraphemeExtend; } 52 | 53 | using tables_view = support::multistage_table_view; 60 | 61 | using names_view = support::multistage_table_view; 68 | 69 | static tables_view configured_tables; 70 | static names_view configured_names; 71 | 72 | /// Retrieves the codepoint properties for the given codepoint. 73 | [[nodiscard]] static codepoint_properties get(char32_t codepoint) noexcept { return configured_tables.get(codepoint); } 74 | 75 | [[nodiscard]] static std::string_view name(char32_t codepoint) { return configured_names.get(codepoint); } 76 | }; 77 | 78 | static_assert(std::has_unique_object_representations_v); 79 | 80 | constexpr bool operator==(codepoint_properties const& a, codepoint_properties const& b) noexcept 81 | { 82 | return __builtin_memcmp(&a, &b, sizeof(codepoint_properties)) == 0; 83 | } 84 | 85 | constexpr bool operator!=(codepoint_properties const& a, codepoint_properties const& b) noexcept 86 | { 87 | return !(a == b); 88 | } 89 | 90 | } // namespace unicode 91 | -------------------------------------------------------------------------------- /src/libunicode/codepoint_properties_loader.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | namespace unicode 22 | { 23 | 24 | using codepoint_properties_table = support::multistage_table; 31 | 32 | using codepoint_names_table = support::multistage_table; 39 | 40 | std::tuple load_from_directory(std::string const& ucdDataDirectory, 41 | std::ostream* log); 42 | 43 | } // namespace unicode 44 | -------------------------------------------------------------------------------- /src/libunicode/convert_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | using namespace unicode; 24 | using namespace std::string_literals; 25 | using namespace std; 26 | 27 | TEST_CASE("convert.same", "[convert]") 28 | { 29 | auto const s8 = "Hello, 😀"sv; 30 | auto t8 = string {}; 31 | unicode::convert_to(s8, back_insert_iterator(t8)); 32 | CHECK(s8 == t8); 33 | 34 | auto const s16 = u"Hello, 😀"sv; 35 | auto t16 = u16string {}; 36 | unicode::convert_to(s16, back_insert_iterator(t16)); 37 | CHECK(s16 == t16); 38 | 39 | auto const s32 = U"Hello, 😀"sv; 40 | auto t32 = u32string {}; 41 | unicode::convert_to(s32, back_insert_iterator(t32)); 42 | CHECK(s32 == t32); 43 | } 44 | 45 | TEST_CASE("convert.8_to_16", "[convert]") 46 | { 47 | auto constexpr input = string_view { 48 | "[" 49 | "\xC3\xB6" // ö - german o-umlaut 50 | "\xE2\x82\xAC" // € - EURO sign U+20AC 51 | "\xF0\x9F\x98\x80" // 😀 - U+1F600 52 | }; 53 | u16string output; 54 | auto bi = back_inserter(output); 55 | unicode::convert_to(input, bi); // back_inserter(output)); 56 | CHECK(output.size() == 5); 57 | CHECK(output == u"[ö€😀"); 58 | } 59 | 60 | TEST_CASE("convert.8_to_32", "[convert]") 61 | { 62 | auto constexpr input = string_view { 63 | "[" 64 | "\xC3\xB6" // ö - german o-umlaut 65 | "\xE2\x82\xAC" // € - EURO sign U+20AC 66 | "\xF0\x9F\x98\x80" // 😀 - U+1F600 67 | }; 68 | u32string output; 69 | auto bi = back_inserter(output); 70 | unicode::convert_to(input, bi); // back_inserter(output)); 71 | CHECK(output.size() == 4); 72 | CHECK(output == U"[ö€😀"); 73 | } 74 | 75 | TEST_CASE("convert.utf8.incremental_decode", "[utf8]") 76 | { 77 | auto constexpr values = string_view { 78 | "[" 79 | "\xC3\xB6" // ö - german o-umlaut 80 | "\xE2\x82\xAC" // € - EURO sign U+20AC 81 | "\xF0\x9F\x98\x80" // 😀 - U+1F600 82 | }; 83 | auto const* p = (char8_type const*) (values.data()); 84 | auto decode = unicode::decoder {}; 85 | 86 | // single-byte 87 | auto result = decode(*p++); 88 | REQUIRE(result.has_value()); 89 | REQUIRE(result.value() == '['); 90 | 91 | // double-byte 92 | result = decode(*p++); 93 | REQUIRE(!result.has_value()); 94 | result = decode(*p++); 95 | REQUIRE(result.has_value()); 96 | REQUIRE(result.value() == U'\u00F6'); // ö 97 | 98 | // 3 bytes 99 | result = decode(*p++); 100 | REQUIRE(!result.has_value()); 101 | result = decode(*p++); 102 | REQUIRE(!result.has_value()); 103 | result = decode(*p++); 104 | REQUIRE(result.has_value()); 105 | REQUIRE(result.value() == U'\u20AC'); // € 106 | 107 | // 4 bytes 108 | result = decode(*p++); 109 | REQUIRE(!result.has_value()); 110 | result = decode(*p++); 111 | REQUIRE(!result.has_value()); 112 | result = decode(*p++); 113 | REQUIRE(!result.has_value()); 114 | result = decode(*p++); 115 | REQUIRE(result.has_value()); 116 | REQUIRE(result.value() == U'\U0001F600'); // 😀 117 | } 118 | -------------------------------------------------------------------------------- /src/libunicode/emoji_presentation_scanner.c: -------------------------------------------------------------------------------- 1 | 2 | #line 1 "emoji_presentation_scanner.rl" 3 | /* Copyright 2019 Google LLC 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // clang-format off 19 | #pragma GCC diagnostic ignored "-Wsign-conversion" 20 | 21 | #line 20 "emoji_presentation_scanner.c" 22 | static const char _emoji_presentation_actions[] = { 23 | 0, 1, 0, 1, 1, 1, 5, 1, 24 | 6, 1, 7, 1, 8, 1, 9, 1, 25 | 10, 1, 11, 2, 2, 3, 2, 2, 26 | 4 27 | }; 28 | 29 | static const char _emoji_presentation_key_offsets[] = { 30 | 0, 5, 7, 14, 18, 20, 21, 24, 31 | 29, 30, 34, 36 32 | }; 33 | 34 | static const unsigned char _emoji_presentation_trans_keys[] = { 35 | 3u, 7u, 13u, 0u, 2u, 14u, 15u, 2u, 36 | 3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u, 37 | 11u, 12u, 10u, 12u, 10u, 4u, 10u, 12u, 38 | 4u, 9u, 10u, 11u, 12u, 6u, 9u, 10u, 39 | 11u, 12u, 8u, 10u, 9u, 10u, 11u, 12u, 40 | 14u, 0 41 | }; 42 | 43 | static const char _emoji_presentation_single_lengths[] = { 44 | 3, 2, 5, 4, 2, 1, 3, 5, 45 | 1, 4, 2, 5 46 | }; 47 | 48 | static const char _emoji_presentation_range_lengths[] = { 49 | 1, 0, 1, 0, 0, 0, 0, 0, 50 | 0, 0, 0, 0 51 | }; 52 | 53 | static const char _emoji_presentation_index_offsets[] = { 54 | 0, 5, 8, 15, 20, 23, 25, 29, 55 | 35, 37, 42, 45 56 | }; 57 | 58 | static const char _emoji_presentation_indicies[] = { 59 | 2, 1, 1, 1, 0, 4, 5, 3, 60 | 7, 8, 10, 11, 12, 6, 9, 5, 61 | 13, 14, 15, 0, 13, 15, 16, 13, 62 | 16, 15, 13, 15, 16, 15, 5, 13, 63 | 14, 15, 16, 5, 17, 5, 13, 14, 64 | 18, 17, 5, 13, 16, 5, 13, 14, 65 | 15, 4, 16, 0 66 | }; 67 | 68 | static const char _emoji_presentation_trans_targs[] = { 69 | 2, 4, 6, 2, 1, 2, 3, 3, 70 | 7, 2, 8, 9, 11, 0, 2, 5, 71 | 2, 2, 10 72 | }; 73 | 74 | static const char _emoji_presentation_trans_actions[] = { 75 | 17, 19, 19, 15, 0, 7, 22, 19, 76 | 19, 9, 0, 22, 19, 0, 5, 19, 77 | 11, 13, 19 78 | }; 79 | 80 | static const char _emoji_presentation_to_state_actions[] = { 81 | 0, 0, 1, 0, 0, 0, 0, 0, 82 | 0, 0, 0, 0 83 | }; 84 | 85 | static const char _emoji_presentation_from_state_actions[] = { 86 | 0, 0, 3, 0, 0, 0, 0, 0, 87 | 0, 0, 0, 0 88 | }; 89 | 90 | static const char _emoji_presentation_eof_trans[] = { 91 | 1, 4, 0, 1, 17, 17, 17, 17, 92 | 18, 18, 17, 17 93 | }; 94 | 95 | static const int emoji_presentation_start = 2; 96 | 97 | 98 | #line 20 "emoji_presentation_scanner.rl" 99 | 100 | 101 | 102 | #line 89 "emoji_presentation_scanner.rl" 103 | 104 | 105 | static emoji_text_iter_t 106 | scan_emoji_presentation (emoji_text_iter_t p, 107 | const emoji_text_iter_t pe, 108 | bool* is_emoji) 109 | { 110 | emoji_text_iter_t ts, te; 111 | const emoji_text_iter_t eof = pe; 112 | 113 | unsigned act; 114 | int cs; 115 | 116 | 117 | #line 116 "emoji_presentation_scanner.c" 118 | { 119 | cs = emoji_presentation_start; 120 | ts = 0; 121 | te = 0; 122 | act = 0; 123 | } 124 | 125 | #line 124 "emoji_presentation_scanner.c" 126 | { 127 | int _klen; 128 | unsigned int _trans; 129 | const char *_acts; 130 | unsigned int _nacts; 131 | const unsigned char *_keys; 132 | 133 | if ( p == pe ) 134 | goto _test_eof; 135 | _resume: 136 | _acts = _emoji_presentation_actions + _emoji_presentation_from_state_actions[cs]; 137 | _nacts = (unsigned int) *_acts++; 138 | while ( _nacts-- > 0 ) { 139 | switch ( *_acts++ ) { 140 | case 1: 141 | #line 1 "NONE" 142 | {ts = p;} 143 | break; 144 | #line 143 "emoji_presentation_scanner.c" 145 | } 146 | } 147 | 148 | _keys = _emoji_presentation_trans_keys + _emoji_presentation_key_offsets[cs]; 149 | _trans = _emoji_presentation_index_offsets[cs]; 150 | 151 | _klen = _emoji_presentation_single_lengths[cs]; 152 | if ( _klen > 0 ) { 153 | const unsigned char *_lower = _keys; 154 | const unsigned char *_mid; 155 | const unsigned char *_upper = _keys + _klen - 1; 156 | while (1) { 157 | if ( _upper < _lower ) 158 | break; 159 | 160 | _mid = _lower + ((_upper-_lower) >> 1); 161 | if ( (*p) < *_mid ) 162 | _upper = _mid - 1; 163 | else if ( (*p) > *_mid ) 164 | _lower = _mid + 1; 165 | else { 166 | _trans += (unsigned int)(_mid - _keys); 167 | goto _match; 168 | } 169 | } 170 | _keys += _klen; 171 | _trans += _klen; 172 | } 173 | 174 | _klen = _emoji_presentation_range_lengths[cs]; 175 | if ( _klen > 0 ) { 176 | const unsigned char *_lower = _keys; 177 | const unsigned char *_mid; 178 | const unsigned char *_upper = _keys + (_klen<<1) - 2; 179 | while (1) { 180 | if ( _upper < _lower ) 181 | break; 182 | 183 | _mid = _lower + (((_upper-_lower) >> 1) & ~1); 184 | if ( (*p) < _mid[0] ) 185 | _upper = _mid - 2; 186 | else if ( (*p) > _mid[1] ) 187 | _lower = _mid + 2; 188 | else { 189 | _trans += (unsigned int)((_mid - _keys)>>1); 190 | goto _match; 191 | } 192 | } 193 | _trans += _klen; 194 | } 195 | 196 | _match: 197 | _trans = _emoji_presentation_indicies[_trans]; 198 | _eof_trans: 199 | cs = _emoji_presentation_trans_targs[_trans]; 200 | 201 | if ( _emoji_presentation_trans_actions[_trans] == 0 ) 202 | goto _again; 203 | 204 | _acts = _emoji_presentation_actions + _emoji_presentation_trans_actions[_trans]; 205 | _nacts = (unsigned int) *_acts++; 206 | while ( _nacts-- > 0 ) 207 | { 208 | switch ( *_acts++ ) 209 | { 210 | case 2: 211 | #line 1 "NONE" 212 | {te = p+1;} 213 | break; 214 | case 3: 215 | #line 85 "emoji_presentation_scanner.rl" 216 | {act = 2;} 217 | break; 218 | case 4: 219 | #line 86 "emoji_presentation_scanner.rl" 220 | {act = 3;} 221 | break; 222 | case 5: 223 | #line 84 "emoji_presentation_scanner.rl" 224 | {te = p+1;{ *is_emoji = false; return te; }} 225 | break; 226 | case 6: 227 | #line 85 "emoji_presentation_scanner.rl" 228 | {te = p+1;{ *is_emoji = true; return te; }} 229 | break; 230 | case 7: 231 | #line 86 "emoji_presentation_scanner.rl" 232 | {te = p+1;{ *is_emoji = false; return te; }} 233 | break; 234 | case 8: 235 | #line 85 "emoji_presentation_scanner.rl" 236 | {te = p;p--;{ *is_emoji = true; return te; }} 237 | break; 238 | case 9: 239 | #line 86 "emoji_presentation_scanner.rl" 240 | {te = p;p--;{ *is_emoji = false; return te; }} 241 | break; 242 | case 10: 243 | #line 85 "emoji_presentation_scanner.rl" 244 | {{p = ((te))-1;}{ *is_emoji = true; return te; }} 245 | break; 246 | case 11: 247 | #line 1 "NONE" 248 | { switch( act ) { 249 | case 2: 250 | {{p = ((te))-1;} *is_emoji = true; return te; } 251 | break; 252 | case 3: 253 | {{p = ((te))-1;} *is_emoji = false; return te; } 254 | break; 255 | } 256 | } 257 | break; 258 | #line 257 "emoji_presentation_scanner.c" 259 | } 260 | } 261 | 262 | _again: 263 | _acts = _emoji_presentation_actions + _emoji_presentation_to_state_actions[cs]; 264 | _nacts = (unsigned int) *_acts++; 265 | while ( _nacts-- > 0 ) { 266 | switch ( *_acts++ ) { 267 | case 0: 268 | #line 1 "NONE" 269 | {ts = 0;} 270 | break; 271 | #line 270 "emoji_presentation_scanner.c" 272 | } 273 | } 274 | 275 | if ( ++p != pe ) 276 | goto _resume; 277 | _test_eof: {} 278 | if ( p == eof ) 279 | { 280 | if ( _emoji_presentation_eof_trans[cs] > 0 ) { 281 | _trans = _emoji_presentation_eof_trans[cs] - 1; 282 | goto _eof_trans; 283 | } 284 | } 285 | 286 | } 287 | 288 | #line 105 "emoji_presentation_scanner.rl" 289 | 290 | 291 | /* Should not be reached. */ 292 | *is_emoji = false; 293 | return pe; 294 | } 295 | 296 | // clang-format on 297 | -------------------------------------------------------------------------------- /src/libunicode/emoji_presentation_scanner.rl: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * https://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | // clang-format off 17 | #pragma GCC diagnostic ignored "-Wsign-conversion" 18 | 19 | %%{ 20 | machine emoji_presentation; 21 | alphtype unsigned char; 22 | write data noerror nofinal noentry; 23 | }%% 24 | 25 | %%{ 26 | 27 | EMOJI = 0; 28 | EMOJI_TEXT_PRESENTATION = 1; 29 | EMOJI_EMOJI_PRESENTATION = 2; 30 | EMOJI_MODIFIER_BASE = 3; 31 | EMOJI_MODIFIER = 4; 32 | EMOJI_VS_BASE = 5; 33 | REGIONAL_INDICATOR = 6; 34 | KEYCAP_BASE = 7; 35 | COMBINING_ENCLOSING_KEYCAP = 8; 36 | COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9; 37 | ZWJ = 10; 38 | VS15 = 11; 39 | VS16 = 12; 40 | TAG_BASE = 13; 41 | TAG_SEQUENCE = 14; 42 | TAG_TERM = 15; 43 | 44 | any_emoji = EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION | KEYCAP_BASE | 45 | EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI; 46 | 47 | emoji_combining_enclosing_circle_backslash_sequence = any_emoji 48 | COMBINING_ENCLOSING_CIRCLE_BACKSLASH; 49 | 50 | # This could be sharper than any_emoji by restricting this only to valid 51 | # variation sequences: 52 | # https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt 53 | # However, implementing 54 | # https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is 55 | # sufficient for our purposes here. 56 | emoji_presentation_sequence = any_emoji VS16; 57 | 58 | emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER; 59 | 60 | emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR; 61 | 62 | # Here we only allow the valid tag sequences 63 | # https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of 64 | # all well-formed ones defined in 65 | # https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence 66 | emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM; 67 | 68 | emoji_keycap_sequence = KEYCAP_BASE VS16 COMBINING_ENCLOSING_KEYCAP; 69 | 70 | emoji_zwj_element = emoji_presentation_sequence | emoji_modifier_sequence | any_emoji; 71 | 72 | emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+; 73 | 74 | emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE | 75 | emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence | 76 | emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence | 77 | emoji_combining_enclosing_circle_backslash_sequence; 78 | 79 | emoji_run = emoji_presentation; 80 | 81 | text_presentation_emoji = any_emoji VS15; 82 | text_run = any; 83 | 84 | text_and_emoji_run := |* 85 | # In order to give the the VS15 sequences higher priority than detecting 86 | # emoji sequences they are listed first as scanner token here. 87 | text_presentation_emoji => { *is_emoji = false; return te; }; 88 | emoji_run => { *is_emoji = true; return te; }; 89 | text_run => { *is_emoji = false; return te; }; 90 | *|; 91 | 92 | }%% 93 | 94 | static emoji_text_iter_t 95 | scan_emoji_presentation (emoji_text_iter_t p, 96 | const emoji_text_iter_t pe, 97 | bool* is_emoji) 98 | { 99 | emoji_text_iter_t ts, te; 100 | const emoji_text_iter_t eof = pe; 101 | 102 | unsigned act; 103 | int cs; 104 | 105 | %%{ 106 | write init; 107 | write exec; 108 | }%% 109 | 110 | /* Should not be reached. */ 111 | *is_emoji = false; 112 | return pe; 113 | } 114 | 115 | // clang-format on 116 | -------------------------------------------------------------------------------- /src/libunicode/emoji_segmenter.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libterminal" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | namespace unicode 23 | { 24 | 25 | namespace 26 | { 27 | 28 | class RagelIterator 29 | { 30 | EmojiSegmentationCategory category_; 31 | char32_t const* buffer_; 32 | size_t size_; 33 | size_t currentCursorEnd_; 34 | 35 | public: 36 | RagelIterator(char32_t const* buffer, size_t size, size_t cursor) noexcept: 37 | category_ { EmojiSegmentationCategory::Invalid }, buffer_ { buffer }, size_ { size }, currentCursorEnd_ { cursor } 38 | { 39 | updateCategory(); 40 | } 41 | 42 | RagelIterator() noexcept: RagelIterator(U"", 0, 0) {} 43 | 44 | constexpr char32_t codepoint() const noexcept { return buffer_[currentCursorEnd_]; } 45 | constexpr EmojiSegmentationCategory category() const noexcept { return category_; } 46 | constexpr size_t cursor() const noexcept { return currentCursorEnd_; } 47 | 48 | void updateCategory() noexcept 49 | { 50 | if (currentCursorEnd_ < size_) 51 | category_ = codepoint_properties::get(codepoint()).emoji_segmentation_category; 52 | else 53 | category_ = EmojiSegmentationCategory::Invalid; 54 | } 55 | 56 | constexpr int operator*() const noexcept { return static_cast(category_); } 57 | 58 | RagelIterator& operator++() noexcept 59 | { 60 | currentCursorEnd_++; 61 | updateCategory(); 62 | return *this; 63 | } 64 | RagelIterator& operator--(int) noexcept 65 | { 66 | currentCursorEnd_--; 67 | updateCategory(); 68 | return *this; 69 | } 70 | 71 | RagelIterator operator+(long v) const noexcept 72 | { 73 | // TODO: assert() on integer overflow 74 | return { buffer_, size_, currentCursorEnd_ + (size_t) v }; 75 | } 76 | 77 | RagelIterator operator-(long v) const noexcept 78 | { 79 | if (v >= 0) 80 | { 81 | assert(currentCursorEnd_ >= static_cast(v)); 82 | return { buffer_, size_, currentCursorEnd_ - (size_t) v }; 83 | } 84 | else 85 | { 86 | return *this + (-v); 87 | } 88 | } 89 | 90 | RagelIterator& operator=(int v) noexcept 91 | { 92 | assert(v >= 0); 93 | currentCursorEnd_ = static_cast(v); 94 | updateCategory(); 95 | return *this; 96 | } 97 | 98 | constexpr bool operator==(RagelIterator const& rhs) const noexcept 99 | { 100 | return buffer_ == rhs.buffer_ && size_ == rhs.size_ && currentCursorEnd_ == rhs.currentCursorEnd_; 101 | } 102 | 103 | constexpr bool operator!=(RagelIterator const& rhs) const noexcept { return !(*this == rhs); } 104 | }; 105 | 106 | using emoji_text_iter_t = RagelIterator; 107 | 108 | #include "emoji_presentation_scanner.c" 109 | } // namespace 110 | 111 | emoji_segmenter::emoji_segmenter(char32_t const* buffer, size_t size) noexcept: buffer_ { buffer }, size_ { size } 112 | { 113 | if (size_) 114 | consume_once(); 115 | } 116 | 117 | bool emoji_segmenter::consume(out size, out emoji) noexcept 118 | { 119 | // 01234567890123456 120 | // "A EMOJI" 121 | // [] | 122 | // [] | 123 | // [----] 124 | 125 | // "ABC EMOJI DEFGH" 126 | // [---] | | 127 | // [----] | 128 | // [-----] 129 | 130 | currentCursorBegin_ = currentCursorEnd_; 131 | currentCursorEnd_ = nextCursorBegin_; 132 | isEmoji_ = isNextEmoji_; 133 | 134 | if (nextCursorBegin_ >= size_) 135 | return false; 136 | 137 | do 138 | { 139 | auto const o = consume_once(); 140 | 141 | if (isEmoji_ != isNextEmoji_) 142 | { 143 | nextCursorBegin_ = o; 144 | break; 145 | } 146 | 147 | currentCursorEnd_ = o; 148 | } while (currentCursorEnd_ < size_); 149 | 150 | size.assign(currentCursorEnd_); 151 | emoji.assign(isEmoji_ ? PresentationStyle::Emoji : PresentationStyle::Text); 152 | nextCursorBegin_ = currentCursorEnd_; 153 | 154 | return true; 155 | } 156 | 157 | size_t emoji_segmenter::consume_once() 158 | { 159 | auto const i = RagelIterator(buffer_, size_, currentCursorEnd_); 160 | auto const e = RagelIterator(buffer_, size_, size_); 161 | auto const o = scan_emoji_presentation(i, e, &isNextEmoji_); 162 | return o.cursor(); 163 | } 164 | 165 | } // namespace unicode 166 | -------------------------------------------------------------------------------- /src/libunicode/emoji_segmenter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | namespace unicode 22 | { 23 | 24 | /// Used to distinguish between standard text and emoji text. 25 | enum class PresentationStyle 26 | { 27 | Text, 28 | Emoji 29 | }; 30 | 31 | enum class EmojiSegmentationCategory : int8_t 32 | { 33 | Invalid = -1, 34 | Emoji = 0, 35 | EmojiTextPresentation = 1, 36 | EmojiEmojiPresentation = 2, 37 | EmojiModifierBase = 3, 38 | EmojiModifier = 4, 39 | EmojiVSBase = 5, 40 | RegionalIndicator = 6, 41 | KeyCapBase = 7, 42 | CombiningEnclosingKeyCap = 8, 43 | CombiningEnclosingCircleBackslash = 9, 44 | ZWJ = 10, 45 | VS15 = 11, 46 | VS16 = 12, 47 | TagBase = 13, 48 | TagSequence = 14, 49 | TagTerm = 15, 50 | }; 51 | 52 | /** 53 | * emoji_segmenter API for segmenting emojis into text-emoji and emoji-emoji presentations. 54 | * 55 | * This segmenter is segmenting emojis by their presentation property (text or emoji), that is, 56 | * whether an emoji is to be rendered in text mode or in emoji (colored) mode. 57 | * 58 | * It must be segmenting only emojis and not any other codepoints. 59 | */ 60 | class emoji_segmenter 61 | { 62 | private: 63 | char32_t const* buffer_ = U""; 64 | size_t size_ = 0; 65 | 66 | size_t currentCursorBegin_ = 0; 67 | size_t currentCursorEnd_ = 0; 68 | size_t nextCursorBegin_ = 0; 69 | 70 | bool isEmoji_ = false; 71 | bool isNextEmoji_ = false; 72 | 73 | public: 74 | using property_type = PresentationStyle; 75 | 76 | constexpr emoji_segmenter() noexcept = default; 77 | constexpr emoji_segmenter& operator=(emoji_segmenter const&) noexcept = default; 78 | constexpr emoji_segmenter& operator=(emoji_segmenter&&) noexcept = default; 79 | constexpr emoji_segmenter(emoji_segmenter const&) noexcept = default; 80 | constexpr emoji_segmenter(emoji_segmenter&&) noexcept = default; 81 | 82 | emoji_segmenter(char32_t const* buffer, size_t size) noexcept; 83 | 84 | emoji_segmenter(std::u32string_view const& sv) noexcept: emoji_segmenter(sv.data(), sv.size()) {} 85 | 86 | constexpr char32_t const* buffer() const noexcept { return buffer_; } 87 | constexpr size_t size() const noexcept { return size_; } 88 | constexpr size_t currentCursorBegin() const noexcept { return currentCursorBegin_; } 89 | constexpr size_t currentCursorEnd() const noexcept { return currentCursorEnd_; } 90 | 91 | bool consume(out size, out emoji) noexcept; 92 | 93 | /// @returns whether or not the currently segmented emoji is to be rendered in text-presentation or not. 94 | constexpr bool isText() const noexcept { return !isEmoji_; } 95 | 96 | /// @returns whether or not the currently segmented emoji is to be rendered in emoji-presentation or not. 97 | constexpr bool isEmoji() const noexcept { return isEmoji_; } 98 | 99 | /// @returns the underlying current segment that has been processed the last. 100 | constexpr std::u32string_view substr() const noexcept 101 | { 102 | // TODO: provide such an accessor in text_run_segmenter 103 | if (currentCursorEnd_ > 0) 104 | return std::u32string_view(buffer_ + currentCursorBegin_, currentCursorEnd_ - currentCursorBegin_); 105 | else 106 | return std::u32string_view {}; 107 | } 108 | 109 | /// @returns the underlying current segment that has been processed the last. 110 | constexpr std::u32string_view operator*() const noexcept { return substr(); } 111 | 112 | private: 113 | size_t consume_once(); 114 | }; 115 | 116 | inline std::ostream& operator<<(std::ostream& os, PresentationStyle ps) 117 | { 118 | switch (ps) 119 | { 120 | case PresentationStyle::Text: return os << "Text"; 121 | case PresentationStyle::Emoji: return os << "Emoji"; 122 | } 123 | return os; 124 | } 125 | 126 | inline std::ostream& operator<<(std::ostream& os, EmojiSegmentationCategory value) 127 | { 128 | switch (value) 129 | { 130 | case unicode::EmojiSegmentationCategory::Invalid: return os << "Invalid"; 131 | case unicode::EmojiSegmentationCategory::Emoji: return os << "Emoji"; 132 | case unicode::EmojiSegmentationCategory::EmojiTextPresentation: return os << "EmojiTextPresentation"; 133 | case unicode::EmojiSegmentationCategory::EmojiEmojiPresentation: return os << "EmojiEmojiPresentation"; 134 | case unicode::EmojiSegmentationCategory::EmojiModifierBase: return os << "EmojiModifierBase"; 135 | case unicode::EmojiSegmentationCategory::EmojiModifier: return os << "EmojiModifier"; 136 | case unicode::EmojiSegmentationCategory::EmojiVSBase: return os << "EmojiVSBase"; 137 | case unicode::EmojiSegmentationCategory::RegionalIndicator: return os << "RegionalIndicator"; 138 | case unicode::EmojiSegmentationCategory::KeyCapBase: return os << "KeyCapBase"; 139 | case unicode::EmojiSegmentationCategory::CombiningEnclosingKeyCap: return os << "CombiningEnclosingKeyCap"; 140 | case unicode::EmojiSegmentationCategory::CombiningEnclosingCircleBackslash: 141 | return os << "CombiningEnclosingCircleBackslash"; 142 | case unicode::EmojiSegmentationCategory::ZWJ: return os << "ZWJ"; 143 | case unicode::EmojiSegmentationCategory::VS15: return os << "VS15"; 144 | case unicode::EmojiSegmentationCategory::VS16: return os << "VS16"; 145 | case unicode::EmojiSegmentationCategory::TagBase: return os << "TagBase"; 146 | case unicode::EmojiSegmentationCategory::TagSequence: return os << "TagSequence"; 147 | case unicode::EmojiSegmentationCategory::TagTerm: return os << "TagTerm"; 148 | } 149 | return os; 150 | } 151 | 152 | } // namespace unicode 153 | 154 | template <> 155 | struct std::formatter: std::formatter 156 | { 157 | auto format(unicode::PresentationStyle value, auto& ctx) const 158 | { 159 | string_view name; 160 | switch (value) 161 | { 162 | case unicode::PresentationStyle::Text: name = "Text"; break; 163 | case unicode::PresentationStyle::Emoji: name = "Emoji"; break; 164 | } 165 | return formatter::format(name, ctx); 166 | } 167 | }; 168 | 169 | template <> 170 | struct std::formatter: std::formatter 171 | { 172 | auto format(unicode::EmojiSegmentationCategory value, auto& ctx) const 173 | { 174 | using unicode::EmojiSegmentationCategory; 175 | string_view name; 176 | switch (value) 177 | { 178 | case EmojiSegmentationCategory::Invalid: name = "Invalid"; break; 179 | case EmojiSegmentationCategory::Emoji: name = "Emoji"; break; 180 | case EmojiSegmentationCategory::EmojiTextPresentation: name = "EmojiTextPresentation"; break; 181 | case EmojiSegmentationCategory::EmojiEmojiPresentation: name = "EmojiEmojiPresentation"; break; 182 | case EmojiSegmentationCategory::EmojiModifierBase: name = "EmojiModifierBase"; break; 183 | case EmojiSegmentationCategory::EmojiModifier: name = "EmojiModifier"; break; 184 | case EmojiSegmentationCategory::EmojiVSBase: name = "EmojiVSBase"; break; 185 | case EmojiSegmentationCategory::RegionalIndicator: name = "RegionalIndicator"; break; 186 | case EmojiSegmentationCategory::KeyCapBase: name = "KeyCapBase"; break; 187 | case EmojiSegmentationCategory::CombiningEnclosingKeyCap: name = "CombiningEnclosingKeyCap"; break; 188 | case EmojiSegmentationCategory::CombiningEnclosingCircleBackslash: name = "CombiningEnclosingCircleBackslash"; break; 189 | case EmojiSegmentationCategory::ZWJ: name = "ZWJ"; break; 190 | case EmojiSegmentationCategory::VS15: name = "VS15"; break; 191 | case EmojiSegmentationCategory::VS16: name = "VS16"; break; 192 | case EmojiSegmentationCategory::TagBase: name = "TagBase"; break; 193 | case EmojiSegmentationCategory::TagSequence: name = "TagSequence"; break; 194 | case EmojiSegmentationCategory::TagTerm: name = "TagTerm"; break; 195 | } 196 | return formatter::format(name, ctx); 197 | } 198 | }; 199 | -------------------------------------------------------------------------------- /src/libunicode/emoji_segmenter_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | #include 22 | 23 | using namespace unicode; 24 | using namespace std::string_literals; 25 | using namespace std; 26 | 27 | namespace 28 | { 29 | struct Expectation 30 | { 31 | u32string_view text; 32 | size_t start; 33 | size_t end; 34 | PresentationStyle presentationStyle; 35 | }; 36 | 37 | void test_segments(int lineNo, std::vector> const& expectations) 38 | { 39 | vector expects; 40 | u32string fullText; 41 | size_t i = 0; 42 | for (auto&& [text, isEmoji]: expectations) 43 | { 44 | expects.push_back(Expectation { text, i, i + text.size(), isEmoji }); 45 | fullText += text; 46 | i += text.size(); 47 | } 48 | 49 | INFO(std::format("Testing emoji segmentation from line {}: {}", lineNo, to_utf8(fullText))); 50 | 51 | size_t size {}; 52 | auto presentationStyle = PresentationStyle {}; 53 | auto segmenter = unicode::emoji_segmenter { fullText }; 54 | for (size_t i = 0; i < expectations.size(); ++i) 55 | { 56 | INFO(std::format("run segmentation for part {}: \"{}\" to be {}", 57 | i, 58 | to_utf8(expectations[i].first), 59 | (unsigned) expectations[i].second)); 60 | bool const consumeSuccess = segmenter.consume(out(size), out(presentationStyle)); 61 | REQUIRE(consumeSuccess); 62 | CHECK(expectations[i].first == *segmenter); 63 | CHECK(size == expects[i].end); 64 | CHECK(presentationStyle == expects[i].presentationStyle); 65 | } 66 | bool const consumeFail = segmenter.consume(out(size), out(presentationStyle)); 67 | REQUIRE_FALSE(consumeFail); 68 | } 69 | } // namespace 70 | 71 | TEST_CASE("emoji_segmenter.Emoji", "[emoji_segmenter]") 72 | { 73 | test_segments(__LINE__, { { U"\U0001F600", PresentationStyle::Emoji } }); 74 | } 75 | 76 | TEST_CASE("emoji_segmenter.Emoji_VS15", "[emoji_segmenter]") 77 | { 78 | test_segments(__LINE__, { { U"\U0001F600\uFE0E", PresentationStyle::Text } }); 79 | } 80 | 81 | TEST_CASE("emoji_segmenter.LatinEmoji", "[emoji_segmenter]") 82 | { 83 | test_segments(__LINE__, { { U"AB", PresentationStyle::Text }, { U"😀", PresentationStyle::Emoji } }); 84 | } 85 | 86 | TEST_CASE("emoji_segmenter.EmojiLatin", "[emoji_segmenter]") 87 | { 88 | test_segments(__LINE__, 89 | { 90 | { U"😀", PresentationStyle::Emoji }, 91 | { U"A", PresentationStyle::Text }, 92 | }); 93 | } 94 | 95 | TEST_CASE("emoji_segmenter.TwoEmojis", "[emoji_segmenter]") 96 | { 97 | test_segments(__LINE__, 98 | { 99 | { U"😀😀", PresentationStyle::Emoji }, 100 | }); 101 | } 102 | 103 | TEST_CASE("emoji_segmenter.LatinCommonEmoji", "[emoji_segmenter]") 104 | { 105 | test_segments(__LINE__, 106 | { 107 | { U"AB ", PresentationStyle::Text }, 108 | { U"😀", PresentationStyle::Emoji }, 109 | }); 110 | } 111 | 112 | TEST_CASE("emoji_segmenter.EmojiTextPresentation", "[emoji_segmenter]") 113 | { 114 | test_segments(__LINE__, 115 | { 116 | { U"\u270c\ufe0e", PresentationStyle::Text }, 117 | }); 118 | } 119 | 120 | TEST_CASE("emoji_segmenter.emoji.text.emoji", "[emoji_segmenter]") 121 | { 122 | test_segments(__LINE__, 123 | { 124 | { U"\u270c", PresentationStyle::Emoji }, 125 | { U"\u270c\ufe0e", PresentationStyle::Text }, 126 | { U"\u270c", PresentationStyle::Emoji }, 127 | }); 128 | } 129 | 130 | TEST_CASE("emoji_segmenter.mixed_complex", "[emoji_segmenter]") 131 | { 132 | test_segments( 133 | __LINE__, 134 | { 135 | { U"Hello(", PresentationStyle::Text }, // Latin text 136 | { U"\u270c\U0001F926\U0001F3FC\u200D\u2642\uFE0F", PresentationStyle::Emoji }, // 🤦🏼‍♂️ Face Palm 137 | { U"\u270c\ufe0e :-)", PresentationStyle::Text }, // ✌ Waving hand (text presentation) 138 | { U"\u270c", PresentationStyle::Emoji }, // ✌ Waving hand 139 | { U")合!", PresentationStyle::Text }, // Kanji text 140 | }); 141 | } 142 | -------------------------------------------------------------------------------- /src/libunicode/grapheme_segmenter.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | 16 | namespace unicode 17 | { 18 | 19 | void grapheme_process_init(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept 20 | { 21 | auto const Pb = codepoint_properties::get(nextCodepoint); 22 | auto const B = Pb.grapheme_cluster_break; 23 | 24 | state.previousCodepoint = nextCodepoint; 25 | state.previousProperties = codepoint_properties::get(nextCodepoint); 26 | state.ri_counter = (B == Grapheme_Cluster_Break::Regional_Indicator) ? 1 : 0; 27 | } 28 | 29 | bool grapheme_process_breakable(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept 30 | { 31 | auto const a = state.previousCodepoint; 32 | auto const Pa = state.previousProperties; 33 | auto const A = Pa.grapheme_cluster_break; 34 | 35 | auto const b = nextCodepoint; 36 | auto const Pb = codepoint_properties::get(b); 37 | auto const B = Pb.grapheme_cluster_break; 38 | 39 | state.previousCodepoint = b; 40 | state.previousProperties = Pb; 41 | 42 | static constexpr char32_t CR = 0x000D; // NOLINT 43 | static constexpr char32_t LF = 0x000A; // NOLINT 44 | 45 | { 46 | // Set state.ri_counter to zero if the next codepoint is not of category Regional_Indicator. 47 | // 48 | // We move the state.ri_counter out to help GCC optimize 49 | // this code to be branchless. 50 | // Sadly only GCC succeeds in doing this and Clang fails. 51 | auto const ri_counter = state.ri_counter; 52 | state.ri_counter = (B == Grapheme_Cluster_Break::Regional_Indicator) ? ri_counter : 0; 53 | } 54 | 55 | // GB3: Do not break between a CR and LF. Otherwise, break before and after controls. 56 | if (a == CR && b == LF) 57 | return false; 58 | 59 | // GB4 (a) + GB5 (b) part 1 (C0 characers) + US-ASCII shortcut 60 | // The US-ASCII part is a pure optimization improving performance 61 | // in standard Latin text. 62 | if (a < 128 && b < 128) 63 | return true; 64 | 65 | // GB4: (part 2) 66 | if (A == Grapheme_Cluster_Break::Control) 67 | return true; 68 | 69 | // GB5: (part 2) 70 | if (B == Grapheme_Cluster_Break::Control) 71 | return true; 72 | 73 | // Do not break Hangul syllable sequences. 74 | // GB6: 75 | if (A == Grapheme_Cluster_Break::L 76 | && (B == Grapheme_Cluster_Break::L || B == Grapheme_Cluster_Break::V || B == Grapheme_Cluster_Break::LV 77 | || B == Grapheme_Cluster_Break::LVT)) 78 | return false; 79 | 80 | // GB7: 81 | if ((A == Grapheme_Cluster_Break::LV || A == Grapheme_Cluster_Break::V) 82 | && (B == Grapheme_Cluster_Break::V || B == Grapheme_Cluster_Break::T)) 83 | return false; 84 | 85 | // GB8: 86 | if ((A == Grapheme_Cluster_Break::LV || A == Grapheme_Cluster_Break::T) && B == Grapheme_Cluster_Break::T) 87 | return false; 88 | 89 | // GB9: Do not break before extending characters. 90 | if (B == Grapheme_Cluster_Break::Extend || B == Grapheme_Cluster_Break::ZWJ) 91 | return false; 92 | 93 | // GB9a: Do not break before SpacingMarks 94 | if (B == Grapheme_Cluster_Break::SpacingMark) 95 | return false; 96 | 97 | // GB9b: or after Prepend characters. 98 | if (A == Grapheme_Cluster_Break::Prepend) 99 | return false; 100 | 101 | // GB11: Do not break within emoji modifier sequences or emoji zwj sequences. 102 | if (A == Grapheme_Cluster_Break::ZWJ && Pb.extended_pictographic()) 103 | return false; 104 | 105 | // GB12/GB13: Do not break within emoji flag sequences. 106 | // That is, do not break between regional indicator (RI) symbols 107 | // if there is an odd number of RI characters before the break point. 108 | if (A == Grapheme_Cluster_Break::Regional_Indicator && A == B && state.ri_counter == 1) 109 | { 110 | state.ri_counter = static_cast((state.ri_counter + 1) % 2); 111 | return false; 112 | } 113 | 114 | // GB999: Otherwise, break everywhere. 115 | return true; // GB10 116 | } 117 | 118 | } // namespace unicode 119 | -------------------------------------------------------------------------------- /src/libunicode/grapheme_segmenter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | namespace unicode 22 | { 23 | 24 | /// Grapheme segmentation state struct, used to keep state 25 | /// while processing each Unicode codepoint, 26 | /// allow proper processing of regional flags 27 | /// as well as reducing the number of invocations 28 | /// to codepoint_properties::get(). 29 | struct grapheme_segmenter_state 30 | { 31 | char32_t previousCodepoint = {}; 32 | codepoint_properties previousProperties = codepoint_properties::get(0); 33 | 34 | uint8_t ri_counter = 0; // modulo 2 35 | }; 36 | 37 | void grapheme_process_init(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept; 38 | 39 | /// Tests if codepoint @p a and @p b are breakable, and thus, two different grapheme clusters. 40 | /// 41 | /// @retval true both codepoints to not belong to the same grapheme cluster 42 | /// @retval false both codepoints belong to the same grapheme cluster 43 | bool grapheme_process_breakable(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept; 44 | 45 | /// Implements http://www.unicode.org/reports/tr29/tr29-27.html#Grapheme_Cluster_Boundary_Rules 46 | class grapheme_segmenter 47 | { 48 | public: 49 | grapheme_segmenter(char32_t const* begin, char32_t const* end) noexcept: 50 | left_ { begin }, right_ { begin }, end_ { end }, state_ {} 51 | { 52 | ++*this; 53 | } 54 | 55 | grapheme_segmenter(std::u32string_view sv) noexcept: grapheme_segmenter(sv.data(), sv.data() + sv.size()) {} 56 | 57 | grapheme_segmenter() noexcept: grapheme_segmenter({}, {}) {} 58 | 59 | grapheme_segmenter& operator++() noexcept 60 | { 61 | left_ = right_; 62 | if (right_ == end_) 63 | return *this; 64 | 65 | grapheme_process_init(*right_++, state_); 66 | 67 | while (right_ != end_ && !grapheme_process_breakable(*right_, state_)) 68 | ++right_; 69 | 70 | return *this; 71 | } 72 | 73 | constexpr std::u32string_view operator*() const noexcept 74 | { 75 | return std::u32string_view(left_, static_cast(right_ - left_)); 76 | } 77 | 78 | constexpr bool codepointsAvailable() const noexcept { return right_ != end_; } 79 | 80 | constexpr operator bool() const noexcept { return codepointsAvailable(); } 81 | 82 | constexpr bool operator==(grapheme_segmenter const& rhs) const noexcept 83 | { 84 | return (!codepointsAvailable() && !rhs.codepointsAvailable()) || (left_ == rhs.left_ && right_ == rhs.right_); 85 | } 86 | 87 | /// Tests if codepoint @p a and @p b are breakable, and thus, two different grapheme clusters. 88 | /// 89 | /// @retval true both codepoints to not belong to the same grapheme cluster 90 | /// @retval false both codepoints belong to the same grapheme cluster 91 | static bool breakable(char32_t a, char32_t b) noexcept 92 | { 93 | auto state = grapheme_segmenter_state {}; 94 | state.previousCodepoint = a; 95 | state.previousProperties = codepoint_properties::get(a); 96 | state.ri_counter = 97 | (state.previousProperties.grapheme_cluster_break == Grapheme_Cluster_Break::Regional_Indicator) ? 1 : 0; 98 | return grapheme_process_breakable(b, state); 99 | } 100 | 101 | static bool nonbreakable(char32_t a, char32_t b) noexcept { return !breakable(a, b); } 102 | 103 | private: 104 | char32_t const* left_; 105 | char32_t const* right_; 106 | char32_t const* end_; 107 | grapheme_segmenter_state state_; 108 | }; 109 | 110 | } // namespace unicode 111 | -------------------------------------------------------------------------------- /src/libunicode/grapheme_segmenter_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | using namespace unicode; 20 | using namespace std::string_literals; 21 | using namespace std; 22 | 23 | // TODO 24 | // Implement examples from table 1a) at: 25 | // http://www.unicode.org/reports/tr29/tr29-27.html#Grapheme_Cluster_Boundary_Rules 26 | 27 | TEST_CASE("latin_common", "[grapheme_segmenter]") 28 | { 29 | // auto constexpr text = u32string_view{U"\u0067G\u0308"}; 30 | 31 | CHECK(grapheme_segmenter::breakable('a', 'b')); 32 | CHECK(grapheme_segmenter::breakable('b', '!')); 33 | CHECK(grapheme_segmenter::breakable('!', '.')); 34 | } 35 | 36 | TEST_CASE("combining character sequences", "[grapheme_segmenter]") 37 | { 38 | // auto constexpr text = u32string_view{U"\u0067G\u0308"}; 39 | 40 | CHECK(grapheme_segmenter::nonbreakable('g', U'\u0308')); 41 | } 42 | 43 | // TEST_CASE("Extended grapheme clusters", "[grapheme_segmenter]") 44 | // { 45 | // // TODO: Hangul Syllables support, can't enable this test yet 46 | // CHECK(grapheme_segmenter::nonbreakable(U'\u0BA8', U'\u0BBF')); // Tamil ni 47 | // CHECK(grapheme_segmenter::nonbreakable(U'\u0E40', 'e')); // Thai e 48 | // CHECK(grapheme_segmenter::nonbreakable(U'\u0E01', U'\u0E33')); // Thai kam 49 | // CHECK(grapheme_segmenter::nonbreakable(U'\u0937', U'\u093F')); // Devanagari ssi 50 | // } 51 | 52 | TEST_CASE("emoji.speaking-eye", "[grapheme_segmenter]") 53 | { 54 | /* 55 | 👁 U+1F441 Eye 56 | ️ U+FE0F VS16 57 | U+200D ZWJ 58 | 🗨 U+1F5E8 Left Speech Bubble 59 | ️ U+FE0F VS16 60 | */ 61 | auto const zwj = u32string_view { U"\U0001F441\uFE0F\u200D\U0001F5E8\uFE0F" }; 62 | CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1])); 63 | CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2])); 64 | CHECK(grapheme_segmenter::nonbreakable(zwj[2], zwj[3])); 65 | CHECK(grapheme_segmenter::nonbreakable(zwj[3], zwj[4])); 66 | } 67 | 68 | TEST_CASE("emoji", "[grapheme_segmenter]") 69 | { 70 | // 👨‍🦰 71 | auto const zwj = u32string_view { U"\U0001F468\u200D\U0001F9B0" }; 72 | CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1])); 73 | CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2])); 74 | 75 | // 👨‍👩‍👧 76 | auto const zwj3 = u32string_view { U"\U0001F468\u200D\U0001F469\u200D\U0001F467" }; 77 | CHECK(grapheme_segmenter::nonbreakable(zwj3[0], zwj3[1])); 78 | CHECK(grapheme_segmenter::nonbreakable(zwj3[1], zwj3[2])); 79 | CHECK(grapheme_segmenter::nonbreakable(zwj3[2], zwj3[3])); 80 | CHECK(grapheme_segmenter::nonbreakable(zwj3[3], zwj3[4])); 81 | } 82 | 83 | TEST_CASE("emoji: Man Facepalming: Medium-Light Skin Tone", "[grapheme_segmenter]") 84 | { 85 | 86 | auto const zwj = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" }; 87 | CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1])); 88 | CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2])); 89 | CHECK(grapheme_segmenter::nonbreakable(zwj[2], zwj[3])); 90 | CHECK(grapheme_segmenter::nonbreakable(zwj[3], zwj[4])); 91 | } 92 | 93 | TEST_CASE("grapheme_segmenter.iterator_1", "[grapheme_segmenter]") 94 | { 95 | auto const codepoints = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" }; 96 | auto gs = grapheme_segmenter { codepoints }; 97 | 98 | // initially first token already process 99 | CHECK(*gs == codepoints); 100 | CHECK_FALSE(gs.codepointsAvailable()); 101 | } 102 | 103 | TEST_CASE("grapheme_segmenter.iterator_2", "[grapheme_segmenter]") 104 | { 105 | auto const grapheme_cluster2 = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" }; 106 | auto const codepoints = u32string_view { U"X\U0001F926\U0001F3FC\u200D\u2642\uFE0F5" }; 107 | auto gs = grapheme_segmenter { codepoints }; 108 | 109 | // first grapheme cluster 110 | CHECK(*gs == U"X"); 111 | CHECK(gs.codepointsAvailable()); 112 | 113 | // second grapheme cluster 114 | ++gs; 115 | CHECK(*gs == grapheme_cluster2); 116 | CHECK(gs.codepointsAvailable()); 117 | 118 | // 3rd grapheme cluster 119 | ++gs; 120 | CHECK(*gs == U"5"); 121 | CHECK_FALSE(gs.codepointsAvailable()); 122 | 123 | // incrementing beyond end of stream 124 | ++gs; 125 | CHECK(*gs == U""); 126 | CHECK_FALSE(gs.codepointsAvailable()); 127 | } 128 | 129 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags", "[grapheme_segmenter]") 130 | { 131 | auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" }; 132 | auto const ri_JP = u32string { U"\U0001F1EF\U0001F1F5" }; 133 | auto const codepoints = ri_DE + ri_DE + ri_JP; 134 | auto gs = grapheme_segmenter { codepoints }; 135 | 136 | // first grapheme cluster 137 | REQUIRE(*gs == ri_DE); 138 | REQUIRE(gs.codepointsAvailable()); 139 | 140 | // second grapheme cluster 141 | ++gs; 142 | REQUIRE(*gs == ri_DE); 143 | REQUIRE(gs.codepointsAvailable()); 144 | 145 | // 3rd grapheme cluster 146 | ++gs; 147 | REQUIRE(*gs == ri_JP); 148 | REQUIRE_FALSE(gs.codepointsAvailable()); 149 | 150 | // incrementing beyond end of stream 151 | ++gs; 152 | REQUIRE(*gs == U""); 153 | REQUIRE_FALSE(gs.codepointsAvailable()); 154 | } 155 | 156 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags invalid 1", "[grapheme_segmenter]") 157 | { 158 | auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" }; 159 | auto const ri_J = u32string { U"\U0001F1EF" }; 160 | auto const codepoints = ri_DE + ri_DE + ri_J + U"P"; 161 | auto gs = grapheme_segmenter { codepoints }; 162 | 163 | // first grapheme cluster 164 | REQUIRE(*gs == ri_DE); 165 | REQUIRE(gs.codepointsAvailable()); 166 | 167 | // second grapheme cluster 168 | ++gs; 169 | REQUIRE(*gs == ri_DE); 170 | REQUIRE(gs.codepointsAvailable()); 171 | 172 | // 3rd grapheme cluster 173 | ++gs; 174 | REQUIRE(*gs == ri_J); 175 | REQUIRE(gs.codepointsAvailable()); 176 | 177 | // 4th grapheme cluster 178 | ++gs; 179 | REQUIRE(*gs == U"P"); 180 | REQUIRE_FALSE(gs.codepointsAvailable()); 181 | 182 | // incrementing beyond end of stream 183 | ++gs; 184 | REQUIRE(*gs == U""); 185 | REQUIRE_FALSE(gs.codepointsAvailable()); 186 | } 187 | 188 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags invalid 2", "[grapheme_segmenter]") 189 | { 190 | auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" }; 191 | auto const ri_J = u32string { U"\U0001F1EF" }; 192 | auto const codepoints = ri_DE + ri_DE + U"Q" + ri_J + U"P"; 193 | auto gs = grapheme_segmenter { codepoints }; 194 | 195 | // 1 196 | REQUIRE(*gs == ri_DE); 197 | REQUIRE(gs.codepointsAvailable()); 198 | 199 | // 2 200 | ++gs; 201 | REQUIRE(*gs == ri_DE); 202 | REQUIRE(gs.codepointsAvailable()); 203 | 204 | // 3 205 | ++gs; 206 | REQUIRE(*gs == U"Q"); 207 | REQUIRE(gs.codepointsAvailable()); 208 | 209 | // 4 210 | ++gs; 211 | REQUIRE(*gs == ri_J); 212 | REQUIRE(gs.codepointsAvailable()); 213 | 214 | // 5 215 | ++gs; 216 | REQUIRE(*gs == U"P"); 217 | REQUIRE_FALSE(gs.codepointsAvailable()); 218 | 219 | // incrementing beyond end of stream 220 | ++gs; 221 | REQUIRE(*gs == U""); 222 | REQUIRE_FALSE(gs.codepointsAvailable()); 223 | } 224 | -------------------------------------------------------------------------------- /src/libunicode/libunicode-config.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | # prevent repeatedly including the targets 4 | if(NOT TARGET unicode::core) 5 | include(${CMAKE_CURRENT_LIST_DIR}/libunicode-targets.cmake) 6 | endif() 7 | 8 | message(STATUS "Found @PROJECT_NAME@, version: ${@PROJECT_NAME@_VERSION}") 9 | -------------------------------------------------------------------------------- /src/libunicode/multistage_table_generator.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2022 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | namespace support 31 | { 32 | 33 | template ::max()> 39 | struct multistage_table 40 | { 41 | using view_type = multistage_table_view; 42 | 43 | std::vector stage1; // div 44 | std::vector stage2; // mod 45 | std::vector stage3; // values 46 | 47 | auto to_view() const noexcept { return view_type { stage1.data(), stage2.data(), stage3.data() }; } 48 | 49 | T const& get(SourceType index) const noexcept { return to_view().get(index); } 50 | }; 51 | 52 | template ::max()> 59 | class multistage_table_generator 60 | { 61 | public: 62 | T const* _input; 63 | size_t _inputSize; 64 | multistage_table& _output; 65 | Stage3Finder _stage3Finder; 66 | 67 | void generate() 68 | { 69 | assert(_inputSize % BlockSize == 0); 70 | _output.stage1.resize(_inputSize / BlockSize); 71 | for (SourceType blockStart = 0; blockStart <= _inputSize - BlockSize; blockStart += BlockSize) 72 | _output.stage1[blockStart / BlockSize] = get_or_create_index_to_stage2_block(blockStart); 73 | } 74 | 75 | void verify() const 76 | { 77 | for (SourceType blockStart = 0; blockStart <= _inputSize - BlockSize; ++blockStart) 78 | verify_block(blockStart / BlockSize); 79 | } 80 | 81 | private: 82 | void verify_block(SourceType blockNumber) const 83 | { 84 | for (SourceType codepoint = blockNumber * BlockSize; codepoint < (blockNumber + 1) * BlockSize; ++codepoint) 85 | { 86 | auto const& a = _input[codepoint]; 87 | auto const& b = _output.get(codepoint); 88 | if (a != b) 89 | { 90 | throw runtime_error((std::ostringstream() 91 | << "U+" << std::hex << unsigned(codepoint) << " mismatch in properties.\n" 92 | << "Expected : " << a << "\nActual : " << b) 93 | .str()); 94 | } 95 | } 96 | } 97 | 98 | Stage1ElementType get_or_create_index_to_stage2_block(SourceType blockStart) 99 | { 100 | if (auto other_block = find_same_block(static_cast(blockStart))) 101 | return _output.stage1[other_block.value()]; 102 | 103 | // Block has not been seen yet. Create a new block. 104 | auto const stage2Index = _output.stage2.size() / BlockSize; 105 | assert(stage2Index < std::numeric_limits::max()); 106 | 107 | for (SourceType index = blockStart; index < blockStart + BlockSize; ++index) 108 | _output.stage2.emplace_back(get_or_create_stage3_index(index)); 109 | 110 | assert(_output.stage2.size() % BlockSize == 0); 111 | 112 | return static_cast(stage2Index); 113 | } 114 | 115 | std::optional find_same_block(size_t blockStart) const noexcept 116 | { 117 | assert(blockStart % BlockSize == 0); 118 | assert(blockStart + BlockSize <= _inputSize); 119 | 120 | for (size_t otherBlockStart = 0; otherBlockStart < blockStart; otherBlockStart += BlockSize) 121 | if (is_same_block(otherBlockStart, blockStart)) 122 | return { otherBlockStart / BlockSize }; 123 | 124 | return std::nullopt; 125 | } 126 | 127 | /// Tests if two given blocks are equivalent. 128 | /// @p a and @p b are both absolute offsets to the start of each block. 129 | bool is_same_block(size_t a, size_t b) const noexcept 130 | { 131 | assert(a % BlockSize == 0); 132 | assert(b % BlockSize == 0); 133 | assert(a + BlockSize <= _inputSize); 134 | assert(b + BlockSize <= _inputSize); 135 | 136 | for (size_t i = 0; i < BlockSize; ++i) 137 | if (_input[a + i] != _input[b + i]) 138 | return false; 139 | 140 | return true; 141 | } 142 | 143 | Stage2ElementType get_or_create_stage3_index(SourceType stage1Index) 144 | { 145 | auto& properties = _output.stage3; 146 | auto const propertyIterator = _stage3Finder(properties.begin(), properties.end(), _input[stage1Index]); 147 | if (propertyIterator != properties.end()) 148 | return static_cast(distance(properties.begin(), propertyIterator)); 149 | 150 | auto const stage3Index = properties.size(); 151 | properties.emplace_back(_input[stage1Index]); 152 | assert(stage3Index < std::numeric_limits::max()); 153 | return static_cast(stage3Index); 154 | } 155 | }; 156 | 157 | template ::max()> 164 | void generate(T const* input, 165 | size_t inputSize, 166 | multistage_table& output, 167 | Stage3Finder&& stage3Finder) 168 | { 169 | auto builder = 170 | multistage_table_generator { 171 | input, inputSize, output, std::forward(stage3Finder) 172 | }; 173 | builder.generate(); 174 | } 175 | 176 | } // namespace support 177 | -------------------------------------------------------------------------------- /src/libunicode/multistage_table_view.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2022 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | namespace support 20 | { 21 | 22 | template ::max()> 28 | struct multistage_table_view 29 | { 30 | using source_type = SourceType; 31 | using stage1_element_type = Stage1ElementType; 32 | using stage2_element_type = Stage2ElementType; 33 | using value_type = T; 34 | 35 | stage1_element_type const* stage1; // div 36 | stage2_element_type const* stage2; // mod 37 | value_type const* stage3; // values 38 | 39 | static std::size_t constexpr block_size = BlockSize; 40 | 41 | // size_t size() const noexcept { return stage1.size(); } 42 | 43 | value_type const& get(source_type index, source_type fallback = source_type {}) const noexcept 44 | { 45 | return unsafe_get(index <= MaxValue ? index : fallback); 46 | } 47 | 48 | value_type const& unsafe_get(source_type index) const noexcept 49 | { 50 | auto const block_number = stage1[index / BlockSize]; 51 | auto const block_start = block_number * BlockSize; 52 | auto const element_offset = index % BlockSize; 53 | auto const property_index = stage2[block_start + element_offset]; 54 | return stage3[property_index]; 55 | } 56 | }; 57 | 58 | } // namespace support 59 | -------------------------------------------------------------------------------- /src/libunicode/run_segmenter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | namespace unicode 28 | { 29 | 30 | template 31 | using segmenter_property_t = typename T::property_type; 32 | 33 | template 34 | using segmenter_property_tuple = std::tuple...>; 35 | 36 | namespace detail 37 | { 38 | template 39 | inline void _continuePrintList(std::ostream& os, Prepend const& prep, std::tuple const& p) 40 | { 41 | ((os << prep << std::get(p)), ...); 42 | } 43 | } // namespace detail 44 | 45 | /// API for segmenting incoming text into small runs. 46 | /// 47 | /// A ``run`` is a unit suitable for text shaping, but may as well be used 48 | /// for other purposes, too. 49 | /// 50 | /// @see script_segmenter 51 | /// @see emoji_segmenter 52 | /// @see grapheme_segmenter 53 | template 54 | class basic_run_segmenter 55 | { 56 | public: 57 | using property_tuple = std::tuple...>; 58 | 59 | /// Contains the extracted information of run_segmenter's single run. 60 | struct range 61 | { 62 | /// start-offset of the current segment that has been extracted 63 | size_t start = 0; 64 | 65 | /// end-offset (excluding) of the current segment that has been extracted 66 | size_t end = 0; 67 | 68 | /// the script (writing system) this segment has been identified with 69 | /// presentation style of the underlying segment 70 | property_tuple properties; 71 | 72 | constexpr bool operator==(range other) const noexcept 73 | { 74 | return start == other.start && end == other.end && properties == other.properties; 75 | } 76 | 77 | constexpr bool operator!=(range other) const noexcept { return !(*this == other); } 78 | 79 | friend inline std::ostream& operator<<(std::ostream& os, range r) 80 | { 81 | os << '(' << r.start << ".." << r.end; 82 | detail::_continuePrintList(os, ", ", r.properties); 83 | os << ')'; 84 | return os; 85 | } 86 | }; 87 | 88 | explicit basic_run_segmenter(std::u32string_view sv): basic_run_segmenter(sv.data(), sv.size()) {} 89 | 90 | basic_run_segmenter(char32_t const* text, size_t size): segmenter_ {}, size_ { size } 91 | { 92 | initialize<0, Segmenter...>(text, size); 93 | } 94 | 95 | constexpr bool finished() const noexcept { return lastSplit_ >= size_; } 96 | 97 | /// Splits input text into segments, such as pure text by script, emoji-emoji, or emoji-text. 98 | /// 99 | /// @retval true more data can be processed 100 | /// @retval false end of input data has been reached. 101 | bool consume(out result) 102 | { 103 | if (finished()) 104 | return false; 105 | 106 | consumeAllUntilSplitPosition<0, Segmenter...>(); 107 | 108 | auto const minPosition = std::min_element(begin(positions_), end(positions_)); 109 | 110 | lastSplit_ = *minPosition; 111 | 112 | candidate_.start = candidate_.end; 113 | candidate_.end = lastSplit_; 114 | candidate_.properties = properties_; 115 | 116 | *result = candidate_; 117 | return true; 118 | } 119 | 120 | private: 121 | template 122 | void initialize(char32_t const*, size_t) 123 | { 124 | } 125 | 126 | template 127 | void initialize(char32_t const* text, size_t size) 128 | { 129 | std::get(segmenter_) = Current { text, size }; 130 | initialize(text, size); 131 | } 132 | 133 | template 134 | void consumeAllUntilSplitPosition() 135 | { 136 | } 137 | 138 | template 139 | void consumeAllUntilSplitPosition() 140 | { 141 | consumeUntilSplitPosition(std::get(segmenter_), out(positions_[I]), out(std::get(properties_))); 142 | consumeAllUntilSplitPosition(); 143 | } 144 | 145 | template 146 | void consumeUntilSplitPosition(TheSegmenter& segmenter, out position, out property) 147 | { 148 | if (*position > lastSplit_) 149 | return; 150 | 151 | if (*position >= size_) 152 | return; 153 | 154 | for (;;) 155 | { 156 | if (!segmenter.consume(position, property)) 157 | break; 158 | 159 | if (*position > lastSplit_) 160 | break; 161 | } 162 | } 163 | 164 | // private data 165 | 166 | using position_list = std::array; 167 | using segmenter_tuple = std::tuple; 168 | 169 | size_t lastSplit_ = 0; 170 | range candidate_ = {}; 171 | position_list positions_ {}; 172 | property_tuple properties_ {}; 173 | segmenter_tuple segmenter_; 174 | size_t const size_; 175 | }; 176 | 177 | using run_segmenter = basic_run_segmenter; 178 | 179 | } // namespace unicode 180 | -------------------------------------------------------------------------------- /src/libunicode/scan.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | 18 | #include 19 | 20 | namespace unicode 21 | { 22 | 23 | /// Holds the result of a call to scan_test(). 24 | struct scan_result 25 | { 26 | /// Number of columns scanned. 27 | /// One column equals a single narrow-width codepoint. 28 | /// Codepoints with property East Asian Width Wide are treated as two columns. 29 | size_t count; 30 | 31 | /// Pointer to UTF-8 grapheme cluster start. 32 | char const* start; 33 | 34 | /// Pointer to UTF-8 grapheme cluster end, i.e. one byte behind 35 | /// the last successfuly processed complete UTF-8 byte.. 36 | char const* end; 37 | }; 38 | 39 | /// Holds the state to keep through a consecutive sequence of calls to scan_test(). 40 | /// 41 | /// This state holds the UTF-8 decoding state, if processing had to be stopped 42 | /// at an incomplete UTF-8 byte sequence, 43 | /// and the last decoded Unicode codepoint necessary for grapheme cluster segmentation. 44 | struct scan_state 45 | { 46 | utf8_decoder_state utf8 {}; 47 | char32_t lastCodepointHint {}; 48 | 49 | /// Pointer to one byte after the last scanned codepoint. 50 | char const* next {}; 51 | }; 52 | 53 | /// Callback-interface that allows precisely understanding the structure of a UTF-8 sequence. 54 | class grapheme_cluster_receiver 55 | { 56 | public: 57 | virtual ~grapheme_cluster_receiver() = default; 58 | 59 | virtual void receiveAsciiSequence(std::string_view codepoints) noexcept = 0; 60 | virtual void receiveGraphemeCluster(std::string_view codepoints, size_t columnCount) noexcept = 0; 61 | virtual void receiveInvalidGraphemeCluster() noexcept = 0; 62 | }; 63 | 64 | /// Quite obviousely, this grapheme_cluster_receiver will do nothing. 65 | class null_receiver final: public grapheme_cluster_receiver 66 | { 67 | public: 68 | void receiveAsciiSequence(std::string_view) noexcept override {} 69 | void receiveGraphemeCluster(std::string_view, size_t) noexcept override {} 70 | void receiveInvalidGraphemeCluster() noexcept override {} 71 | 72 | static null_receiver& get() noexcept 73 | { 74 | static null_receiver instance {}; 75 | return instance; 76 | } 77 | }; 78 | 79 | namespace detail 80 | { 81 | size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept; 82 | 83 | template 84 | size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept; 85 | size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept; 86 | size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept; 87 | scan_result scan_for_text_nonascii(scan_state& state, 88 | std::string_view text, 89 | size_t maxColumnCount, 90 | grapheme_cluster_receiver& receiver) noexcept; 91 | } // namespace detail 92 | 93 | /// Scans a sequence of UTF-8 encoded bytes. 94 | /// 95 | /// This call will return early one of the conditions is met: 96 | /// 97 | /// - given the input sequence, the right most invalid or complete UTF-8 sequence is processed, 98 | /// - maxColumnCount is reached and the next grapheme cluster would exceed the given limit, 99 | /// - a control character is about to be processed. 100 | /// 101 | /// When this function returns, it is guaranteed to not contain an incomplete UTF-8 sequence 102 | /// at the end of the output sequence. 103 | /// 104 | /// Calling this function again with more bytes will resume decoding that UTF-8 sequence 105 | /// with the help of the passed UTF-8 decoder state. 106 | /// 107 | /// @return scanned textual result. This is, a sequence of 108 | /// either valid or invalid UTF-8 codepoints, 109 | /// but not incomplete codepoints at the end. 110 | scan_result scan_text(scan_state& state, std::string_view text, size_t maxColumnCount) noexcept; 111 | 112 | scan_result scan_text(scan_state& state, 113 | std::string_view text, 114 | size_t maxColumnCount, 115 | grapheme_cluster_receiver& receiver) noexcept; 116 | 117 | } // namespace unicode 118 | -------------------------------------------------------------------------------- /src/libunicode/scan256.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | #include 3 | #include 4 | 5 | namespace unicode::detail 6 | { 7 | size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept 8 | { 9 | return scan_for_text_ascii_simd<256>(text, maxColumnCount); 10 | } 11 | } // namespace unicode::detail 12 | -------------------------------------------------------------------------------- /src/libunicode/scan512.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | #include 3 | #include 4 | 5 | namespace unicode::detail 6 | { 7 | size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept 8 | { 9 | return scan_for_text_ascii_simd<512>(text, maxColumnCount); 10 | } 11 | } // namespace unicode::detail 12 | -------------------------------------------------------------------------------- /src/libunicode/scan_simd_impl.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | #pragma once 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // clang-format off 9 | #if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) 10 | #define USE_STD_SIMD 11 | #include 12 | namespace stdx = std::experimental; 13 | #elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) 14 | #define USE_STD_SIMD 15 | #include 16 | namespace stdx = std; 17 | #elif defined(LIBUNICODE_USE_INTRINSICS) 18 | #include "intrinsics.h" 19 | #endif 20 | // clang-format on 21 | namespace unicode::detail 22 | { 23 | template 24 | size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept 25 | { 26 | [[maybe_unused]] constexpr int simd_size = SimdBitWidth / 8; 27 | auto input = text.data(); 28 | auto const end = text.data() + std::min(text.size(), maxColumnCount); 29 | 30 | #if defined(USE_STD_SIMD) 31 | auto simd_text = stdx::fixed_size_simd {}; 32 | while (input < end - simd_size) 33 | { 34 | simd_text.copy_from(input, stdx::element_aligned); 35 | auto const is_control_mask = simd_text < 0x20; 36 | auto const is_complex_mask = (simd_text & 0x80) == 0x80; 37 | auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask; 38 | if (stdx::any_of(ctrl_or_complex_mask)) 39 | { 40 | input += stdx::find_first_set(ctrl_or_complex_mask); 41 | break; 42 | } 43 | input += simd_size; 44 | } 45 | #elif defined(LIBUNICODE_USE_INTRINSICS) 46 | constexpr auto trailing_zero_count = [](T value) noexcept { 47 | // clang-format off 48 | if constexpr (std::same_as, uint32_t>) 49 | { 50 | #if defined(_WIN32) 51 | // return _tzcnt_u32(value); 52 | // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. 53 | unsigned long r = 0; 54 | _BitScanForward(&r, value); 55 | return r; 56 | #else 57 | return __builtin_ctz(value); 58 | #endif 59 | } 60 | else 61 | { 62 | #if defined(_WIN32) 63 | unsigned long r = 0; 64 | _BitScanForward64(&r, value); 65 | return r; 66 | #else 67 | return __builtin_ctzl(value); 68 | #endif 69 | } 70 | // clang-format on 71 | }; 72 | using intrinsics = intrinsics; 73 | auto const vec_control = intrinsics::set1_epi8(0x20); // 0..0x1F 74 | auto const vec_complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) 75 | 76 | while (input < end - simd_size) 77 | { 78 | auto const batch = intrinsics::load(input); 79 | auto const is_control_mask = intrinsics::less(batch, vec_control); 80 | auto const is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); 81 | auto const ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); 82 | if (ctrl_or_complex_mask) 83 | { 84 | int const advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); 85 | input += advance; 86 | break; 87 | } 88 | input += sizeof(simd_size); 89 | } 90 | #endif 91 | 92 | constexpr auto is_ascii = [](char ch) noexcept { 93 | auto const is_control = static_cast(ch) < 0x20; 94 | auto const is_complex = static_cast(ch) & 0x80; 95 | return !is_control && !is_complex; 96 | }; 97 | while (input != end && is_ascii(*input)) 98 | ++input; 99 | 100 | return static_cast(std::distance(text.data(), input)); 101 | } 102 | } // namespace unicode::detail 103 | -------------------------------------------------------------------------------- /src/libunicode/scoped_timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2022 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | namespace support 21 | { 22 | 23 | class scoped_timer 24 | { 25 | public: 26 | scoped_timer(std::ostream* output, std::string message): 27 | _start { std::chrono::steady_clock::now() }, _output { output }, _message { std::move(message) } 28 | { 29 | if (_output) 30 | { 31 | *_output << _message << " ... "; 32 | _output->flush(); 33 | } 34 | } 35 | 36 | ~scoped_timer() 37 | { 38 | if (!_output) 39 | return; 40 | 41 | auto const finish = std::chrono::steady_clock::now(); 42 | auto const diff = finish - _start; 43 | *_output << std::chrono::duration_cast(diff).count() << " ms\n"; 44 | } 45 | 46 | private: 47 | std::chrono::time_point _start; 48 | std::ostream* _output; 49 | std::string _message; 50 | }; 51 | 52 | } // namespace support 53 | -------------------------------------------------------------------------------- /src/libunicode/script_segmenter.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | using namespace std; 20 | 21 | namespace unicode 22 | { 23 | 24 | namespace 25 | { 26 | bool constexpr isPreferred(Script script) noexcept 27 | { 28 | switch (script) 29 | { 30 | case Script::Invalid: 31 | case Script::Common: 32 | case Script::Inherited: return false; 33 | default: return true; 34 | } 35 | } 36 | } // namespace 37 | 38 | optional script_segmenter::consume() 39 | { 40 | if (offset_ >= size_) 41 | return nullopt; 42 | 43 | while (offset_ < size_) 44 | { 45 | ScriptSet const nextScriptSet = getScriptsFor(currentChar()); 46 | 47 | if (!mergeSets(nextScriptSet, currentScriptSet_)) 48 | { 49 | // If merging failed, then we have found a script segmeent boundary. 50 | auto const res = result { resolveScript(), offset_ }; 51 | currentScriptSet_ = nextScriptSet; 52 | return res; 53 | } 54 | 55 | offset_++; 56 | } 57 | 58 | auto const res = result { resolveScript(), offset_ }; 59 | currentScriptSet_.clear(); 60 | return res; 61 | } 62 | 63 | bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept 64 | { 65 | if (nextSet.empty() || currentSet.empty()) 66 | return false; 67 | 68 | auto currentSetIter = currentSet.begin(); 69 | auto const currentSetEnd = currentSet.end(); 70 | 71 | Script priorityScript = *currentSetIter++; 72 | 73 | if (!isPreferred(nextSet.at(0))) 74 | { 75 | if (nextSet.size() == 2 && !isPreferred(priorityScript) && commonPreferredScript_ == Script::Common) 76 | commonPreferredScript_ = nextSet.at(1); 77 | return true; 78 | } 79 | 80 | // If priorityScript is either Common or Inherited then take nextScriptSet 81 | if (!isPreferred(priorityScript)) 82 | { 83 | currentSet = nextSet; 84 | return true; 85 | } 86 | 87 | auto nextSetIter = nextSet.begin(); 88 | auto const nextSetEnd = nextSet.end(); 89 | 90 | if (currentSetIter == currentSetEnd) 91 | return std::find(nextSetIter, nextSetEnd, priorityScript) != nextSetEnd; 92 | 93 | // See if we have a priority script, and if not, get it from the nextScriptSet 94 | bool hasPriorityScript = find(nextSetIter, nextSetEnd, priorityScript) != nextSetEnd; 95 | if (!hasPriorityScript) 96 | { 97 | priorityScript = *nextSetIter++; 98 | hasPriorityScript = find(currentSetIter, currentSetEnd, priorityScript) != currentSetEnd; 99 | } 100 | 101 | auto currentWriteIter = currentSet.begin(); 102 | if (hasPriorityScript) 103 | *currentWriteIter++ = priorityScript; 104 | 105 | // Intersect the remaining nextScriptSet into the currentSetIter. 106 | if (nextSetIter != nextSetEnd) 107 | { 108 | while (currentSetIter != currentSetEnd) 109 | { 110 | auto const sc = *currentSetIter++; 111 | if (find(nextSetIter, nextSetEnd, sc) != nextSetEnd) 112 | *currentWriteIter++ = sc; 113 | } 114 | } 115 | 116 | // NB: first is always smaller than second, so it is save to cast to unsigned. 117 | auto const writeCount = static_cast(distance(currentSet.begin(), currentWriteIter)); 118 | if (writeCount == 0) 119 | return false; 120 | 121 | currentSet.resize(writeCount); 122 | return true; 123 | } 124 | 125 | script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept 126 | { 127 | ScriptSet scriptSet; 128 | 129 | // Collect all script(/-extensions) for @p _codepoint into scriptSet. 130 | size_t const sceCount = script_extensions(codepoint, scriptSet.data(), scriptSet.capacity()); 131 | scriptSet.resize(sceCount); 132 | 133 | // Get the script for @p _codepoint. 134 | Script const sc = script(codepoint); 135 | 136 | // If the script of @p _codepoint is also in scriptSet, 137 | // then move it to the front of the set, 138 | // otherwise append it to the back of scriptSet. 139 | if (auto i = find(scriptSet.begin(), scriptSet.end(), sc); i != scriptSet.end()) 140 | swap(*i, *scriptSet.begin()); 141 | else 142 | scriptSet.push_back(sc); 143 | 144 | return scriptSet; 145 | } 146 | 147 | } // namespace unicode 148 | -------------------------------------------------------------------------------- /src/libunicode/script_segmenter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the "libunicode" project 3 | * Copyright (c) 2020 Christian Parpart 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | namespace unicode 23 | { 24 | 25 | class script_segmenter 26 | { 27 | public: 28 | constexpr script_segmenter() noexcept = default; 29 | constexpr script_segmenter& operator=(script_segmenter const&) noexcept = default; 30 | constexpr script_segmenter& operator=(script_segmenter&&) noexcept = default; 31 | constexpr script_segmenter(script_segmenter const&) noexcept = default; 32 | constexpr script_segmenter(script_segmenter&&) noexcept = default; 33 | 34 | constexpr explicit script_segmenter(char32_t const* data) noexcept: script_segmenter { data, getStringLength(data) } {} 35 | 36 | constexpr script_segmenter(char32_t const* data, size_t size) noexcept: data_ { data }, offset_ { 0 }, size_ { size } 37 | { 38 | currentScriptSet_.push_back(Script::Common); 39 | } 40 | 41 | constexpr script_segmenter(std::u32string_view data) noexcept: data_ { data.data() }, offset_ { 0 }, size_ { data.size() } 42 | { 43 | currentScriptSet_.push_back(Script::Common); 44 | } 45 | 46 | struct result 47 | { 48 | Script script; 49 | size_t size; 50 | }; 51 | 52 | std::optional consume(); 53 | 54 | using property_type = Script; 55 | 56 | bool consume(out size, out