├── .clang-format
├── .clang-tidy
├── .editorconfig
├── .github
    ├── FUNDING.yml
    ├── labeler.yml
    └── workflows
    │   ├── build.yml
    │   ├── checks.yml
    │   └── labeler.yml
├── .gitignore
├── .vimspector.json
├── CMakeLists.txt
├── CMakePresets.json
├── Changelog.md
├── LICENSE
├── README.md
├── TODO.md
├── cmake
    ├── ClangTidy.cmake
    ├── EnableCcache.cmake
    ├── PedanticCompiler.cmake
    ├── ThirdParties.cmake
    └── presets
    │   ├── common.json
    │   ├── os-linux.json
    │   ├── os-macos.json
    │   └── os-windows.json
├── pylintrc
├── scripts
    ├── check-pr-todos.sh
    ├── install-deps.ps1
    └── install-deps.sh
├── src
    ├── libunicode
    │   ├── CMakeLists.txt
    │   ├── benchmark.cpp
    │   ├── capi.cpp
    │   ├── capi.h
    │   ├── capi_test.cpp
    │   ├── codepoint_properties.cpp
    │   ├── codepoint_properties.h
    │   ├── codepoint_properties_loader.cpp
    │   ├── codepoint_properties_loader.h
    │   ├── convert.h
    │   ├── convert_test.cpp
    │   ├── emoji_presentation_scanner.c
    │   ├── emoji_presentation_scanner.rl
    │   ├── emoji_segmenter.cpp
    │   ├── emoji_segmenter.h
    │   ├── emoji_segmenter_test.cpp
    │   ├── grapheme_segmenter.cpp
    │   ├── grapheme_segmenter.h
    │   ├── grapheme_segmenter_test.cpp
    │   ├── intrinsics.h
    │   ├── libunicode-config.cmake.in
    │   ├── mktables.py
    │   ├── multistage_table_generator.h
    │   ├── multistage_table_view.h
    │   ├── run_segmenter.h
    │   ├── run_segmenter_test.cpp
    │   ├── scan.cpp
    │   ├── scan.h
    │   ├── scan256.cpp
    │   ├── scan512.cpp
    │   ├── scan_simd_impl.h
    │   ├── scan_test.cpp
    │   ├── scoped_timer.h
    │   ├── script_segmenter.cpp
    │   ├── script_segmenter.h
    │   ├── script_segmenter_test.cpp
    │   ├── simd_detector.cpp
    │   ├── simd_detector.h
    │   ├── support.h
    │   ├── tablegen.cpp
    │   ├── test_main.cpp
    │   ├── ucd_private.h
    │   ├── unicode_test.cpp
    │   ├── utf8.cpp
    │   ├── utf8.h
    │   ├── utf8_grapheme_segmenter.h
    │   ├── utf8_grapheme_segmenter_test.cpp
    │   ├── utf8_test.cpp
    │   ├── width.cpp
    │   ├── width.h
    │   ├── width_test.cpp
    │   ├── word_segmenter.h
    │   └── word_segmenter_test.cpp
    └── tools
    │   ├── CMakeLists.txt
    │   ├── uc-inspect.cpp
    │   └── unicode-query.cpp
├── tests
    └── zalgo.txt
└── vcpkg.json


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | BasedOnStyle: Microsoft
  3 | AccessModifierOffset: '-2'
  4 | AlignAfterOpenBracket: Align
  5 | AlignConsecutiveMacros: 'true'
  6 | AlignConsecutiveDeclarations: 'false'
  7 | AlignEscapedNewlines: Left
  8 | AlignOperands: 'true'
  9 | AlignTrailingComments: 'true'
 10 | AllowAllArgumentsOnNextLine: 'true'
 11 | AllowAllConstructorInitializersOnNextLine: 'true'
 12 | AllowAllParametersOfDeclarationOnNextLine: 'true'
 13 | AllowShortBlocksOnASingleLine: 'false'
 14 | AllowShortCaseLabelsOnASingleLine: 'true'
 15 | AllowShortFunctionsOnASingleLine: InlineOnly
 16 | AllowShortIfStatementsOnASingleLine: Never
 17 | AllowShortLambdasOnASingleLine: Inline
 18 | AllowShortLoopsOnASingleLine: 'false'
 19 | AlwaysBreakAfterReturnType: None
 20 | AlwaysBreakBeforeMultilineStrings: 'false'
 21 | AlwaysBreakTemplateDeclarations: 'Yes'
 22 | BinPackArguments: 'false'
 23 | BinPackParameters: 'false'
 24 | BreakBeforeBinaryOperators: NonAssignment
 25 | BreakBeforeBraces: Custom
 26 | BreakBeforeTernaryOperators: 'true'
 27 | BreakConstructorInitializers: AfterColon
 28 | BreakInheritanceList: AfterColon
 29 | BreakStringLiterals: 'true'
 30 | ColumnLimit: '130'
 31 | CompactNamespaces: 'false'
 32 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
 33 | ConstructorInitializerIndentWidth: '4'
 34 | ContinuationIndentWidth: '4'
 35 | Cpp11BracedListStyle: 'false'
 36 | DerivePointerAlignment: 'false'
 37 | FixNamespaceComments: 'true'
 38 | IncludeBlocks: Regroup
 39 | IndentCaseLabels: true
 40 | IndentPPDirectives: BeforeHash
 41 | IndentWidth: '4'
 42 | IndentWrappedFunctionNames: 'false'
 43 | Language: Cpp
 44 | MaxEmptyLinesToKeep: '1'
 45 | NamespaceIndentation: Inner
 46 | PenaltyBreakAssignment: '0'
 47 | PointerAlignment: Left
 48 | ReflowComments: 'true'
 49 | SortIncludes: 'true'
 50 | SortUsingDeclarations: 'true'
 51 | SpaceAfterCStyleCast: 'true'
 52 | SpaceAfterLogicalNot: 'false'
 53 | SpaceAfterTemplateKeyword: 'true'
 54 | SpaceBeforeAssignmentOperators: 'true'
 55 | SpaceBeforeCpp11BracedList: 'true'
 56 | SpaceBeforeCtorInitializerColon: 'false'
 57 | SpaceBeforeInheritanceColon: 'false'
 58 | SpaceBeforeParens: ControlStatements
 59 | SpaceBeforeRangeBasedForLoopColon: 'false'
 60 | SpaceInEmptyParentheses: 'false'
 61 | SpacesInAngles: 'false'
 62 | SpacesInCStyleCastParentheses: 'false'
 63 | SpacesInContainerLiterals: 'false'
 64 | SpacesInParentheses: 'false'
 65 | SpacesInSquareBrackets: 'false'
 66 | Standard: Cpp11
 67 | TabWidth: '4'
 68 | UseTab: Never
 69 | IncludeCategories:
 70 |   - Regex:     '^<(contour)/'
 71 |     Priority:  0
 72 |   - Regex:     '^<(terminal)/'
 73 |     Priority:  1
 74 |   - Regex:     '^<(terminal_renderer)/'
 75 |     Priority:  2
 76 |   - Regex:     '^<(text_shaper)/'
 77 |     Priority:  3
 78 |   - Regex:     '^<(crispy)/'
 79 |     Priority:  4
 80 |   - Regex:     '^<(libunicode)/'
 81 |     Priority:  5
 82 |   - Regex:     '^<(fmt)/'
 83 |     Priority:  6
 84 |   - Regex:     '^<(yaml-cpp)/'
 85 |     Priority:  7
 86 |   - Regex:     '^<(range)/'
 87 |     Priority:  8
 88 |   - Regex:     '^<gsl/'
 89 |     Priority:  9
 90 |   - Regex:     '^<fontconfig/'
 91 |     Priority:  10
 92 |   - Regex:     '^<harfbuzz/'
 93 |     Priority:  11
 94 |   - Regex:     '^<(QtCore|QtGui|QtWidgets|QtQml|QtQuick|QtNetwork|QtMultimedia)/'
 95 |     Priority:  12
 96 |   - Regex:     '^<catch2/'
 97 |     Priority:  20
 98 |   - Regex:     '^<[[:alnum:]_]+>'
 99 |     Priority:  21
100 |   - Regex:     '<[[:alnum:]_]+\.h>'
101 |     Priority:  22
102 |   - Regex:     '.*'
103 |     Priority:  23
104 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | ---
 2 | Checks: >-
 3 |   -*,
 4 |   clang-diagnostic-*,
 5 |   clang-analyzer-*,
 6 |   bugprone-*,
 7 |   -bugprone-suspicious-include,
 8 |   bugprone-unchecked-optional-access,
 9 |   performance-*,
10 |   -performance-no-int-to-ptr,
11 |   readability-non-const-parameter,
12 |   readability-redundant-*,
13 |   cppcoreguidelines-slicing,
14 |   readability-identifier-naming
15 | UseColor: true
16 | WarningsAsErrors: true
17 | HeaderFilterRegex: ''
18 | FormatStyle:     none
19 | CheckOptions:
20 |   - key:             bugprone-easily-swappable-parameters.MinimumLength
21 |     value:           '3'
22 |   - key:             cert-dcl16-c.NewSuffixes
23 |     value:           'L;LL;LU;LLU'
24 |   - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
25 |     value:           '0'
26 |   - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors
27 |     value:           '1'
28 |   - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic
29 |     value:           '1'
30 |   - key:             google-readability-braces-around-statements.ShortStatementLines
31 |     value:           '1'
32 |   - key:             google-readability-function-size.StatementThreshold
33 |     value:           '800'
34 |   - key:             google-readability-namespace-comments.ShortNamespaceLines
35 |     value:           '10'
36 |   - key:             google-readability-namespace-comments.SpacesBeforeComments
37 |     value:           '2'
38 |   - key:             modernize-loop-convert.MaxCopySize
39 |     value:           '16'
40 |   - key:             modernize-loop-convert.MinConfidence
41 |     value:           reasonable
42 |   - key:             modernize-loop-convert.NamingStyle
43 |     value:           CamelCase
44 |   - key:             modernize-pass-by-value.IncludeStyle
45 |     value:           llvm
46 |   - key:             modernize-replace-auto-ptr.IncludeStyle
47 |     value:           llvm
48 |   - key:             modernize-use-nullptr.NullMacros
49 |     value:           'NULL'
50 |   - key:   readability-identifier-naming.EnumCase
51 |     value: Camel_Snake_Case
52 |   - key:   readability-identifier-naming.ClassIgnoredRegexp
53 |     value: '^(RagelIterator|Expectation)$'
54 |   - key:   readability-identifier-naming.ClassCase
55 |     value: lower_case
56 |   - key:   readability-identifier-naming.ClassMemberCase
57 |     value: lower_case
58 |   - key:   readability-identifier-naming.ClassMethodCase
59 |     value: lower_case
60 |   - key:   readability-identifier-naming.ParameterCase
61 |     value: camelBack
62 |   - key:   readability-identifier-naming.ParameterPrefix
63 |     value: ''
64 |   - key:   readability-identifier-naming.ScopedEnumConstantCase
65 |     value: Camel_Snake_Case
66 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | insert_final_newline = true
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | 
11 | [*.yml]
12 | indent_style = space
13 | indent_size = 4
14 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: ['christianparpart']
4 | custom: ['https://paypal.me/ChristianParpart']
5 | 


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
1 | CI:
2 |     - .github/**
3 | CMake:
4 |     - "**CMakeLists.txt"
5 |     - cmake/**
6 | documentation:
7 |     - "**/*.md"
8 |     - docs/**
9 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: Build
  2 | 
  3 | on:
  4 |   push:
  5 |     paths-ignore:
  6 |       - 'docs/**'
  7 |       - '.github/ISSUE_TEMPLATE/**'
  8 |       - '.github/*.yml'
  9 |       - 'LICENSE.txt'
 10 |       - '*.md'
 11 |       - '*.sh'
 12 |     branches:
 13 |       - master
 14 |       - edge
 15 |   pull_request:
 16 |     branches:
 17 |       - master
 18 |       - edge
 19 | 
 20 | concurrency:
 21 |   group: ${{ github.ref }}
 22 |   cancel-in-progress: true
 23 | 
 24 | env:
 25 |   CTEST_OUTPUT_ON_FAILURE: 1
 26 | 
 27 | jobs:
 28 | 
 29 |   ubuntu_matrix:
 30 |     strategy:
 31 |       fail-fast: false
 32 |       matrix:
 33 |         os_version: ['24.04']
 34 |     name: "Ubuntu ${{ matrix.os_version }}"
 35 |     runs-on: ubuntu-${{ matrix.os_version }}
 36 |     steps:
 37 |       - uses: actions/checkout@v4
 38 |       - name: ccache
 39 |         uses: hendrikmuhs/ccache-action@v1
 40 |         with:
 41 |           key: "ccache-ubuntu_${{ matrix.os_version }}"
 42 |           max-size: 256M
 43 |       - name: "Update package database"
 44 |         run: sudo apt -q update
 45 |       - name: "install dependencies"
 46 |         run: ./scripts/install-deps.sh
 47 |       - name: "cmake"
 48 |         run: |
 49 |           cmake -S . -B build -G Ninja \
 50 |             -D CMAKE_BUILD_TYPE="RelWithDebInfo" \
 51 |             -D LIBUNICODE_BENCHMARK=ON \
 52 |             -D LIBUNICODE_TESTING=ON
 53 |       - name: "build"
 54 |         run: cmake --build build/ -- -j3
 55 |       - name: "test"
 56 |         run: ./build/src/libunicode/unicode_test
 57 | 
 58 |   # {{{ macOS
 59 |   osx:
 60 |     name: "macOS"
 61 |     runs-on: macos-14
 62 |     steps:
 63 |       - uses: actions/checkout@v4
 64 |       - name: ccache
 65 |         uses: hendrikmuhs/ccache-action@v1.2
 66 |         with:
 67 |           key: ccache-osx_qt${{ steps.set_vars.outputs.QTVER }}-r1
 68 |           max-size: 256M
 69 |       - name: "Install dependencies"
 70 |         # Sometimes, brew thinks it needs to install from source rather than binary.
 71 |         # For Qt this may take ages (many many hours). Let's not waste our CPU credits here,
 72 |         # and limit the run time.
 73 |         timeout-minutes: 15
 74 |         run: |
 75 |           set -ex
 76 |           #brew update
 77 |           ./scripts/install-deps.sh
 78 |       - name: "Generate build files"
 79 |         run: cmake --preset macos-release
 80 |       - name: "Build"
 81 |         run: cmake --build --preset macos-release
 82 |       - name: "Test"
 83 |         run: ctest --preset macos-release
 84 |   # }}}
 85 | 
 86 |   windows:
 87 |     name: "Windows"
 88 |     runs-on: windows-latest
 89 |     steps:
 90 |       - uses: actions/checkout@v4
 91 |       - name: "vcpkg: Install dependencies"
 92 |         uses: lukka/run-vcpkg@v11.1
 93 |         id: runvcpkg
 94 |         with:
 95 |           vcpkgDirectory: ${{ runner.workspace }}/vcpkg/
 96 |           vcpkgGitCommitId: 80403036a665cb8fcc1a1b3e17593d20b03b2489
 97 |       - name: "List cmake presets"
 98 |         run: cmake --list-presets
 99 |       - name: "Generate build files"
100 |         run: cmake --preset windows-cl-release -DCMAKE_TOOLCHAIN_FILE="${{ runner.workspace }}\vcpkg\scripts\buildsystems\vcpkg.cmake"
101 |       - name: "Build"
102 |         run: cmake --build --preset windows-cl-release
103 |       - name: "test"
104 |         run: ctest --preset windows-cl-release
105 | 
106 |   Fedora:
107 |     name: Fedora
108 |     runs-on: ubuntu-24.04
109 |     container: fedora:latest
110 | 
111 |     steps:
112 |       - uses: actions/checkout@v4
113 |       - name: Install build dependencies
114 |         run: |
115 |           dnf install -y curl
116 |           PREPARE_ONLY_EMBEDS=OFF SYSDEP_ASSUME_YES=ON ./scripts/install-deps.sh
117 |           dnf install -y unicode-ucd
118 |       - name: configure
119 |         run: cmake --preset linux-gcc-debug -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd
120 |       - name: build
121 |         run: cmake --build --preset linux-gcc-debug -j$(nproc)
122 |       - name: test
123 |         run: |
124 |           ctest --preset linux-gcc-debug
125 | 


--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
 1 | name: Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |     - '.github/ISSUE_TEMPLATE/**'
 7 |     - '.github/*.yml'
 8 |     - 'LICENSE.txt'
 9 |     branches:
10 |     - master
11 |   pull_request:
12 |     branches:
13 |     - master
14 | 
15 | concurrency:
16 |   group: checks-${{ github.ref }}
17 |   cancel-in-progress: true
18 | 
19 | jobs:
20 |   check_PR_TODOs:
21 |     name: "Check PR-TODOs"
22 |     runs-on: ubuntu-20.04
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: "Checking for open PR-related TODO items"
26 |       run: |
27 |         set -ex
28 |         ./scripts/check-pr-todos.sh
29 | 
30 |   check_clang_format:
31 |     name: "Check C++ style"
32 |     runs-on: ubuntu-20.04
33 |     steps:
34 |     - uses: actions/checkout@v4
35 |     - name: Install clang
36 |       run: |
37 |         wget https://apt.llvm.org/llvm.sh
38 |         chmod +x llvm.sh
39 |         sudo ./llvm.sh 18
40 |         sudo apt-get install clang-format-18
41 |     - name: "Clang-format"
42 |       run: find ./src/ -name "*.cpp" -o -name "*.h" | xargs clang-format-18 --Werror --dry-run
43 | 


--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
 1 | name: "PR Labeler"
 2 | 
 3 | on:
 4 | - pull_request_target
 5 | 
 6 | jobs:
 7 |   triage:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/labeler@v3
11 |       with:
12 |         repo-token: "${{ secrets.GITHUB_TOKEN }}"
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /build/
 2 | /out/
 3 | /_deps/
 4 | /_ucd/
 5 | /.cache/
 6 | /.clangd/
 7 | /compile_commands.json
 8 | /.vscode/
 9 | /sandbox/
10 | /target/
11 | src/libunicode/ucd.cpp
12 | src/libunicode/ucd.h
13 | src/libunicode/ucd_enums.h
14 | src/libunicode/ucd_fmt.h
15 | src/libunicode/ucd_ostream.h
16 | src/libunicode/codepoint_properties_data.cpp
17 | src/libunicode/codepoint_properties_data.h
18 | src/libunicode/codepoint_properties_names.cpp
19 | 


--------------------------------------------------------------------------------
/.vimspector.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#",
 3 |     "configurations": {
 4 |         "ModelTest": {
 5 |             "adapter": "vscode-cpptools",
 6 |             "configuration": {
 7 |                 "request": "launch",
 8 |                 "program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test",
 9 |                 "args": [
10 |                 ],
11 |                 "cwd": "${workspaceRoot}",
12 |                 "externalConsole": true,
13 |                 "stopAtEntry": false,
14 |                 "MIMode": "gdb"
15 |             },
16 |             "breakpoints": {
17 |                 "exception": {
18 |                     "caught": "Y",
19 |                     "uncaught": "Y"
20 |                 }
21 |             }
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
  2 | 
  3 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
  4 | 
  5 | project(libunicode VERSION "0.6.0" LANGUAGES CXX)
  6 | 
  7 | set(MASTER_PROJECT OFF)
  8 | if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
  9 |     set(MASTER_PROJECT ON)
 10 | endif()
 11 | 
 12 | if(MASTER_PROJECT AND NOT WIN32)
 13 |     set(LIBUNICODE_BUILD_STATIC_DEFAULT OFF)
 14 | else()
 15 |     set(LIBUNICODE_BUILD_STATIC_DEFAULT ON)
 16 | endif()
 17 | 
 18 | # setting defaults
 19 | if (NOT("${CMAKE_CXX_STANDARD}"))
 20 |     set(CMAKE_CXX_STANDARD 20)
 21 | endif()
 22 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 23 | set(CMAKE_CXX_EXTENSIONS OFF)
 24 | 
 25 | set(CMAKE_COLOR_DIAGNOSTICS ON)
 26 | 
 27 | if(("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") OR ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang"))
 28 |     add_compile_options(-Wall)
 29 |     add_compile_options(-Wextra)
 30 |     if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release")
 31 |         add_definitions(-D_GLIBCXX_DEBUG)
 32 |     endif()
 33 | elseif(DEFINED MSVC)
 34 |     add_definitions(-DNOMINMAX)
 35 |     add_compile_options(/utf-8)
 36 | endif()
 37 | 
 38 | include(EnableCcache)
 39 | include(ClangTidy)
 40 | include(PedanticCompiler)
 41 | 
 42 | set(CMAKE_EXPORT_COMPILE_COMMANDS ${MASTER_PROJECT})
 43 | option(LIBUNICODE_COVERAGE "libunicode: Builds with codecov [default: OFF]" OFF)
 44 | option(LIBUNICODE_EXAMPLES "libunicode: Enables building of example programs. [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT})
 45 | option(LIBUNICODE_TESTING "libunicode: Enables building of unittests for libunicode [default: ${MASTER_PROJECT}" ${MASTER_PROJECT})
 46 | option(LIBUNICODE_BENCHMARK "libunicode: Enables building of benchmark for libunicode [default: OFF]" OFF)
 47 | option(LIBUNICODE_TOOLS "libunicode: Builds CLI tools [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT})
 48 | option(LIBUNICODE_BUILD_STATIC "libunicode: provide static library instead of dynamic [default: ${LIBUNICODE_BUILD_STATIC_DEFAULT}]" ${LIBUNICODE_BUILD_STATIC_DEFAULT})
 49 | option(LIBUNICODE_TABLEGEN_FASTBUILD "libunicode: Use fast table generation (takes more memory in final tables) [default: OFF]" OFF)
 50 | 
 51 | string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSTEM_PROCESSOR_LOWER)
 52 | 
 53 | if(NOT LIBUNICODE_SIMD_IMPLEMENTATION)
 54 |     if((SYSTEM_PROCESSOR_LOWER STREQUAL "x86_64")
 55 |     OR (SYSTEM_PROCESSOR_LOWER STREQUAL "aarch64")
 56 |     OR (SYSTEM_PROCESSOR_LOWER STREQUAL "amd64")
 57 |     OR (SYSTEM_PROCESSOR_LOWER STREQUAL "arm64"))
 58 |         set(LIBUNICODE_SIMD_IMPLEMENTATION "intrinsics" CACHE STRING "libunicode: SIMD implementation to use" FORCE)
 59 |     else()
 60 |         set(LIBUNICODE_SIMD_IMPLEMENTATION "std" CACHE STRING "libunicode: SIMD implementation to use" FORCE)
 61 |     endif()
 62 |     set_property(CACHE LIBUNICODE_SIMD_IMPLEMENTATION PROPERTY STRINGS "std" "intrinsics" "none")
 63 | endif()
 64 | 
 65 | set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE)
 66 | include(ThirdParties)
 67 | 
 68 | if(LIBUNICODE_TESTING)
 69 |     enable_testing()
 70 | endif()
 71 | 
 72 | # ----------------------------------------------------------------------------
 73 | set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version")
 74 | set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.")
 75 | 
 76 | set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip")
 77 | set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233")
 78 | set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip")
 79 | set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.")
 80 | 
 81 | # ----------------------------------------------------------------------------
 82 | # code coverage
 83 | 
 84 | if(LIBUNICODE_COVERAGE AND NOT MSVC)
 85 |     add_compile_options(-g --coverage)
 86 |     set(CMAKE_EXE_LINKER_FLAGS "--coverage ${CMAKE_EXE_LINKER_FLAGS}")
 87 |     message("-- [code coverage] Enabled.")
 88 | else()
 89 |     message("-- [code coverage] Disabled.")
 90 | endif()
 91 | 
 92 | # ----------------------------------------------------------------------------
 93 | 
 94 | add_subdirectory(src/libunicode)
 95 | add_subdirectory(src/tools)
 96 | 
 97 | if("${CCACHE}" STREQUAL "")
 98 |     set(USING_CCACHE_STRING "OFF")
 99 | else()
100 |     set(USING_CCACHE_STRING "${CCACHE}")
101 | endif()
102 | 
103 | if(LIBUNICODE_BUILD_STATIC)
104 |     set(LIBUNICODE_BUILD_MODE "static")
105 | else()
106 |     set(LIBUNICODE_BUILD_MODE "dynamic")
107 | endif()
108 | 
109 | # Export the cmake package to the cmake package registry (~/.cmake/packages/)
110 | export(PACKAGE libunicode)
111 | 
112 | message(STATUS "------------------------------------------------------------------------------")
113 | message(STATUS "    libunicode (version ${libunicode_VERSION}${libunicode_VERSION_SUFFIX})")
114 | message(STATUS "------------------------------------------------------------------------------")
115 | message(STATUS "Build type:                  ${CMAKE_BUILD_TYPE}")
116 | message(STATUS "Build mode:                  ${LIBUNICODE_BUILD_MODE}")
117 | message(STATUS "Build unit tests:            ${LIBUNICODE_TESTING}")
118 | message(STATUS "Build benchmark:             ${LIBUNICODE_BENCHMARK}")
119 | message(STATUS "Build tools:                 ${LIBUNICODE_TOOLS}")
120 | message(STATUS "Enable tablegen fast build:  ${LIBUNICODE_TABLEGEN_FASTBUILD}")
121 | message(STATUS "Using ccache:                ${USING_CCACHE_STRING}")
122 | message(STATUS "SIMD support:                ${LIBUNICODE_SIMD_IMPLEMENTATION}")
123 | message(STATUS "Using UCD directory:         ${LIBUNICODE_UCD_DIR}")
124 | message(STATUS "Enable clang-tidy:           ${ENABLE_TIDY} (${CMAKE_CXX_CLANG_TIDY})")
125 | message(STATUS "------------------------------------------------------------------------------")
126 | 
127 | ThirdPartiesSummary2()
128 | 


--------------------------------------------------------------------------------
/CMakePresets.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 6,
 3 |     "cmakeMinimumRequired": {
 4 |         "major": 3,
 5 |         "minor": 27,
 6 |         "patch": 0
 7 |     },
 8 |     "include": [
 9 |         "cmake/presets/os-linux.json",
10 |         "cmake/presets/os-macos.json",
11 |         "cmake/presets/os-windows.json"
12 |     ]
13 | }
14 | 


--------------------------------------------------------------------------------
/Changelog.md:
--------------------------------------------------------------------------------
 1 | ## 0.5.0 (unreleased)
 2 | 
 3 | - Show emoji presentation in Unicode properties query tool.
 4 | 
 5 | ## 0.4.0 (2023-11-27)
 6 | 
 7 | - Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`.
 8 | - Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref.
 9 | - Move `scan_result.next` to `scan_state.next`.
10 | 
11 | ## 0.3.0 (2023-03-01)
12 | 
13 | - Fixes build error on GCC 13.
14 | - Fixes properly stopping at control characters in complex sub-state in scan API.
15 | - Fixes successful processing invalid UTF-8 in scan API.
16 | - Fixes installing missing headers for use of this API as non-embedded library.
17 | - Changes project and include directory from `unicode` to `libunicode` to avoid include path conflict with `ICU`.
18 | - Adds compile time option to either build static or dynamic binaries (`LIBUNICODE_BUILD_STATIC`).
19 | - Adds SONAME version to libraries.
20 | 
21 | ## 0.2.1 (2023-02-14)
22 | 
23 | - Fixes unicode-query's output for "character width".
24 | - Fixes decoding invalid UTF-8 locking up.
25 | - Fixes stage1 multistage-table sizes, reducing memory footprint a bit.
26 | - Adds SIMD implementation for scan API on ARM64 (NEON).
27 | - unicode-query is now linked statically on UNIX platforms.
28 | 
29 | ## 0.2.0 (2022-11-13)
30 | 
31 | - Slightly improve performance of grapheme cluster segmentation.
32 | - Fixes grapheme cluster segmentation of multiple consecutive regional flags.
33 | - Add access to Age property of a codepoint (giving information about at which Unicode version a codepoint was introduced).
34 | - Add access to the assigned name of a codepoint.
35 | - unicode-query: Now also prints name and age properties.
36 | - CMake install target also installs header and library (not just tools).
37 | - Reduce number of dependencies down to fmtlib and (for unit tests) Catch2.
38 | - Enables libunicode to be found via CMake's `find_package()`.
39 | - Improved default installation directories on UNIX via GNUInstallDirs helper.
40 | - Enable compiling on ARM64.
41 | 
42 | ## 0.1.0 (2022-11-03)
43 | 
44 | While version 0.1.0 sounds like a small number, this project is out there since quite some years already
45 | and actively used by Contour Terminal.
46 | 
47 | The biggest movements lately are major performance improvements in accessing Unicode properties,
48 | fixing bugs as usual, and apart from being a modern C++ Unicode library, we've now also added
49 | a command line tool to query Unicode properties in the hope it'll be useful to you.
50 | 
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![C++20](https://img.shields.io/badge/standard-C%2B%2B%2020-blue.svg?logo=C%2B%2B)](https://isocpp.org/)
 2 | [![CI Build](https://github.com/contour-terminal/libunicode/workflows/Build/badge.svg)](https://github.com/contour-terminal/libunicode/actions?query=workflow%3ABuild)
 3 | 
 4 | # Modern C++20 Unicode Library
 5 | 
 6 | The goal of this library is to bring painless unicode support to C++ with simple and easy to understand APIs.
 7 | 
 8 | The API naming conventions are chosen to look familiar to those using the C++ standard libary.
 9 | 
10 | ### Feature Overview
11 | 
12 | - [x] API for accessing UCD properties
13 | - [x] UTF8 <-> UTF32 conversion
14 | - [x] wcwidth equivalent (`int unicode::width(char32_t)`)
15 | - [x] grapheme segmentation (UTS algorithm)
16 | - [x] symbol/emoji segmentation (UTS algorithm)
17 | - [x] script segmentation [UTS 24](https://unicode.org/reports/tr24/)
18 | - [x] unit tests for most parts (wcwidth / segmentation)
19 | - [x] generic text run segmentation (top level segmentation API suitable for text shaping implementations)
20 | - [ ] word segmentation (UTS algorithm)
21 | - [x] CLI tool: `uc-inspect` for inspecting input files by code point properties, grapheme cluster, word, script, ...
22 | 
23 | # Unicode Technical Specifications
24 | 
25 | - [UTS 11](https://unicode.org/reports/tr11/) - character width
26 | - [UTS 24](https://unicode.org/reports/tr24/) - script property
27 | - [UTS 29](https://unicode.org/reports/tr29/) - text segmentation (grapheme cluster, word boundary)
28 | - [UTS 51](https://unicode.org/reports/tr51/) - Emoji
29 | 
30 | ### Integrate with your CMake project
31 | 
32 | ```sh
33 | git submodule add --name libunicode https://github.com/contour-terminal/libunicode 3rdparty/libunicode
34 | ```
35 | 
36 | ```cmake
37 | add_subdirectory(3rdparty/libunicode)
38 | 
39 | add_executable(your_tool your_tool.cpp)
40 | target_link_libraries(your_tool PRIVATE unicode::unicode)
41 | ```
42 | 
43 | ### Contributing
44 | 
45 | - for filing issues please visit: https://github.com/contour-terminal/libunicode/issues
46 | - fork and create pull requests:  https://github.com/contour-terminal/libunicode/pulls
47 | - I am also happy to just receive code reviews
48 | - you can help with documentation, or
49 | - general feedback is also very welcome
50 | 
51 | ### Users of this library
52 | 
53 | * [Contour Terminal Emulator](https://github.com/contour-terminal/contour/)
54 | 
55 | ### Disclaimer
56 | 
57 | This library is -in terms of features- by no means competive to the ICU library, but it attempts to
58 | provide a clean and intuitive modern C++ API for those that do not want to fight legacy-style C APIs.
59 | 
60 | I hope that over time we can add more and more features to this library to conform to the Unicode
61 | specification eventually at some point and I welcome everyone to contribute to it by forking the
62 | library, creating pull requests, or even just constructive feedback.
63 | 
64 | ### License
65 | 
66 | ```
67 | libunicode - a modern C++20 unicode library
68 | -------------------------------------------
69 | 
70 | Licensed under the Apache License, Version 2.0 (the "License");
71 | you may not use this file except in compliance with the License.
72 | 
73 | Unless required by applicable law or agreed to in writing, software
74 | distributed under the License is distributed on an "AS IS" BASIS,
75 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
76 | See the License for the specific language governing permissions and
77 | limitations under the License.
78 | ```
79 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # TODO
 3 | 
 4 | - [ ] rewrite test functions from `bool foo(x) ...` to `bool is_foo(x) ...`
 5 | - [ ] all about emoji flag sequences
 6 | - [ ] `bool is_mirrorred(char32_t) noexcept` (such as parenthesis, curly braces, brackets, ...)
 7 |   - also ability to get the mirrorring codepoint
 8 | - [ ] map codepoint to block (enum) - see Blocks.txt
 9 | - [ ] map coepoint to plane (enum)
10 | - [ ] map block to codepoint range
11 | - [ ] map plane to codepoint range
12 | - [ ] provide C API binding for basic functionality
13 | - [ ] `script_segmenter`: add support for commonPreferredScript tracking wrt brackets () [] {}.
14 | - [ ] `script_segmenter`: test "foo(λ);" -> {Latin, Greek, Latin}
15 | - [ ] `orientation_segmenter` (and integrate it into `run_segmenter` as well as its tests)
16 | - [ ] mktables: `fmtlib` integration into `ucd_fmt.h` (without actually depending on fmtlib itself)
17 | - [ ] mktables: `to_string` builder
18 | - [ ] mktables: `to_type` builder
19 | - [ ] mktables: pylint into CI
20 | - [ ] clang-tidy into CI
21 | - [ ] META: cmake install target (header files and .a file, executable)
22 | - [ ] META: pkg-config file
23 | - [ ] word segmentation (UTS algorithm)
24 | - [ ] generic text segmentation (top level segmentation API suitable for text shaping implementations)
25 | - [ ] CLI tool: unicode-inspect for inspecting input files by code point, grapheme cluster, word, script, ...
26 | - [x] unit tests for most parts (wcwidth / segmentation)
27 | - [x] README: list all TRs that are being implemented
28 | - [x] API for accessing UCD properties
29 | - [x] UTF8 <-> UTF32 conversion
30 | - [x] grapheme segmentation (UTS algorithm)
31 | - [x] symbol/emoji segmentation (UTS algorithm)
32 | - [x] wcwidth equivalent (`unicode::width(char32_t)`)
33 | - [x] script segmentation
34 | - [x] `out<T>` helper to force explicit `ref(val)` for more readability.
35 | - [x] `operator<<(ostream&, T)` for all UCD properties - in its own header file (`ucd_ostream.h`)
36 | - [x] `emoji_segmenter`: test "x 😀 y" -> {Text, Emoji, Text}
37 | - [x] make `run_segmenter` more templated / customizable
38 | - [x] mktables: `enum class` builder
39 | 
40 | ## Integration TODO
41 | 
42 | * [x] integrate into contour
43 | * [ ] see if this makes sense: make use of this library in klex lexical scanner, to allow unicode input
44 | 
45 | 


--------------------------------------------------------------------------------
/cmake/ClangTidy.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | option(ENABLE_TIDY "Enable clang-tidy [default: OFF]" OFF)
 3 | if(ENABLE_TIDY)
 4 |     find_program(CLANG_TIDY_EXE
 5 |         NAMES clang-tidy-9 clang-tidy-8 clang-tidy-7 clang-tidy
 6 |         DOC "Path to clang-tidy executable")
 7 |     if(NOT CLANG_TIDY_EXE)
 8 |         message(STATUS "[clang-tidy] Not found.")
 9 |     else()
10 |         message(STATUS "[clang-tidy] found: ${CLANG_TIDY_EXE}")
11 |         set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_EXE}")
12 |     endif()
13 | else()
14 |     message(STATUS "[clang-tidy] Disabled.")
15 | endif()
16 | 


--------------------------------------------------------------------------------
/cmake/EnableCcache.cmake:
--------------------------------------------------------------------------------
 1 | # Setup ccache.
 2 | #
 3 | # The ccache is auto-enabled if the tool is found.
 4 | # To disable set -DCCACHE=OFF option.
 5 | if(NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
 6 |     find_program(CCACHE ccache DOC "ccache tool path; set to OFF to disable")
 7 |     if(CCACHE)
 8 |         set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE})
 9 |         if(COMMAND cotire)
10 |             # Change ccache config to meet cotire requirements.
11 |             set(ENV{CCACHE_SLOPPINESS} pch_defines,time_macros)
12 |         endif()
13 |         message(STATUS "[ccache] Enabled: ${CCACHE}")
14 |     else()
15 |         message(STATUS "[ccache] Disabled.")
16 |     endif()
17 | endif()
18 | 


--------------------------------------------------------------------------------
/cmake/PedanticCompiler.cmake:
--------------------------------------------------------------------------------
 1 | include(CheckCXXCompilerFlag)
 2 | function(try_add_compile_options FLAG)
 3 |     # Remove leading - or / from the flag name.
 4 |     string(REGEX REPLACE "^[-/]" "" name ${FLAG})
 5 |     # Deletes any ':' because it's invalid variable names.
 6 |     string(REGEX REPLACE ":" "" name ${name})
 7 |     check_cxx_compiler_flag(${FLAG} ${name})
 8 |     if(${name})
 9 |         message(STATUS "Adding compiler flag: ${FLAG}.")
10 |         add_compile_options(${FLAG})
11 |     else()
12 |         message(STATUS "Adding compiler flag: ${FLAG} failed.")
13 |     endif()
14 | 
15 |     # If the optional argument passed, store the result there.
16 |     if(ARGV1)
17 |         set(${ARGV1} ${name} PARENT_SCOPE)
18 |     endif()
19 | endfunction()
20 | 
21 | option(PEDANTIC_COMPILER "Compile the project with almost all warnings turned on." ON)
22 | option(PEDANTIC_COMPILER_WERROR "Enables -Werror to force warnings to be treated as errors." OFF)
23 | 
24 | # Always show diagnostics in colored output.
25 | try_add_compile_options(-fdiagnostics-color=always)
26 | 
27 | if(${PEDANTIC_COMPILER})
28 |     if(("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") OR ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang"))
29 |         message(STATUS "Enabling pedantic compiler options: yes")
30 |         # TODO: check https://github.com/lefticus/cppbestpractices/blob/master/02-Use_the_Tools_Available.md#compilers
31 |         try_add_compile_options(-Qunused-arguments)
32 |         try_add_compile_options(-Wall)
33 |         try_add_compile_options(-Wconversion)
34 |         try_add_compile_options(-Wduplicate-enum)
35 |         try_add_compile_options(-Wduplicated-cond)
36 |         try_add_compile_options(-Wextra)
37 |         try_add_compile_options(-Wextra-semi)
38 |         try_add_compile_options(-Wfinal-dtor-non-final-class)
39 |         try_add_compile_options(-Wimplicit-fallthrough)
40 |         try_add_compile_options(-Wlogical-op)
41 |         try_add_compile_options(-Wmissing-declarations)
42 |         try_add_compile_options(-Wnewline-eof)
43 |         try_add_compile_options(-Wno-unknown-attributes)
44 |         try_add_compile_options(-Wno-unknown-pragmas)
45 |         try_add_compile_options(-Wnull-dereference)
46 |         try_add_compile_options(-Wpessimizing-move)
47 |         try_add_compile_options(-Wredundant-move)
48 |         try_add_compile_options(-Wsign-conversion)
49 |         try_add_compile_options(-Wsuggest-destructor-override)
50 |         try_add_compile_options(-pedantic)
51 |     else()
52 |         message(STATUS "Enabling pedantic compiler options: unsupported platform")
53 |     endif()
54 | else()
55 |     message(STATUS "Enabling pedantic compiler options: no")
56 | endif()
57 | 
58 | if(${PEDANTIC_COMPILER_WERROR})
59 |     try_add_compile_options(-Werror) # XXX Not yet, but hopefully soon.
60 | 
61 |     # Not sure how to work around these.
62 |     try_add_compile_options(-Wno-error=class-memaccess)
63 |     try_add_compile_options(-Wno-class-memaccess)
64 | 
65 |     # TODO: Should be addressed.
66 |     try_add_compile_options(-Wno-error=missing-declarations)
67 |     try_add_compile_options(-Wno-missing-declarations)
68 | endif()
69 | 


--------------------------------------------------------------------------------
/cmake/ThirdParties.cmake:
--------------------------------------------------------------------------------
 1 | # This directory structure is being created by `scripts/install-deps.sh`
 2 | # and is used to inject all the dependencies the operating system's
 3 | # package manager did not provide (not found or too old version).
 4 | 
 5 | if(EXISTS ${PROJECT_SOURCE_DIR}/_deps/sources/CMakeLists.txt)
 6 |     message(STATUS "Embedding 3rdparty libraries ...")
 7 |     add_subdirectory(${PROJECT_SOURCE_DIR}/_deps/sources)
 8 | endif()
 9 | 
10 | set(LIST ThirdParties)
11 | macro(Thirdparty_Include_If_MIssing _TARGET _PACKAGE_NAME)
12 |     if(${_PACKAGE_NAME} STREQUAL "")
13 |         set(${_PACKAGE_NAME} ${_TARGET})
14 |     endif()
15 |     if (NOT TARGET ${_TARGET})
16 |         find_package(${_PACKAGE_NAME} REQUIRED)
17 |         list(APPEND ThirdParties ${_TARGET}_SYSDEP)
18 |         set(THIRDPARTY_BUILTIN_${_TARGET} "system package")
19 |     else()
20 |         list(APPEND ThirdParties ${_TARGET}_EMBED)
21 |         set(THIRDPARTY_BUILTIN_${_TARGET} "embedded")
22 |     endif()
23 | endmacro()
24 | 
25 | # TODO make me working
26 | macro(ThirdPartiesSummary)
27 |     message(STATUS "==============================================================================")
28 |     message(STATUS "    ThirdParties")
29 |     message(STATUS "------------------------------------------------------------------------------")
30 |     foreach(TP ${ThirdParties})
31 |         message(STATUS "${TP}\t\t${THIRDPARTY_BUILTIN_${TP}}")
32 |     endforeach()
33 | endmacro()
34 | 
35 | # Now, conditionally find all dependencies that were not included above
36 | # via find_package, usually system installed packages.
37 | 
38 | if(LIBUNICODE_TESTING)
39 |     if(TARGET Catch2::Catch2WithMain)
40 |         set(THIRDPARTY_BUILTIN_Catch2 "embedded")
41 |     else()
42 |         find_package(Catch2 REQUIRED)
43 |         set(THIRDPARTY_BUILTIN_Catch2 "system package")
44 |     endif()
45 | endif()
46 | 
47 | if(LIBUNICODE_BENCHMARK)
48 |     if(TARGET benchmark::benchmark_main)
49 |         set(THIRDPARTY_BUILTIN_benchmark "embedded")
50 |     else()
51 |         find_package(benchmark REQUIRED)
52 |         set(THIRDPARTY_BUILTIN_benchmark "system package")
53 |     endif()
54 | endif()
55 | 
56 | 
57 | macro(ThirdPartiesSummary2)
58 |     message(STATUS "==============================================================================")
59 |     message(STATUS "    ThirdParties")
60 |     message(STATUS "------------------------------------------------------------------------------")
61 | if(LIBUNICODE_TESTING)
62 |     message(STATUS "Catch2              ${THIRDPARTY_BUILTIN_Catch2}")
63 | endif()
64 | if(LIBUNICODE_BENCHMARK)
65 |     message(STATUS "Benchmark           ${THIRDPARTY_BUILTIN_benchmark}")
66 | endif()
67 |     message(STATUS "------------------------------------------------------------------------------")
68 | endmacro()
69 | 


--------------------------------------------------------------------------------
/cmake/presets/common.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 6,
 3 |     "configurePresets": [
 4 |         { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON" } },
 5 |         { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
 6 |         { "name": "arch-native", "hidden": true, "cacheVariables": { "CMAKE_CXX_FLAGS": "-march=native" } },
 7 |         { "name": "clang", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "clang++" } },
 8 |         { "name": "gcc", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "g++" } },
 9 |         {
10 |             "name": "libunicode-common",
11 |             "hidden": true,
12 |             "binaryDir": "${sourceDir}/build/${presetName}",
13 |             "cacheVariables": {
14 |                 "LIBUNICODE_BENCHMARK": "ON",
15 |                 "LIBUNICODE_TESTING": "ON",
16 |                 "PEDANTIC_COMPILER": "ON",
17 |                 "PEDANTIC_COMPILER_WERROR": "ON"
18 |             }
19 |         }
20 |     ]
21 | }
22 | 


--------------------------------------------------------------------------------
/cmake/presets/os-linux.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 6,
 3 |     "include": [ "common.json" ],
 4 |     "configurePresets": [
 5 |         {
 6 |             "name": "linux-common",
 7 |             "inherits": "libunicode-common",
 8 |             "generator": "Ninja",
 9 |             "hidden": true,
10 |             "condition": {
11 |                 "type": "equals",
12 |                 "lhs": "${hostSystemName}",
13 |                 "rhs": "Linux"
14 |             }
15 |         },
16 |         {
17 |             "name": "linux-clang-debug",
18 |             "displayName": "Linux (Clang) Debug",
19 |             "inherits": ["linux-common", "debug", "clang"]
20 |         },
21 |         {
22 |             "name": "linux-clang-release",
23 |             "displayName": "Linux (Clang) Release",
24 |             "inherits": ["linux-common", "release", "clang"]
25 |         },
26 |         {
27 |             "name": "linux-gcc-debug",
28 |             "displayName": "Linux (GCC) Debug",
29 |             "inherits": ["linux-common", "debug", "gcc"]
30 |         },
31 |         {
32 |             "name": "linux-gcc-release",
33 |             "displayName": "Linux (GCC) Release",
34 |             "inherits": ["linux-common", "release", "gcc"]
35 |         },
36 |         {
37 |             "name": "linux-native-clang-release",
38 |             "displayName": "Linux (Clang, Native arch, Release)",
39 |             "inherits": ["linux-common", "release", "arch-native", "clang"]
40 |         },
41 |         {
42 |             "name": "linux-native-gcc-release",
43 |             "displayName": "Linux (GCC, Native arch, Release)",
44 |             "inherits": ["linux-common", "release", "arch-native", "gcc"]
45 |         }
46 |     ],
47 |     "buildPresets": [
48 |         { "name": "linux-clang-debug", "configurePreset": "linux-clang-debug" },
49 |         { "name": "linux-clang-release", "configurePreset": "linux-clang-release" },
50 |         { "name": "linux-gcc-debug", "configurePreset": "linux-gcc-debug" },
51 |         { "name": "linux-gcc-release", "configurePreset": "linux-gcc-release" },
52 |         { "name": "linux-native-clang-release", "configurePreset": "linux-native-clang-release" },
53 |         { "name": "linux-native-gcc-release", "configurePreset": "linux-native-gcc-release" }
54 |     ],
55 |     "testPresets": [
56 |         { "name": "linux-clang-debug", "configurePreset": "linux-clang-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } },
57 |         { "name": "linux-clang-release", "configurePreset": "linux-clang-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } },
58 |         { "name": "linux-gcc-debug", "configurePreset": "linux-gcc-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } },
59 |         { "name": "linux-gcc-release", "configurePreset": "linux-gcc-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }
60 |     ]
61 | }
62 | 


--------------------------------------------------------------------------------
/cmake/presets/os-macos.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 6,
 3 |     "include": [ "common.json" ],
 4 |     "configurePresets": [
 5 |         {
 6 |             "name": "macos-common",
 7 |             "inherits": "libunicode-common",
 8 |             "generator": "Ninja",
 9 |             "hidden": true,
10 |             "condition": {
11 |                 "type": "equals",
12 |                 "lhs": "${hostSystemName}",
13 |                 "rhs": "Darwin"
14 |             }
15 |         },
16 |         { "name": "macos-debug", "displayName": "MacOS Debug", "inherits": ["macos-common", "debug"] },
17 |         { "name": "macos-release", "displayName": "MacOS Release", "inherits": ["macos-common", "release"] }
18 |     ],
19 |     "buildPresets": [
20 |         { "name": "macos-debug", "configurePreset": "macos-debug" },
21 |         { "name": "macos-release", "configurePreset": "macos-release" }
22 |     ],
23 |     "testPresets": [
24 |         { "name": "macos-debug", "configurePreset": "macos-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } },
25 |         { "name": "macos-release", "configurePreset": "macos-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/cmake/presets/os-windows.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 6,
 3 |     "include": [ "common.json" ],
 4 |     "configurePresets": [
 5 |         {
 6 |             "name": "windows-common",
 7 |             "inherits": "libunicode-common",
 8 |             "displayName": "Windows - common settings",
 9 |             "hidden": true,
10 |             "binaryDir": "${sourceDir}/out/build/${presetName}",
11 |             "condition": {
12 |                 "type": "equals",
13 |                 "lhs": "${hostSystemName}",
14 |                 "rhs": "Windows"
15 |             },
16 |             "cacheVariables": {
17 |                 "VCPKG_TARGET_TRIPLET": "x64-windows",
18 |                 "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
19 |                 "CMAKE_VERBOSE_MAKEFILE": "ON",
20 |                 "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/../vcpkg/scripts/buildsystems/vcpkg.cmake"
21 |             }
22 |         },
23 |         { "name": "windows-cl-debug", "inherits": ["windows-common", "debug"], "displayName": "Windows (MSVC) Debug", "description": "Using MSVC compiler (64-bit)" },
24 |         { "name": "windows-cl-release", "inherits": ["windows-common", "release"], "displayName": "Windows (MSVC) Release", "description": "Using MSVC compiler (64-bit)" },
25 |         { "name": "windows-clang-common", "inherits": ["windows-common"], "hidden": true, "toolset":  "ClangCL,host=x64" },
26 |         { "name": "windows-clang-debug", "inherits": ["windows-clang-common", "debug"], "displayName": "Windows (ClangCL) Debug", "description": "Using Clang compiler (64-bit)" },
27 |         { "name": "windows-clang-release", "inherits": ["windows-clang-common", "release"], "displayName": "Windows (ClangCL) Release", "description": "Using Clang compiler (64-bit)" }
28 |     ],
29 |     "buildPresets": [
30 |         { "name": "windows-cl-debug", "displayName": "x64 (MSVC) Debug", "configurePreset": "windows-cl-debug", "configuration": "Debug" },
31 |         { "name": "windows-cl-release", "displayName": "x64 (MSVC) RelWithDebInfo", "configurePreset": "windows-cl-release", "configuration": "RelWithDebInfo" },
32 |         { "name": "windows-clang-debug", "displayName": "x64 (Clang) Debug", "configurePreset": "windows-clang-debug", "configuration": "Debug" },
33 |         { "name": "windows-clang-release", "displayName": "x64 (Clang) RelWithDebInfo", "configurePreset": "windows-clang-release", "configuration": "RelWithDebInfo" }
34 |     ],
35 |     "testPresets": [
36 |         { "name": "windows-cl-debug", "configurePreset": "windows-cl-debug", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } },
37 |         { "name": "windows-cl-release", "configurePreset": "windows-cl-release", "output": {"outputOnFailure": true}, "execution": { "noTestsAction": "error", "stopOnFailure": true } }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/scripts/check-pr-todos.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | FOUND=$(git grep "TODO(pr)" | grep -v "scripts/check-pr-todos.sh")
 3 | if [[ "${FOUND}" == "" ]]; then
 4 |     exit 0
 5 | fi
 6 | 
 7 | echo "This PR still contains PR-related TODO itmes that must be resolved."
 8 | echo
 9 | echo "${FOUND}"
10 | exit 1
11 | 


--------------------------------------------------------------------------------
/scripts/install-deps.ps1:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env pwsh
 2 | 
 3 | # Let's assume for now, that this script is only invoked from within Windows
 4 | # But in the future, I'd like it to support all the others, too.
 5 | 
 6 | class ThirdParty
 7 | {
 8 |     [ValidateNotNullOrEmpty()] [string] $Folder
 9 |     [ValidateNotNullOrEmpty()] [string] $Archive
10 |     [ValidateNotNullOrEmpty()] [string] $URI
11 | }
12 | 
13 | # Take care, order matters, at least as much as dependencies are of concern.
14 | $ThirdParties =
15 | @(
16 |     [ThirdParty]@{
17 |         Folder="Catch2-3.4.0";
18 |         Archive="Catch2-3.4.0.zip";
19 |         URI="https://github.com/catchorg/Catch2/archive/refs/tags/v3.4.0.zip"
20 |     };
21 | )
22 | 
23 | function Fetch-And-Add
24 | {
25 |     param (
26 |         [Parameter(Mandatory)] [string] $Target,
27 |         [Parameter(Mandatory)] [string] $Folder,
28 |         [Parameter(Mandatory)] [string] $Archive,
29 |         [Parameter(Mandatory)] [string] $URI,
30 |         [Parameter(Mandatory)] [string] $CMakeListsFile
31 |     )
32 | 
33 |     $DistfilesDir = "${Target}/distfiles"
34 |     if (! [System.IO.Directory]::Exists($DistfilesDir))
35 |     {
36 |         New-Item -ItemType Directory -Force -Path $DistfilesDir
37 |     }
38 | 
39 |     $ArchivePath = "${DistfilesDir}/${Archive}"
40 |     if (! [System.IO.File]::Exists($ArchivePath))
41 |     {
42 |         Write-Host "Downloading $Archive to $ArchivePath"
43 |         Invoke-WebRequest -Uri $URI -OutFile $ArchivePath
44 |     }
45 |     else
46 |     {
47 |         Write-Host "Already there: $ArchivePath"
48 |     }
49 | 
50 |     if (! [System.IO.Directory]::Exists("$Target/sources/$Folder"))
51 |     {
52 |         Write-Host "Populating ${Folder}"
53 |         Expand-Archive $ArchivePath -DestinationPath "${Target}/sources/"
54 |     }
55 |     else
56 |     {
57 |         Write-Host "Already there ${Folder}"
58 |     }
59 | 
60 |     Add-Content $CMakeListsFile "add_subdirectory(${Folder} EXCLUDE_FROM_ALL)"
61 | }
62 | 
63 | function Run
64 | {
65 |     $ProjectRoot = "${PSScriptRoot}/.."
66 |     $ThirsPartiesDir = "${ProjectRoot}/_deps"
67 |     $DistfilesDir = "${ThirsPartiesDir}/distfiles"
68 |     $SourcesDir = "${ThirsPartiesDir}/sources"
69 |     $CMakeListsFile = "${SourcesDir}/CMakeLists.txt"
70 | 
71 |     if (! [System.IO.Directory]::Exists($DistfilesDir))
72 |     {
73 |         New-Item -ItemType Directory -Force -Path $DistfilesDir
74 |     }
75 | 
76 |     if (! [System.IO.Directory]::Exists($SourcesDir))
77 |     {
78 |         New-Item -ItemType Directory -Force -Path $SourcesDir
79 |     }
80 | 
81 |     if ([System.IO.File]::Exists($CMakeListsFile))
82 |     {
83 |         Clear-Content $CMakeListsFile
84 |     }
85 | 
86 |     foreach($TP in $ThirdParties)
87 |     {
88 |         Fetch-And-Add -Folder $TP.Folder -Archive $TP.Archive -URI $TP.URI -Target $ThirsPartiesDir -CMakeListsFile $CMakeListsFile
89 |     }
90 | }
91 | 
92 | Run
93 | 


--------------------------------------------------------------------------------
/scripts/install-deps.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | 
  3 | set -ex
  4 | 
  5 | # Special environment variable to be used when only fetching and extracting
  6 | # embedded dependencies should be done, i.e. no system package manager is
  7 | # being invoked.
  8 | #
  9 | # set this as environment variable to ON to activate this mode.
 10 | if [ x$PREPARE_ONLY_EMBEDS = x ]
 11 | then
 12 |     PREPARE_ONLY_EMBEDS=OFF
 13 | fi
 14 | 
 15 | # if SYSDEP_ASSUME_YES=ON is set, then system package managers are attempted
 16 | # to install packages automatically, i.e. without confirmation.
 17 | if [ x$SYSDEP_ASSUME_YES = xON ]
 18 | then
 19 |     SYSDEP_ASSUME_YES='-y'
 20 | else
 21 |     unset SYSDEP_ASSUME_YES
 22 | fi
 23 | 
 24 | # {{{ sysdeps fetcher and unpacker for deps that aren't available via sys pkg mgnr
 25 | SYSDEPS_BASE_DIR="$(dirname $0)/../_deps"
 26 | 
 27 | SYSDEPS_DIST_DIR="$SYSDEPS_BASE_DIR/distfiles"
 28 | SYSDEPS_SRC_DIR="$SYSDEPS_BASE_DIR/sources"
 29 | SYSDEPS_CMAKE_FILE="$SYSDEPS_SRC_DIR/CMakeLists.txt"
 30 | 
 31 | fetch_and_unpack()
 32 | {
 33 |     NAME=$1
 34 |     DISTFILE=$2
 35 |     URL=$3
 36 | 
 37 |     FULL_DISTFILE="$SYSDEPS_DIST_DIR/$DISTFILE"
 38 | 
 39 |     if ! test -f "$FULL_DISTFILE"; then
 40 |         if which curl &>/dev/null; then
 41 |             curl -L -o "$FULL_DISTFILE" "$URL"
 42 |         elif which wget &>/dev/null; then
 43 |             wget -O "$FULL_DISTFILE" "$URL"
 44 |         elif which fetch &>/dev/null; then
 45 |             # FreeBSD
 46 |             fetch -o "$FULL_DISTFILE" "$URL"
 47 |         else
 48 |             echo "Don't know how to fetch from the internet." 1>&2
 49 |             exit 1
 50 |         fi
 51 |     else
 52 |         echo "Already fetched $DISTFILE. Skipping."
 53 |     fi
 54 | 
 55 |     if ! test -d "$SYSDEPS_SRC_DIR/$NAME"; then
 56 |         echo "Extracting $DISTFILE"
 57 |         tar xzpf $FULL_DISTFILE -C $SYSDEPS_SRC_DIR
 58 |     else
 59 |         echo "Already extracted $DISTFILE. Skipping."
 60 |     fi
 61 | 
 62 |     echo "add_subdirectory($NAME EXCLUDE_FROM_ALL)" >> $SYSDEPS_CMAKE_FILE
 63 | }
 64 | 
 65 | fetch_and_unpack_Catch2()
 66 | {
 67 |     fetch_and_unpack \
 68 |         Catch2-3.4.0 \
 69 |         Catch2-3.4.0.tar.gz \
 70 |         https://github.com/catchorg/Catch2/archive/refs/tags/v3.4.0.tar.gz
 71 | }
 72 | 
 73 | fetch_and_unpack_benchmark()
 74 | {
 75 |     fetch_and_unpack \
 76 |         benchmark-1.8.3 \
 77 |         benchmark-1.8.3.tar.gz \
 78 |         https://github.com/google/benchmark/archive/refs/tags/v1.8.3.tar.gz
 79 | }
 80 | 
 81 | 
 82 | prepare_fetch_and_unpack()
 83 | {
 84 |     mkdir -p "${SYSDEPS_BASE_DIR}"
 85 |     mkdir -p "${SYSDEPS_DIST_DIR}"
 86 |     mkdir -p "${SYSDEPS_SRC_DIR}"
 87 | 
 88 |     # empty out sysdeps CMakeLists.txt
 89 |     rm -f $SYSDEPS_CMAKE_FILE
 90 | }
 91 | # }}}
 92 | 
 93 | install_deps_ubuntu()
 94 | {
 95 |     local packages="
 96 |         build-essential
 97 |         cmake
 98 |         debhelper
 99 |         dpkg-dev
100 |         libc6-dev
101 |         make
102 |         ninja-build
103 |     "
104 | 
105 |     RELEASE=`grep VERSION_ID /etc/os-release | cut -d= -f2 | tr -d '"'`
106 | 
107 |     local NAME=`grep ^NAME /etc/os-release | cut -d= -f2 | cut -f1 | tr -d '"'`
108 | 
109 |     case $RELEASE in
110 |         "24.04")
111 |             fetch_and_unpack_Catch2
112 |             packages="$packages g++-14"
113 |             ;;
114 |         *)
115 |             packages="$packages g++"
116 |             packages="$packages catch2"
117 |             ;;
118 |     esac
119 | 
120 |     fetch_and_unpack_benchmark
121 | 
122 |     [ x$PREPARE_ONLY_EMBEDS = xON ] && return
123 | 
124 |     sudo apt install $SYSDEP_ASSUME_YES $packages
125 |     # sudo snap install --classic powershell
126 | }
127 | 
128 | install_deps_FreeBSD()
129 | {
130 |     fetch_and_unpack_benchmark
131 | 
132 |     [ x$PREPARE_ONLY_EMBEDS = xON ] && return
133 | 
134 |     su root -c "pkg install $SYSDEP_ASSUME_YES \
135 |         catch \
136 |         cmake \
137 |         ninja \
138 |         pkgconf \
139 |         range-v3
140 |     "
141 | }
142 | 
143 | install_deps_arch()
144 | {
145 |     fetch_and_unpack_benchmark
146 |     [ x$PREPARE_ONLY_EMBEDS = xON ] && return
147 | 
148 |     sudo pacman -S -y --needed \
149 |         catch2 \
150 |         cmake \
151 |         git \
152 |         ninja \
153 |         range-v3
154 | }
155 | 
156 | install_deps_fedora()
157 | {
158 |     version=`cat /etc/fedora-release | awk '{print $3}'`
159 | 
160 |     local packages="
161 |         catch-devel
162 |         cmake
163 |         gcc-c++
164 |         google-benchmark-devel
165 |         ninja-build
166 |         pkgconf
167 |     "
168 | 
169 |     [ x$PREPARE_ONLY_EMBEDS = xON ] && return
170 | 
171 |     sudo dnf install $SYSDEP_ASSUME_YES $packages
172 | }
173 | 
174 | 
175 | install_deps_darwin()
176 | {
177 |     fetch_and_unpack_Catch2
178 |     fetch_and_unpack_benchmark
179 | 
180 |     [ x$PREPARE_ONLY_EMBEDS = xON ] && return
181 | 
182 |     # NB: Also available in brew: mimalloc
183 |     # catch2: available in brew, but too new (version 3+)
184 |     brew install $SYSDEP_ASSUME_YES \
185 |         ninja \
186 |         pkg-config \
187 |         range-v3
188 | }
189 | 
190 | main()
191 | {
192 |     if test x$OS_OVERRIDE != x; then
193 |         # In CI, we need to be able to fetch embedd-setups for different OSes.
194 |         ID=$OS_OVERRIDE
195 |     elif test -f /etc/os-release; then
196 |         ID=`grep ^ID= /etc/os-release | cut -d= -f2`
197 |     else
198 |         ID=`uname -s`
199 |     fi
200 | 
201 |     prepare_fetch_and_unpack
202 | 
203 |     case "$ID" in
204 |         arch)
205 |             install_deps_arch
206 |             ;;
207 |         fedora)
208 |             install_deps_fedora
209 |             ;;
210 |         ubuntu|neon|debian)
211 |             install_deps_ubuntu
212 |             ;;
213 |         Darwin)
214 |             install_deps_darwin
215 |             ;;
216 |         FreeBSD)
217 |             install_deps_FreeBSD
218 |             ;;
219 |         *)
220 |             fetch_and_unpack_Catch2
221 |             fetch_and_unpack_benchmark
222 |             echo "OS $ID not supported."
223 |             echo "Dependencies were fetch manually and most likely libunicode will compile."
224 |             ;;
225 |     esac
226 | }
227 | 
228 | main $*
229 | 


--------------------------------------------------------------------------------
/src/libunicode/benchmark.cpp:
--------------------------------------------------------------------------------
 1 | #include <libunicode/convert.h>
 2 | #include <libunicode/scan.h>
 3 | #include <libunicode/utf8.h>
 4 | 
 5 | #include <string_view>
 6 | 
 7 | #include <benchmark/benchmark.h>
 8 | 
 9 | using std::string_view;
10 | 
11 | template <size_t L>
12 | static void benchmarkWithLength(benchmark::State& benchmarkState)
13 | {
14 |     auto TestText = std::string(L, 'a') + "\u00A9";
15 |     for (auto _: benchmarkState)
16 |     {
17 |         benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
18 |     }
19 | }
20 | 
21 | template <size_t L>
22 | static void benchmarkWithOffset(benchmark::State& benchmarkState)
23 | {
24 |     auto TestText = std::string(L, 'a') + "\U0001F600" + std::string(1000, 'a');
25 |     for (auto _: benchmarkState)
26 |     {
27 |         benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
28 |     }
29 | }
30 | 
31 | BENCHMARK(benchmarkWithLength<1>);
32 | BENCHMARK(benchmarkWithLength<10>);
33 | BENCHMARK(benchmarkWithLength<100>);
34 | BENCHMARK(benchmarkWithLength<1000>);
35 | BENCHMARK(benchmarkWithLength<10000>);
36 | BENCHMARK(benchmarkWithLength<100000>);
37 | BENCHMARK(benchmarkWithLength<1000000>);
38 | 
39 | BENCHMARK(benchmarkWithOffset<5>);
40 | BENCHMARK(benchmarkWithOffset<10>);
41 | BENCHMARK(benchmarkWithOffset<15>);
42 | BENCHMARK(benchmarkWithOffset<20>);
43 | BENCHMARK(benchmarkWithOffset<25>);
44 | BENCHMARK(benchmarkWithOffset<30>);
45 | BENCHMARK(benchmarkWithOffset<35>);
46 | BENCHMARK(benchmarkWithOffset<40>);
47 | BENCHMARK(benchmarkWithOffset<45>);
48 | BENCHMARK(benchmarkWithOffset<50>);
49 | BENCHMARK(benchmarkWithOffset<55>);
50 | BENCHMARK(benchmarkWithOffset<60>);
51 | BENCHMARK(benchmarkWithOffset<65>);
52 | BENCHMARK(benchmarkWithOffset<70>);
53 | BENCHMARK(benchmarkWithOffset<75>);
54 | BENCHMARK(benchmarkWithOffset<80>);
55 | BENCHMARK(benchmarkWithOffset<85>);
56 | BENCHMARK(benchmarkWithOffset<90>);
57 | BENCHMARK(benchmarkWithOffset<95>);
58 | BENCHMARK(benchmarkWithOffset<100>);
59 | BENCHMARK(benchmarkWithOffset<105>);
60 | BENCHMARK(benchmarkWithOffset<110>);
61 | BENCHMARK(benchmarkWithOffset<115>);
62 | BENCHMARK(benchmarkWithOffset<120>);
63 | BENCHMARK(benchmarkWithOffset<125>);
64 | BENCHMARK(benchmarkWithOffset<130>);
65 | 
66 | // Run the benchmark
67 | BENCHMARK_MAIN();
68 | 


--------------------------------------------------------------------------------
/src/libunicode/capi.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2021 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/capi.h>
 15 | #include <libunicode/convert.h>
 16 | #include <libunicode/grapheme_segmenter.h>
 17 | #include <libunicode/ucd.h>
 18 | #include <libunicode/width.h>
 19 | 
 20 | #include <iterator>
 21 | 
 22 | int u32_gc_count(u32_char_t const* codepoints, size_t size)
 23 | {
 24 |     if (!size)
 25 |         return 0;
 26 | 
 27 |     int count = 1;
 28 |     auto segmenter = unicode::grapheme_segmenter((char32_t const*) codepoints, (char32_t const*) codepoints + size);
 29 |     while (segmenter.codepointsAvailable())
 30 |     {
 31 |         ++segmenter;
 32 |         ++count;
 33 |     }
 34 |     return count;
 35 | }
 36 | 
 37 | int u8_gc_count(u8_char_t const* codepoints, size_t size)
 38 | {
 39 |     auto const u32 = unicode::convert_to<char32_t>(std::string_view(codepoints, size));
 40 |     return u32_gc_count((uint32_t const*) u32.data(), u32.size());
 41 | }
 42 | 
 43 | int u32_gc_width(u32_char_t const* codepoints, size_t size, int mode)
 44 | {
 45 |     int totalWidth = 0;
 46 |     auto segmenter = unicode::grapheme_segmenter((char32_t const*) codepoints, (char32_t const*) codepoints + size);
 47 |     while (segmenter.codepointsAvailable())
 48 |     {
 49 |         auto const cluster = *segmenter;
 50 |         int thisWidth = static_cast<int>(unicode::width(cluster.front()));
 51 |         if (mode != GC_WIDTH_MODE_NON_MODIFIABLE)
 52 |         {
 53 |             for (size_t i = 1; i < size; ++i)
 54 |             {
 55 |                 auto const codepoint = codepoints[i];
 56 |                 auto const width = [&]() {
 57 |                     switch (codepoint)
 58 |                     {
 59 |                         case 0xFE0E: return 1;
 60 |                         case 0xFE0F: return 2;
 61 |                         default: return static_cast<int>(unicode::width(codepoint));
 62 |                     }
 63 |                 }();
 64 |                 if (width && width != thisWidth)
 65 |                     thisWidth = width;
 66 |             }
 67 |         }
 68 |         totalWidth += thisWidth;
 69 |         ++segmenter;
 70 |     }
 71 |     return totalWidth;
 72 | }
 73 | 
 74 | int u8_gc_width(u8_char_t const* codepoints, size_t count, int allowMod)
 75 | {
 76 |     (void) codepoints;
 77 |     (void) count;
 78 |     (void) allowMod;
 79 | 
 80 |     return -1; // TODO
 81 | }
 82 | 
 83 | int u32_grapheme_unbreakable(u32_char_t a, u32_char_t b)
 84 | {
 85 |     return unicode::grapheme_segmenter::nonbreakable(a, b);
 86 | }
 87 | 
 88 | struct u8u32_stream_state
 89 | {
 90 |     unicode::decoder<char> conv {};
 91 | };
 92 | 
 93 | u8u32_stream_state_t u8u32_stream_convert_create()
 94 | {
 95 |     return new u8u32_stream_state();
 96 | }
 97 | 
 98 | int u8u32_stream_convert_run(u8u32_stream_state_t handle, u8_char_t input, u32_char_t* output)
 99 | {
100 |     if (auto const codepoint = handle->conv(static_cast<uint8_t>(input)); codepoint.has_value())
101 |     {
102 |         *output = codepoint.value();
103 |         return 1;
104 |     }
105 |     return 0;
106 | }
107 | 
108 | void u8u32_stream_convert_destroy(u8u32_stream_state_t* handle)
109 | {
110 |     delete *handle;
111 |     *handle = nullptr;
112 | }
113 | 
114 | int u32u8_convert(u32_char_t const* source, size_t slen, u8_char_t* dest, size_t dlen)
115 | {
116 |     auto conv = unicode::encoder<u8_char_t> {};
117 |     auto nwritten = 0;
118 | 
119 |     for (size_t i = 0; i < slen; ++i)
120 |     {
121 |         u8_char_t buf[4];
122 |         auto const bufEnd = conv(source[i], buf);
123 |         auto const bufLength = static_cast<size_t>(std::distance(buf, bufEnd));
124 |         if (!(bufLength < dlen))
125 |             return -1;
126 | 
127 |         for (size_t k = 0; k < bufLength; ++k)
128 |             dest[k] = buf[k];
129 |         nwritten += static_cast<int>(bufLength);
130 |         dest += bufLength;
131 |         dlen -= bufLength;
132 |     }
133 | 
134 |     return nwritten;
135 | }
136 | 


--------------------------------------------------------------------------------
/src/libunicode/capi.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2021 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #ifndef LIBUNICODE_CAPI_H
 15 | #define LIBUNICODE_CAPI_H 1
 16 | 
 17 | #include <stddef.h>
 18 | #include <stdint.h>
 19 | 
 20 | #if !defined(__cplusplus)
 21 | extern "C"
 22 | {
 23 | #endif
 24 | 
 25 | #define U32_CODEPOINT_MAX  0x10FFFF // 0b1'0000'1111'1111'1111'1111
 26 | #define U32_CODEPOINT_MIN  0
 27 | #define U32_CODEPOINT_MASK 0x1FFFFF // 0b1'1111'1111'1111'1111'1111
 28 | 
 29 |     /// UTF-8 character or 8bit segment of an UTF-8 character.
 30 |     typedef char u8_char_t;
 31 | 
 32 |     /// UTF-32 codepoint between 0 and 0x10FFFF. Any valud outside that
 33 |     /// range must be properly handled by the functions above to avoid undefined
 34 |     /// behavior.
 35 |     typedef uint_least32_t u32_char_t;
 36 | 
 37 | /**
 38 |  * Verifies that _p codepoint is a valid codepoint,
 39 |  */
 40 | #define u32_is_valid_codepoint(_codepoint) \
 41 |     ((_codepoint) < 0xD800 || ((_codepoint) > 0xDFFF && (_codepoint) <= 0x10FFFF)
 42 | 
 43 | /**
 44 |  * Extracts the unused higher order bits and moves them bit-wise to the right.
 45 |  *
 46 |  * A UTF-32 character is 32 bits wide (on a machine at least 32 bits wide),
 47 |  * and the largest valid UTF-32 codepoint is 0x10FFFF.
 48 |  * That is, the 21 least significant bits are used and the 11 most significant
 49 |  * bits are available other application specific purposes.
 50 |  */
 51 | #define u32_unused_bit_mask(_codepoint) ((_codepoint) >> 21)
 52 | 
 53 | /**
 54 |  * Returns the number of available bits that are free
 55 |  * for application-specific use.
 56 |  *
 57 |  * If the machine type for UTF-32 is actually 32 bits wide, this
 58 |  * function yields 11 bits. It is guaranteed to always return at least 11.
 59 |  */
 60 | #define u32_unused_bit_capacity() (8 * sizeof(u32_char_t) - 21)
 61 | 
 62 | /**
 63 |  * Tests if given bit at @p _index of the unused most significant bits is set.
 64 |  */
 65 | #define u32_unused_bit_get(_codepoint, _index) (((_codepoint) & (1 << ((_index) + 21))) != 0)
 66 | 
 67 | /**
 68 |  * Sets the bit at @p _index of the unused most significant bits.
 69 |  */
 70 | #define u32_unused_bit_on(_codepoint, _index) ((_codepoint) | (1 << ((_index) + 21)))
 71 | 
 72 | /**
 73 |  * Clears the bit at @p _index of the unused most significant bits.
 74 |  */
 75 | #define u32_unused_bit_off(_codepoint, _index) ((_codepoint) & ~(1 << ((_index) + 21)))
 76 | 
 77 | /**
 78 |  * Returns @p _codepoint with all unused bits cleared.
 79 |  */
 80 | #define u32_unused_bit_cleared(_codepoint) ((_codepoint) & U32_CODEPOINT_MASK)
 81 | 
 82 |     /**
 83 |      * Counts the number of grapheme clusters for given sequence of codepoints.
 84 |      *
 85 |      * Use this function to determine the number of
 86 |      * user perceived characters (grapheme clusters).
 87 |      *
 88 |      * @param codepoints   pointer to the first codepoint.
 89 |      * @param n            number of codepoints to count the grapheme clusters for.
 90 |      *
 91 |      * @return number of user perceived characters (grapheme clusters) counted
 92 |      *         in [codepoints, codepoints+n).
 93 |      */
 94 |     int u32_gc_count(u32_char_t const* codepoints, size_t n);
 95 |     int u8_gc_count(u8_char_t const* codepoints, size_t n);
 96 | 
 97 | /**
 98 |  * Determines that u32_gc_width()/u8_gc_width() must not respect
 99 |  * variation selectors, and thus, will not change the width of a
100 |  * processed grapheme cluster.
101 |  *
102 |  * Using this is not recommended unless backwards compatibility with
103 |  * broken clients is of concern.
104 |  */
105 | #define GC_WIDTH_MODE_NON_MODIFIABLE 0
106 | 
107 | /**
108 |  * Mandates that u32_gc_width()/u8_gc_width() must respect
109 |  * variation selectors, thus, allow changing the width of
110 |  * a processed grapheme cluster.
111 |  */
112 | #define GC_WIDTH_MODE_MODIFIABLE 1
113 | 
114 |     /**
115 |      * Computes the display width for given sequence of codepoints,
116 |      * respecting grapheme cluters, and modifiers.
117 |      *
118 |      * @param codepoints  pointer to first codepoint
119 |      * @param n           number of codepoints
120 |      * @param mode        determines how to deal with variation selectors that do
121 |      *                    force changing the width or a grapheme cluster.
122 |      *                    Valid values are:
123 |      *                    GC_WIDTH_MODE_MODIFIABLE (allow, recommended),
124 |      *                    GC_WIDTH_MODE_NON_MODIFIABLE (disallowed).
125 |      *
126 |      * Use this function to determine how many terminal grid cells a
127 |      * string of codepoints should occupy when being rendered.
128 |      */
129 |     int u32_gc_width(u32_char_t const* codepoints, size_t n, int mode);
130 | 
131 |     /**
132 |      * UTF-8 version of @c u32_gc_width().
133 |      *
134 |      * @see u32_gc_width(u32_char_t const* codepoints, size_t n, int allowMod)
135 |      */
136 |     int u8_gc_width(u8_char_t const* codepoints, size_t n, int allowMod);
137 | 
138 |     /**
139 |      * Tests if two consecutive codepoints do belong to the same grapheme cluster,
140 |      * i.e. are unbreakable and thus should not be broken up.
141 |      *
142 |      * @retval 1   both codepoints to belong to the same grapheme cluster.
143 |      * @retval 0   both codepoints do not belong to the same grapheme cluster.
144 |      *
145 |      * @note The grapheme cluster segmentation algorithm walks through an
146 |      * ordered sequence of checks that would either yield return value true
147 |      * or value. If non of these rules match, true will be returned, meaning
148 |      * that the both codepoints @p a and @p b can be broken up.
149 |      * This implies that codepoints outside the valid Unicode range will also yield
150 |      * return code true.
151 |      */
152 |     int u32_grapheme_unbreakable(u32_char_t a, u32_char_t b);
153 | 
154 |     /**
155 |      * Opaque handle for the UTF-8 to UTF-32 stream converter.
156 |      */
157 |     struct u8u32_stream_state;
158 |     typedef struct u8u32_stream_state* u8u32_stream_state_t;
159 | 
160 |     /**
161 |      * Constructs an UTF-8-to-UTF-32 streamed converter context.
162 |      */
163 |     u8u32_stream_state_t u8u32_stream_convert_create();
164 | 
165 |     /**
166 |      * Processes a single UTF-8 byte to incrementally convert
167 |      * consecutively incoming UTF-8 bytes into a sequence of UTF-32 codepoints.
168 |      *
169 |      * @param handle  The handle to the previously created streaming context.
170 |      * @param input   A UTF-8 character to be procecced consecutively.
171 |      * @param output  Will contain the fully parsed UTF-32 codepoint every time
172 |      *                one is available.
173 |      *
174 |      * @retval 0 The codepoint is incomplete and needs more data; @p output is not touched.
175 |      * @retval 1 The UTF-8 codepoint was fully processed and stored into @p output.
176 |      *
177 |      * @note Invalid input is silently ignored.
178 |      */
179 |     int u8u32_stream_convert_run(u8u32_stream_state_t handle, u8_char_t input, u32_char_t* output);
180 | 
181 |     /**
182 |      * Destroys the UTF-8-to-UTF-32 streaming converter context.
183 |      * The parameer @p handle will be set to NULL when this call leaves.
184 |      */
185 |     void u8u32_stream_convert_destroy(u8u32_stream_state_t* handle);
186 | 
187 |     /**
188 |      * Convertes a UTF-32 sequence to UTF-8.
189 |      *
190 |      * @param source Pointer sequence of UTF-32 characters to convert.
191 |      * @param slen   Number of UTF-32 characters to convert.
192 |      * @param dest   Destination address where to store the converted UTF-8 sequence to.
193 |      * @param dlen   Number of bytes to write to @p _dest at most.
194 |      *
195 |      * @note No trailing zero byte will be written.
196 |      *
197 |      * @retval >0     Success. TRhe number of bytes written to @p _dest is returned.
198 |      * @retval  0     _slen is 0, and nothing was converted.
199 |      * @retval -1     Some characters have been converted but target destination
200 |      *                is not large enough to continue.
201 |      */
202 |     int u32u8_convert(u32_char_t const* source, size_t slen, u8_char_t* dest, size_t dlen);
203 | 
204 | #if !defined(__cplusplus)
205 | }
206 | #endif
207 | 
208 | #endif
209 | 


--------------------------------------------------------------------------------
/src/libunicode/capi_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2021 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/capi.h>
15 | 
16 | #include <catch2/catch_test_macros.hpp>
17 | 
18 | #include <array>
19 | #include <format>
20 | #include <utility>
21 | 
22 | using namespace std;
23 | using namespace std::string_view_literals;
24 | 
25 | TEST_CASE("capi.gc_count")
26 | {
27 |     CHECK(0 == u32_gc_count((u32_char_t const*) U"", 0));
28 |     CHECK(1 == u32_gc_count((u32_char_t const*) U"\U0001F600\uFE0E", 2));
29 |     CHECK(2 == u32_gc_count((u32_char_t const*) U"\U0001F600\uFE0E\U0001F600", 3));
30 |     CHECK(3 == u32_gc_count((u32_char_t const*) U"Yeo", 3));
31 |     CHECK(4 == u32_gc_count((u32_char_t const*) U"Hi \U0001F600\uFE0E", 5));
32 |     CHECK(4 == u32_gc_count((u32_char_t const*) U"1234", 4));
33 |     CHECK(1 == u32_gc_count((u32_char_t const*) U"\U0001F468\U0001F3FE\u200D\U0001F9B3", 4));
34 | }
35 | 
36 | TEST_CASE("capi.u8u32_stream_convert_and_inverse")
37 | {
38 |     auto constexpr input = "[\xC3\xB6\xE2\x82\xAC\xF0\x9F\x98\x80"sv;
39 |     auto constexpr expected = U"[ö€😀"sv;
40 | 
41 |     u8u32_stream_state_t conv = u8u32_stream_convert_create();
42 |     u32string output;
43 |     for (size_t i = 0; i < input.size(); ++i)
44 |     {
45 |         u32_char_t out {};
46 |         if (u8u32_stream_convert_run(conv, input.at(i), &out))
47 |             output.push_back(out);
48 |     }
49 |     CHECK(output == expected);
50 |     u8u32_stream_convert_destroy(&conv);
51 | 
52 |     // Verify inverse conversion (UTF-32 to UTF-8) works, too.
53 |     array<u8_char_t, 32> inverse {};
54 |     auto const ilen = u32u8_convert((u32_char_t const*) expected.data(), expected.size(), inverse.data(), inverse.size());
55 |     CHECK(ilen == (int) input.size());
56 |     auto const inverseSV = string_view { inverse.data(), static_cast<size_t>(ilen) };
57 |     CHECK(inverseSV == input);
58 | }
59 | 
60 | // TODO more C-API tests
61 | 


--------------------------------------------------------------------------------
/src/libunicode/codepoint_properties.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/codepoint_properties.h>
15 | #include <libunicode/codepoint_properties_data.h>
16 | 
17 | namespace unicode
18 | {
19 | 
20 | codepoint_properties::tables_view codepoint_properties::configured_tables { precompiled::stage1.data(),
21 |                                                                             precompiled::stage2.data(),
22 |                                                                             precompiled::properties.data() };
23 | 
24 | codepoint_properties::names_view codepoint_properties::configured_names {
25 |     precompiled::names_stage1.data(),
26 |     precompiled::names_stage2.data(),
27 |     precompiled::names_stage3.data(),
28 | };
29 | 
30 | } // namespace unicode
31 | 


--------------------------------------------------------------------------------
/src/libunicode/codepoint_properties.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | #include <libunicode/emoji_segmenter.h> // Only for EmojiSegmentationCategory.
17 | #include <libunicode/multistage_table_view.h>
18 | #include <libunicode/support.h>   // Only for LIBUNICODE_PACKED.
19 | #include <libunicode/ucd_enums.h> // Only for the UCD enums.
20 | 
21 | #include <type_traits>
22 | 
23 | namespace unicode
24 | {
25 | 
26 | struct LIBUNICODE_PACKED codepoint_properties
27 | {
28 |     uint8_t char_width = 0;
29 |     uint8_t flags = 0;
30 |     Script script = Script::Unknown;
31 |     Grapheme_Cluster_Break grapheme_cluster_break = Grapheme_Cluster_Break::Other;
32 |     East_Asian_Width east_asian_width = East_Asian_Width::Narrow;
33 |     General_Category general_category = General_Category::Unassigned;
34 |     EmojiSegmentationCategory emoji_segmentation_category = EmojiSegmentationCategory::Invalid;
35 |     Age age = Age::Unassigned;
36 | 
37 |     static uint8_t constexpr FlagEmoji = 0x01;                // NOLINT(readability-identifier-naming)
38 |     static uint8_t constexpr FlagEmojiPresentation = 0x02;    // NOLINT(readability-identifier-naming)
39 |     static uint8_t constexpr FlagEmojiComponent = 0x04;       // NOLINT(readability-identifier-naming)
40 |     static uint8_t constexpr FlagEmojiModifier = 0x08;        // NOLINT(readability-identifier-naming)
41 |     static uint8_t constexpr FlagEmojiModifierBase = 0x10;    // NOLINT(readability-identifier-naming)
42 |     static uint8_t constexpr FlagExtendedPictographic = 0x20; // NOLINT(readability-identifier-naming)
43 |     static uint8_t constexpr FlagCoreGraphemeExtend = 0x40;   // NOLINT(readability-identifier-naming)
44 | 
45 |     constexpr bool emoji() const noexcept { return flags & FlagEmoji; }
46 |     constexpr bool emoji_presentation() const noexcept { return flags & FlagEmojiPresentation; }
47 |     constexpr bool emoji_component() const noexcept { return flags & FlagEmojiComponent; }
48 |     constexpr bool emoji_modifier() const noexcept { return flags & FlagEmojiModifier; }
49 |     constexpr bool emoji_modifier_base() const noexcept { return flags & FlagEmojiModifierBase; }
50 |     constexpr bool extended_pictographic() const noexcept { return flags & FlagExtendedPictographic; }
51 |     constexpr bool core_grapheme_extend() const noexcept { return flags & FlagCoreGraphemeExtend; }
52 | 
53 |     using tables_view = support::multistage_table_view<codepoint_properties,
54 |                                                        uint32_t,     // source type
55 |                                                        uint8_t,      // stage 1
56 |                                                        uint16_t,     // stage 2
57 |                                                        256,          // block size
58 |                                                        0x110'000 - 1 // max value
59 |                                                        >;
60 | 
61 |     using names_view = support::multistage_table_view<std::string_view,
62 |                                                       uint32_t,     // source type
63 |                                                       uint8_t,      // stage 1
64 |                                                       uint16_t,     // stage 2
65 |                                                       256,          // block size
66 |                                                       0x110'000 - 1 // max value
67 |                                                       >;
68 | 
69 |     static tables_view configured_tables;
70 |     static names_view configured_names;
71 | 
72 |     /// Retrieves the codepoint properties for the given codepoint.
73 |     [[nodiscard]] static codepoint_properties get(char32_t codepoint) noexcept { return configured_tables.get(codepoint); }
74 | 
75 |     [[nodiscard]] static std::string_view name(char32_t codepoint) { return configured_names.get(codepoint); }
76 | };
77 | 
78 | static_assert(std::has_unique_object_representations_v<codepoint_properties>);
79 | 
80 | constexpr bool operator==(codepoint_properties const& a, codepoint_properties const& b) noexcept
81 | {
82 |     return __builtin_memcmp(&a, &b, sizeof(codepoint_properties)) == 0;
83 | }
84 | 
85 | constexpr bool operator!=(codepoint_properties const& a, codepoint_properties const& b) noexcept
86 | {
87 |     return !(a == b);
88 | }
89 | 
90 | } // namespace unicode
91 | 


--------------------------------------------------------------------------------
/src/libunicode/codepoint_properties_loader.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | #include <libunicode/codepoint_properties.h>
17 | #include <libunicode/multistage_table_generator.h>
18 | 
19 | #include <vector>
20 | 
21 | namespace unicode
22 | {
23 | 
24 | using codepoint_properties_table = support::multistage_table<codepoint_properties,
25 |                                                              uint32_t,     // source type
26 |                                                              uint8_t,      // stage 1
27 |                                                              uint16_t,     // stage 2
28 |                                                              256,          // block size
29 |                                                              0x110'000 - 1 // max value
30 |                                                              >;
31 | 
32 | using codepoint_names_table = support::multistage_table<std::string,
33 |                                                         uint32_t,     // source type
34 |                                                         uint8_t,      // stage 1
35 |                                                         uint16_t,     // stage 2
36 |                                                         256,          // block size
37 |                                                         0x110'000 - 1 // max value
38 |                                                         >;
39 | 
40 | std::tuple<codepoint_properties_table, codepoint_names_table> load_from_directory(std::string const& ucdDataDirectory,
41 |                                                                                   std::ostream* log);
42 | 
43 | } // namespace unicode
44 | 


--------------------------------------------------------------------------------
/src/libunicode/convert_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/convert.h>
 15 | #include <libunicode/support.h>
 16 | #include <libunicode/utf8.h>
 17 | 
 18 | #include <catch2/catch_test_macros.hpp>
 19 | 
 20 | #include <format>
 21 | #include <iterator>
 22 | 
 23 | using namespace unicode;
 24 | using namespace std::string_literals;
 25 | using namespace std;
 26 | 
 27 | TEST_CASE("convert.same", "[convert]")
 28 | {
 29 |     auto const s8 = "Hello, 😀"sv;
 30 |     auto t8 = string {};
 31 |     unicode::convert_to<char>(s8, back_insert_iterator(t8));
 32 |     CHECK(s8 == t8);
 33 | 
 34 |     auto const s16 = u"Hello, 😀"sv;
 35 |     auto t16 = u16string {};
 36 |     unicode::convert_to<char16_t>(s16, back_insert_iterator(t16));
 37 |     CHECK(s16 == t16);
 38 | 
 39 |     auto const s32 = U"Hello, 😀"sv;
 40 |     auto t32 = u32string {};
 41 |     unicode::convert_to<char32_t>(s32, back_insert_iterator(t32));
 42 |     CHECK(s32 == t32);
 43 | }
 44 | 
 45 | TEST_CASE("convert.8_to_16", "[convert]")
 46 | {
 47 |     auto constexpr input = string_view {
 48 |         "["
 49 |         "\xC3\xB6"         // ö  - german o-umlaut
 50 |         "\xE2\x82\xAC"     // €  - EURO sign U+20AC
 51 |         "\xF0\x9F\x98\x80" // 😀 - U+1F600
 52 |     };
 53 |     u16string output;
 54 |     auto bi = back_inserter(output);
 55 |     unicode::convert_to<char16_t>(input, bi); // back_inserter(output));
 56 |     CHECK(output.size() == 5);
 57 |     CHECK(output == u"[ö€😀");
 58 | }
 59 | 
 60 | TEST_CASE("convert.8_to_32", "[convert]")
 61 | {
 62 |     auto constexpr input = string_view {
 63 |         "["
 64 |         "\xC3\xB6"         // ö  - german o-umlaut
 65 |         "\xE2\x82\xAC"     // €  - EURO sign U+20AC
 66 |         "\xF0\x9F\x98\x80" // 😀 - U+1F600
 67 |     };
 68 |     u32string output;
 69 |     auto bi = back_inserter(output);
 70 |     unicode::convert_to<char32_t>(input, bi); // back_inserter(output));
 71 |     CHECK(output.size() == 4);
 72 |     CHECK(output == U"[ö€😀");
 73 | }
 74 | 
 75 | TEST_CASE("convert.utf8.incremental_decode", "[utf8]")
 76 | {
 77 |     auto constexpr values = string_view {
 78 |         "["
 79 |         "\xC3\xB6"         // ö  - german o-umlaut
 80 |         "\xE2\x82\xAC"     // €  - EURO sign U+20AC
 81 |         "\xF0\x9F\x98\x80" // 😀 - U+1F600
 82 |     };
 83 |     auto const* p = (char8_type const*) (values.data());
 84 |     auto decode = unicode::decoder<char> {};
 85 | 
 86 |     // single-byte
 87 |     auto result = decode(*p++);
 88 |     REQUIRE(result.has_value());
 89 |     REQUIRE(result.value() == '[');
 90 | 
 91 |     // double-byte
 92 |     result = decode(*p++);
 93 |     REQUIRE(!result.has_value());
 94 |     result = decode(*p++);
 95 |     REQUIRE(result.has_value());
 96 |     REQUIRE(result.value() == U'\u00F6'); // ö
 97 | 
 98 |     // 3 bytes
 99 |     result = decode(*p++);
100 |     REQUIRE(!result.has_value());
101 |     result = decode(*p++);
102 |     REQUIRE(!result.has_value());
103 |     result = decode(*p++);
104 |     REQUIRE(result.has_value());
105 |     REQUIRE(result.value() == U'\u20AC'); // €
106 | 
107 |     // 4 bytes
108 |     result = decode(*p++);
109 |     REQUIRE(!result.has_value());
110 |     result = decode(*p++);
111 |     REQUIRE(!result.has_value());
112 |     result = decode(*p++);
113 |     REQUIRE(!result.has_value());
114 |     result = decode(*p++);
115 |     REQUIRE(result.has_value());
116 |     REQUIRE(result.value() == U'\U0001F600'); // 😀
117 | }
118 | 


--------------------------------------------------------------------------------
/src/libunicode/emoji_presentation_scanner.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #line 1 "emoji_presentation_scanner.rl"
  3 | /* Copyright 2019 Google LLC
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  *     https://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | // clang-format off
 19 | #pragma GCC diagnostic ignored "-Wsign-conversion"
 20 | 
 21 | #line 20 "emoji_presentation_scanner.c"
 22 | static const char _emoji_presentation_actions[] = {
 23 | 	0, 1, 0, 1, 1, 1, 5, 1,
 24 | 	6, 1, 7, 1, 8, 1, 9, 1,
 25 | 	10, 1, 11, 2, 2, 3, 2, 2,
 26 | 	4
 27 | };
 28 | 
 29 | static const char _emoji_presentation_key_offsets[] = {
 30 | 	0, 5, 7, 14, 18, 20, 21, 24,
 31 | 	29, 30, 34, 36
 32 | };
 33 | 
 34 | static const unsigned char _emoji_presentation_trans_keys[] = {
 35 | 	3u, 7u, 13u, 0u, 2u, 14u, 15u, 2u,
 36 | 	3u, 6u, 7u, 13u, 0u, 1u, 9u, 10u,
 37 | 	11u, 12u, 10u, 12u, 10u, 4u, 10u, 12u,
 38 | 	4u, 9u, 10u, 11u, 12u, 6u, 9u, 10u,
 39 | 	11u, 12u, 8u, 10u, 9u, 10u, 11u, 12u,
 40 | 	14u, 0
 41 | };
 42 | 
 43 | static const char _emoji_presentation_single_lengths[] = {
 44 | 	3, 2, 5, 4, 2, 1, 3, 5,
 45 | 	1, 4, 2, 5
 46 | };
 47 | 
 48 | static const char _emoji_presentation_range_lengths[] = {
 49 | 	1, 0, 1, 0, 0, 0, 0, 0,
 50 | 	0, 0, 0, 0
 51 | };
 52 | 
 53 | static const char _emoji_presentation_index_offsets[] = {
 54 | 	0, 5, 8, 15, 20, 23, 25, 29,
 55 | 	35, 37, 42, 45
 56 | };
 57 | 
 58 | static const char _emoji_presentation_indicies[] = {
 59 | 	2, 1, 1, 1, 0, 4, 5, 3,
 60 | 	7, 8, 10, 11, 12, 6, 9, 5,
 61 | 	13, 14, 15, 0, 13, 15, 16, 13,
 62 | 	16, 15, 13, 15, 16, 15, 5, 13,
 63 | 	14, 15, 16, 5, 17, 5, 13, 14,
 64 | 	18, 17, 5, 13, 16, 5, 13, 14,
 65 | 	15, 4, 16, 0
 66 | };
 67 | 
 68 | static const char _emoji_presentation_trans_targs[] = {
 69 | 	2, 4, 6, 2, 1, 2, 3, 3,
 70 | 	7, 2, 8, 9, 11, 0, 2, 5,
 71 | 	2, 2, 10
 72 | };
 73 | 
 74 | static const char _emoji_presentation_trans_actions[] = {
 75 | 	17, 19, 19, 15, 0, 7, 22, 19,
 76 | 	19, 9, 0, 22, 19, 0, 5, 19,
 77 | 	11, 13, 19
 78 | };
 79 | 
 80 | static const char _emoji_presentation_to_state_actions[] = {
 81 | 	0, 0, 1, 0, 0, 0, 0, 0,
 82 | 	0, 0, 0, 0
 83 | };
 84 | 
 85 | static const char _emoji_presentation_from_state_actions[] = {
 86 | 	0, 0, 3, 0, 0, 0, 0, 0,
 87 | 	0, 0, 0, 0
 88 | };
 89 | 
 90 | static const char _emoji_presentation_eof_trans[] = {
 91 | 	1, 4, 0, 1, 17, 17, 17, 17,
 92 | 	18, 18, 17, 17
 93 | };
 94 | 
 95 | static const int emoji_presentation_start = 2;
 96 | 
 97 | 
 98 | #line 20 "emoji_presentation_scanner.rl"
 99 | 
100 | 
101 | 
102 | #line 89 "emoji_presentation_scanner.rl"
103 | 
104 | 
105 | static emoji_text_iter_t
106 | scan_emoji_presentation (emoji_text_iter_t p,
107 |     const emoji_text_iter_t pe,
108 |     bool* is_emoji)
109 | {
110 |   emoji_text_iter_t ts, te;
111 |   const emoji_text_iter_t eof = pe;
112 | 
113 |   unsigned act;
114 |   int cs;
115 | 
116 | 
117 | #line 116 "emoji_presentation_scanner.c"
118 | 	{
119 | 	cs = emoji_presentation_start;
120 | 	ts = 0;
121 | 	te = 0;
122 | 	act = 0;
123 | 	}
124 | 
125 | #line 124 "emoji_presentation_scanner.c"
126 | 	{
127 | 	int _klen;
128 | 	unsigned int _trans;
129 | 	const char *_acts;
130 | 	unsigned int _nacts;
131 | 	const unsigned char *_keys;
132 | 
133 | 	if ( p == pe )
134 | 		goto _test_eof;
135 | _resume:
136 | 	_acts = _emoji_presentation_actions + _emoji_presentation_from_state_actions[cs];
137 | 	_nacts = (unsigned int) *_acts++;
138 | 	while ( _nacts-- > 0 ) {
139 | 		switch ( *_acts++ ) {
140 | 	case 1:
141 | #line 1 "NONE"
142 | 	{ts = p;}
143 | 	break;
144 | #line 143 "emoji_presentation_scanner.c"
145 | 		}
146 | 	}
147 | 
148 | 	_keys = _emoji_presentation_trans_keys + _emoji_presentation_key_offsets[cs];
149 | 	_trans = _emoji_presentation_index_offsets[cs];
150 | 
151 | 	_klen = _emoji_presentation_single_lengths[cs];
152 | 	if ( _klen > 0 ) {
153 | 		const unsigned char *_lower = _keys;
154 | 		const unsigned char *_mid;
155 | 		const unsigned char *_upper = _keys + _klen - 1;
156 | 		while (1) {
157 | 			if ( _upper < _lower )
158 | 				break;
159 | 
160 | 			_mid = _lower + ((_upper-_lower) >> 1);
161 | 			if ( (*p) < *_mid )
162 | 				_upper = _mid - 1;
163 | 			else if ( (*p) > *_mid )
164 | 				_lower = _mid + 1;
165 | 			else {
166 | 				_trans += (unsigned int)(_mid - _keys);
167 | 				goto _match;
168 | 			}
169 | 		}
170 | 		_keys += _klen;
171 | 		_trans += _klen;
172 | 	}
173 | 
174 | 	_klen = _emoji_presentation_range_lengths[cs];
175 | 	if ( _klen > 0 ) {
176 | 		const unsigned char *_lower = _keys;
177 | 		const unsigned char *_mid;
178 | 		const unsigned char *_upper = _keys + (_klen<<1) - 2;
179 | 		while (1) {
180 | 			if ( _upper < _lower )
181 | 				break;
182 | 
183 | 			_mid = _lower + (((_upper-_lower) >> 1) & ~1);
184 | 			if ( (*p) < _mid[0] )
185 | 				_upper = _mid - 2;
186 | 			else if ( (*p) > _mid[1] )
187 | 				_lower = _mid + 2;
188 | 			else {
189 | 				_trans += (unsigned int)((_mid - _keys)>>1);
190 | 				goto _match;
191 | 			}
192 | 		}
193 | 		_trans += _klen;
194 | 	}
195 | 
196 | _match:
197 | 	_trans = _emoji_presentation_indicies[_trans];
198 | _eof_trans:
199 | 	cs = _emoji_presentation_trans_targs[_trans];
200 | 
201 | 	if ( _emoji_presentation_trans_actions[_trans] == 0 )
202 | 		goto _again;
203 | 
204 | 	_acts = _emoji_presentation_actions + _emoji_presentation_trans_actions[_trans];
205 | 	_nacts = (unsigned int) *_acts++;
206 | 	while ( _nacts-- > 0 )
207 | 	{
208 | 		switch ( *_acts++ )
209 | 		{
210 | 	case 2:
211 | #line 1 "NONE"
212 | 	{te = p+1;}
213 | 	break;
214 | 	case 3:
215 | #line 85 "emoji_presentation_scanner.rl"
216 | 	{act = 2;}
217 | 	break;
218 | 	case 4:
219 | #line 86 "emoji_presentation_scanner.rl"
220 | 	{act = 3;}
221 | 	break;
222 | 	case 5:
223 | #line 84 "emoji_presentation_scanner.rl"
224 | 	{te = p+1;{ *is_emoji = false; return te; }}
225 | 	break;
226 | 	case 6:
227 | #line 85 "emoji_presentation_scanner.rl"
228 | 	{te = p+1;{ *is_emoji = true; return te; }}
229 | 	break;
230 | 	case 7:
231 | #line 86 "emoji_presentation_scanner.rl"
232 | 	{te = p+1;{ *is_emoji = false; return te; }}
233 | 	break;
234 | 	case 8:
235 | #line 85 "emoji_presentation_scanner.rl"
236 | 	{te = p;p--;{ *is_emoji = true; return te; }}
237 | 	break;
238 | 	case 9:
239 | #line 86 "emoji_presentation_scanner.rl"
240 | 	{te = p;p--;{ *is_emoji = false; return te; }}
241 | 	break;
242 | 	case 10:
243 | #line 85 "emoji_presentation_scanner.rl"
244 | 	{{p = ((te))-1;}{ *is_emoji = true; return te; }}
245 | 	break;
246 | 	case 11:
247 | #line 1 "NONE"
248 | 	{	switch( act ) {
249 | 	case 2:
250 | 	{{p = ((te))-1;} *is_emoji = true; return te; }
251 | 	break;
252 | 	case 3:
253 | 	{{p = ((te))-1;} *is_emoji = false; return te; }
254 | 	break;
255 | 	}
256 | 	}
257 | 	break;
258 | #line 257 "emoji_presentation_scanner.c"
259 | 		}
260 | 	}
261 | 
262 | _again:
263 | 	_acts = _emoji_presentation_actions + _emoji_presentation_to_state_actions[cs];
264 | 	_nacts = (unsigned int) *_acts++;
265 | 	while ( _nacts-- > 0 ) {
266 | 		switch ( *_acts++ ) {
267 | 	case 0:
268 | #line 1 "NONE"
269 | 	{ts = 0;}
270 | 	break;
271 | #line 270 "emoji_presentation_scanner.c"
272 | 		}
273 | 	}
274 | 
275 | 	if ( ++p != pe )
276 | 		goto _resume;
277 | 	_test_eof: {}
278 | 	if ( p == eof )
279 | 	{
280 | 	if ( _emoji_presentation_eof_trans[cs] > 0 ) {
281 | 		_trans = _emoji_presentation_eof_trans[cs] - 1;
282 | 		goto _eof_trans;
283 | 	}
284 | 	}
285 | 
286 | 	}
287 | 
288 | #line 105 "emoji_presentation_scanner.rl"
289 | 
290 | 
291 |   /* Should not be reached. */
292 |   *is_emoji = false;
293 |   return pe;
294 | }
295 | 
296 | // clang-format on
297 | 


--------------------------------------------------------------------------------
/src/libunicode/emoji_presentation_scanner.rl:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC
  2 |  *
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     https://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  */
 15 | 
 16 | // clang-format off
 17 | #pragma GCC diagnostic ignored "-Wsign-conversion"
 18 | 
 19 | %%{
 20 |   machine emoji_presentation;
 21 |   alphtype unsigned char;
 22 |   write data noerror nofinal noentry;
 23 | }%%
 24 | 
 25 | %%{
 26 | 
 27 | EMOJI = 0;
 28 | EMOJI_TEXT_PRESENTATION = 1;
 29 | EMOJI_EMOJI_PRESENTATION = 2;
 30 | EMOJI_MODIFIER_BASE = 3;
 31 | EMOJI_MODIFIER = 4;
 32 | EMOJI_VS_BASE = 5;
 33 | REGIONAL_INDICATOR = 6;
 34 | KEYCAP_BASE = 7;
 35 | COMBINING_ENCLOSING_KEYCAP = 8;
 36 | COMBINING_ENCLOSING_CIRCLE_BACKSLASH = 9;
 37 | ZWJ = 10;
 38 | VS15 = 11;
 39 | VS16 = 12;
 40 | TAG_BASE = 13;
 41 | TAG_SEQUENCE = 14;
 42 | TAG_TERM = 15;
 43 | 
 44 | any_emoji =  EMOJI_TEXT_PRESENTATION | EMOJI_EMOJI_PRESENTATION |  KEYCAP_BASE |
 45 |   EMOJI_MODIFIER_BASE | TAG_BASE | EMOJI;
 46 | 
 47 | emoji_combining_enclosing_circle_backslash_sequence = any_emoji
 48 |   COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
 49 | 
 50 | # This could be sharper than any_emoji by restricting this only to valid
 51 | # variation sequences:
 52 | # https://www.unicode.org/Public/emoji/11.0/emoji-variation-sequences.txt
 53 | # However, implementing
 54 | # https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence is
 55 | # sufficient for our purposes here.
 56 | emoji_presentation_sequence = any_emoji VS16;
 57 | 
 58 | emoji_modifier_sequence = EMOJI_MODIFIER_BASE EMOJI_MODIFIER;
 59 | 
 60 | emoji_flag_sequence = REGIONAL_INDICATOR REGIONAL_INDICATOR;
 61 | 
 62 | # Here we only allow the valid tag sequences
 63 | # https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences, instead of
 64 | # all well-formed ones defined in
 65 | # https://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
 66 | emoji_tag_sequence = TAG_BASE TAG_SEQUENCE+ TAG_TERM;
 67 | 
 68 | emoji_keycap_sequence = KEYCAP_BASE VS16 COMBINING_ENCLOSING_KEYCAP;
 69 | 
 70 | emoji_zwj_element =  emoji_presentation_sequence | emoji_modifier_sequence | any_emoji;
 71 | 
 72 | emoji_zwj_sequence = emoji_zwj_element ( ZWJ emoji_zwj_element )+;
 73 | 
 74 | emoji_presentation = EMOJI_EMOJI_PRESENTATION | TAG_BASE | EMOJI_MODIFIER_BASE |
 75 |  emoji_presentation_sequence | emoji_modifier_sequence | emoji_flag_sequence |
 76 |  emoji_tag_sequence | emoji_keycap_sequence | emoji_zwj_sequence |
 77 |  emoji_combining_enclosing_circle_backslash_sequence;
 78 | 
 79 | emoji_run = emoji_presentation;
 80 | 
 81 | text_presentation_emoji = any_emoji VS15;
 82 | text_run = any;
 83 | 
 84 | text_and_emoji_run := |*
 85 | # In order to give the the VS15 sequences higher priority than detecting
 86 | # emoji sequences they are listed first as scanner token here.
 87 | text_presentation_emoji => { *is_emoji = false; return te; };
 88 | emoji_run => { *is_emoji = true; return te; };
 89 | text_run => { *is_emoji = false; return te; };
 90 | *|;
 91 | 
 92 | }%%
 93 | 
 94 | static emoji_text_iter_t
 95 | scan_emoji_presentation (emoji_text_iter_t p,
 96 |     const emoji_text_iter_t pe,
 97 |     bool* is_emoji)
 98 | {
 99 |   emoji_text_iter_t ts, te;
100 |   const emoji_text_iter_t eof = pe;
101 | 
102 |   unsigned act;
103 |   int cs;
104 | 
105 |   %%{
106 |     write init;
107 |     write exec;
108 |   }%%
109 | 
110 |   /* Should not be reached. */
111 |   *is_emoji = false;
112 |   return pe;
113 | }
114 | 
115 | // clang-format on
116 | 


--------------------------------------------------------------------------------
/src/libunicode/emoji_segmenter.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libterminal" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | 
 15 | #include <libunicode/codepoint_properties_data.h>
 16 | #include <libunicode/emoji_segmenter.h>
 17 | #include <libunicode/ucd.h>
 18 | 
 19 | #include <cassert>
 20 | #include <iostream>
 21 | 
 22 | namespace unicode
 23 | {
 24 | 
 25 | namespace
 26 | {
 27 | 
 28 |     class RagelIterator
 29 |     {
 30 |         EmojiSegmentationCategory category_;
 31 |         char32_t const* buffer_;
 32 |         size_t size_;
 33 |         size_t currentCursorEnd_;
 34 | 
 35 |       public:
 36 |         RagelIterator(char32_t const* buffer, size_t size, size_t cursor) noexcept:
 37 |             category_ { EmojiSegmentationCategory::Invalid }, buffer_ { buffer }, size_ { size }, currentCursorEnd_ { cursor }
 38 |         {
 39 |             updateCategory();
 40 |         }
 41 | 
 42 |         RagelIterator() noexcept: RagelIterator(U"", 0, 0) {}
 43 | 
 44 |         constexpr char32_t codepoint() const noexcept { return buffer_[currentCursorEnd_]; }
 45 |         constexpr EmojiSegmentationCategory category() const noexcept { return category_; }
 46 |         constexpr size_t cursor() const noexcept { return currentCursorEnd_; }
 47 | 
 48 |         void updateCategory() noexcept
 49 |         {
 50 |             if (currentCursorEnd_ < size_)
 51 |                 category_ = codepoint_properties::get(codepoint()).emoji_segmentation_category;
 52 |             else
 53 |                 category_ = EmojiSegmentationCategory::Invalid;
 54 |         }
 55 | 
 56 |         constexpr int operator*() const noexcept { return static_cast<int>(category_); }
 57 | 
 58 |         RagelIterator& operator++() noexcept
 59 |         {
 60 |             currentCursorEnd_++;
 61 |             updateCategory();
 62 |             return *this;
 63 |         }
 64 |         RagelIterator& operator--(int) noexcept
 65 |         {
 66 |             currentCursorEnd_--;
 67 |             updateCategory();
 68 |             return *this;
 69 |         }
 70 | 
 71 |         RagelIterator operator+(long v) const noexcept
 72 |         {
 73 |             // TODO: assert() on integer overflow
 74 |             return { buffer_, size_, currentCursorEnd_ + (size_t) v };
 75 |         }
 76 | 
 77 |         RagelIterator operator-(long v) const noexcept
 78 |         {
 79 |             if (v >= 0)
 80 |             {
 81 |                 assert(currentCursorEnd_ >= static_cast<size_t>(v));
 82 |                 return { buffer_, size_, currentCursorEnd_ - (size_t) v };
 83 |             }
 84 |             else
 85 |             {
 86 |                 return *this + (-v);
 87 |             }
 88 |         }
 89 | 
 90 |         RagelIterator& operator=(int v) noexcept
 91 |         {
 92 |             assert(v >= 0);
 93 |             currentCursorEnd_ = static_cast<size_t>(v);
 94 |             updateCategory();
 95 |             return *this;
 96 |         }
 97 | 
 98 |         constexpr bool operator==(RagelIterator const& rhs) const noexcept
 99 |         {
100 |             return buffer_ == rhs.buffer_ && size_ == rhs.size_ && currentCursorEnd_ == rhs.currentCursorEnd_;
101 |         }
102 | 
103 |         constexpr bool operator!=(RagelIterator const& rhs) const noexcept { return !(*this == rhs); }
104 |     };
105 | 
106 |     using emoji_text_iter_t = RagelIterator;
107 | 
108 | #include "emoji_presentation_scanner.c"
109 | } // namespace
110 | 
111 | emoji_segmenter::emoji_segmenter(char32_t const* buffer, size_t size) noexcept: buffer_ { buffer }, size_ { size }
112 | {
113 |     if (size_)
114 |         consume_once();
115 | }
116 | 
117 | bool emoji_segmenter::consume(out<size_t> size, out<PresentationStyle> emoji) noexcept
118 | {
119 |     // 01234567890123456
120 |     // "A EMOJI"
121 |     //  []     |
122 |     //   []    |
123 |     //    [----]
124 | 
125 |     // "ABC EMOJI DEFGH"
126 |     //  [---]    |     |
127 |     //      [----]     |
128 |     //           [-----]
129 | 
130 |     currentCursorBegin_ = currentCursorEnd_;
131 |     currentCursorEnd_ = nextCursorBegin_;
132 |     isEmoji_ = isNextEmoji_;
133 | 
134 |     if (nextCursorBegin_ >= size_)
135 |         return false;
136 | 
137 |     do
138 |     {
139 |         auto const o = consume_once();
140 | 
141 |         if (isEmoji_ != isNextEmoji_)
142 |         {
143 |             nextCursorBegin_ = o;
144 |             break;
145 |         }
146 | 
147 |         currentCursorEnd_ = o;
148 |     } while (currentCursorEnd_ < size_);
149 | 
150 |     size.assign(currentCursorEnd_);
151 |     emoji.assign(isEmoji_ ? PresentationStyle::Emoji : PresentationStyle::Text);
152 |     nextCursorBegin_ = currentCursorEnd_;
153 | 
154 |     return true;
155 | }
156 | 
157 | size_t emoji_segmenter::consume_once()
158 | {
159 |     auto const i = RagelIterator(buffer_, size_, currentCursorEnd_);
160 |     auto const e = RagelIterator(buffer_, size_, size_);
161 |     auto const o = scan_emoji_presentation(i, e, &isNextEmoji_);
162 |     return o.cursor();
163 | }
164 | 
165 | } // namespace unicode
166 | 


--------------------------------------------------------------------------------
/src/libunicode/emoji_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/support.h>
 17 | 
 18 | #include <format>
 19 | #include <ostream>
 20 | 
 21 | namespace unicode
 22 | {
 23 | 
 24 | /// Used to distinguish between standard text and emoji text.
 25 | enum class PresentationStyle
 26 | {
 27 |     Text,
 28 |     Emoji
 29 | };
 30 | 
 31 | enum class EmojiSegmentationCategory : int8_t
 32 | {
 33 |     Invalid = -1,
 34 |     Emoji = 0,
 35 |     EmojiTextPresentation = 1,
 36 |     EmojiEmojiPresentation = 2,
 37 |     EmojiModifierBase = 3,
 38 |     EmojiModifier = 4,
 39 |     EmojiVSBase = 5,
 40 |     RegionalIndicator = 6,
 41 |     KeyCapBase = 7,
 42 |     CombiningEnclosingKeyCap = 8,
 43 |     CombiningEnclosingCircleBackslash = 9,
 44 |     ZWJ = 10,
 45 |     VS15 = 11,
 46 |     VS16 = 12,
 47 |     TagBase = 13,
 48 |     TagSequence = 14,
 49 |     TagTerm = 15,
 50 | };
 51 | 
 52 | /**
 53 |  * emoji_segmenter API for segmenting emojis into text-emoji and emoji-emoji presentations.
 54 |  *
 55 |  * This segmenter is segmenting emojis by their presentation property (text or emoji), that is,
 56 |  * whether an emoji is to be rendered in text mode or in emoji (colored) mode.
 57 |  *
 58 |  * It must be segmenting only emojis and not any other codepoints.
 59 |  */
 60 | class emoji_segmenter
 61 | {
 62 |   private:
 63 |     char32_t const* buffer_ = U"";
 64 |     size_t size_ = 0;
 65 | 
 66 |     size_t currentCursorBegin_ = 0;
 67 |     size_t currentCursorEnd_ = 0;
 68 |     size_t nextCursorBegin_ = 0;
 69 | 
 70 |     bool isEmoji_ = false;
 71 |     bool isNextEmoji_ = false;
 72 | 
 73 |   public:
 74 |     using property_type = PresentationStyle;
 75 | 
 76 |     constexpr emoji_segmenter() noexcept = default;
 77 |     constexpr emoji_segmenter& operator=(emoji_segmenter const&) noexcept = default;
 78 |     constexpr emoji_segmenter& operator=(emoji_segmenter&&) noexcept = default;
 79 |     constexpr emoji_segmenter(emoji_segmenter const&) noexcept = default;
 80 |     constexpr emoji_segmenter(emoji_segmenter&&) noexcept = default;
 81 | 
 82 |     emoji_segmenter(char32_t const* buffer, size_t size) noexcept;
 83 | 
 84 |     emoji_segmenter(std::u32string_view const& sv) noexcept: emoji_segmenter(sv.data(), sv.size()) {}
 85 | 
 86 |     constexpr char32_t const* buffer() const noexcept { return buffer_; }
 87 |     constexpr size_t size() const noexcept { return size_; }
 88 |     constexpr size_t currentCursorBegin() const noexcept { return currentCursorBegin_; }
 89 |     constexpr size_t currentCursorEnd() const noexcept { return currentCursorEnd_; }
 90 | 
 91 |     bool consume(out<size_t> size, out<PresentationStyle> emoji) noexcept;
 92 | 
 93 |     /// @returns whether or not the currently segmented emoji is to be rendered in text-presentation or not.
 94 |     constexpr bool isText() const noexcept { return !isEmoji_; }
 95 | 
 96 |     /// @returns whether or not the currently segmented emoji is to be rendered in emoji-presentation or not.
 97 |     constexpr bool isEmoji() const noexcept { return isEmoji_; }
 98 | 
 99 |     /// @returns the underlying current segment that has been processed the last.
100 |     constexpr std::u32string_view substr() const noexcept
101 |     {
102 |         // TODO: provide such an accessor in text_run_segmenter
103 |         if (currentCursorEnd_ > 0)
104 |             return std::u32string_view(buffer_ + currentCursorBegin_, currentCursorEnd_ - currentCursorBegin_);
105 |         else
106 |             return std::u32string_view {};
107 |     }
108 | 
109 |     /// @returns the underlying current segment that has been processed the last.
110 |     constexpr std::u32string_view operator*() const noexcept { return substr(); }
111 | 
112 |   private:
113 |     size_t consume_once();
114 | };
115 | 
116 | inline std::ostream& operator<<(std::ostream& os, PresentationStyle ps)
117 | {
118 |     switch (ps)
119 |     {
120 |         case PresentationStyle::Text: return os << "Text";
121 |         case PresentationStyle::Emoji: return os << "Emoji";
122 |     }
123 |     return os;
124 | }
125 | 
126 | inline std::ostream& operator<<(std::ostream& os, EmojiSegmentationCategory value)
127 | {
128 |     switch (value)
129 |     {
130 |         case unicode::EmojiSegmentationCategory::Invalid: return os << "Invalid";
131 |         case unicode::EmojiSegmentationCategory::Emoji: return os << "Emoji";
132 |         case unicode::EmojiSegmentationCategory::EmojiTextPresentation: return os << "EmojiTextPresentation";
133 |         case unicode::EmojiSegmentationCategory::EmojiEmojiPresentation: return os << "EmojiEmojiPresentation";
134 |         case unicode::EmojiSegmentationCategory::EmojiModifierBase: return os << "EmojiModifierBase";
135 |         case unicode::EmojiSegmentationCategory::EmojiModifier: return os << "EmojiModifier";
136 |         case unicode::EmojiSegmentationCategory::EmojiVSBase: return os << "EmojiVSBase";
137 |         case unicode::EmojiSegmentationCategory::RegionalIndicator: return os << "RegionalIndicator";
138 |         case unicode::EmojiSegmentationCategory::KeyCapBase: return os << "KeyCapBase";
139 |         case unicode::EmojiSegmentationCategory::CombiningEnclosingKeyCap: return os << "CombiningEnclosingKeyCap";
140 |         case unicode::EmojiSegmentationCategory::CombiningEnclosingCircleBackslash:
141 |             return os << "CombiningEnclosingCircleBackslash";
142 |         case unicode::EmojiSegmentationCategory::ZWJ: return os << "ZWJ";
143 |         case unicode::EmojiSegmentationCategory::VS15: return os << "VS15";
144 |         case unicode::EmojiSegmentationCategory::VS16: return os << "VS16";
145 |         case unicode::EmojiSegmentationCategory::TagBase: return os << "TagBase";
146 |         case unicode::EmojiSegmentationCategory::TagSequence: return os << "TagSequence";
147 |         case unicode::EmojiSegmentationCategory::TagTerm: return os << "TagTerm";
148 |     }
149 |     return os;
150 | }
151 | 
152 | } // namespace unicode
153 | 
154 | template <>
155 | struct std::formatter<unicode::PresentationStyle>: std::formatter<std::string_view>
156 | {
157 |     auto format(unicode::PresentationStyle value, auto& ctx) const
158 |     {
159 |         string_view name;
160 |         switch (value)
161 |         {
162 |             case unicode::PresentationStyle::Text: name = "Text"; break;
163 |             case unicode::PresentationStyle::Emoji: name = "Emoji"; break;
164 |         }
165 |         return formatter<string_view>::format(name, ctx);
166 |     }
167 | };
168 | 
169 | template <>
170 | struct std::formatter<unicode::EmojiSegmentationCategory>: std::formatter<std::string_view>
171 | {
172 |     auto format(unicode::EmojiSegmentationCategory value, auto& ctx) const
173 |     {
174 |         using unicode::EmojiSegmentationCategory;
175 |         string_view name;
176 |         switch (value)
177 |         {
178 |             case EmojiSegmentationCategory::Invalid: name = "Invalid"; break;
179 |             case EmojiSegmentationCategory::Emoji: name = "Emoji"; break;
180 |             case EmojiSegmentationCategory::EmojiTextPresentation: name = "EmojiTextPresentation"; break;
181 |             case EmojiSegmentationCategory::EmojiEmojiPresentation: name = "EmojiEmojiPresentation"; break;
182 |             case EmojiSegmentationCategory::EmojiModifierBase: name = "EmojiModifierBase"; break;
183 |             case EmojiSegmentationCategory::EmojiModifier: name = "EmojiModifier"; break;
184 |             case EmojiSegmentationCategory::EmojiVSBase: name = "EmojiVSBase"; break;
185 |             case EmojiSegmentationCategory::RegionalIndicator: name = "RegionalIndicator"; break;
186 |             case EmojiSegmentationCategory::KeyCapBase: name = "KeyCapBase"; break;
187 |             case EmojiSegmentationCategory::CombiningEnclosingKeyCap: name = "CombiningEnclosingKeyCap"; break;
188 |             case EmojiSegmentationCategory::CombiningEnclosingCircleBackslash: name = "CombiningEnclosingCircleBackslash"; break;
189 |             case EmojiSegmentationCategory::ZWJ: name = "ZWJ"; break;
190 |             case EmojiSegmentationCategory::VS15: name = "VS15"; break;
191 |             case EmojiSegmentationCategory::VS16: name = "VS16"; break;
192 |             case EmojiSegmentationCategory::TagBase: name = "TagBase"; break;
193 |             case EmojiSegmentationCategory::TagSequence: name = "TagSequence"; break;
194 |             case EmojiSegmentationCategory::TagTerm: name = "TagTerm"; break;
195 |         }
196 |         return formatter<string_view>::format(name, ctx);
197 |     }
198 | };
199 | 


--------------------------------------------------------------------------------
/src/libunicode/emoji_segmenter_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/emoji_segmenter.h>
 15 | #include <libunicode/grapheme_segmenter.h>
 16 | #include <libunicode/run_segmenter.h>
 17 | #include <libunicode/utf8.h>
 18 | 
 19 | #include <catch2/catch_test_macros.hpp>
 20 | 
 21 | #include <format>
 22 | 
 23 | using namespace unicode;
 24 | using namespace std::string_literals;
 25 | using namespace std;
 26 | 
 27 | namespace
 28 | {
 29 | struct Expectation
 30 | {
 31 |     u32string_view text;
 32 |     size_t start;
 33 |     size_t end;
 34 |     PresentationStyle presentationStyle;
 35 | };
 36 | 
 37 | void test_segments(int lineNo, std::vector<std::pair<std::u32string_view, PresentationStyle>> const& expectations)
 38 | {
 39 |     vector<Expectation> expects;
 40 |     u32string fullText;
 41 |     size_t i = 0;
 42 |     for (auto&& [text, isEmoji]: expectations)
 43 |     {
 44 |         expects.push_back(Expectation { text, i, i + text.size(), isEmoji });
 45 |         fullText += text;
 46 |         i += text.size();
 47 |     }
 48 | 
 49 |     INFO(std::format("Testing emoji segmentation from line {}: {}", lineNo, to_utf8(fullText)));
 50 | 
 51 |     size_t size {};
 52 |     auto presentationStyle = PresentationStyle {};
 53 |     auto segmenter = unicode::emoji_segmenter { fullText };
 54 |     for (size_t i = 0; i < expectations.size(); ++i)
 55 |     {
 56 |         INFO(std::format("run segmentation for part {}: \"{}\" to be {}",
 57 |                          i,
 58 |                          to_utf8(expectations[i].first),
 59 |                          (unsigned) expectations[i].second));
 60 |         bool const consumeSuccess = segmenter.consume(out(size), out(presentationStyle));
 61 |         REQUIRE(consumeSuccess);
 62 |         CHECK(expectations[i].first == *segmenter);
 63 |         CHECK(size == expects[i].end);
 64 |         CHECK(presentationStyle == expects[i].presentationStyle);
 65 |     }
 66 |     bool const consumeFail = segmenter.consume(out(size), out(presentationStyle));
 67 |     REQUIRE_FALSE(consumeFail);
 68 | }
 69 | } // namespace
 70 | 
 71 | TEST_CASE("emoji_segmenter.Emoji", "[emoji_segmenter]")
 72 | {
 73 |     test_segments(__LINE__, { { U"\U0001F600", PresentationStyle::Emoji } });
 74 | }
 75 | 
 76 | TEST_CASE("emoji_segmenter.Emoji_VS15", "[emoji_segmenter]")
 77 | {
 78 |     test_segments(__LINE__, { { U"\U0001F600\uFE0E", PresentationStyle::Text } });
 79 | }
 80 | 
 81 | TEST_CASE("emoji_segmenter.LatinEmoji", "[emoji_segmenter]")
 82 | {
 83 |     test_segments(__LINE__, { { U"AB", PresentationStyle::Text }, { U"😀", PresentationStyle::Emoji } });
 84 | }
 85 | 
 86 | TEST_CASE("emoji_segmenter.EmojiLatin", "[emoji_segmenter]")
 87 | {
 88 |     test_segments(__LINE__,
 89 |                   {
 90 |                       { U"😀", PresentationStyle::Emoji },
 91 |                       { U"A", PresentationStyle::Text },
 92 |                   });
 93 | }
 94 | 
 95 | TEST_CASE("emoji_segmenter.TwoEmojis", "[emoji_segmenter]")
 96 | {
 97 |     test_segments(__LINE__,
 98 |                   {
 99 |                       { U"😀😀", PresentationStyle::Emoji },
100 |                   });
101 | }
102 | 
103 | TEST_CASE("emoji_segmenter.LatinCommonEmoji", "[emoji_segmenter]")
104 | {
105 |     test_segments(__LINE__,
106 |                   {
107 |                       { U"AB ", PresentationStyle::Text },
108 |                       { U"😀", PresentationStyle::Emoji },
109 |                   });
110 | }
111 | 
112 | TEST_CASE("emoji_segmenter.EmojiTextPresentation", "[emoji_segmenter]")
113 | {
114 |     test_segments(__LINE__,
115 |                   {
116 |                       { U"\u270c\ufe0e", PresentationStyle::Text },
117 |                   });
118 | }
119 | 
120 | TEST_CASE("emoji_segmenter.emoji.text.emoji", "[emoji_segmenter]")
121 | {
122 |     test_segments(__LINE__,
123 |                   {
124 |                       { U"\u270c", PresentationStyle::Emoji },
125 |                       { U"\u270c\ufe0e", PresentationStyle::Text },
126 |                       { U"\u270c", PresentationStyle::Emoji },
127 |                   });
128 | }
129 | 
130 | TEST_CASE("emoji_segmenter.mixed_complex", "[emoji_segmenter]")
131 | {
132 |     test_segments(
133 |         __LINE__,
134 |         {
135 |             { U"Hello(", PresentationStyle::Text },                                        // Latin text
136 |             { U"\u270c\U0001F926\U0001F3FC\u200D\u2642\uFE0F", PresentationStyle::Emoji }, // 🤦🏼‍♂️ Face Palm
137 |             { U"\u270c\ufe0e :-)", PresentationStyle::Text },                              // ✌ Waving hand (text presentation)
138 |             { U"\u270c", PresentationStyle::Emoji },                                       // ✌ Waving hand
139 |             { U")合!", PresentationStyle::Text },                                          // Kanji text
140 |         });
141 | }
142 | 


--------------------------------------------------------------------------------
/src/libunicode/grapheme_segmenter.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/utf8_grapheme_segmenter.h>
 15 | 
 16 | namespace unicode
 17 | {
 18 | 
 19 | void grapheme_process_init(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept
 20 | {
 21 |     auto const Pb = codepoint_properties::get(nextCodepoint);
 22 |     auto const B = Pb.grapheme_cluster_break;
 23 | 
 24 |     state.previousCodepoint = nextCodepoint;
 25 |     state.previousProperties = codepoint_properties::get(nextCodepoint);
 26 |     state.ri_counter = (B == Grapheme_Cluster_Break::Regional_Indicator) ? 1 : 0;
 27 | }
 28 | 
 29 | bool grapheme_process_breakable(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept
 30 | {
 31 |     auto const a = state.previousCodepoint;
 32 |     auto const Pa = state.previousProperties;
 33 |     auto const A = Pa.grapheme_cluster_break;
 34 | 
 35 |     auto const b = nextCodepoint;
 36 |     auto const Pb = codepoint_properties::get(b);
 37 |     auto const B = Pb.grapheme_cluster_break;
 38 | 
 39 |     state.previousCodepoint = b;
 40 |     state.previousProperties = Pb;
 41 | 
 42 |     static constexpr char32_t CR = 0x000D; // NOLINT
 43 |     static constexpr char32_t LF = 0x000A; // NOLINT
 44 | 
 45 |     {
 46 |         // Set state.ri_counter to zero if the next codepoint is not of category Regional_Indicator.
 47 |         //
 48 |         // We move the state.ri_counter out to help GCC optimize
 49 |         // this code to be branchless.
 50 |         // Sadly only GCC succeeds in doing this and Clang fails.
 51 |         auto const ri_counter = state.ri_counter;
 52 |         state.ri_counter = (B == Grapheme_Cluster_Break::Regional_Indicator) ? ri_counter : 0;
 53 |     }
 54 | 
 55 |     // GB3: Do not break between a CR and LF. Otherwise, break before and after controls.
 56 |     if (a == CR && b == LF)
 57 |         return false;
 58 | 
 59 |     // GB4 (a) + GB5 (b) part 1 (C0 characers) + US-ASCII shortcut
 60 |     // The US-ASCII part is a pure optimization improving performance
 61 |     // in standard Latin text.
 62 |     if (a < 128 && b < 128)
 63 |         return true;
 64 | 
 65 |     // GB4: (part 2)
 66 |     if (A == Grapheme_Cluster_Break::Control)
 67 |         return true;
 68 | 
 69 |     // GB5: (part 2)
 70 |     if (B == Grapheme_Cluster_Break::Control)
 71 |         return true;
 72 | 
 73 |     // Do not break Hangul syllable sequences.
 74 |     // GB6:
 75 |     if (A == Grapheme_Cluster_Break::L
 76 |         && (B == Grapheme_Cluster_Break::L || B == Grapheme_Cluster_Break::V || B == Grapheme_Cluster_Break::LV
 77 |             || B == Grapheme_Cluster_Break::LVT))
 78 |         return false;
 79 | 
 80 |     // GB7:
 81 |     if ((A == Grapheme_Cluster_Break::LV || A == Grapheme_Cluster_Break::V)
 82 |         && (B == Grapheme_Cluster_Break::V || B == Grapheme_Cluster_Break::T))
 83 |         return false;
 84 | 
 85 |     // GB8:
 86 |     if ((A == Grapheme_Cluster_Break::LV || A == Grapheme_Cluster_Break::T) && B == Grapheme_Cluster_Break::T)
 87 |         return false;
 88 | 
 89 |     // GB9: Do not break before extending characters.
 90 |     if (B == Grapheme_Cluster_Break::Extend || B == Grapheme_Cluster_Break::ZWJ)
 91 |         return false;
 92 | 
 93 |     // GB9a: Do not break before SpacingMarks
 94 |     if (B == Grapheme_Cluster_Break::SpacingMark)
 95 |         return false;
 96 | 
 97 |     // GB9b: or after Prepend characters.
 98 |     if (A == Grapheme_Cluster_Break::Prepend)
 99 |         return false;
100 | 
101 |     // GB11: Do not break within emoji modifier sequences or emoji zwj sequences.
102 |     if (A == Grapheme_Cluster_Break::ZWJ && Pb.extended_pictographic())
103 |         return false;
104 | 
105 |     // GB12/GB13: Do not break within emoji flag sequences.
106 |     // That is, do not break between regional indicator (RI) symbols
107 |     // if there is an odd number of RI characters before the break point.
108 |     if (A == Grapheme_Cluster_Break::Regional_Indicator && A == B && state.ri_counter == 1)
109 |     {
110 |         state.ri_counter = static_cast<uint8_t>((state.ri_counter + 1) % 2);
111 |         return false;
112 |     }
113 | 
114 |     // GB999: Otherwise, break everywhere.
115 |     return true; // GB10
116 | }
117 | 
118 | } // namespace unicode
119 | 


--------------------------------------------------------------------------------
/src/libunicode/grapheme_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/codepoint_properties.h>
 17 | #include <libunicode/ucd.h>
 18 | 
 19 | #include <string_view>
 20 | 
 21 | namespace unicode
 22 | {
 23 | 
 24 | /// Grapheme segmentation state struct, used to keep state
 25 | /// while processing each Unicode codepoint,
 26 | /// allow proper processing of regional flags
 27 | /// as well as reducing the number of invocations
 28 | /// to codepoint_properties::get().
 29 | struct grapheme_segmenter_state
 30 | {
 31 |     char32_t previousCodepoint = {};
 32 |     codepoint_properties previousProperties = codepoint_properties::get(0);
 33 | 
 34 |     uint8_t ri_counter = 0; // modulo 2
 35 | };
 36 | 
 37 | void grapheme_process_init(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept;
 38 | 
 39 | /// Tests if codepoint @p a and @p b are breakable, and thus, two different grapheme clusters.
 40 | ///
 41 | /// @retval true both codepoints to not belong to the same grapheme cluster
 42 | /// @retval false both codepoints belong to the same grapheme cluster
 43 | bool grapheme_process_breakable(char32_t nextCodepoint, grapheme_segmenter_state& state) noexcept;
 44 | 
 45 | /// Implements http://www.unicode.org/reports/tr29/tr29-27.html#Grapheme_Cluster_Boundary_Rules
 46 | class grapheme_segmenter
 47 | {
 48 |   public:
 49 |     grapheme_segmenter(char32_t const* begin, char32_t const* end) noexcept:
 50 |         left_ { begin }, right_ { begin }, end_ { end }, state_ {}
 51 |     {
 52 |         ++*this;
 53 |     }
 54 | 
 55 |     grapheme_segmenter(std::u32string_view sv) noexcept: grapheme_segmenter(sv.data(), sv.data() + sv.size()) {}
 56 | 
 57 |     grapheme_segmenter() noexcept: grapheme_segmenter({}, {}) {}
 58 | 
 59 |     grapheme_segmenter& operator++() noexcept
 60 |     {
 61 |         left_ = right_;
 62 |         if (right_ == end_)
 63 |             return *this;
 64 | 
 65 |         grapheme_process_init(*right_++, state_);
 66 | 
 67 |         while (right_ != end_ && !grapheme_process_breakable(*right_, state_))
 68 |             ++right_;
 69 | 
 70 |         return *this;
 71 |     }
 72 | 
 73 |     constexpr std::u32string_view operator*() const noexcept
 74 |     {
 75 |         return std::u32string_view(left_, static_cast<size_t>(right_ - left_));
 76 |     }
 77 | 
 78 |     constexpr bool codepointsAvailable() const noexcept { return right_ != end_; }
 79 | 
 80 |     constexpr operator bool() const noexcept { return codepointsAvailable(); }
 81 | 
 82 |     constexpr bool operator==(grapheme_segmenter const& rhs) const noexcept
 83 |     {
 84 |         return (!codepointsAvailable() && !rhs.codepointsAvailable()) || (left_ == rhs.left_ && right_ == rhs.right_);
 85 |     }
 86 | 
 87 |     /// Tests if codepoint @p a and @p b are breakable, and thus, two different grapheme clusters.
 88 |     ///
 89 |     /// @retval true both codepoints to not belong to the same grapheme cluster
 90 |     /// @retval false both codepoints belong to the same grapheme cluster
 91 |     static bool breakable(char32_t a, char32_t b) noexcept
 92 |     {
 93 |         auto state = grapheme_segmenter_state {};
 94 |         state.previousCodepoint = a;
 95 |         state.previousProperties = codepoint_properties::get(a);
 96 |         state.ri_counter =
 97 |             (state.previousProperties.grapheme_cluster_break == Grapheme_Cluster_Break::Regional_Indicator) ? 1 : 0;
 98 |         return grapheme_process_breakable(b, state);
 99 |     }
100 | 
101 |     static bool nonbreakable(char32_t a, char32_t b) noexcept { return !breakable(a, b); }
102 | 
103 |   private:
104 |     char32_t const* left_;
105 |     char32_t const* right_;
106 |     char32_t const* end_;
107 |     grapheme_segmenter_state state_;
108 | };
109 | 
110 | } // namespace unicode
111 | 


--------------------------------------------------------------------------------
/src/libunicode/grapheme_segmenter_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/convert.h>
 15 | #include <libunicode/grapheme_segmenter.h>
 16 | 
 17 | #include <catch2/catch_test_macros.hpp>
 18 | 
 19 | using namespace unicode;
 20 | using namespace std::string_literals;
 21 | using namespace std;
 22 | 
 23 | // TODO
 24 | // Implement examples from table 1a) at:
 25 | // http://www.unicode.org/reports/tr29/tr29-27.html#Grapheme_Cluster_Boundary_Rules
 26 | 
 27 | TEST_CASE("latin_common", "[grapheme_segmenter]")
 28 | {
 29 |     // auto constexpr text = u32string_view{U"\u0067G\u0308"};
 30 | 
 31 |     CHECK(grapheme_segmenter::breakable('a', 'b'));
 32 |     CHECK(grapheme_segmenter::breakable('b', '!'));
 33 |     CHECK(grapheme_segmenter::breakable('!', '.'));
 34 | }
 35 | 
 36 | TEST_CASE("combining character sequences", "[grapheme_segmenter]")
 37 | {
 38 |     // auto constexpr text = u32string_view{U"\u0067G\u0308"};
 39 | 
 40 |     CHECK(grapheme_segmenter::nonbreakable('g', U'\u0308'));
 41 | }
 42 | 
 43 | // TEST_CASE("Extended grapheme clusters", "[grapheme_segmenter]")
 44 | // {
 45 | //     // TODO: Hangul Syllables support, can't enable this test yet
 46 | //     CHECK(grapheme_segmenter::nonbreakable(U'\u0BA8', U'\u0BBF'));   // Tamil ni
 47 | //     CHECK(grapheme_segmenter::nonbreakable(U'\u0E40', 'e'));         // Thai e
 48 | //     CHECK(grapheme_segmenter::nonbreakable(U'\u0E01', U'\u0E33'));   // Thai kam
 49 | //     CHECK(grapheme_segmenter::nonbreakable(U'\u0937', U'\u093F'));   // Devanagari ssi
 50 | // }
 51 | 
 52 | TEST_CASE("emoji.speaking-eye", "[grapheme_segmenter]")
 53 | {
 54 |     /*
 55 |     👁 U+1F441     Eye
 56 |     ️  U+FE0F      VS16
 57 |       U+200D      ZWJ
 58 |     🗨 U+1F5E8     Left Speech Bubble
 59 |      ️ U+FE0F      VS16
 60 |      */
 61 |     auto const zwj = u32string_view { U"\U0001F441\uFE0F\u200D\U0001F5E8\uFE0F" };
 62 |     CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1]));
 63 |     CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2]));
 64 |     CHECK(grapheme_segmenter::nonbreakable(zwj[2], zwj[3]));
 65 |     CHECK(grapheme_segmenter::nonbreakable(zwj[3], zwj[4]));
 66 | }
 67 | 
 68 | TEST_CASE("emoji", "[grapheme_segmenter]")
 69 | {
 70 |     // 👨‍🦰
 71 |     auto const zwj = u32string_view { U"\U0001F468\u200D\U0001F9B0" };
 72 |     CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1]));
 73 |     CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2]));
 74 | 
 75 |     // 👨‍👩‍👧
 76 |     auto const zwj3 = u32string_view { U"\U0001F468\u200D\U0001F469\u200D\U0001F467" };
 77 |     CHECK(grapheme_segmenter::nonbreakable(zwj3[0], zwj3[1]));
 78 |     CHECK(grapheme_segmenter::nonbreakable(zwj3[1], zwj3[2]));
 79 |     CHECK(grapheme_segmenter::nonbreakable(zwj3[2], zwj3[3]));
 80 |     CHECK(grapheme_segmenter::nonbreakable(zwj3[3], zwj3[4]));
 81 | }
 82 | 
 83 | TEST_CASE("emoji: Man Facepalming: Medium-Light Skin Tone", "[grapheme_segmenter]")
 84 | {
 85 | 
 86 |     auto const zwj = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" };
 87 |     CHECK(grapheme_segmenter::nonbreakable(zwj[0], zwj[1]));
 88 |     CHECK(grapheme_segmenter::nonbreakable(zwj[1], zwj[2]));
 89 |     CHECK(grapheme_segmenter::nonbreakable(zwj[2], zwj[3]));
 90 |     CHECK(grapheme_segmenter::nonbreakable(zwj[3], zwj[4]));
 91 | }
 92 | 
 93 | TEST_CASE("grapheme_segmenter.iterator_1", "[grapheme_segmenter]")
 94 | {
 95 |     auto const codepoints = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" };
 96 |     auto gs = grapheme_segmenter { codepoints };
 97 | 
 98 |     // initially first token already process
 99 |     CHECK(*gs == codepoints);
100 |     CHECK_FALSE(gs.codepointsAvailable());
101 | }
102 | 
103 | TEST_CASE("grapheme_segmenter.iterator_2", "[grapheme_segmenter]")
104 | {
105 |     auto const grapheme_cluster2 = u32string_view { U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F" };
106 |     auto const codepoints = u32string_view { U"X\U0001F926\U0001F3FC\u200D\u2642\uFE0F5" };
107 |     auto gs = grapheme_segmenter { codepoints };
108 | 
109 |     // first grapheme cluster
110 |     CHECK(*gs == U"X");
111 |     CHECK(gs.codepointsAvailable());
112 | 
113 |     // second grapheme cluster
114 |     ++gs;
115 |     CHECK(*gs == grapheme_cluster2);
116 |     CHECK(gs.codepointsAvailable());
117 | 
118 |     // 3rd grapheme cluster
119 |     ++gs;
120 |     CHECK(*gs == U"5");
121 |     CHECK_FALSE(gs.codepointsAvailable());
122 | 
123 |     // incrementing beyond end of stream
124 |     ++gs;
125 |     CHECK(*gs == U"");
126 |     CHECK_FALSE(gs.codepointsAvailable());
127 | }
128 | 
129 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags", "[grapheme_segmenter]")
130 | {
131 |     auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" };
132 |     auto const ri_JP = u32string { U"\U0001F1EF\U0001F1F5" };
133 |     auto const codepoints = ri_DE + ri_DE + ri_JP;
134 |     auto gs = grapheme_segmenter { codepoints };
135 | 
136 |     // first grapheme cluster
137 |     REQUIRE(*gs == ri_DE);
138 |     REQUIRE(gs.codepointsAvailable());
139 | 
140 |     // second grapheme cluster
141 |     ++gs;
142 |     REQUIRE(*gs == ri_DE);
143 |     REQUIRE(gs.codepointsAvailable());
144 | 
145 |     // 3rd grapheme cluster
146 |     ++gs;
147 |     REQUIRE(*gs == ri_JP);
148 |     REQUIRE_FALSE(gs.codepointsAvailable());
149 | 
150 |     // incrementing beyond end of stream
151 |     ++gs;
152 |     REQUIRE(*gs == U"");
153 |     REQUIRE_FALSE(gs.codepointsAvailable());
154 | }
155 | 
156 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags invalid 1", "[grapheme_segmenter]")
157 | {
158 |     auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" };
159 |     auto const ri_J = u32string { U"\U0001F1EF" };
160 |     auto const codepoints = ri_DE + ri_DE + ri_J + U"P";
161 |     auto gs = grapheme_segmenter { codepoints };
162 | 
163 |     // first grapheme cluster
164 |     REQUIRE(*gs == ri_DE);
165 |     REQUIRE(gs.codepointsAvailable());
166 | 
167 |     // second grapheme cluster
168 |     ++gs;
169 |     REQUIRE(*gs == ri_DE);
170 |     REQUIRE(gs.codepointsAvailable());
171 | 
172 |     // 3rd grapheme cluster
173 |     ++gs;
174 |     REQUIRE(*gs == ri_J);
175 |     REQUIRE(gs.codepointsAvailable());
176 | 
177 |     // 4th grapheme cluster
178 |     ++gs;
179 |     REQUIRE(*gs == U"P");
180 |     REQUIRE_FALSE(gs.codepointsAvailable());
181 | 
182 |     // incrementing beyond end of stream
183 |     ++gs;
184 |     REQUIRE(*gs == U"");
185 |     REQUIRE_FALSE(gs.codepointsAvailable());
186 | }
187 | 
188 | TEST_CASE("grapheme_segmenter.iterator_3: regional flags invalid 2", "[grapheme_segmenter]")
189 | {
190 |     auto const ri_DE = u32string { U"\U0001F1E9\U0001F1E9" };
191 |     auto const ri_J = u32string { U"\U0001F1EF" };
192 |     auto const codepoints = ri_DE + ri_DE + U"Q" + ri_J + U"P";
193 |     auto gs = grapheme_segmenter { codepoints };
194 | 
195 |     // 1
196 |     REQUIRE(*gs == ri_DE);
197 |     REQUIRE(gs.codepointsAvailable());
198 | 
199 |     // 2
200 |     ++gs;
201 |     REQUIRE(*gs == ri_DE);
202 |     REQUIRE(gs.codepointsAvailable());
203 | 
204 |     // 3
205 |     ++gs;
206 |     REQUIRE(*gs == U"Q");
207 |     REQUIRE(gs.codepointsAvailable());
208 | 
209 |     // 4
210 |     ++gs;
211 |     REQUIRE(*gs == ri_J);
212 |     REQUIRE(gs.codepointsAvailable());
213 | 
214 |     // 5
215 |     ++gs;
216 |     REQUIRE(*gs == U"P");
217 |     REQUIRE_FALSE(gs.codepointsAvailable());
218 | 
219 |     // incrementing beyond end of stream
220 |     ++gs;
221 |     REQUIRE(*gs == U"");
222 |     REQUIRE_FALSE(gs.codepointsAvailable());
223 | }
224 | 


--------------------------------------------------------------------------------
/src/libunicode/libunicode-config.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 | 
3 | # prevent repeatedly including the targets
4 | if(NOT TARGET unicode::core)
5 |     include(${CMAKE_CURRENT_LIST_DIR}/libunicode-targets.cmake)
6 | endif()
7 | 
8 | message(STATUS "Found @PROJECT_NAME@, version: ${@PROJECT_NAME@_VERSION}")
9 | 


--------------------------------------------------------------------------------
/src/libunicode/multistage_table_generator.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2022 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/multistage_table_view.h>
 17 | #include <libunicode/scoped_timer.h>
 18 | 
 19 | #include <algorithm>
 20 | #include <cassert>
 21 | #include <cstdint>
 22 | #include <iomanip>
 23 | #include <iterator>
 24 | #include <limits>
 25 | #include <optional>
 26 | #include <sstream>
 27 | #include <stdexcept>
 28 | #include <vector>
 29 | 
 30 | namespace support
 31 | {
 32 | 
 33 | template <typename T,
 34 |           typename SourceType,
 35 |           typename Stage1ElementType,
 36 |           typename Stage2ElementType,
 37 |           SourceType BlockSize,
 38 |           SourceType MaxValue = std::numeric_limits<SourceType>::max()>
 39 | struct multistage_table
 40 | {
 41 |     using view_type = multistage_table_view<T, SourceType, Stage1ElementType, Stage2ElementType, BlockSize, MaxValue>;
 42 | 
 43 |     std::vector<Stage1ElementType> stage1; // div
 44 |     std::vector<Stage2ElementType> stage2; // mod
 45 |     std::vector<T> stage3;                 // values
 46 | 
 47 |     auto to_view() const noexcept { return view_type { stage1.data(), stage2.data(), stage3.data() }; }
 48 | 
 49 |     T const& get(SourceType index) const noexcept { return to_view().get(index); }
 50 | };
 51 | 
 52 | template <typename T,
 53 |           typename SourceType,
 54 |           typename Stage1ElementType,
 55 |           typename Stage2ElementType,
 56 |           typename Stage3Finder,
 57 |           SourceType BlockSize,
 58 |           SourceType MaxValue = std::numeric_limits<SourceType>::max()>
 59 | class multistage_table_generator
 60 | {
 61 |   public:
 62 |     T const* _input;
 63 |     size_t _inputSize;
 64 |     multistage_table<T, SourceType, Stage1ElementType, Stage2ElementType, BlockSize, MaxValue>& _output;
 65 |     Stage3Finder _stage3Finder;
 66 | 
 67 |     void generate()
 68 |     {
 69 |         assert(_inputSize % BlockSize == 0);
 70 |         _output.stage1.resize(_inputSize / BlockSize);
 71 |         for (SourceType blockStart = 0; blockStart <= _inputSize - BlockSize; blockStart += BlockSize)
 72 |             _output.stage1[blockStart / BlockSize] = get_or_create_index_to_stage2_block(blockStart);
 73 |     }
 74 | 
 75 |     void verify() const
 76 |     {
 77 |         for (SourceType blockStart = 0; blockStart <= _inputSize - BlockSize; ++blockStart)
 78 |             verify_block(blockStart / BlockSize);
 79 |     }
 80 | 
 81 |   private:
 82 |     void verify_block(SourceType blockNumber) const
 83 |     {
 84 |         for (SourceType codepoint = blockNumber * BlockSize; codepoint < (blockNumber + 1) * BlockSize; ++codepoint)
 85 |         {
 86 |             auto const& a = _input[codepoint];
 87 |             auto const& b = _output.get(codepoint);
 88 |             if (a != b)
 89 |             {
 90 |                 throw runtime_error((std::ostringstream()
 91 |                                      << "U+" << std::hex << unsigned(codepoint) << " mismatch in properties.\n"
 92 |                                      << "Expected : " << a << "\nActual   : " << b)
 93 |                                         .str());
 94 |             }
 95 |         }
 96 |     }
 97 | 
 98 |     Stage1ElementType get_or_create_index_to_stage2_block(SourceType blockStart)
 99 |     {
100 |         if (auto other_block = find_same_block(static_cast<size_t>(blockStart)))
101 |             return _output.stage1[other_block.value()];
102 | 
103 |         // Block has not been seen yet. Create a new block.
104 |         auto const stage2Index = _output.stage2.size() / BlockSize;
105 |         assert(stage2Index < std::numeric_limits<Stage2ElementType>::max());
106 | 
107 |         for (SourceType index = blockStart; index < blockStart + BlockSize; ++index)
108 |             _output.stage2.emplace_back(get_or_create_stage3_index(index));
109 | 
110 |         assert(_output.stage2.size() % BlockSize == 0);
111 | 
112 |         return static_cast<Stage1ElementType>(stage2Index);
113 |     }
114 | 
115 |     std::optional<size_t> find_same_block(size_t blockStart) const noexcept
116 |     {
117 |         assert(blockStart % BlockSize == 0);
118 |         assert(blockStart + BlockSize <= _inputSize);
119 | 
120 |         for (size_t otherBlockStart = 0; otherBlockStart < blockStart; otherBlockStart += BlockSize)
121 |             if (is_same_block(otherBlockStart, blockStart))
122 |                 return { otherBlockStart / BlockSize };
123 | 
124 |         return std::nullopt;
125 |     }
126 | 
127 |     /// Tests if two given blocks are equivalent.
128 |     /// @p a and @p b are both absolute offsets to the start of each block.
129 |     bool is_same_block(size_t a, size_t b) const noexcept
130 |     {
131 |         assert(a % BlockSize == 0);
132 |         assert(b % BlockSize == 0);
133 |         assert(a + BlockSize <= _inputSize);
134 |         assert(b + BlockSize <= _inputSize);
135 | 
136 |         for (size_t i = 0; i < BlockSize; ++i)
137 |             if (_input[a + i] != _input[b + i])
138 |                 return false;
139 | 
140 |         return true;
141 |     }
142 | 
143 |     Stage2ElementType get_or_create_stage3_index(SourceType stage1Index)
144 |     {
145 |         auto& properties = _output.stage3;
146 |         auto const propertyIterator = _stage3Finder(properties.begin(), properties.end(), _input[stage1Index]);
147 |         if (propertyIterator != properties.end())
148 |             return static_cast<Stage2ElementType>(distance(properties.begin(), propertyIterator));
149 | 
150 |         auto const stage3Index = properties.size();
151 |         properties.emplace_back(_input[stage1Index]);
152 |         assert(stage3Index < std::numeric_limits<Stage2ElementType>::max());
153 |         return static_cast<Stage2ElementType>(stage3Index);
154 |     }
155 | };
156 | 
157 | template <typename T,
158 |           typename SourceType,
159 |           typename Stage1ElementType,
160 |           typename Stage2ElementType,
161 |           typename Stage3Finder,
162 |           SourceType BlockSize,
163 |           SourceType MaxValue = std::numeric_limits<SourceType>::max()>
164 | void generate(T const* input,
165 |               size_t inputSize,
166 |               multistage_table<T, SourceType, Stage1ElementType, Stage2ElementType, BlockSize, MaxValue>& output,
167 |               Stage3Finder&& stage3Finder)
168 | {
169 |     auto builder =
170 |         multistage_table_generator<T, SourceType, Stage1ElementType, Stage2ElementType, Stage3Finder, BlockSize, MaxValue> {
171 |             input, inputSize, output, std::forward<Stage3Finder>(stage3Finder)
172 |         };
173 |     builder.generate();
174 | }
175 | 
176 | } // namespace support
177 | 


--------------------------------------------------------------------------------
/src/libunicode/multistage_table_view.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2022 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | #include <cstdint>
17 | #include <limits>
18 | 
19 | namespace support
20 | {
21 | 
22 | template <typename T,
23 |           typename SourceType,
24 |           typename Stage1ElementType,
25 |           typename Stage2ElementType,
26 |           SourceType BlockSize,
27 |           SourceType MaxValue = std::numeric_limits<SourceType>::max()>
28 | struct multistage_table_view
29 | {
30 |     using source_type = SourceType;
31 |     using stage1_element_type = Stage1ElementType;
32 |     using stage2_element_type = Stage2ElementType;
33 |     using value_type = T;
34 | 
35 |     stage1_element_type const* stage1; // div
36 |     stage2_element_type const* stage2; // mod
37 |     value_type const* stage3;          // values
38 | 
39 |     static std::size_t constexpr block_size = BlockSize;
40 | 
41 |     // size_t size() const noexcept { return stage1.size(); }
42 | 
43 |     value_type const& get(source_type index, source_type fallback = source_type {}) const noexcept
44 |     {
45 |         return unsafe_get(index <= MaxValue ? index : fallback);
46 |     }
47 | 
48 |     value_type const& unsafe_get(source_type index) const noexcept
49 |     {
50 |         auto const block_number = stage1[index / BlockSize];
51 |         auto const block_start = block_number * BlockSize;
52 |         auto const element_offset = index % BlockSize;
53 |         auto const property_index = stage2[block_start + element_offset];
54 |         return stage3[property_index];
55 |     }
56 | };
57 | 
58 | } // namespace support
59 | 


--------------------------------------------------------------------------------
/src/libunicode/run_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/emoji_segmenter.h>
 17 | #include <libunicode/script_segmenter.h>
 18 | #include <libunicode/support.h>
 19 | #include <libunicode/ucd.h>
 20 | #include <libunicode/ucd_ostream.h>
 21 | 
 22 | #include <array>
 23 | #include <iterator>
 24 | #include <ostream>
 25 | #include <tuple>
 26 | 
 27 | namespace unicode
 28 | {
 29 | 
 30 | template <typename T>
 31 | using segmenter_property_t = typename T::property_type;
 32 | 
 33 | template <typename... Ts>
 34 | using segmenter_property_tuple = std::tuple<segmenter_property_t<Ts>...>;
 35 | 
 36 | namespace detail
 37 | {
 38 |     template <typename Prepend, typename... Ts>
 39 |     inline void _continuePrintList(std::ostream& os, Prepend const& prep, std::tuple<Ts...> const& p)
 40 |     {
 41 |         ((os << prep << std::get<Ts>(p)), ...);
 42 |     }
 43 | } // namespace detail
 44 | 
 45 | /// API for segmenting incoming text into small runs.
 46 | ///
 47 | /// A ``run`` is a unit suitable for text shaping, but may as well be used
 48 | /// for other purposes, too.
 49 | ///
 50 | /// @see script_segmenter
 51 | /// @see emoji_segmenter
 52 | /// @see grapheme_segmenter
 53 | template <typename... Segmenter>
 54 | class basic_run_segmenter
 55 | {
 56 |   public:
 57 |     using property_tuple = std::tuple<segmenter_property_t<Segmenter>...>;
 58 | 
 59 |     /// Contains the extracted information of run_segmenter's single run.
 60 |     struct range
 61 |     {
 62 |         /// start-offset of the current segment that has been extracted
 63 |         size_t start = 0;
 64 | 
 65 |         /// end-offset (excluding) of the current segment that has been extracted
 66 |         size_t end = 0;
 67 | 
 68 |         /// the script (writing system) this segment has been identified with
 69 |         /// presentation style of the underlying segment
 70 |         property_tuple properties;
 71 | 
 72 |         constexpr bool operator==(range other) const noexcept
 73 |         {
 74 |             return start == other.start && end == other.end && properties == other.properties;
 75 |         }
 76 | 
 77 |         constexpr bool operator!=(range other) const noexcept { return !(*this == other); }
 78 | 
 79 |         friend inline std::ostream& operator<<(std::ostream& os, range r)
 80 |         {
 81 |             os << '(' << r.start << ".." << r.end;
 82 |             detail::_continuePrintList(os, ", ", r.properties);
 83 |             os << ')';
 84 |             return os;
 85 |         }
 86 |     };
 87 | 
 88 |     explicit basic_run_segmenter(std::u32string_view sv): basic_run_segmenter(sv.data(), sv.size()) {}
 89 | 
 90 |     basic_run_segmenter(char32_t const* text, size_t size): segmenter_ {}, size_ { size }
 91 |     {
 92 |         initialize<0, Segmenter...>(text, size);
 93 |     }
 94 | 
 95 |     constexpr bool finished() const noexcept { return lastSplit_ >= size_; }
 96 | 
 97 |     /// Splits input text into segments, such as pure text by script, emoji-emoji, or emoji-text.
 98 |     ///
 99 |     /// @retval true more data can be processed
100 |     /// @retval false end of input data has been reached.
101 |     bool consume(out<range> result)
102 |     {
103 |         if (finished())
104 |             return false;
105 | 
106 |         consumeAllUntilSplitPosition<0, Segmenter...>();
107 | 
108 |         auto const minPosition = std::min_element(begin(positions_), end(positions_));
109 | 
110 |         lastSplit_ = *minPosition;
111 | 
112 |         candidate_.start = candidate_.end;
113 |         candidate_.end = lastSplit_;
114 |         candidate_.properties = properties_;
115 | 
116 |         *result = candidate_;
117 |         return true;
118 |     }
119 | 
120 |   private:
121 |     template <size_t I>
122 |     void initialize(char32_t const*, size_t)
123 |     {
124 |     }
125 | 
126 |     template <size_t I, typename Current, typename... Remaining>
127 |     void initialize(char32_t const* text, size_t size)
128 |     {
129 |         std::get<I>(segmenter_) = Current { text, size };
130 |         initialize<I + 1, Remaining...>(text, size);
131 |     }
132 | 
133 |     template <size_t I>
134 |     void consumeAllUntilSplitPosition()
135 |     {
136 |     }
137 | 
138 |     template <size_t I, typename Current, typename... Remaining>
139 |     void consumeAllUntilSplitPosition()
140 |     {
141 |         consumeUntilSplitPosition(std::get<Current>(segmenter_), out(positions_[I]), out(std::get<I>(properties_)));
142 |         consumeAllUntilSplitPosition<I + 1, Remaining...>();
143 |     }
144 | 
145 |     template <typename TheSegmenter, typename Property>
146 |     void consumeUntilSplitPosition(TheSegmenter& segmenter, out<size_t> position, out<Property> property)
147 |     {
148 |         if (*position > lastSplit_)
149 |             return;
150 | 
151 |         if (*position >= size_)
152 |             return;
153 | 
154 |         for (;;)
155 |         {
156 |             if (!segmenter.consume(position, property))
157 |                 break;
158 | 
159 |             if (*position > lastSplit_)
160 |                 break;
161 |         }
162 |     }
163 | 
164 |     // private data
165 | 
166 |     using position_list = std::array<size_t, sizeof...(Segmenter)>;
167 |     using segmenter_tuple = std::tuple<Segmenter...>;
168 | 
169 |     size_t lastSplit_ = 0;
170 |     range candidate_ = {};
171 |     position_list positions_ {};
172 |     property_tuple properties_ {};
173 |     segmenter_tuple segmenter_;
174 |     size_t const size_;
175 | };
176 | 
177 | using run_segmenter = basic_run_segmenter<script_segmenter, emoji_segmenter>;
178 | 
179 | } // namespace unicode
180 | 


--------------------------------------------------------------------------------
/src/libunicode/scan.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/utf8.h>
 17 | 
 18 | #include <string_view>
 19 | 
 20 | namespace unicode
 21 | {
 22 | 
 23 | /// Holds the result of a call to scan_test().
 24 | struct scan_result
 25 | {
 26 |     /// Number of columns scanned.
 27 |     /// One column equals a single narrow-width codepoint.
 28 |     /// Codepoints with property East Asian Width Wide are treated as two columns.
 29 |     size_t count;
 30 | 
 31 |     /// Pointer to UTF-8 grapheme cluster start.
 32 |     char const* start;
 33 | 
 34 |     /// Pointer to UTF-8 grapheme cluster end, i.e. one byte behind
 35 |     /// the last successfuly processed complete UTF-8 byte..
 36 |     char const* end;
 37 | };
 38 | 
 39 | /// Holds the state to keep through a consecutive sequence of calls to scan_test().
 40 | ///
 41 | /// This state holds the UTF-8 decoding state, if processing had to be stopped
 42 | /// at an incomplete UTF-8 byte sequence,
 43 | /// and the last decoded Unicode codepoint necessary for grapheme cluster segmentation.
 44 | struct scan_state
 45 | {
 46 |     utf8_decoder_state utf8 {};
 47 |     char32_t lastCodepointHint {};
 48 | 
 49 |     /// Pointer to one byte after the last scanned codepoint.
 50 |     char const* next {};
 51 | };
 52 | 
 53 | /// Callback-interface that allows precisely understanding the structure of a UTF-8 sequence.
 54 | class grapheme_cluster_receiver
 55 | {
 56 |   public:
 57 |     virtual ~grapheme_cluster_receiver() = default;
 58 | 
 59 |     virtual void receiveAsciiSequence(std::string_view codepoints) noexcept = 0;
 60 |     virtual void receiveGraphemeCluster(std::string_view codepoints, size_t columnCount) noexcept = 0;
 61 |     virtual void receiveInvalidGraphemeCluster() noexcept = 0;
 62 | };
 63 | 
 64 | /// Quite obviousely, this grapheme_cluster_receiver will do nothing.
 65 | class null_receiver final: public grapheme_cluster_receiver
 66 | {
 67 |   public:
 68 |     void receiveAsciiSequence(std::string_view) noexcept override {}
 69 |     void receiveGraphemeCluster(std::string_view, size_t) noexcept override {}
 70 |     void receiveInvalidGraphemeCluster() noexcept override {}
 71 | 
 72 |     static null_receiver& get() noexcept
 73 |     {
 74 |         static null_receiver instance {};
 75 |         return instance;
 76 |     }
 77 | };
 78 | 
 79 | namespace detail
 80 | {
 81 |     size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept;
 82 | 
 83 |     template <size_t SimdBitWidth>
 84 |     size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept;
 85 |     size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept;
 86 |     size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept;
 87 |     scan_result scan_for_text_nonascii(scan_state& state,
 88 |                                        std::string_view text,
 89 |                                        size_t maxColumnCount,
 90 |                                        grapheme_cluster_receiver& receiver) noexcept;
 91 | } // namespace detail
 92 | 
 93 | /// Scans a sequence of UTF-8 encoded bytes.
 94 | ///
 95 | /// This call will return early one of the conditions is met:
 96 | ///
 97 | /// - given the input sequence, the right most invalid or complete UTF-8 sequence is processed,
 98 | /// - maxColumnCount is reached and the next grapheme cluster would exceed the given limit,
 99 | /// - a control character is about to be processed.
100 | ///
101 | /// When this function returns, it is guaranteed to not contain an incomplete UTF-8 sequence
102 | /// at the end of the output sequence.
103 | ///
104 | /// Calling this function again with more bytes will resume decoding that UTF-8 sequence
105 | /// with the help of the passed UTF-8 decoder state.
106 | ///
107 | /// @return scanned textual result. This is, a sequence of
108 | ///         either valid or invalid UTF-8 codepoints,
109 | ///         but not incomplete codepoints at the end.
110 | scan_result scan_text(scan_state& state, std::string_view text, size_t maxColumnCount) noexcept;
111 | 
112 | scan_result scan_text(scan_state& state,
113 |                       std::string_view text,
114 |                       size_t maxColumnCount,
115 |                       grapheme_cluster_receiver& receiver) noexcept;
116 | 
117 | } // namespace unicode
118 | 


--------------------------------------------------------------------------------
/src/libunicode/scan256.cpp:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: Apache-2.0
 2 | #include <libunicode/scan.h>
 3 | #include <libunicode/scan_simd_impl.h>
 4 | 
 5 | namespace unicode::detail
 6 | {
 7 | size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept
 8 | {
 9 |     return scan_for_text_ascii_simd<256>(text, maxColumnCount);
10 | }
11 | } // namespace unicode::detail
12 | 


--------------------------------------------------------------------------------
/src/libunicode/scan512.cpp:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: Apache-2.0
 2 | #include <libunicode/scan.h>
 3 | #include <libunicode/scan_simd_impl.h>
 4 | 
 5 | namespace unicode::detail
 6 | {
 7 | size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept
 8 | {
 9 |     return scan_for_text_ascii_simd<512>(text, maxColumnCount);
10 | }
11 | } // namespace unicode::detail
12 | 


--------------------------------------------------------------------------------
/src/libunicode/scan_simd_impl.h:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: Apache-2.0
  2 | #pragma once
  3 | #include <algorithm>
  4 | #include <cstdint>
  5 | #include <iterator>
  6 | #include <string_view>
  7 | 
  8 | // clang-format off
  9 | #if __has_include(<experimental/simd>) && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__)
 10 |     #define USE_STD_SIMD
 11 |     #include <experimental/simd>
 12 |     namespace stdx = std::experimental;
 13 | #elif __has_include(<simd>) && defined(LIBUNICODE_USE_STD_SIMD)
 14 |     #define USE_STD_SIMD
 15 |     #include <simd>
 16 |     namespace stdx = std;
 17 | #elif defined(LIBUNICODE_USE_INTRINSICS)
 18 |     #include "intrinsics.h"
 19 | #endif
 20 | // clang-format on
 21 | namespace unicode::detail
 22 | {
 23 | template <size_t SimdBitWidth>
 24 | size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept
 25 | {
 26 |     [[maybe_unused]] constexpr int simd_size = SimdBitWidth / 8;
 27 |     auto input = text.data();
 28 |     auto const end = text.data() + std::min(text.size(), maxColumnCount);
 29 | 
 30 | #if defined(USE_STD_SIMD)
 31 |     auto simd_text = stdx::fixed_size_simd<char, simd_size> {};
 32 |     while (input < end - simd_size)
 33 |     {
 34 |         simd_text.copy_from(input, stdx::element_aligned);
 35 |         auto const is_control_mask = simd_text < 0x20;
 36 |         auto const is_complex_mask = (simd_text & 0x80) == 0x80;
 37 |         auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask;
 38 |         if (stdx::any_of(ctrl_or_complex_mask))
 39 |         {
 40 |             input += stdx::find_first_set(ctrl_or_complex_mask);
 41 |             break;
 42 |         }
 43 |         input += simd_size;
 44 |     }
 45 | #elif defined(LIBUNICODE_USE_INTRINSICS)
 46 |     constexpr auto trailing_zero_count = []<typename T>(T value) noexcept {
 47 |         // clang-format off
 48 |         if constexpr (std::same_as<std::remove_cvref_t<T>, uint32_t>)
 49 |         {
 50 |             #if defined(_WIN32)
 51 |                 // return _tzcnt_u32(value);
 52 |                 // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64.
 53 |                 unsigned long r = 0;
 54 |                 _BitScanForward(&r, value);
 55 |                 return r;
 56 |             #else
 57 |                 return __builtin_ctz(value);
 58 |             #endif
 59 |         }
 60 |         else
 61 |         {
 62 |             #if defined(_WIN32)
 63 |                 unsigned long r = 0;
 64 |                 _BitScanForward64(&r, value);
 65 |                 return r;
 66 |             #else
 67 |                 return __builtin_ctzl(value);
 68 |             #endif
 69 |         }
 70 |         // clang-format on
 71 |     };
 72 |     using intrinsics = intrinsics<SimdBitWidth>;
 73 |     auto const vec_control = intrinsics::set1_epi8(0x20); // 0..0x1F
 74 |     auto const vec_complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000)
 75 | 
 76 |     while (input < end - simd_size)
 77 |     {
 78 |         auto const batch = intrinsics::load(input);
 79 |         auto const is_control_mask = intrinsics::less(batch, vec_control);
 80 |         auto const is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex);
 81 |         auto const ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask);
 82 |         if (ctrl_or_complex_mask)
 83 |         {
 84 |             int const advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask));
 85 |             input += advance;
 86 |             break;
 87 |         }
 88 |         input += sizeof(simd_size);
 89 |     }
 90 | #endif
 91 | 
 92 |     constexpr auto is_ascii = [](char ch) noexcept {
 93 |         auto const is_control = static_cast<uint8_t>(ch) < 0x20;
 94 |         auto const is_complex = static_cast<uint8_t>(ch) & 0x80;
 95 |         return !is_control && !is_complex;
 96 |     };
 97 |     while (input != end && is_ascii(*input))
 98 |         ++input;
 99 | 
100 |     return static_cast<size_t>(std::distance(text.data(), input));
101 | }
102 | } // namespace unicode::detail
103 | 


--------------------------------------------------------------------------------
/src/libunicode/scoped_timer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2022 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | #include <chrono>
17 | #include <ostream>
18 | #include <string>
19 | 
20 | namespace support
21 | {
22 | 
23 | class scoped_timer
24 | {
25 |   public:
26 |     scoped_timer(std::ostream* output, std::string message):
27 |         _start { std::chrono::steady_clock::now() }, _output { output }, _message { std::move(message) }
28 |     {
29 |         if (_output)
30 |         {
31 |             *_output << _message << " ... ";
32 |             _output->flush();
33 |         }
34 |     }
35 | 
36 |     ~scoped_timer()
37 |     {
38 |         if (!_output)
39 |             return;
40 | 
41 |         auto const finish = std::chrono::steady_clock::now();
42 |         auto const diff = finish - _start;
43 |         *_output << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " ms\n";
44 |     }
45 | 
46 |   private:
47 |     std::chrono::time_point<std::chrono::steady_clock> _start;
48 |     std::ostream* _output;
49 |     std::string _message;
50 | };
51 | 
52 | } // namespace support
53 | 


--------------------------------------------------------------------------------
/src/libunicode/script_segmenter.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/script_segmenter.h>
 15 | #include <libunicode/ucd.h>
 16 | 
 17 | #include <algorithm>
 18 | 
 19 | using namespace std;
 20 | 
 21 | namespace unicode
 22 | {
 23 | 
 24 | namespace
 25 | {
 26 |     bool constexpr isPreferred(Script script) noexcept
 27 |     {
 28 |         switch (script)
 29 |         {
 30 |             case Script::Invalid:
 31 |             case Script::Common:
 32 |             case Script::Inherited: return false;
 33 |             default: return true;
 34 |         }
 35 |     }
 36 | } // namespace
 37 | 
 38 | optional<script_segmenter::result> script_segmenter::consume()
 39 | {
 40 |     if (offset_ >= size_)
 41 |         return nullopt;
 42 | 
 43 |     while (offset_ < size_)
 44 |     {
 45 |         ScriptSet const nextScriptSet = getScriptsFor(currentChar());
 46 | 
 47 |         if (!mergeSets(nextScriptSet, currentScriptSet_))
 48 |         {
 49 |             // If merging failed, then we have found a script segmeent boundary.
 50 |             auto const res = result { resolveScript(), offset_ };
 51 |             currentScriptSet_ = nextScriptSet;
 52 |             return res;
 53 |         }
 54 | 
 55 |         offset_++;
 56 |     }
 57 | 
 58 |     auto const res = result { resolveScript(), offset_ };
 59 |     currentScriptSet_.clear();
 60 |     return res;
 61 | }
 62 | 
 63 | bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept
 64 | {
 65 |     if (nextSet.empty() || currentSet.empty())
 66 |         return false;
 67 | 
 68 |     auto currentSetIter = currentSet.begin();
 69 |     auto const currentSetEnd = currentSet.end();
 70 | 
 71 |     Script priorityScript = *currentSetIter++;
 72 | 
 73 |     if (!isPreferred(nextSet.at(0)))
 74 |     {
 75 |         if (nextSet.size() == 2 && !isPreferred(priorityScript) && commonPreferredScript_ == Script::Common)
 76 |             commonPreferredScript_ = nextSet.at(1);
 77 |         return true;
 78 |     }
 79 | 
 80 |     // If priorityScript is either Common or Inherited then take nextScriptSet
 81 |     if (!isPreferred(priorityScript))
 82 |     {
 83 |         currentSet = nextSet;
 84 |         return true;
 85 |     }
 86 | 
 87 |     auto nextSetIter = nextSet.begin();
 88 |     auto const nextSetEnd = nextSet.end();
 89 | 
 90 |     if (currentSetIter == currentSetEnd)
 91 |         return std::find(nextSetIter, nextSetEnd, priorityScript) != nextSetEnd;
 92 | 
 93 |     // See if we have a priority script, and if not, get it from the nextScriptSet
 94 |     bool hasPriorityScript = find(nextSetIter, nextSetEnd, priorityScript) != nextSetEnd;
 95 |     if (!hasPriorityScript)
 96 |     {
 97 |         priorityScript = *nextSetIter++;
 98 |         hasPriorityScript = find(currentSetIter, currentSetEnd, priorityScript) != currentSetEnd;
 99 |     }
100 | 
101 |     auto currentWriteIter = currentSet.begin();
102 |     if (hasPriorityScript)
103 |         *currentWriteIter++ = priorityScript;
104 | 
105 |     // Intersect the remaining nextScriptSet into the currentSetIter.
106 |     if (nextSetIter != nextSetEnd)
107 |     {
108 |         while (currentSetIter != currentSetEnd)
109 |         {
110 |             auto const sc = *currentSetIter++;
111 |             if (find(nextSetIter, nextSetEnd, sc) != nextSetEnd)
112 |                 *currentWriteIter++ = sc;
113 |         }
114 |     }
115 | 
116 |     // NB: first is always smaller than second, so it is save to cast to unsigned.
117 |     auto const writeCount = static_cast<size_t>(distance(currentSet.begin(), currentWriteIter));
118 |     if (writeCount == 0)
119 |         return false;
120 | 
121 |     currentSet.resize(writeCount);
122 |     return true;
123 | }
124 | 
125 | script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept
126 | {
127 |     ScriptSet scriptSet;
128 | 
129 |     // Collect all script(/-extensions) for @p _codepoint into scriptSet.
130 |     size_t const sceCount = script_extensions(codepoint, scriptSet.data(), scriptSet.capacity());
131 |     scriptSet.resize(sceCount);
132 | 
133 |     // Get the script for @p _codepoint.
134 |     Script const sc = script(codepoint);
135 | 
136 |     // If the script of @p _codepoint is also in scriptSet,
137 |     // then move it to the front of the set,
138 |     // otherwise append it to the back of scriptSet.
139 |     if (auto i = find(scriptSet.begin(), scriptSet.end(), sc); i != scriptSet.end())
140 |         swap(*i, *scriptSet.begin());
141 |     else
142 |         scriptSet.push_back(sc);
143 | 
144 |     return scriptSet;
145 | }
146 | 
147 | } // namespace unicode
148 | 


--------------------------------------------------------------------------------
/src/libunicode/script_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <libunicode/support.h>
 17 | #include <libunicode/ucd.h>
 18 | 
 19 | #include <optional>
 20 | #include <string_view>
 21 | 
 22 | namespace unicode
 23 | {
 24 | 
 25 | class script_segmenter
 26 | {
 27 |   public:
 28 |     constexpr script_segmenter() noexcept = default;
 29 |     constexpr script_segmenter& operator=(script_segmenter const&) noexcept = default;
 30 |     constexpr script_segmenter& operator=(script_segmenter&&) noexcept = default;
 31 |     constexpr script_segmenter(script_segmenter const&) noexcept = default;
 32 |     constexpr script_segmenter(script_segmenter&&) noexcept = default;
 33 | 
 34 |     constexpr explicit script_segmenter(char32_t const* data) noexcept: script_segmenter { data, getStringLength(data) } {}
 35 | 
 36 |     constexpr script_segmenter(char32_t const* data, size_t size) noexcept: data_ { data }, offset_ { 0 }, size_ { size }
 37 |     {
 38 |         currentScriptSet_.push_back(Script::Common);
 39 |     }
 40 | 
 41 |     constexpr script_segmenter(std::u32string_view data) noexcept: data_ { data.data() }, offset_ { 0 }, size_ { data.size() }
 42 |     {
 43 |         currentScriptSet_.push_back(Script::Common);
 44 |     }
 45 | 
 46 |     struct result
 47 |     {
 48 |         Script script;
 49 |         size_t size;
 50 |     };
 51 | 
 52 |     std::optional<result> consume();
 53 | 
 54 |     using property_type = Script;
 55 | 
 56 |     bool consume(out<size_t> size, out<Script> script)
 57 |     {
 58 |         if (auto const p = consume(); p.has_value())
 59 |         {
 60 |             *size = p.value().size;
 61 |             *script = p.value().script;
 62 |             return true;
 63 |         }
 64 |         return false;
 65 |     }
 66 | 
 67 |   private:
 68 |     using ScriptSet = fs_array<Script, 32>;
 69 | 
 70 |     /// constexpr-version of strlen for UTF-32 strings
 71 |     constexpr size_t getStringLength(char32_t const* data) noexcept
 72 |     {
 73 |         size_t n = 0;
 74 |         while (data && *data)
 75 |         {
 76 |             ++data;
 77 |             ++n;
 78 |         }
 79 |         return n;
 80 |     }
 81 | 
 82 |     /// Returnes all scripts that this @p _codepoint is associated with.
 83 |     ScriptSet getScriptsFor(char32_t codepoint) noexcept;
 84 | 
 85 |     /// Intersects @p _nextSet into @p _currentSet.
 86 |     ///
 87 |     /// @retval true Intersection succeed, meaning that no boundary was found.
 88 |     /// @retval false The resulting intersection is empty, meaning, a script boundary was found.
 89 |     bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept;
 90 | 
 91 |     /// Returns the resolved script.
 92 |     ///
 93 |     /// That is, if currentScriptSet is {Common}, then the preferred script for Common, otherwise
 94 |     /// whatever currentScriptSet's one and only element contains.
 95 |     constexpr Script resolveScript() const noexcept
 96 |     {
 97 |         Script const result = currentScriptSet_.at(0);
 98 |         return result == Script::Common ? commonPreferredScript_ : result;
 99 |     }
100 | 
101 |     constexpr char32_t currentChar() const noexcept { return data_[offset_]; }
102 | 
103 |     // private data
104 | 
105 |     char32_t const* data_ = U"";
106 |     size_t offset_ = 0;
107 |     size_t size_ = 0;
108 | 
109 |     ScriptSet currentScriptSet_ {};
110 |     Script commonPreferredScript_ = Script::Common;
111 | };
112 | 
113 | } // namespace unicode
114 | 


--------------------------------------------------------------------------------
/src/libunicode/script_segmenter_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libterminal" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/script_segmenter.h>
 15 | 
 16 | #include <catch2/catch_test_macros.hpp>
 17 | 
 18 | #include <string_view>
 19 | 
 20 | using namespace std::string_view_literals;
 21 | 
 22 | TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
 23 | {
 24 |     auto constexpr str = U"\uE0B0"sv; // some PUA codepoint
 25 |     auto seg = unicode::script_segmenter { str.data(), str.size() };
 26 | 
 27 |     auto const r1 = seg.consume();
 28 |     REQUIRE(r1.has_value());
 29 |     auto const res1 = r1.value();
 30 |     CHECK(res1.size == 1);
 31 |     CHECK(res1.script == unicode::Script::Unknown);
 32 | }
 33 | 
 34 | TEST_CASE("script_segmenter.common_to_specific", "[script_segmenter]")
 35 | {
 36 |     // '1' is script property Common, 'a' is script property Latin, so the whole string is Latin.
 37 | 
 38 |     auto constexpr str = U"1a"sv;
 39 |     auto seg = unicode::script_segmenter { str.data(), str.size() };
 40 | 
 41 |     std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
 42 |     REQUIRE(r1.has_value());
 43 |     auto const res1 = r1.value();
 44 |     CHECK(res1.size == str.size());
 45 |     CHECK(res1.script == unicode::Script::Latin);
 46 | 
 47 |     auto const r2 = seg.consume();
 48 |     REQUIRE_FALSE(r2.has_value());
 49 | }
 50 | 
 51 | TEST_CASE("script_segmenter.greek_kanji_greek", "[script_segmenter]")
 52 | {
 53 |     char32_t const* str = U"λ 合気道 λ;";
 54 |     auto seg = unicode::script_segmenter { str };
 55 | 
 56 |     // greek text
 57 |     std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
 58 |     REQUIRE(r1.has_value());
 59 |     unicode::script_segmenter::result const res1 = r1.value();
 60 |     CHECK(res1.size == 2);
 61 |     CHECK(res1.script == unicode::Script::Greek);
 62 | 
 63 |     // japanese (Kanji-only)
 64 |     auto const r2 = seg.consume();
 65 |     REQUIRE(r2.has_value());
 66 |     auto const res2 = r2.value();
 67 |     CHECK(res2.size == 6);
 68 |     CHECK(res2.script == unicode::Script::Han);
 69 | 
 70 |     // greek
 71 |     auto const r3 = seg.consume();
 72 |     REQUIRE(r3.has_value());
 73 |     auto const res3 = r3.value();
 74 |     CHECK(res3.size == 8);
 75 |     CHECK(res3.script == unicode::Script::Greek);
 76 | 
 77 |     // end of stream
 78 |     auto const r4 = seg.consume();
 79 |     REQUIRE_FALSE(r4.has_value());
 80 | }
 81 | 
 82 | TEST_CASE("script_segmenter.latin_and_greek", "[script_segmenter]")
 83 | {
 84 |     auto constexpr str = U"AB λ;"sv;
 85 |     auto seg = unicode::script_segmenter { str.data(), str.size() };
 86 | 
 87 |     // latin text
 88 |     std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
 89 |     REQUIRE(r1.has_value());
 90 |     auto const res1 = r1.value();
 91 |     CHECK(res1.size == 3);
 92 |     CHECK(res1.script == unicode::Script::Latin);
 93 | 
 94 |     // greek
 95 |     auto const r2 = seg.consume();
 96 |     REQUIRE(r2.has_value());
 97 |     auto const res2 = r2.value();
 98 |     CHECK(res2.size == 5);
 99 |     CHECK(res2.script == unicode::Script::Greek);
100 | 
101 |     // end of stream
102 |     auto const r3 = seg.consume();
103 |     REQUIRE_FALSE(r3.has_value());
104 | }
105 | 


--------------------------------------------------------------------------------
/src/libunicode/simd_detector.cpp:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: Apache-2.0
  2 | #include "simd_detector.h"
  3 | 
  4 | #include <cstdint>
  5 | 
  6 | // AVX512 required:
  7 | // AVX512_BITALG : popcnt
  8 | // AVX512_BW : compare greater (less is needed)
  9 | // AVX512_F : and
 10 | //
 11 | // auto max_simd_size() -> size_t;
 12 | 
 13 | void cpuid(int32_t out[4], int32_t eax, int32_t ecx);
 14 | #if _WIN32
 15 | __int64 xgetbv(unsigned int x);
 16 | #elif defined(__GNUC__) || defined(__clang__)
 17 | uint64_t xgetbv(unsigned int index);
 18 | #else
 19 | #endif
 20 | 
 21 | auto detect_os_avx() -> bool;
 22 | auto detect_os_avx512() -> bool;
 23 | 
 24 | #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
 25 |     #if _WIN32
 26 |     // clang-format off
 27 |         #include <Windows.h>
 28 |         #include <intrin.h>
 29 |         void cpuid(int32_t out[4], int32_t eax, int32_t ecx)
 30 |         {
 31 |             __cpuidex(out, eax, ecx);
 32 |         }
 33 |         __int64 xgetbv(unsigned int x)
 34 |         {
 35 |             return _xgetbv(x);
 36 |         }
 37 |     // clang-format on
 38 |     #elif defined(__GNUC__) || defined(__clang__)
 39 |     // clang-format off
 40 |         #include <cpuid.h>
 41 |         void cpuid(int32_t out[4], int32_t eax, int32_t ecx)
 42 |         {
 43 |             __cpuid_count(eax, ecx, out[0], out[1], out[2], out[3]);
 44 |         }
 45 |         uint64_t xgetbv(unsigned int index)
 46 |         {
 47 |             uint32_t eax, edx;
 48 |             __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
 49 |             return ((uint64_t) edx << 32) | eax;
 50 |         }
 51 |         #define _XCR_XFEATURE_ENABLED_MASK 0
 52 |     // clang-format on
 53 |     #else
 54 |         #error "No cpuid intrinsic defined for compiler."
 55 |     #endif
 56 | 
 57 | auto detect_os_avx() -> bool
 58 | {
 59 |     //  Copied from: http://stackoverflow.com/a/22521619/922184
 60 |     bool avxSupported = false;
 61 |     int32_t cpuInfo[4];
 62 |     cpuid(cpuInfo, 1, 0);
 63 | 
 64 |     bool const osUsesXSAVE_XRSTORE = (cpuInfo[2] & (1 << 27));
 65 |     bool const cpuAVXSuport = (cpuInfo[2] & (1 << 28)) != 0;
 66 | 
 67 |     if (osUsesXSAVE_XRSTORE && cpuAVXSuport)
 68 |     {
 69 |         auto const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK);
 70 |         avxSupported = (xcrFeatureMask & 0x6) == 0x6;
 71 |     }
 72 | 
 73 |     return avxSupported;
 74 | }
 75 | 
 76 | auto detect_os_avx512() -> bool
 77 | {
 78 |     if (!detect_os_avx())
 79 |         return false;
 80 |     uint64_t const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK);
 81 |     return (xcrFeatureMask & 0xe6) == 0xe6;
 82 | }
 83 | 
 84 | auto unicode::detail::max_simd_size() -> size_t
 85 | {
 86 |     if (!detect_os_avx())
 87 |         return 128;
 88 | 
 89 |     int32_t info[4];
 90 |     cpuid(info, 0, 0);
 91 |     int const nIds = info[0];
 92 | 
 93 |     // cpuid(info, 0x80000000, 0);
 94 |     // uint32_t nExIds = info[0];
 95 | 
 96 |     //  Detect Features
 97 |     // if (nIds >= 0x00000001)
 98 |     // {
 99 |     //     cpuid(info, 0x00000001, 0);
100 |     //     bool HW_MMX = (info[3] & ((int) 1 << 23)) != 0;
101 |     //     bool HW_SSE = (info[3] & ((int) 1 << 25)) != 0;
102 |     //     bool HW_SSE2 = (info[3] & ((int) 1 << 26)) != 0;
103 |     //     bool HW_SSE3 = (info[2] & ((int) 1 << 0)) != 0;
104 |     //
105 |     //     bool HW_SSSE3 = (info[2] & ((int) 1 << 9)) != 0;
106 |     //     bool HW_SSE41 = (info[2] & ((int) 1 << 19)) != 0;
107 |     //     bool HW_SSE42 = (info[2] & ((int) 1 << 20)) != 0;
108 |     //     bool HW_AES = (info[2] & ((int) 1 << 25)) != 0;
109 |     //
110 |     //     bool HW_AVX = (info[2] & ((int) 1 << 28)) != 0;
111 |     //     bool HW_FMA3 = (info[2] & ((int) 1 << 12)) != 0;
112 |     //
113 |     //     bool HW_RDRAND = (info[2] & ((int) 1 << 30)) != 0;
114 |     // }
115 |     if (nIds >= 0x00000007)
116 |     {
117 |         cpuid(info, 0x00000007, 0);
118 |         bool const HW_AVX2 = (info[1] & ((int) 1 << 5));
119 |         if (!HW_AVX2)
120 |             return 128;
121 | 
122 |         // bool HW_BMI1 = (info[1] & ((int) 1 << 3)) != 0;
123 |         // bool HW_BMI2 = (info[1] & ((int) 1 << 8)) != 0;
124 |         // bool HW_ADX = (info[1] & ((int) 1 << 19)) != 0;
125 |         // bool HW_MPX = (info[1] & ((int) 1 << 14)) != 0;
126 |         // bool HW_SHA = (info[1] & ((int) 1 << 29)) != 0;
127 |         // bool HW_RDSEED = (info[1] & ((int) 1 << 18)) != 0;
128 |         // bool HW_PREFETCHWT1 = (info[2] & ((int) 1 << 0)) != 0;
129 |         // bool HW_RDPID = (info[2] & ((int) 1 << 22)) != 0;
130 | 
131 |         bool const HW_AVX512_F = (info[1] & ((int) 1 << 16));
132 |         // bool HW_AVX512_CD = (info[1] & ((int) 1 << 28)) != 0;
133 |         // bool HW_AVX512_PF = (info[1] & ((int) 1 << 26)) != 0;
134 |         // bool HW_AVX512_ER = (info[1] & ((int) 1 << 27)) != 0;
135 | 
136 |         // bool HW_AVX512_VL = (info[1] & ((int) 1 << 31)) != 0;
137 |         bool const HW_AVX512_BW = (info[1] & ((int) 1 << 30));
138 |         // bool HW_AVX512_DQ = (info[1] & ((int) 1 << 17)) != 0;
139 | 
140 |         // bool HW_AVX512_IFMA = (info[1] & ((int) 1 << 21)) != 0;
141 |         // bool HW_AVX512_VBMI = (info[2] & ((int) 1 << 1)) != 0;
142 | 
143 |         // bool HW_AVX512_VPOPCNTDQ = (info[2] & ((int) 1 << 14)) != 0;
144 |         // bool HW_AVX512_4VNNIW = (info[3] & ((int) 1 << 2)) != 0;
145 |         // bool HW_AVX512_4FMAPS = (info[3] & ((int) 1 << 3)) != 0;
146 | 
147 |         // bool HW_AVX512_VNNI = (info[2] & ((int) 1 << 11)) != 0;
148 | 
149 |         // bool HW_AVX512_VBMI2 = (info[2] & ((int) 1 << 6)) != 0;
150 |         // bool HW_GFNI = (info[2] & ((int) 1 << 8)) != 0;
151 |         // bool HW_VAES = (info[2] & ((int) 1 << 9)) != 0;
152 |         // bool HW_AVX512_VPCLMUL = (info[2] & ((int) 1 << 10)) != 0;
153 |         bool const HW_AVX512_BITALG = (info[2] & ((int) 1 << 12));
154 | 
155 |         bool const use512 = detect_os_avx512() && HW_AVX512_F && HW_AVX512_BW && HW_AVX512_BITALG;
156 |         if (!use512)
157 |             return 256;
158 |         else
159 |             return 512;
160 | 
161 |         // cpuid(info, 0x00000007, 1);
162 |         // bool HW_AVX512_BF16 = (info[0] & ((int) 1 << 5)) != 0;
163 |     }
164 |     return 128;
165 |     // if (nExIds >= 0x80000001)
166 |     // {
167 |     //     cpuid(info, 0x80000001, 0);
168 |     //     bool HW_x64 = (info[3] & ((int) 1 << 29)) != 0;
169 |     //     bool HW_ABM = (info[2] & ((int) 1 << 5)) != 0;
170 |     //     bool HW_SSE4a = (info[2] & ((int) 1 << 6)) != 0;
171 |     //     bool HW_FMA4 = (info[2] & ((int) 1 << 16)) != 0;
172 |     //     bool HW_XOP = (info[2] & ((int) 1 << 11)) != 0;
173 |     //     bool HW_PREFETCHW = (info[2] & ((int) 1 << 8)) != 0;
174 |     // }
175 | }
176 | 
177 | #else
178 | auto unicode::detail::max_simd_size() -> size_t
179 | {
180 |     return 128;
181 | }
182 | #endif
183 | 


--------------------------------------------------------------------------------
/src/libunicode/simd_detector.h:
--------------------------------------------------------------------------------
1 | // SPDX-License-Identifier: Apache-2.0
2 | #pragma once
3 | 
4 | #include <cstddef>
5 | namespace unicode::detail
6 | {
7 | auto max_simd_size() -> size_t;
8 | }
9 | 


--------------------------------------------------------------------------------
/src/libunicode/support.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <array>
 17 | #include <cstdint>
 18 | #include <functional>
 19 | 
 20 | namespace unicode
 21 | {
 22 | 
 23 | #if defined(__GNUC__) || defined(__clang__)
 24 |     #define LIBUNICODE_PACKED __attribute__((packed))
 25 | #else
 26 |     #define LIBUNICODE_PACKED /*!*/
 27 | #endif
 28 | 
 29 | #if defined(__cpp_char8_t)
 30 | using char8_type = char8_t;
 31 | #else
 32 | using char8_type = uint8_t;
 33 | #endif
 34 | 
 35 | // Helper API solely for use of function parameters to visually denote output parameters.
 36 | template <typename T>
 37 | class out
 38 | {
 39 |   public:
 40 |     constexpr out(std::reference_wrapper<T> ref) noexcept: _ref { &ref.value() } {}
 41 |     constexpr explicit out(T& ref) noexcept: _ref { &ref } {}
 42 |     constexpr out(out const&) noexcept = default;
 43 |     constexpr out(out&&) noexcept = default;
 44 |     constexpr out& operator=(out const&) noexcept = default;
 45 |     constexpr out& operator=(out&&) noexcept = default;
 46 | 
 47 |     constexpr T& get() noexcept { return *_ref; }
 48 |     constexpr T const& get() const noexcept { return *_ref; }
 49 | 
 50 |     constexpr T& operator*() noexcept { return *_ref; }
 51 |     constexpr T const& operator*() const noexcept { return *_ref; }
 52 | 
 53 |     constexpr T* operator->() noexcept { return _ref; }
 54 |     constexpr T const* operator->() const noexcept { return _ref; }
 55 | 
 56 |     constexpr void assign(T value) { *_ref = std::move(value); }
 57 | 
 58 |   private:
 59 |     T* _ref;
 60 | };
 61 | 
 62 | // dynamic array with a fixed capacity.
 63 | template <typename T, std::size_t N>
 64 | class fs_array
 65 | {
 66 |   public:
 67 |     using value_type = T;
 68 |     using array_type = std::array<value_type, N>;
 69 |     using iterator = typename array_type::iterator;
 70 |     using const_iterator = typename array_type::const_iterator;
 71 | 
 72 |     constexpr fs_array() noexcept: values_ { {} }, size_ { 0 } {}
 73 | 
 74 |     constexpr void clear() noexcept
 75 |     {
 76 |         for (size_t i = 0; i < size_; ++i)
 77 |             values_[i].~T();
 78 |         size_ = 0;
 79 |     }
 80 | 
 81 |     constexpr size_t capacity() const noexcept { return N; }
 82 |     constexpr size_t size() const noexcept { return size_; }
 83 |     constexpr bool empty() const noexcept { return size_ == 0; }
 84 | 
 85 |     constexpr bool push_back(T value) noexcept
 86 |     {
 87 |         if (size_ == N)
 88 |             return false;
 89 |         values_[size_++] = std::move(value);
 90 |         return true;
 91 |     }
 92 | 
 93 |     constexpr void resize(size_t n) noexcept
 94 |     {
 95 |         if (n < N)
 96 |             size_ = n;
 97 |     }
 98 | 
 99 |     constexpr T const& operator[](size_t i) const noexcept { return values_[i]; }
100 |     constexpr T const& at(size_t i) const noexcept { return values_.at(i); }
101 | 
102 |     constexpr iterator begin() noexcept { return values_.begin(); }
103 |     constexpr iterator end() noexcept
104 |     {
105 |         using SizeT = typename std::iterator_traits<decltype(values_.begin())>::difference_type;
106 |         return std::next(values_.begin(), static_cast<SizeT>(size_));
107 |     }
108 | 
109 |     constexpr const_iterator begin() const noexcept { return values_.begin(); }
110 |     constexpr const_iterator end() const noexcept
111 |     {
112 |         using SizeT = typename std::iterator_traits<decltype(values_.begin())>::difference_type;
113 |         return std::next(values_.begin(), static_cast<SizeT>(size_));
114 |     }
115 | 
116 |     constexpr T* data() noexcept { return values_.data(); }
117 |     constexpr T const* data() const noexcept { return values_.data(); }
118 | 
119 |     constexpr T& front() noexcept { return at(0); }
120 |     constexpr T const& front() const noexcept { return at(0); }
121 | 
122 |     constexpr T& back() noexcept { return at(size_ - 1); }
123 |     constexpr T const& back() const noexcept { return at(size_ - 1); }
124 | 
125 |   private:
126 |     array_type values_;
127 |     size_t size_;
128 | };
129 | 
130 | } // namespace unicode
131 | 


--------------------------------------------------------------------------------
/src/libunicode/test_main.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #define CATCH_CONFIG_RUNNER
15 | #include <catch2/catch_session.hpp>
16 | 
17 | int main(int argc, char const* argv[])
18 | {
19 |     int const result = Catch::Session().run(argc, argv);
20 | 
21 |     // avoid closing extern console to close on VScode/windows
22 |     // system("pause");
23 | 
24 |     return result;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/libunicode/ucd_private.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | #include <array>
17 | #include <optional>
18 | 
19 | namespace unicode
20 | {
21 | 
22 | struct Interval // NOLINT(readability-identifier-naming)
23 | {
24 |     char32_t from;
25 |     char32_t to;
26 | };
27 | 
28 | template <size_t N>
29 | constexpr bool contains(std::array<Interval, N> const& ranges, char32_t codepoint) noexcept
30 | {
31 |     auto a = size_t { 0 };
32 |     auto b = static_cast<size_t>(ranges.size()) - 1;
33 |     while (a < b)
34 |     {
35 |         auto const i = ((b + a) / 2);
36 |         auto const& I = ranges[i];
37 |         if (I.to < codepoint)
38 |             a = i + 1;
39 |         else if (I.from > codepoint)
40 |         {
41 |             if (i == 0)
42 |                 return false;
43 |             b = i - 1;
44 |         }
45 |         else
46 |             return true;
47 |     }
48 |     return a == b && ranges[a].from <= codepoint && codepoint <= ranges[a].to;
49 | }
50 | 
51 | template <typename T>
52 | struct Prop // NOLINT(readability-identifier-naming)
53 | {
54 |     Interval interval;
55 |     T property;
56 | };
57 | 
58 | template <typename T, size_t N>
59 | constexpr std::optional<T> search(std::array<Prop<T>, N> const& ranges, char32_t codepoint)
60 | {
61 |     auto a = size_t { 0 };
62 |     auto b = static_cast<size_t>(ranges.size()) - 1;
63 | 
64 |     while (a < b)
65 |     {
66 |         auto const i = a + static_cast<size_t>((b - a) / 2);
67 |         auto const& I = ranges[i];
68 |         if (I.interval.to < codepoint)
69 |         {
70 |             if (i == b)
71 |                 return std::nullopt;
72 |             a = i + 1;
73 |         }
74 |         else if (I.interval.from > codepoint)
75 |         {
76 |             if (i == 0)
77 |                 return std::nullopt;
78 |             b = i - 1;
79 |         }
80 |         else
81 |             return I.property;
82 |     }
83 | 
84 |     if (a == b && ranges[a].interval.from <= codepoint && codepoint <= ranges[a].interval.to)
85 |         return ranges[a].property;
86 | 
87 |     return std::nullopt;
88 | }
89 | 
90 | } // namespace unicode
91 | 


--------------------------------------------------------------------------------
/src/libunicode/unicode_test.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/contour-terminal/libunicode/3b6290b6b5bd6597145f39a06599fc7d8c683984/src/libunicode/unicode_test.cpp


--------------------------------------------------------------------------------
/src/libunicode/utf8.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/utf8.h>
15 | 
16 | namespace unicode
17 | {
18 | 
19 | ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept
20 | {
21 |     if (!state.expectedLength)
22 |     {
23 |         if ((value & 0b1000'0000) == 0)
24 |         {
25 |             state.currentLength = 1;
26 |             return Success { value };
27 |         }
28 |         else if ((value & 0b1110'0000) == 0b1100'0000)
29 |         {
30 |             state.currentLength = 1;
31 |             state.expectedLength = 2;
32 |             state.character = value & 0b0001'1111;
33 |         }
34 |         else if ((value & 0b1111'0000) == 0b1110'0000)
35 |         {
36 |             state.currentLength = 1;
37 |             state.expectedLength = 3;
38 |             state.character = value & 0b0000'1111;
39 |         }
40 |         else if ((value & 0b1111'1000) == 0b1111'0000)
41 |         {
42 |             state.currentLength = 1;
43 |             state.expectedLength = 4;
44 |             state.character = value & 0b0000'0111;
45 |         }
46 |         else
47 |         {
48 |             state.currentLength = 1;
49 |             state.expectedLength = 0;
50 |             return Invalid {};
51 |         }
52 |     }
53 |     // clang-format off
54 |     else if ((value & 0b1110'0000) == 0b1100'0000
55 |           || (value & 0b1111'0000) == 0b1110'0000
56 |           || (value & 0b1111'1000) == 0b1111'0000)
57 |     // clang-format on
58 |     {
59 |         // We have a new codepoint, but the previous one was incomplete.
60 |         state.expectedLength = 0;
61 |         // Return Invalid for the current incomplete codepoint,
62 |         // but have already started the next codepoint.
63 |         from_utf8(state, value);
64 |         return { Invalid {} };
65 |     }
66 |     else
67 |     {
68 |         state.character <<= 6;
69 |         state.character |= value & 0b0011'1111;
70 |         state.currentLength++;
71 |     }
72 | 
73 |     if (state.currentLength < state.expectedLength)
74 |         return { Incomplete {} };
75 | 
76 |     state.expectedLength = 0; // reset state
77 |     return { Success { state.character } };
78 | }
79 | 
80 | } // namespace unicode
81 | 


--------------------------------------------------------------------------------
/src/libunicode/utf8.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <cstddef>
 17 | #include <cstdint>
 18 | #include <string>
 19 | #include <string_view>
 20 | #include <variant>
 21 | 
 22 | namespace unicode
 23 | {
 24 | 
 25 | /// Converts an UTF-32 codepoint into a UTF-8 sequence.
 26 | ///
 27 | /// @param character UTF-32 character to encode to UTF-8
 28 | /// @param result target memory location to start writing to (up to 4 chars)
 29 | ///
 30 | /// @return number of bytes written to.
 31 | constexpr inline unsigned to_utf8(char32_t character, uint8_t* result)
 32 | {
 33 |     if (character <= 0x7F)
 34 |     {
 35 |         result[0] = static_cast<uint8_t>(character & 0b0111'1111);
 36 |         return 1;
 37 |     }
 38 |     else if (character <= 0x07FF)
 39 |     {
 40 |         result[0] = static_cast<uint8_t>(((character >> 6) & 0b0001'1111) | 0b1100'0000);
 41 |         result[1] = static_cast<uint8_t>(((character >> 0) & 0b0011'1111) | 0b1000'0000);
 42 |         return 2;
 43 |     }
 44 |     if (character <= 0xFFFF)
 45 |     {
 46 |         result[0] = static_cast<uint8_t>(((character >> 12) & 0b0000'1111) | 0b1110'0000);
 47 |         result[1] = static_cast<uint8_t>(((character >> 6) & 0b0011'1111) | 0b1000'0000);
 48 |         result[2] = static_cast<uint8_t>(((character >> 0) & 0b0011'1111) | 0b1000'0000);
 49 |         return 3;
 50 |     }
 51 |     else
 52 |     {
 53 |         result[0] = static_cast<uint8_t>(((character >> 18) & 0b0000'0111) | 0b1111'0000);
 54 |         result[1] = static_cast<uint8_t>(((character >> 12) & 0b0011'1111) | 0b1000'0000);
 55 |         result[2] = static_cast<uint8_t>(((character >> 6) & 0b0011'1111) | 0b1000'0000);
 56 |         result[3] = static_cast<uint8_t>(((character >> 0) & 0b0011'1111) | 0b1000'0000);
 57 |         return 4;
 58 |     }
 59 | }
 60 | 
 61 | /// Converts a UTF-32 string into an UTF-8 sring.
 62 | inline std::string to_utf8(char32_t const* characters, size_t n)
 63 | {
 64 |     std::string s;
 65 |     s.reserve(n);
 66 |     for (size_t i = 0; i < n; ++i)
 67 |     {
 68 |         uint8_t bytes[4];
 69 |         unsigned const len = to_utf8(characters[i], bytes);
 70 |         s.append((char const*) bytes, len);
 71 |     }
 72 | 
 73 |     return s;
 74 | }
 75 | 
 76 | inline std::string to_utf8(char32_t character)
 77 | {
 78 |     return to_utf8(&character, 1);
 79 | }
 80 | 
 81 | inline std::string to_utf8(std::u32string const& characters)
 82 | {
 83 |     return to_utf8(characters.data(), characters.size());
 84 | }
 85 | 
 86 | inline std::string to_utf8(std::u32string_view const& characters)
 87 | {
 88 |     return to_utf8(characters.data(), characters.size());
 89 | }
 90 | 
 91 | struct utf8_decoder_state
 92 | {
 93 |     char32_t character = 0;
 94 |     unsigned expectedLength = 0;
 95 |     unsigned currentLength = 0;
 96 | };
 97 | 
 98 | // clang-format off
 99 | // NOLINTBEGIN(readability-identifier-naming)
100 | struct Invalid { };
101 | struct Incomplete { };
102 | struct Success { char32_t value; };
103 | // NOLINTEND(readability-identifier-naming)
104 | // clang-format on
105 | 
106 | using ConvertResult = std::variant<Invalid, Incomplete, Success>;
107 | 
108 | /// Progressively decodes a UTF-8 codepoint.
109 | ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept;
110 | 
111 | inline unsigned from_utf8i(utf8_decoder_state& state, uint8_t value)
112 | {
113 |     auto const result = from_utf8(state, value);
114 | 
115 |     if (std::holds_alternative<Incomplete>(result))
116 |         return static_cast<unsigned>(-1);
117 | 
118 |     if (std::holds_alternative<Invalid>(result))
119 |         return static_cast<unsigned>(-2);
120 | 
121 |     return std::get<Success>(result).value;
122 | }
123 | 
124 | inline ConvertResult from_utf8(uint8_t const* bytes, size_t* size)
125 | {
126 |     auto state = utf8_decoder_state {};
127 |     auto result = ConvertResult {};
128 | 
129 |     do
130 |         result = from_utf8(state, *bytes++);
131 |     while (std::holds_alternative<Incomplete>(result));
132 | 
133 |     if (size)
134 |         *size = state.currentLength;
135 | 
136 |     return result;
137 | }
138 | 
139 | #if 0 // TODO(do that later) __cplusplus > 201703L // C++20 (char8_t)
140 | inline ConvertResult from_utf8(char8_t const* bytes, size_t* size)
141 | {
142 |     return from_utf8((uint8_t const*)(bytes), size);
143 | }
144 | #endif
145 | 
146 | inline ConvertResult from_utf8(char const* bytes, size_t* size)
147 | {
148 |     return from_utf8((uint8_t const*) (bytes), size);
149 | }
150 | 
151 | template <typename T = char32_t>
152 | inline std::basic_string<T> from_utf8(std::string_view bytes)
153 | {
154 |     static_assert(sizeof(T) == 4);
155 |     std::basic_string<T> s;
156 |     size_t offset = 0;
157 |     while (offset < bytes.size())
158 |     {
159 |         size_t i {};
160 |         ConvertResult const result = from_utf8(bytes.data() + offset, &i);
161 |         if (std::holds_alternative<Success>(result))
162 |             s += T(std::get<Success>(result).value);
163 |         offset += i;
164 |     }
165 |     return s;
166 | }
167 | 
168 | } // namespace unicode
169 | 


--------------------------------------------------------------------------------
/src/libunicode/utf8_grapheme_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #include <libunicode/convert.h>
 15 | #include <libunicode/grapheme_segmenter.h>
 16 | #include <libunicode/utf8.h>
 17 | 
 18 | #include <ostream>
 19 | #include <string_view>
 20 | 
 21 | namespace unicode
 22 | {
 23 | 
 24 | struct utf8_grapheme_segmenter
 25 | {
 26 |     class iterator;
 27 | 
 28 |     explicit utf8_grapheme_segmenter(std::string_view text) noexcept;
 29 |     utf8_grapheme_segmenter(utf8_grapheme_segmenter const&) noexcept = default;
 30 |     utf8_grapheme_segmenter(utf8_grapheme_segmenter&&) noexcept = default;
 31 |     utf8_grapheme_segmenter& operator=(utf8_grapheme_segmenter const&) noexcept = default;
 32 |     utf8_grapheme_segmenter& operator=(utf8_grapheme_segmenter&&) noexcept = default;
 33 | 
 34 |     iterator begin() const noexcept;
 35 |     iterator end() const noexcept;
 36 | 
 37 |   private:
 38 |     std::string_view _text;
 39 | };
 40 | 
 41 | class utf8_grapheme_segmenter::iterator
 42 | {
 43 |   public:
 44 |     using value_type = std::u32string;
 45 | 
 46 |     iterator(char const* data, char const* end) noexcept;
 47 |     iterator(iterator const&) = default;
 48 |     iterator(iterator&&) noexcept = default;
 49 |     iterator& operator=(iterator const&) = default;
 50 |     iterator& operator=(iterator&&) noexcept = default;
 51 | 
 52 |     value_type const& value() const noexcept;
 53 |     value_type const& operator*() const noexcept;
 54 | 
 55 |     iterator& operator++() noexcept;
 56 |     iterator operator++(int) noexcept;
 57 | 
 58 |     bool operator==(iterator const& other) const noexcept;
 59 |     bool operator!=(iterator const& other) const noexcept;
 60 | 
 61 |     // private:
 62 |     char32_t consumeCodepoint() noexcept;
 63 |     void consumeGraphemeCluster() noexcept;
 64 | 
 65 |     char const* _start;
 66 |     char const* _clusterStart;
 67 |     char const* _nextCodepointStart;
 68 |     char const* _nextUtf8;
 69 |     char const* _end;
 70 |     utf8_decoder_state _utf8_decoder_state {};
 71 |     ConvertResult _result = Incomplete {};
 72 |     char32_t _nextCodepoint {};
 73 |     value_type _cluster {};
 74 | };
 75 | 
 76 | // {{{ utf8_grapheme_segmenter implementation
 77 | inline utf8_grapheme_segmenter::utf8_grapheme_segmenter(std::string_view text) noexcept: _text { text }
 78 | {
 79 | }
 80 | 
 81 | inline utf8_grapheme_segmenter::iterator utf8_grapheme_segmenter::begin() const noexcept
 82 | {
 83 |     return iterator { _text.data(), _text.data() + _text.size() };
 84 | }
 85 | 
 86 | inline utf8_grapheme_segmenter::iterator utf8_grapheme_segmenter::end() const noexcept
 87 | {
 88 |     return iterator { _text.data() + _text.size(), _text.data() + _text.size() };
 89 | }
 90 | // }}}
 91 | 
 92 | // {{{ iterator implementation
 93 | inline utf8_grapheme_segmenter::iterator::iterator(char const* data, char const* end) noexcept:
 94 |     _start { data }, _clusterStart { data }, _nextCodepointStart { data }, _nextUtf8 { data }, _end { end }
 95 | {
 96 |     if (data != end)
 97 |     {
 98 |         consumeCodepoint();
 99 |         consumeGraphemeCluster();
100 |     }
101 | }
102 | 
103 | inline utf8_grapheme_segmenter::iterator::value_type const& utf8_grapheme_segmenter::iterator::value() const noexcept
104 | {
105 |     return _cluster;
106 | }
107 | 
108 | inline utf8_grapheme_segmenter::iterator::value_type const& utf8_grapheme_segmenter::iterator::operator*() const noexcept
109 | {
110 |     return _cluster;
111 | }
112 | 
113 | inline char32_t utf8_grapheme_segmenter::iterator::consumeCodepoint() noexcept
114 | {
115 |     auto constexpr ReplacementChar = char32_t { 0xFFFD };
116 |     _nextCodepointStart = _nextUtf8;
117 |     while (_nextUtf8 != _end)
118 |     {
119 |         _result = from_utf8(_utf8_decoder_state, uint8_t(*_nextUtf8++));
120 |         if (std::holds_alternative<Success>(_result))
121 |         {
122 |             auto const result = _nextCodepoint;
123 |             _nextCodepoint = std::get<Success>(_result).value;
124 |             return result;
125 |         }
126 |         if (std::holds_alternative<Invalid>(_result))
127 |         {
128 |             auto const result = _nextCodepoint;
129 |             _nextCodepoint = ReplacementChar;
130 |             return result;
131 |         }
132 |     }
133 |     auto const result = _nextCodepoint;
134 |     _nextCodepoint = 0;
135 |     return result;
136 | }
137 | 
138 | inline void utf8_grapheme_segmenter::iterator::consumeGraphemeCluster() noexcept
139 | {
140 |     _clusterStart = _nextCodepointStart;
141 |     _cluster.clear();
142 | 
143 |     bool nonbreakable = true;
144 |     while (_nextCodepointStart != _end && nonbreakable)
145 |     {
146 |         _cluster.push_back(consumeCodepoint());
147 |         nonbreakable = unicode::grapheme_segmenter::nonbreakable(_cluster.back(), _nextCodepoint);
148 |     }
149 | }
150 | 
151 | inline utf8_grapheme_segmenter::iterator& utf8_grapheme_segmenter::iterator::operator++() noexcept
152 | {
153 |     consumeGraphemeCluster();
154 |     return *this;
155 | }
156 | 
157 | inline utf8_grapheme_segmenter::iterator utf8_grapheme_segmenter::iterator::operator++(int) noexcept
158 | {
159 |     auto tmp(*this);
160 |     ++*this;
161 |     return tmp;
162 | }
163 | 
164 | inline bool utf8_grapheme_segmenter::iterator::operator==(iterator const& other) const noexcept
165 | {
166 |     return _clusterStart == other._clusterStart;
167 | }
168 | 
169 | inline bool utf8_grapheme_segmenter::iterator::operator!=(iterator const& other) const noexcept
170 | {
171 |     return !(*this == other);
172 | }
173 | // }}}
174 | 
175 | } // namespace unicode
176 | 
177 | namespace std
178 | {
179 | inline ostream& operator<<(ostream& os, unicode::utf8_grapheme_segmenter::iterator const& i)
180 | {
181 |     os << '"' << unicode::convert_to<char>(std::u32string_view(i.value())) << '"';
182 |     return os;
183 | }
184 | } // namespace std
185 | 


--------------------------------------------------------------------------------
/src/libunicode/utf8_grapheme_segmenter_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/utf8_grapheme_segmenter.h>
15 | 
16 | #include <catch2/catch_test_macros.hpp>
17 | 
18 | #include <format>
19 | #include <string_view>
20 | 
21 | using namespace std::string_literals;
22 | using namespace std::string_view_literals;
23 | 
24 | namespace
25 | {
26 | 
27 | std::string escape(std::string const& s)
28 | {
29 |     std::string t;
30 |     for (char ch: s)
31 |         if (std::isprint(ch))
32 |             t += ch;
33 |         else
34 |             t += std::format("\\x{:02X}", ((unsigned) ch) & 0xFF);
35 |     return t;
36 | }
37 | 
38 | template <typename... Ts>
39 | void test_utf8_grapheme_cluster_segmentation(Ts... expects)
40 | {
41 |     static_assert(sizeof...(expects) != 0);
42 | 
43 |     auto const text = (unicode::convert_to<char>(expects) + ...);
44 | 
45 |     auto const grapheme_segmenter = unicode::utf8_grapheme_segmenter(std::string_view(text));
46 |     auto i = grapheme_segmenter.begin();
47 |     auto const e = grapheme_segmenter.end();
48 |     auto const s8 = [](auto const s) -> std::string {
49 |         return escape(unicode::convert_to<char>(std::u32string_view(s)));
50 |     };
51 | 
52 |     auto const checkOne = [&](std::u32string_view expected) -> void {
53 |         INFO(std::format("expects: {}, actual {}", s8(expected), s8(*i)));
54 |         REQUIRE(s8(*i) == s8(expected));
55 |         REQUIRE(i != e);
56 |         ++i;
57 |     };
58 | 
59 |     (checkOne(expects), ...);
60 | 
61 |     REQUIRE(i == e);
62 |     REQUIRE(*i == U""sv);
63 | 
64 |     ++i;
65 |     REQUIRE(i == e);
66 |     REQUIRE(*i == U""sv);
67 | }
68 | 
69 | } // namespace
70 | 
71 | TEST_CASE("utf8_grapheme_segmenter.empty")
72 | {
73 |     auto const grapheme_segmenter = unicode::utf8_grapheme_segmenter(""sv);
74 |     auto i = grapheme_segmenter.begin();
75 |     auto const e = grapheme_segmenter.end();
76 |     REQUIRE(i == e);
77 |     REQUIRE(*i == U""sv);
78 |     ++i;
79 |     REQUIRE(i == e);
80 |     REQUIRE(*i == U""sv);
81 | }
82 | 
83 | TEST_CASE("utf8_grapheme_segmenter.mixed")
84 | {
85 |     test_utf8_grapheme_cluster_segmentation(U"Y"sv, U"e"sv, U"s"sv);
86 |     test_utf8_grapheme_cluster_segmentation(U"├"sv, U"─"sv);
87 |     test_utf8_grapheme_cluster_segmentation(U"├"sv, U"─"sv, U" "sv, U"Y"sv, U"e"sv, U"s"sv);
88 |     test_utf8_grapheme_cluster_segmentation(U"X"sv, U"\U0001F926\U0001F3FC\u200D\u2642\uFE0F"sv, U"5"sv);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/libunicode/width.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/codepoint_properties.h>
15 | #include <libunicode/ucd.h>
16 | #include <libunicode/width.h>
17 | 
18 | namespace unicode
19 | {
20 | 
21 | unsigned width(char32_t codepoint) noexcept
22 | {
23 |     return codepoint_properties::get(codepoint).char_width;
24 | }
25 | 
26 | } // namespace unicode
27 | 


--------------------------------------------------------------------------------
/src/libunicode/width.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libunicode" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #pragma once
15 | 
16 | namespace unicode
17 | {
18 | 
19 | /// Returns the number of text columns the given codepoint would need to be displayed.
20 | unsigned width(char32_t codepoint) noexcept;
21 | 
22 | } // namespace unicode
23 | 


--------------------------------------------------------------------------------
/src/libunicode/width_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libterminal" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/width.h>
15 | 
16 | #include <catch2/catch_test_macros.hpp>
17 | 
18 | TEST_CASE("random test", "[width]")
19 | {
20 |     // C0
21 |     CHECK(unicode::width(0x07) == 0);
22 | 
23 |     // US-ASCII
24 |     for (char32_t i = 0x20; i <= 0x7E; ++i)
25 |         CHECK(unicode::width(i) == 1);
26 | 
27 |     CHECK(unicode::width(U'\u00A9') == 1); // Copyright symbol
28 | 
29 |     CHECK(unicode::width(U'\u200D') == 0); // ZWJ
30 |     CHECK(unicode::width(U'\uFE0E') == 0); // emoji modifier
31 |     CHECK(unicode::width(U'\uFE0F') == 0); // emoji modifier
32 | 
33 |     // emoji
34 |     CHECK(unicode::width(U'\U0001F60A') == 2); // 😊 :blush:
35 |     CHECK(unicode::width(U'\U0001F480') == 2); // 💀 :skull:
36 | }
37 | 


--------------------------------------------------------------------------------
/src/libunicode/word_segmenter.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the "libunicode" project
  3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  *
  8 |  * Unless required by applicable law or agreed to in writing, software
  9 |  * distributed under the License is distributed on an "AS IS" BASIS,
 10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |  * See the License for the specific language governing permissions and
 12 |  * limitations under the License.
 13 |  */
 14 | #pragma once
 15 | 
 16 | #include <string_view>
 17 | 
 18 | namespace unicode
 19 | {
 20 | 
 21 | class word_segmenter
 22 | {
 23 |   public:
 24 |     using char_type = char32_t;
 25 |     using iterator = char_type const*;
 26 |     using view_type = std::basic_string_view<char_type>;
 27 | 
 28 |     constexpr word_segmenter(std::basic_string_view<char_type> const& str): word_segmenter(str.data(), str.data() + str.size()) {}
 29 | 
 30 |     constexpr word_segmenter(): word_segmenter({}, {}) {}
 31 | 
 32 |     constexpr bool empty() const noexcept { return size() == 0; }
 33 |     constexpr std::size_t size() const noexcept { return static_cast<size_t>(_right - _left); }
 34 |     constexpr view_type operator*() const noexcept { return view_type(_left, size()); }
 35 | 
 36 |     constexpr word_segmenter& operator++() noexcept
 37 |     {
 38 |         _left = _right;
 39 |         while (_right != _end)
 40 |         {
 41 |             switch (_state)
 42 |             {
 43 |                 case State::NoWord:
 44 |                     if (!isDelimiter(*_right))
 45 |                     {
 46 |                         _state = State::Word;
 47 |                         return *this;
 48 |                     }
 49 |                     break;
 50 |                 case State::Word:
 51 |                     if (isDelimiter(*_right))
 52 |                     {
 53 |                         _state = State::NoWord;
 54 |                         return *this;
 55 |                     }
 56 |                     break;
 57 |             }
 58 |             ++_right;
 59 |         }
 60 |         return *this;
 61 |     }
 62 | 
 63 |     constexpr bool operator==(word_segmenter const& rhs) const noexcept { return _left == rhs._left && _right == rhs._right; }
 64 | 
 65 |     constexpr bool operator!=(word_segmenter const& rhs) const noexcept { return !(*this == rhs); }
 66 | 
 67 |   private:
 68 |     constexpr word_segmenter(iterator begin, iterator end):
 69 |         _left { begin },
 70 |         _right { begin },
 71 |         _state { begin != end ? (isDelimiter(*_right) ? State::NoWord : State::Word) : State::NoWord },
 72 |         _end { end }
 73 |     {
 74 |         ++*this;
 75 |     }
 76 | 
 77 |     constexpr bool isDelimiter(char_type character) const noexcept
 78 |     {
 79 |         switch (character)
 80 |         {
 81 |             case ' ':
 82 |             case '\r':
 83 |             case '\n':
 84 |             case '\t': return true;
 85 |             default: return false;
 86 |         }
 87 |     }
 88 | 
 89 |     // private fields
 90 |     //
 91 |     enum class State
 92 |     {
 93 |         Word,
 94 |         NoWord
 95 |     };
 96 | 
 97 |     iterator _left;
 98 |     iterator _right;
 99 |     State _state;
100 |     iterator _end;
101 | };
102 | 
103 | } // namespace unicode
104 | 


--------------------------------------------------------------------------------
/src/libunicode/word_segmenter_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the "libterminal" project
 3 |  *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing, software
 9 |  * distributed under the License is distributed on an "AS IS" BASIS,
10 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the License for the specific language governing permissions and
12 |  * limitations under the License.
13 |  */
14 | #include <libunicode/word_segmenter.h>
15 | 
16 | #include <catch2/catch_test_macros.hpp>
17 | 
18 | using namespace unicode;
19 | using namespace std::string_literals;
20 | using namespace std;
21 | 
22 | TEST_CASE("word_segmenter.HelloWorld", "[word_segmenter]")
23 | {
24 |     auto constexpr s = U"Hello, \t World!"sv;
25 | 
26 |     auto ws = word_segmenter(s);
27 |     CHECK(*ws == U"Hello,");
28 |     CHECK(ws.size() == 6);
29 | 
30 |     ++ws;
31 |     CHECK(*ws == U" \t ");
32 |     CHECK(ws.size() == 3);
33 | 
34 |     ++ws;
35 |     CHECK(*ws == U"World!");
36 |     CHECK(ws.size() == 6);
37 | 
38 |     ++ws;
39 |     CHECK(*ws == U"");
40 |     CHECK(ws.size() == 0);
41 | }
42 | 


--------------------------------------------------------------------------------
/src/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(LIBUNICODE_TOOLS)
2 |     add_executable(unicode-query unicode-query.cpp)
3 |     target_link_libraries(unicode-query unicode)
4 |     if(LIBUNICODE_BUILD_STATIC)
5 |         target_link_libraries(unicode-query "-static")
6 |     endif()
7 |     install(TARGETS unicode-query DESTINATION bin)
8 | endif()
9 | 


--------------------------------------------------------------------------------
/tests/zalgo.txt:
--------------------------------------------------------------------------------
1 | [11;72H[39;49mPack my box with five dozen liquor jugs.[12;72H˙sɓnɾ ɹonbıl uǝzop ǝʌıɟ ɥʇıʍ xoq ʎɯ ʞɔɐԀ[13;72H.ꙅǫuꞁ ɿoupi| ᴎɘƹob ɘviᎸ ʜƚiw xod ʏm ʞɔɒꟼ[14;71HP̸̯̼͙̻̲͚̜͚͈̩̎͠a̶̯̳̱̟͚͇̩̯̬͂̒̒̌̅͊̽̿͗̈́͘͝ͅć̸̮̦̩̭͓̫̟̹̆͂͒̓͆̈̅̀͐̿̚ͅk̶̡̻̜̼̙͍̥̗̯̠̜͓̪̽ ̷̮͚̺͎̗̂̈́̿̑͝m̴̩͍̺̟͓̼͓͇̟̙͂̏̈́͆̎́̐̐̽̕̚͘͜ͅy̵̧̻̗̦̯̱̬̤͈̦̺͗̓͛ ̵̣̤͕͙̟͛͌͑̉͘͝b̵̨̡̨̯̞̘͕̰̙̬̳͇̮͖̹͗o̶̻̫͑̎͑̽̈́̚̚̕ͅx̵̢̦̗̳̝̻̗̟̘̻̼͚̰̓̇̓͛̀ ̸͈̜͙́̍͂̅̃̿͘w̵̧̪̻̤̮̑͊͌̈́͋́̔͂̑̌͑͋̇͂ḯ̴̺͛̒̔̏́̅̓̾̔̊̆͗͠͝t̷̢̛̟͉͕̗̙̭̖͈̼̂̎́͌͜͝h̸̢̨̜̤̗͎̳̖͙̺̹̭̘̞̀̀̓̊̐̀͐̈́̀̿͆̔̄͝ͅ ̴̹̙̜̥͕͖͑́͛̈́̄̈́̿̕f̶̡̢̳̗͉̩͖̹͚̗̩̰͖̀̂͗͌̑i̴̧̝̰͎͕̣͓͓̋͒̇̀̾͐̃̚̕v̵̛̠̩̪̻̟͕̭͕̗̲̼̽͗͐̄̈́̾͂̍̔̎͌̌͘͝e̵̢̤͇̪̻͚̜͉̻͉̝͙͗̀̃̐͋̌̋̌̈́̅̈̌̉͘͜ ̶̡̢̭͙̤͑̑͂̐͛̑̍͑͌̀͛d̴̨̧͍̫͔͔̫̻̗̙͖̞̱͆̒͂̈̐͑̕̚͠͝ó̸̰̠̦̦̞̼̘͔̥͎͕̦̯̑̀̇̈́̎̄͘z̴̡̧̡̦̦̙̞̪̣̤͕̫̳̈̉͌̃͌͛̀͌̎̃̌͒͜͝ȩ̷̻͖̥̬̹̖̫͛͐̍̂̾̀͑͊̎̀̊̏̕ͅn̵̢̢̧͚̜͉̯̲͕̒͊̒͌͋͗̓́͂͝ ̵̡̧̨̛̛͓͚̘̺̲̺̻̻̫̾̄̒̑̄̄̏̇̍̽͜l̴̗̩͍̰̇i̴̡̭̳͉̘̩͚̽̏̿̈̔́̂̈́̊͝q̷̫͚̌̅̈́̓̐̎ů̶̢͈̪͔̅̓̀̓̓̈͆̍͋͋̉͝͠ǫ̵̻͠r̵̰̯̠̟̬͖̳͔̚ͅ ̶̡̣̭̥̻̭͙͎̰̜̥̜̊j̷̧̡̟̝̼̞̭͙͈̘̇̾̽͊̄̈̍͗͒͑͜u̷̡̪̤̖̣̰͈̽̀̚͜g̸̨̢̧̳̙̝̠̩̜̻͙̘̪̞̈́͐̈́̇̆̎̈ͅs̶̫͑͂̂͛̋̇̅̒͝.̴͈̖̮̪̮͓̹̈̐̇̓̇͝ ̵̭̤͐͗̇̽̎́͆͋͌͜[15;72HⓅⓐⓒⓚ ⓜⓨ ⓑⓞⓧ ⓦⓘⓣⓗ ⓕⓘⓥⓔ ⓓⓞⓩⓔⓝ ⓛⓘⓠⓤⓞⓡ ⓙⓤⓖⓢ [16;72Hφąçҟ ʍվ ҍօ× աìէհ ƒìѵҽ ժօՀҽղ Ӏìզմօɾ ʝմցʂ.[17;75HP⃞a⃞c⃞k⃞m⃞y⃞b⃞o⃞x⃞w⃞i⃞t⃞h⃞f⃞i⃞v⃞e⃞d⃞o⃞z⃞e⃞n⃞l⃞i⃞q⃞u⃞o⃞r⃞j⃞u⃞g⃞s⃞.⃞[18;75HP⃣a⃣c⃣k⃣m⃣y⃣b⃣o⃣x⃣w⃣i⃣t⃣h⃣f⃣i⃣v⃣e⃣d⃣o⃣z⃣e⃣n⃣l⃣i⃣q⃣u⃣o⃣r⃣j⃣u⃣g⃣s⃣.⃣[19;72Hᴘᴀᴄᴋ ᴍʏ ʙᴏx ᴡɪᴛʜ ꜰɪᴠᴇ ᴅᴏᴢᴇɴ ʟɪQᴜᴏʀ ᴊᴜɢꜱ.[20;74H🅝🅔🅖🅐🅣🅘🅥🅔 🅒🅘🅡🅒🅛🅔🅢 🅐🅡🅔 🅐🅛🅢🅞 🅐🅥🅐🅘🅛🅐🅑🅛🅔 [21;67H🄴 🅂 🄲 🄷 🄴 🅆  🄲 🄸 🅁 🄲 🄻 🄴 🅂  🄶 🄴 🅃  🅂 🅀 🅄 🄰 🅁 🄴 🅂 [22;74H🅰 🅱🅴🅰🆄🆃🅸🅵🆄🅻 🆄🆂🅴 🅾🅵 🅽🅴🅶🅰🆃🅸🆅🅴 🆂🆀🆄🅰🆁🅴🆂 [25;74Hį̵̢̡̢̨̨̛͈̤͇̭͈̣̗̮̫̫̰̤̳̘̞̥̟͙̞̥͉̳̜̫̣̯̗̠͖̲̪̘̟͉͖̜͔̪͈̜̼̩̥̗͔̭͔̠̺̫̺̦͇̐͛̉̃͒̿̒͆̅̐̆͆͌͋̎̈́̓̔̓͌̌̀̽̾̈́̄̈́̇̒̀͆͆͌̄́̎̋̑̀͊̐̆̚̚̚̚͜͜͜͝͠͠͝ͅͅn̵̡̢̧̧̧̢̢̛̛̙̩͈̦̠̳̜͖̤͍̻̹̼̯̦̦̰͕͈͈̖̬̣̦͓̳̣̮͙͓̣͖̲͉̭̞̤̞͙̝̝͔͓̗͓̣͙͕̲̩̠̬̑̊̈̐͛͒̎̐͒̓̆̌͊̅̿̂͛̈́̈́́̉̽̔̓̔̎̈̿͒̃̊̾̏̇̈́̋́͆̐̈́͛̔̓̄̀͋̊̃͛͑̋͂̔̈̐͐͐̆̇͌̓́́̿́̍̈́̈́̈́͋̄͆̈͛͂̇͗̅̓̀̈́̂͂̆̃̈́̈́̓̓̐̔̌̍̋͐̚̕̚͘͘͘͘̚̚͘͘͜͝͝͝͝͠͝e̶̡̨̨̧̨̨̨̧̦͇̖͔̫̹̺̺̯͈̺̫̗͈̲͕̮͈͓̭̱͔͈͚̬̰̜͇̼̙̰̣͎͕̠̥̯̖̣̬̗͎̓͑͌̊̇͆͐͑̂̎͋̑͐̀̅̂͌̽̉̊̀͌̓̈́̀͛̐́̋̈͑͆̐̑̉̐̍́̈́̅̿͒̕̕̚͜͜͝͝͠ļ̷̢̨̢̢̡̡̣͕͈̩͔̻̬̻͙͙̩̲͇͈̥͎̖̻͖̮̪̞̠̹͍̙͖̳̱͓̘͖̱̱͕̯̰͙̝̣͕̭̮̿̑͘͜͜͜ự̴̡̡̨̨̢̨̨̤͈̝̟̣̠͖̗͙̗̖̦̯͍̘̟͎̝͔͙̗͎̟̺̣͖̝̩̜̜̖̳̹͍̦̩͔̜̘̲͈̮̠͚̭̭̜͉̘̫̰̺̫͖̗̦̮͓̗̻̖͍̥̤͎͖̣̻̝͚͙̤̥͎̖̯͙̯̫̘̜̘̤̠̯͖͉͍̎̈͐̄̊̈̊̌͑̎͂͒͌́̃͐̐̉͗̽̀͂̃̒͋̂̀̊̉́̃̄̄̍̆́̑͑̇̇̾́̊̂͊̐͑̾̀͊̒̓́̂̐̄͋̀̈́̅̏̒̎̊̒͂͐̑͐̉́̄̂̀̕̕͘̕̕͘̕̕̕͜͜͝͝͝͝͝͝͠͝ć̵̡̨̨̢̢̛̛͔̩̦͓̥̠̜̳̯̮̩̜̖͖̘̜̖͖̖̳̹̲̻̯̬̪̰̭͇̮͉̳͉̬̬͇͉̟̲̞̱̯̖̗̞͚͎͓͔͍͈̲͇̳͈̦̠͚̲͎̦͙̰̲̬̝̞̟̪͈̘̖̦̩̝̩͕͈̦̯͙̞̖̥̺̝̗̔͊̂̈́́̿̾̾͛̈͐̔̉̈͐̈́̈̉͑̃̂͋́̆̓̒̈̈̉͛̏̅͌͌̆̓́͊̂̾̈̚͘͘͜͜͜͝͠͝͠͠͠͝͠ͅͅͅt̶̡̡̧̛̛̮̘̹̼̺̹͓̠͖̳̺̹̘̥͙͇̪̩͇̖̦̾͐̍̇͊̅̅̇̀̂̏̀̆̑͗̑͆͊̾̎͗̋̊̇̒͛̀̇͂̒͆̀̊̽̑̀̐̅̔̅̀̋͆͐̋́͆̒̓̐͐́͂̐̋̇̿̑͆̅͑͐́̄́̈́̌̈́͋̄̎͌̈́̅̍̎̓̑̕̚͘̚̚͜͠͠͝͝͠͝ͅą̵̧̛̛͎͕̯̪͚̫̟̺͚̱͎͚̹̟̖̦̼͍̦͙̹̹̰̫̤̯͚̠̼͉̭̳̓͗̈́̎̀̇͌͌͌̐̌̀̾̀̈̍̾̓̀͌͑̔̆̋̇̇̈́̔̌̈́̐͒͌̒̐̋̅̀̉͆̒̓̀͌̒̒͋̐̀̒̇̉̅̏̾̕͘̕͝͝͠͝ͅͅb̷̧̧̡̢̨̧̡̡̛̛̛͈̩̰͖̰̹̯̪͍̯͉͙̺̬̝̟̥̜͙̙͕͇̗̪̬̣̙͕̙̙͉͔̦̦̼̜̳̤̺͕͇̠̲̘̼̲̦̺̝̪̲̣̠̪̮͉̮͙̥̤̫̫͉͍̤͍͙̺̩̫̜͚̞̫̺͍̺̣͍̥̠͓̯̳̳̖̟͉̄̏͛̆͊͆͆̆̐̀̏̎̊͆̾͋̇͑̓̒͑̂͛́͊͗̇̔̾͆͂̒̃͋͊̊̆̀̈̊̌͗̈̿̄̀́̌̌̑͐͂̈́̇͛̒̏̒̋̅̆̈̽̽̅̐͊͆̿̐̌̈̂̓͌̀̋͛͛̎͒͒͋̏̐̀̒̇̀̐́̕͘̕͜͜͜͜͜͝͝͝͠͠͝͠͝ͅl̴̨̨̢̨̨̢̧̨̧̨͍͕̟̲͙̺̯̹̺̟̼̜͍̙̣̼̬̺͕͚̤̙͍͓̳̘͍̠̙͍͎̝̮͍̤̰̩̯̦̠̦̻̣̜͎͉͓͎̦̝̖̲͚̦͓̭̩͉͖̘̟̪͔͎̪̰͓͔̟̬͂̌̈̉͋͘ͅͅͅê̸̡̨̧̧̧̡̧̧̨̧̛̛̝͕̪̲̟̣͍̹̻̠̥̟̯͔̻̗̙̳̺͍̗̬̥̼͕͖͈͔̝̩͙͔̜̩̫͕̠͇͔͍̥͚͕͉̬̩̫̟͉̝̭̖̳̳̰̘͙͓̙͍̙̞̣̖̝̦̰̰̺̰̺̖̦̝͈̲͈̗̬̦̺̹͎̯̭̟̺̯̠̬̮̝̰͙̗͎͈͔̮̲̯̩̪͈͙̲̪̱̦̩̘̲̈̋̒̉̑̈́̍̀̄̇̓́̓̾̌̒̏͌͒̂͆̆̈́́͐̾̈́͌̍͌̉̐̐̑̇̽̑̋̆͗̑̆̌͂͛̑́̒̅̌̌̀͗͂̅̎̋̈́̅͑̌̄̃̾̎͌͒͑͊̍̂̾͋̉̒̋̃̒̑̈͋̊̒͆̏̎̊̃̕̚̚̕͘̕͘͘̚͜͜͠͠͝͝͝͝͝͝͠͠͝ͅͅͅͅͅ ̷̧̡̢̡̧̢̧̧̡̨̖̤̺̤̮̗̭̬̰̰̟̳͚̱̩͍̱̰̩̠̮͕͓̣̺̳͈͔̟̳̣͈̱̰̬̞̖̦͔̼̹̥͖͈͚̩̼͖̬̞̩͎͇̞̬̯̟͕̞̗͖̘̙͖̞͓̜̭͖̹̱̙̖͔̭̲̻̘͚͇͍̱̹͇̱̬̘̲͕̝̗̳̰͖̮̞͉̗̝͖̥̹̣̣̬̠̘̦͚̫̜̻̱̌̀͂̃̑̓̄̈́͐̌͒̐́͑̂̇͋͗́́͗̍͊̐͑̾͂̍̈́̏̈́́̑̌̓̎̈́̐̑͗̊̇̈́͂̚͘̚͜͜͝͝͝͝͝͝ͅͅͅm̷̢̡̛̛͚̹̻̮̘̤̩̬͙͎̈́̿̂̍͂̽̀͒͌́̃͑̓̿̉̀͑̈̓͒͘̕͝͝͝͠ͅǫ̶̢̢̢̨̼͇̗͕͖͚̤̤̰̰̯̜͕͍̠̳̰͕̣͍̱̞̮̠̳̤̭̰̫͖̘͉̠̝̹̩̳̳̱͔͍̲̪̱͔̫̠͑́͋́́̓̀͋̈́̍̉̅̾́͘͘͜͜͠ḑ̸̢̨̡̛̛͍̞͉̝̖͓̝̳̝̲͈̗̼͖̤̯͈̮̮͇̹̲̰̟͎̼̏͂́̃̈̀̾̅̊̍̒͛̓̒̆̀͐͗̍̆͛͊̂͗̃́̓͑̃̇͒͌͛̆̋̈́̑̕͝ͅą̸̛̛̗̠̙̳̣̯͇̙̯̰̊͒̆̔͛͌̄̓̒͐̑̏̃̊̏͌͊͑̒̂̂͂̈́̐̎̂̀͒̓̅̈́͊́́̒̾̑̒̒̉͛́̃̔̈̀̍͋͑̓͆̂̿̽̏́͑͂̀̍̀̓̒͛̊̋̈̑̎̅̀̉̑̈́͑̓̐͑̾̃̒͊̉̒̓̌̀͑̍͑̾͊̿͌̔̏͗͗̈́͘͘͘̚̕͘̕͘̕͘͠͠͝ͅĺ̴̨̨̡̧̡̧̨̧̡̡̢̡͖͔̗̮͓̼̗͈͔̳̬̞̺͙̩̣̩̥͎̙̭̯̖̪̫͎͇̪̯̬̠͓͖̣͇͍̮̮͈̟̫͚͉̻͈̖̗̠̙̝͙̠̰͚̱̲̤̺̪̭̫̱̫̠̞͓̹̮̖̘̣̓̿́̌͂̀̇͐̈́̾̀̂̍͆͛͗͊̈́̀̈́͑̎͋̒͒̕̚̕̚̕͜͠͠͠ͅͅi̴̧̨̢̝͎͇̠̭̟͇̭͙͚̳̭̘͈̟͓͈̥̭͚͈͍̲̲͔̤͉͈̎̈͂͆̀͛͋͊̏͊͗͋̓̐̉̆̒̈́͊̏̈͋́̓̿̾͛̒̓̅͛̚̕͜͜͝͠͝ͅͅṯ̶̡̛̪͚̮̩̤̗͇̗͍̩̬̲͔͙̼̯͉͓̃̐̽̐̌͂̑͊͊͝y̷̨̡̧̢̧̧̧̧̡̡̨̧͈̫͉̟̠̘̜̟̜̻̭̜̦̻̺̮͖̫̺͙̰̟̥̱̝̤̘͙̳̜͔̼̹̹͉̳̰̻̳̩̻̺̫͈̼̘͉͓͎̙͔̣̥̪̺͎̙̹͕͇̖̤̗͙͈͉̭̦̺͈̞̼̗̜̯̞̠̬̱̼͈͇̖̬͕͕̬̮͖̤̜͔͙̻̤͈͓̣̖͓̘̘̬̳̻̜̲̳̏̄̂̈́̒̃̎̇̏͆͊̓̎́̓͛̄͆̃͆̈́́͊̋̈́̾̈͒̍͊̔̆̉̏͐̿͆̂̊̀̌̀̽̆̋̓́͆̓̈́̚͘͜͜͜͝ͅͅͅͅͅ ̴̢̧̧̨̢̨̡̡̨̢̨̛̛̛̥͓̬͍̭̫̬̬̱̩͇̱̗͚̲̘͉̘͇͓̮͇̹̱̠͖̦͈̟̮̬̦̝͚͙̝̠͖̞̖̱̥̜̯̼͓̫̹͖̘͓̣͎̱̘̰̦̺̪͙̜͓̼͚̺̖̞̻͈͚̲͙̗͈̱̯̘͈̟̞̞͈̬̤̦̹̹̿̐̅́͛̒͐͋́̓̊̊̋̊͐̈́̇̍͒̑̎̿̀͑̉͒̾̀̈̈́̂̐͋͑͗̓͒̑̔̌̏͒̑̓͒̏͌̀͌̋̂̇̓̈͗̓͛̉͑͗͒̾̐͊̏̈́̅́̈̾͋̐͂̽̈̏̐̀̆̃͗̓̚̕̕̕̚̚͝͝͝͝ͅǫ̵̢̡̨̡̨̡̨̤̤̖͕̠̥̰̠̩̙̰̮̜̘̪̭̭̯̺̭͚͙̞̜̪̰͚̘̭̻͓͈̺͉̯͔͖̯̠̭̰̫̲͈̦̖͖̪̣̰͖͎̙͚̙̹̰̬̜̲̱̘̘̪̭̣̻̫͓̼̦̦̘̩̥̱̣̺̌̇͒̊̃̐̾̍̈́̅̉̊͐͒̀͆̉̀̇͊̕̕͜͜͝ͅͅͅͅͅͅͅf̷̨̨̧̨̢̢̨̧̛̮̖̝͎̻̟̦̗͕͍̻̫̤̰̣̜̮͙̱̝̮͈̫̯̣̻̪͚̘͍͈̝̱̞͚͔̣̗̱̳̰̣̦̘̦̮̞̬̗͈̘̤̦̞̞̱̠̹̖̣̰̦̦͍̙̫̲̮̲͇̼̗͓̹̹͓̬̩͍̦͍̆͊̀̆͂͑̾̔̎̎̿͂̀̐̈́̈̇͒̔̋̇̒͆̏̋͋͒͊̏̉̏̎͛̋̔͐̈̈́̿͋̽̀̈́͗͒̑̽͋̏̅̌̇̓͑̅̿͐̂͆͆̃̈̈́̌̈́̆̌̋̎̀̾͆͂̿̃̃̐͒͌̃́̑̐̽́̐̔̔̅̄͘̚̚͘̚͘̚͘͜͜͝͝͝͠͠͝͝͠͠ ̷̡̡̨̨̧̛̛̱̼̜͙̜͔̠̹̺̞̰̞̟̘͚̣̮̼̞̙̘̱͔̖̬͈͖͇̳̪͚̩̰̠̦͖̫͖͈̗͍̩̤̂̆̂͂̓̾̑̉́̏͊́̉͗͌͆̒̑̅̓̅̑͛͐̌̅͑̍̋͛͒̆͆̐̇̒̌́͐̔̐͂̽̋̋̐̾̆̆̀̋̈́̆̔̀̎͐̄̌̀̈͑̂͛̈́͐̈́͋̽͊͑̔̀͛͑͑̐̎͋̾̓͗̍͋̏̈́͛̂͒̉̅̊̂̆͂́̅͑̾͆̈́̇̆̑͋̐̋̚̚̚̚͘͘̚̚̕͘͠͝͝͝͝͝͝͝͝͝ͅt̸̨̛̛̛̛̙͕̪̬̣͚̩͕̺̯̹͂̿̃̊̔̇̄̌̉̒̆͐̈́̌͐̓̿͊͊͆̈́̄̌͂̓̅̐̓̐͒̍̽̓͊̄̏͌́̃̓̉́̀̈̿̓͌̆̅̎̇͒͐̀͗̉̀͂̏̓̀̈́͑͐̾̿̌̉̔̓͌͛̈͌́̊̕̕̚̕̕̕͘̚͠͝͝͠͝͝͝͝͝͠h̴̡̨̡̨̧̛̛̛̳͙̳̦͚̫̟̰͔̘̼̣̳̲̯͍̮͚̝͔̠͔͓̥̫̜̩̟̝͖̼͉͎̹̲̤̝̤̠̤̮͖͙͉̫̦̖͇̤̝̼̝̖̲̠̥̦͍͈͇̼͇̙̟̱͕̦̘͍͕̗̩͇͚̩͙̻̣͙̥͕͎̬̙̙̰͇͕͎͎̹̼̭̘̠̟̲͇̟͖͂̋̏̅̔̏͋̇̀͐̅͂̿̍̒͛́̾͗̑͂͊͂̀̋̓̀̓̽̔̋̐̈̓̀͑̑̒̒̍͑͌͌̎́͛̄͛͛̅́͗̿̚̚̕͘̚͜͝͝͝͝ͅͅę̶̢̡̨̡̧̨̢̢̛̛̛̛̛͉̬̜̘̪͕̪̖͚͚̝̜̜̩̹̗̰̫̼͕̼̰͔̞̤̬̞͓̖͙͙̣̰̭̹̫̬̘̯̭̟͖͉̖̰̗̮̯̭̿͒͒̌͐͛͒̿̋̐͂͌̾̾̀̉͆̾̆̇͋̒̿͂͐̍̄̽͌͆̏̉̐̈́̿̎̒̃̉̀͌̉̈́͌͊̇̒̔͌̈́̆̆̌̅̈͛͆͆̋̓̀͋̓̓̔͛́̃̾͛̂͊͂͑̐̑͑̉̐̂̊̒̕͘̕͘̕͘̚͘͜͜͝͝͝͝͠͠ͅ ̶̡̧̨̧̧̧̨̧̡̨̧̺̯͍͖̺̟̖͇̜͚̟̪̟̤̠̭͓͇̩͉̯̮͉͚͓̯̼̝̘͕͕͔͈͉̱̠̱͇̞̗̠̱̪̟̯͔͖͓̺̫̪̠͉̺͕̟̮̲̰͔̻̮͓͈̮̭̥̱͇̰͈̟̮̥̣̭̯̹̑̈́̿̅̾̃̄͆̆̉̔̂͗͐̒̄̈̈̋̈͊̾̀̓̾̒͘͜͠͝͝v̴̨̡̡̧̨̨̧̛̛̛̛̻͕͔̠͚͎̱̪̮͓̘̟͓͚̜̯͚̼͉͉̯̖͓͖̖̪̹͖̱̝̫̜̖̠̙̺̳͇̭͈̯̹̺̮̝̲͎̮̮̦͓͈͍̳̫̞͓͉̰̺͇̻̩̗̩̞̺̻̬̬̮͈̗͇͉̝͔̺̖̲͉̭͎̞̣̈́̎͊͗̾̌̅͋́̀̍̓͆̈̀͑̎̏̉̽͗̀̄͋̋̍͊̅̽̈́͑̉̓̐̀̾̀̀̎͑͆̿̈́͆͆̉̎͆̈́̃̓͐͑̃̐̊̏̒͛̀͌͑̐̆̅̋̿͐͒͂̅̃̍̄͆̈̎̔̆̓̇̈̾̾̇̿̈͌͆̔̄̀͘̕̚̕͘͘͜͜͜͜͝͠͝͠͝͠͝͝ͅͅͅͅi̵̢̧̧̢̡̡̨̡̛͙͙̲̫͙̪̠̹͕̲̺̻̜̜̭̠̱̟̙͖͖̘͍̣̙̬͈̜̣͍͙̘̟͚͙͕͉̲̺͍̜͎͚̘̫͇̲̗̲̞̞̩̫̗̫͍̲̥̺̮̻̹̝̹͉̪̪̭͓̰̭̠̤̝̰͕̮͍͖͕̙̖̫͉̭͓̰̱̳̫̠͍̬͉̣̱̮̳̲̭̮̪̭̗̳̳̲̞̻̯͙̎̅̈́̍͒̄̀̀̾̒͐̾̃̂̑́̅́̈́̑̈́̏͐͑̑̄̇̍̌̚̕̕̕̕͜͠͠͝͝͝͝ͅͅş̴̛̛͎̜̖͓̏̒̽̽́̿́̀̎́̐͒͋́͊̓̎͌̋̉̒̾͒͐̌̕͘ͅĩ̴̡̡̢̨̛̛̛͙̹̗̭͔̱͈̠̟̹̬̯̯̱͔̞͓̯͓̩̯̻̹̯̤̭̭͔̬͓͍͈͔̝̯͉̻̦̝̪̰̜̣̝̲̱͈̥̦̼͔̹̩͔̫̳̖̭̜̖̯̙̘̗̳͈͇̤̥͔̯̜͕̜̟̬̻̲̎͌͒̈́͗̂̇̓́̎̈́̄̾̿͑̂͒̀͌̆́͛͑̀̐̎͗͐̈́̈͊̐́́̈́͛̊͌̓͂̔̊́̉̋́̂̿̎̋̄͂͂͊͛̍̽͋̑̾͋̎̇̐̊̔͐̓̈́̀̇̾̆̃͑̌̂̀̊̀̒͐̒̍̍̒̒̓͗̎̽̌̄̒̄͌̓͊̿̂̊͋̀̈́̃͋̈̍́͆̑̑͘͘͘͘̕͘͘͜͜͝͝͠͠͠ͅͅͅb̵̧̧͓̻͚̰̰̻͕̮͕̹̱̲̞͙̝̹̆̊̔͑̓̐́͛̿̔̓̿̅̂̉̎͂͗̀̿͒͐̈̑͂̏͒̑̾̽̇̄̾̑̄̊̌̀͒̾͂̉̍͋̋̇̏̆̕̚̕͘͝͝͠͝l̷̢̧̨̢̡̨̢̡̡̛̗̤̹̞̱̬͎̗̣͍͈̦̝͇̱̖̮̩͙͉̪̞̮͙͍͍͔̘̭̻̼̘̺͙̪̳̣̱̲̲͈͔͍͎͙̙̲̟̥̪̺̩̺͓̜̗̬͙̬̳̜͍̻̩̙̭͈̘̗̝̦͓͇̟̭̦͚͖̥̤̦̻͈̘̩̭͇̫̩̺̟̪̝̩̟̰̥̞̥̝̮̖̺̟̺̗̗͇̦͙͙̞̽̑͑̊̐̓̀́̓͗̇̓͗͋̑̆͋̅͋͛̈̋̈́͛̚͘͜͜͜ͅͅͅͅȩ̴̨̨̢̡̨̼̻̻̝͔̱̘͕̥͚̟̦̻͖̺̼͇̼̬̙̦̫̝̗̥̞̥̻̦̙̬̩̠̟̺͖̪͚̲̙̪̲̻̺̺̥͈̝̲͉̟̥͇̰̻̫̟̣̰̫̹̲̦̬̼̦͓͍͎̙̼͎̤̥̻̪̩͖̼̱͇̗̋͋̐͂͑̊͊̉̿̉͗̎͋͌͌̀͒͋̐̃͊͗̆̑̀͒̈́̂̀͒͌̕̚̕͘͜ͅ


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
1 | {
2 |     "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json",
3 |     "builtin-baseline": "80403036a665cb8fcc1a1b3e17593d20b03b2489",
4 |     "dependencies": [
5 |         { "name": "benchmark", "version>=": "1.8.3" },
6 |         { "name": "catch2", "version>=": "3.4.0" }
7 |     ]
8 | }
9 | 


--------------------------------------------------------------------------------