├── .clang-format ├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── .mdl_style.rb ├── .mdlrc ├── CMakeLists.txt ├── FindCRoaring.cmake ├── LICENSE ├── Makefile.am ├── NEWS ├── NOTICE ├── README.md ├── _config.yml ├── build-aux ├── autogen.sh └── git-version-gen ├── c11-all.h ├── cmakeconfig.h.in ├── config.h.in ├── configure.ac ├── confus.h ├── confus_croar.h ├── doc ├── D2528R1.html ├── D2528R1.md ├── D2528R1.pdf ├── P2528R0.html ├── P2528R0.md ├── P2528R0.pdf ├── c11.md ├── n2916.html ├── n2916.md ├── n2916.pdf ├── n2932.html ├── n2932.md ├── n2932.pdf ├── nXXXX.html ├── nXXXX.md ├── nXXXX.patch ├── nXXXX.pdf └── tr31-bugs.md ├── example.c ├── gconfus.h ├── gconfus.h.in ├── hangul.h ├── htable.c ├── htable.h ├── include └── u8ident.h ├── libu8ident.pc.in ├── m4 └── ax_gcc_builtin.m4 ├── makefile.gnu ├── mark.h ├── medial.h ├── mingw-cross.cmake ├── mkc26.c ├── mkconfus.pl ├── mkgc.pl ├── mkmark.pl ├── mkmedial.pl ├── mkroar.c ├── mkscripts.pl ├── mktest-norm.pl ├── perf.c ├── scripts.h ├── scripts16.h ├── test-all-fast.sh ├── test-all.sh ├── test-texts.c ├── test-texts.test.in ├── test.c ├── texts ├── amharic-1.txt ├── arabic-1.c ├── arabic-1.txt ├── armenian-1.txt ├── bengali-1.txt ├── bidi-sec-1-allowed.tst ├── bidi-sec-1-c11.tst ├── bidi-sec-1.c ├── bidi-sec-1.py ├── bidi-sec-1.tst ├── bidi-sec-2-allowed.tst ├── bidi-sec-2-ascii.tst ├── bidi-sec-2-c11.tst ├── bidi-sec-2.c ├── bidi-sec-2.tst ├── bidi-sec-3.c ├── bidi-sec-4.cc ├── bopomofo-1.txt ├── bulgarian-1.txt ├── chinese-s-1.txt ├── chinese-trad-1.txt ├── cyrillic-1.txt ├── english-1.txt ├── farsi-1.txt ├── georgian-1.txt ├── greek-1.txt ├── gujarati-1.txt ├── hebrew-1.txt ├── hindi-1.txt ├── homo-1-p4.tst ├── homo-1.c ├── homo-1.tst ├── homo-sec-1-p1.tst ├── homo-sec-1.c ├── homo-sec-1.js ├── homo-sec-1.py ├── homo-sec-1.tst ├── homo-sec-2.c ├── hungarian-1.txt ├── igbo-1.txt ├── japanese-1.txt ├── japanese-2.txt ├── kannada-1.txt ├── khmer-1.txt ├── korean-1.txt ├── korean-2.txt ├── lao-1.txt ├── latin-1.txt ├── malayalam-1.txt ├── marathi-1.txt ├── math-1.c ├── math-2.c ├── math-2.cc ├── myanmar-1.txt ├── nfc-1.c ├── nfkc-1.c ├── norm-sec-1.cperl ├── norm-sec-1.py ├── norm-sec-1.raku ├── odia-1.txt ├── punjabi-1.txt ├── result.lst ├── scots-gaelic-1.txt ├── sinhala-1.txt ├── swedish-1.txt ├── tamil-1.txt ├── telugu-1.txt ├── thai-1.txt ├── turkish-1.txt ├── urdu-1.txt └── vietnamese-1.txt ├── u8id_gc.h ├── u8id_private.h ├── u8ident.c ├── u8idlint.1 ├── u8idlint.c ├── u8idlint.test ├── u8idnorm.c ├── u8idroar.c ├── u8idroar.h ├── u8idscr.c ├── u8idscr.h ├── un8ifcan.h ├── un8ifcmb.h ├── un8ifcmp.h ├── un8ifcpt.h ├── un8ifexc.h ├── unic11.h └── unic26.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | BasedOnStyle: LLVM 4 | IndentPPDirectives: AfterHash 5 | SortIncludes: false 6 | SortUsingDeclarations: false 7 | ... 8 | 9 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: rurban 4 | patreon: rurban 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Github CI 2 | on: [push, pull_request] 3 | 4 | #strategy: 5 | # matrix: 6 | # os: [ubuntu-14.04, ubuntu-18.04, ubuntu-20.04, ubuntu-latest] 7 | jobs: 8 | linux: 9 | runs-on: ubuntu-20.04 10 | timeout-minutes: 10 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | CFLAGS: 15 | - 16 | - -march=native -Wall -Wextra -O2 17 | - -march=native -O3 -flto 18 | - -march=native -O2 -fsanitize=address,undefined -fno-omit-frame-pointer 19 | - -m32 20 | CC: 21 | - gcc 22 | - clang 23 | steps: 24 | - uses: actions/checkout@v3 25 | with: 26 | fetch-depth: 1 27 | #- run: sudo apt-get install perl groff 28 | - run: autoreconf -fi 29 | - run: ./configure --enable-confus 30 | - run: make V=1 31 | - run: make V=1 check || (cat test-suite.log; false) 32 | - run: make V=1 check-all 33 | - if: matrix.CFLAGS == '' && matrix.CC == 'gcc' 34 | run: make distcheck 35 | - if: contains(matrix.CFLAGS, '-Wall -Wextra -O2') && matrix.CC == 'clang' 36 | run: make check-all-combinations 37 | - name: Prep-Release 38 | if: matrix.CFLAGS == '' && matrix.CC == 'gcc' && startsWith(github.ref, 'refs/tags/') 39 | run: make dist pkg && echo ${{ github.sha }} > Release.txt; sha256sum libu8ident-*-x86_64-pc-linux-gnu.tar.gz > linux.sha256 40 | - name: Release 41 | continue-on-error: true 42 | uses: softprops/action-gh-release@v1 43 | if: matrix.CFLAGS == '' && matrix.CC == 'gcc' && startsWith(github.ref, 'refs/tags/') 44 | with: 45 | files: | 46 | Release.txt 47 | linux.sha256 48 | libu8ident-*-x86_64-pc-linux-gnu.tar.gz 49 | libu8ident-*.tar.gz 50 | libu8ident-*.tar.xz 51 | macOS: 52 | name: macOS 53 | runs-on: macOS-latest 54 | steps: 55 | - name: checkout 56 | uses: actions/checkout@v3 57 | - run: brew install autoconf automake libtool m4 58 | - run: autoreconf -fi 59 | - run: ./configure --enable-confus 60 | - run: make 61 | - run: make check 62 | - run: make check-all 63 | mingw: 64 | name: mingw 65 | runs-on: windows-latest 66 | env: 67 | MSYS2_DIR: msys64 68 | MSYS2_ARCH: x86_64 69 | MSYSTEM: MINGW64 70 | ARCH: win64 71 | PLATFORM: x64 72 | #PATH: "C:\%MSYS2_DIR%\%MSYSTEM%\bin;C:\%MSYS2_DIR%\usr\bin;%PATH%" 73 | steps: 74 | # see https://github.com/msys2/setup-msys2 75 | - name: setup-msys2 76 | uses: msys2/setup-msys2@v2 77 | with: 78 | path-type: minimal 79 | update: true 80 | install: >- 81 | base-devel 82 | coreutils 83 | zip 84 | mingw-w64-x86_64-toolchain 85 | mingw-w64-x86_64-libtool 86 | mingw-w64-x86_64-perl 87 | libtool 88 | autoconf-wrapper 89 | automake-wrapper 90 | - run: reg add "HKLM\Software\Microsoft\Windows\Windows Error Reporting" /f /v DontShowUI /d 1 91 | - run: git config --global core.autocrlf input 92 | - name: checkout 93 | uses: actions/checkout@v3 94 | - shell: msys2 {0} 95 | #run: autoreconf -fi 96 | run: libtoolize -f && aclocal -I m4 && autoconf && automake --add-missing; autoreconf -fi 97 | continue-on-error: true 98 | # msys2 is broken for today 99 | - shell: msys2 {0} 100 | run: ./configure --enable-confus 101 | continue-on-error: true 102 | - shell: msys2 {0} 103 | run: make 104 | continue-on-error: true 105 | - shell: msys2 {0} 106 | run: make check 107 | continue-on-error: true 108 | - shell: msys2 {0} 109 | run: make pkg; sha256sum libu8ident-*-x86_64-w64-mingw32.zip > mingw.sha256 110 | continue-on-error: true 111 | - name: Release 112 | uses: softprops/action-gh-release@v1 113 | if: startsWith(github.ref, 'refs/tags/') 114 | with: 115 | files: | 116 | mingw.sha256 117 | libu8ident-*-x86_64-w64-mingw32.zip 118 | linux-cmake: 119 | runs-on: ubuntu-latest 120 | timeout-minutes: 10 121 | steps: 122 | - uses: actions/checkout@v3 123 | with: 124 | fetch-depth: 1 125 | - run: cmake . 126 | - run: make 127 | - run: make test 128 | macos-cmake: 129 | runs-on: macOS-latest 130 | timeout-minutes: 10 131 | steps: 132 | - uses: actions/checkout@v3 133 | with: 134 | fetch-depth: 1 135 | - run: cmake . 136 | - run: make 137 | - run: make test 138 | windows-cmake: 139 | runs-on: windows-latest 140 | timeout-minutes: 10 141 | steps: 142 | - uses: actions/checkout@v3 143 | with: 144 | fetch-depth: 1 145 | #- uses: microsoft/setup-msbuild@v1.1 146 | - run: cmake . 147 | - run: cmake --build . --config Release --verbose 148 | - run: cmake --build . --target RUN_TESTS --config Release --verbose 149 | - run: cmake --build . --target package --config Release 150 | # currently -G for "Visual Studio 16 2019" 151 | #- run: msbuild libu8ident.sln 152 | #- run: Debug\u8idtest 153 | - name: Release 154 | uses: softprops/action-gh-release@v1 155 | if: startsWith(github.ref, 'refs/tags/') 156 | with: 157 | files: | 158 | libu8ident-*-win64.exe 159 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.gdbinit 2 | .analysis 3 | .version 4 | GTAGS 5 | GPATH 6 | GRTAGS 7 | TAGS 8 | autom4te.cache 9 | config.h 10 | include/u8ident.h.bak 11 | libu8ident.a 12 | libu8ident.so 13 | libu8ident.so.0 14 | libu8ident.so.1 15 | a.out 16 | u8idlint 17 | u8ident.o 18 | u8idnorm.o 19 | u8idscr.o 20 | u8idroar.o 21 | htable.o 22 | /test 23 | /test-asan 24 | /test-texts 25 | /mkc26 26 | /example 27 | /build 28 | /build-asan 29 | /build-[b-z0-9]*/ 30 | /Scripts.txt 31 | /ScriptExtensions.txt 32 | /PropertyValueAliases.txt 33 | /DerivedCoreProperties*.txt 34 | /DerivedNormalizationProps*.txt 35 | /IdentifierType.txt 36 | /IdentifierStatus.txt 37 | /confusables.txt 38 | /UnicodeData*.txt 39 | Unicode-Normalize 40 | libu8ident-*.tar.gz 41 | libu8ident-*.tar.xz 42 | Makefile.in 43 | TODO 44 | aclocal.m4 45 | build-aux/compile 46 | build-aux/config.guess 47 | build-aux/config.sub 48 | configure 49 | build-aux/depcomp 50 | build-aux/install-sh 51 | build-aux/ltmain.sh 52 | build-aux/missing 53 | build-aux/test-driver 54 | m4/ 55 | roaring.c 56 | roaring.h 57 | u8ident.3 58 | mkroar 59 | perf 60 | gconfus.h 61 | allowed_croar.h 62 | nfc_croar.h 63 | nfd_croar.h 64 | nfkc_croar.h 65 | nfkd_croar.h 66 | allowed_croar.bin 67 | confus_croar.bin 68 | nf*_croar.bin 69 | mark_croar.bin 70 | test-texts.test 71 | texts.tst 72 | doc/my.latex 73 | doc/appendix-h.md 74 | texts/homo-sec-2.cc 75 | homo-sec-1 76 | homo-sec-1.o 77 | Dockerfile.pandoc 78 | doc/P2528R*.pdf 79 | doc/n2932.pdf 80 | -------------------------------------------------------------------------------- /.mdl_style.rb: -------------------------------------------------------------------------------- 1 | all 2 | # Multiple top level headers in the same document 3 | exclude_rule 'MD025' 4 | 5 | #rule 'MD004', 'ul-style' => :sublist 6 | exclude_rule 'MD004' 7 | 8 | rule 'MD029', 'ol-prefix' => :ordered 9 | exclude_rule 'MD029' 10 | 11 | exclude_rule 'MD046' 12 | 13 | # First line in file should be a top level header. 14 | # We set an explicit title for pdf and html 15 | exclude_rule 'MD041' 16 | -------------------------------------------------------------------------------- /.mdlrc: -------------------------------------------------------------------------------- 1 | style '.mdl_style.rb' 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(libu8ident C) 2 | cmake_minimum_required(VERSION 2.9) 3 | # Supported options: 4 | # -DBUILD_SHARED_LIBS=ON,OFF 5 | # -DU8ID_NORM=NFC,NFKC,NFD,NFKD 6 | # -DU8ID_PROFILE=2,3,4,5,6,C26_4,C11_6 7 | # -DU8ID_TR31=ALLOWED,SAFEC26,ID,XID,C11,ALLUTF8,NONE 8 | # -DHAVE_CONFUS=ON 9 | # -DHAVE_CROARING=ON 10 | # for smaller builds and lib. 11 | 12 | if (EXISTS ".version") 13 | file(READ .version NL_PACKAGE_VERSION) 14 | else() 15 | find_package(Git) 16 | set(PACKAGE_VERSION "") 17 | execute_process(COMMAND ${GIT_EXECUTABLE} describe --long --tags --always 18 | OUTPUT_VARIABLE NL_PACKAGE_VERSION) 19 | endif() 20 | string(STRIP "${NL_PACKAGE_VERSION}" PACKAGE_VERSION) 21 | 22 | include(CheckIncludeFile) 23 | CHECK_INCLUDE_FILE("dlfcn.h" HAVE_DLFCN_H) 24 | CHECK_INCLUDE_FILE("dirent.h" HAVE_DIRENT_H) 25 | CHECK_INCLUDE_FILE("getopt.h" HAVE_GETOPT_H) 26 | CHECK_INCLUDE_FILE("inttypes.h" HAVE_INTTYPES_H) 27 | CHECK_INCLUDE_FILE("malloc.h" HAVE_MALLOC_H) 28 | CHECK_INCLUDE_FILE("memory.h" HAVE_MEMORY_H) 29 | CHECK_INCLUDE_FILE("stdbool.h" HAVE_STDBOOL_H) 30 | CHECK_INCLUDE_FILE("stddef.h" HAVE_STDDEF_H) 31 | CHECK_INCLUDE_FILE("stdint.h" HAVE_STDINT_H) 32 | CHECK_INCLUDE_FILE("stdlib.h" HAVE_STDLIB_H) 33 | CHECK_INCLUDE_FILE("string.h" HAVE_STRING_H) 34 | CHECK_INCLUDE_FILE("strings.h" HAVE_STRINGS_H) 35 | CHECK_INCLUDE_FILE("sys/stat.h" HAVE_SYS_STAT_H) 36 | CHECK_INCLUDE_FILE("sys/types.h" HAVE_SYS_TYPES_H) 37 | include(CheckFunctionExists) 38 | check_function_exists(getopt_long HAVE_GETOPT_LONG) 39 | include(CheckCSourceCompiles) 40 | # Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero. 41 | check_c_source_compiles(" 42 | int main() { (void)__builtin_ffs(0); return 0; } 43 | " HAVE___BUILTIN_FFS) 44 | #check_function_exists(__builtin_ffs HAVE___BUILTIN_FFS) 45 | #check_function_exists(strcmp HAVE_STRCMP) 46 | #check_function_exists(strcasestr HAVE_STRCASESTR) 47 | include(CheckTypeSize) 48 | check_type_size(size_t SIZE_T) 49 | #check_type_size(wchar_t WCHAR_T) 50 | #include(CheckCCompilerFlag) 51 | configure_file(cmakeconfig.h.in config.h) 52 | if(CMAKE_MAJOR_VERSION GREATER 2) 53 | include(TestBigEndian) 54 | test_big_endian(IS_BIG_ENDIAN) 55 | if(IS_BIG_ENDIAN) 56 | message(ERROR "Big Endian not supported: <${error}>") 57 | add_definitions(-DBIG_ENDIAN) 58 | endif() 59 | endif() 60 | add_definitions(-DHAVE_CONFIG_H) 61 | 62 | if(MSVC) 63 | # Disable some overly strict MSVC warnings. 64 | add_compile_options(-wd4244 -wd4800 -wd4805 -D_CRT_SECURE_NO_WARNINGS) 65 | endif() 66 | if(MINGW) # WINDOWS AND CMAKE_COMPILER_IS_GNUCC 67 | add_compile_options(-fstack-protector) 68 | add_link_options(-fstack-protector) 69 | endif() 70 | 71 | option(BUILD_SHARED_LIBS "shared libu8ident library" ON) 72 | option(U8ID_NORM "force a single normalization: NFC,NFKC,NFD,NFKD,FCC,FCD" OFF) 73 | option(U8ID_PROFILE "force a single profile: 2-6,C26_4,C11_6" OFF) 74 | option(U8ID_TR31 "force TR31 charset: ALLOWED,SAFEC26,ID,XID,C11,ALLUTF8,NONE" OFF) 75 | option(HAVE_CONFUS "add confusables API" OFF) 76 | option(HAVE_CROARING "use CRoaring bitsets" ON) 77 | 78 | if (HAVE_CONFUS) 79 | if (HAVE_CROARING) 80 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") 81 | find_package(CRoaring) 82 | if (CRoaring_FOUND) 83 | add_definitions(-DHAVE_CROARING) 84 | add_definitions(-DCROARING_PATH="${CROARING_LIBRARY}") 85 | else() 86 | unset(HAVE_CROARING) 87 | endif() 88 | endif() 89 | endif() 90 | 91 | if(NOT CMAKE_BUILD_TYPE) 92 | set(CMAKE_BUILD_TYPE "Release" 93 | CACHE STRING "Choose the type of build, options are: Debug Release 94 | RelWithDebInfo MinSizeRel Asan." FORCE) 95 | endif() 96 | 97 | if((CMAKE_MAJOR_VERSION EQUAL 3 AND CMAKE_MINOR_VERSION GREATER_EQUAL 9) 98 | AND (CMAKE_BUILD_TYPE STREQUAL "Release")) 99 | cmake_policy(SET CMP0069 NEW) 100 | include(CheckIPOSupported) 101 | check_ipo_supported(RESULT ipo_supported OUTPUT error) 102 | endif() 103 | if(ipo_supported) 104 | message(STATUS "IPO / LTO enabled") 105 | set_property(GLOBAL PROPERTY INTERPROCEDURAL_OPTIMIZATION True) 106 | add_definitions(-DLTO) 107 | else() 108 | message(STATUS "IPO / LTO not supported: <${error}>") 109 | endif() 110 | 111 | set(libu8ident_DOCS 112 | README.md doc/c11.md doc/n2916.md doc/P2528R0.md doc/P2528R1.md 113 | doc/tr31-bugs.md doc/n2916.patch 114 | NOTICE LICENSE) 115 | 116 | set(libu8ident_HEADERS 117 | include/u8ident.h) 118 | 119 | set(libu8ident_SOURCES 120 | u8ident.c 121 | u8idnorm.c 122 | u8idscr.c 123 | u8idroar.c) 124 | 125 | if (CRoaring_FOUND) 126 | set(roaring_HEADERS 127 | confus_croar.h) 128 | endif() 129 | 130 | set(private_HEADERS 131 | scripts.h 132 | config.h.in 133 | u8idscr.h 134 | u8idroar.h 135 | u8id_private.h 136 | un8ifcan.h un8ifcmb.h un8ifcmp.h un8ifcpt.h un8ifexc.h hangul.h 137 | mark.h unic11.h confus.h ${roaring_HEADERS}) 138 | 139 | add_library(u8ident_static STATIC 140 | ${libu8ident_HEADERS} 141 | ${libu8ident_SOURCES} 142 | ${private_HEADERS}) 143 | add_library(u8ident SHARED 144 | ${libu8ident_HEADERS} 145 | ${libu8ident_SOURCES} 146 | ${private_HEADERS}) 147 | if(MSVC) 148 | if (BUILD_SHARED_LIBS) 149 | # FIXME cmake bug: 150 | # need full path. u8idlint only gets Release\u8ident.lib, but is in a seperate path. 151 | # This is ignored because the target is built, not imported. 152 | #set_target_properties(u8ident PROPERTIES 153 | # IMPORTED_IMPLIB_DEBUG "${CMAKE_CURRENT_BINARY_DIR}\\Debug\\u8ident.lib") 154 | #set_target_properties(u8ident PROPERTIES 155 | # IMPORTED_IMPLIB_RELEASE "${CMAKE_CURRENT_BINARY_DIR}\\Release\\u8ident.lib") 156 | # this is the default already 157 | #if (CMAKE_BUILD_TYPE STREQUAL "Debug") 158 | #set(LIBPATH "${CMAKE_CURRENT_BINARY_DIR}/Debug") 159 | #set_target_properties(u8ident PROPERTIES 160 | # LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Debug") 161 | #else() 162 | #set(LIBPATH "${CMAKE_CURRENT_BINARY_DIR}/Release") 163 | #set_target_properties(u8ident PROPERTIES 164 | # LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Release") 165 | #endif() 166 | endif(BUILD_SHARED_LIBS) 167 | endif(MSVC) 168 | #set_target_properties(${libu8ident_HEADERS} PROPERTIES PUBLIC_HEADER) 169 | set_target_properties(u8ident PROPERTIES PUBLIC_HEADER include/u8ident.h) 170 | if (U8ID_NORM) 171 | set_target_properties(u8ident u8ident_static PROPERTIES 172 | COMPILE_DEFINITIONS U8ID_NORM=${U8ID_NORM}) 173 | endif() 174 | if (U8ID_PROFILE) 175 | get_target_property(DEF u8ident COMPILE_DEFINITIONS) 176 | if (DEF) 177 | set_target_properties(u8ident u8ident_static PROPERTIES 178 | COMPILE_DEFINITIONS "${DEF};U8ID_PROFILE=${U8ID_PROFILE}") 179 | else() 180 | set_target_properties(u8ident u8ident_static PROPERTIES 181 | COMPILE_DEFINITIONS U8ID_PROFILE=${U8ID_PROFILE}) 182 | endif() 183 | endif() 184 | if (U8ID_TR31) 185 | get_target_property(DEF u8ident COMPILE_DEFINITIONS) 186 | if (DEF) 187 | set_target_properties(u8ident u8ident_static PROPERTIES 188 | COMPILE_DEFINITIONS "${DEF};U8ID_TR31=${U8ID_TR31}") 189 | else() 190 | set_target_properties(u8ident u8ident_static PROPERTIES 191 | COMPILE_DEFINITIONS U8ID_TR31=${U8ID_TR31}) 192 | endif() 193 | endif() 194 | 195 | target_include_directories(u8ident PRIVATE 196 | ${CMAKE_CURRENT_SOURCE_DIR} 197 | ${CMAKE_CURRENT_BINARY_DIR}) 198 | target_include_directories(u8ident PUBLIC 199 | ${CMAKE_CURRENT_SOURCE_DIR}/include) 200 | 201 | if (CRoaring_FOUND) 202 | add_executable(perf perf.c u8idroar.c) 203 | set_source_files_properties(perf.c 204 | PROPERTIES COMPILE_FLAGS "-DPERF_TEST") 205 | endif() 206 | 207 | add_executable(u8idlint u8idlint.c ${libu8ident_SOURCES}) 208 | # cmake MSVC bug: u8idlint only gets Release\u8ident.lib, but is in a subdir 209 | # target_link_directories(u8idlint PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 210 | # target_link_libraries(u8idlint ${CMAKE_CURRENT_BINARY_DIR}/u8ident.lib) 211 | include_directories(BEFORE 212 | ${CMAKE_CURRENT_SOURCE_DIR}/include 213 | ${CMAKE_CURRENT_SOURCE_DIR} 214 | ${CMAKE_CURRENT_BINARY_DIR}) 215 | 216 | if(NOT CMAKE_CROSSCOMPILING) 217 | enable_testing() 218 | add_executable(u8idtest test.c ${libu8ident_SOURCES}) 219 | add_test(test ./u8idtest) 220 | if (NOT MSVC) 221 | add_test(u8idlint.test "${CMAKE_CURRENT_SOURCE_DIR}/u8idlint.test") 222 | endif() 223 | set(ENV{U8IDTEST_TEXTS} "${CMAKE_CURRENT_SOURCE_DIR}/texts") 224 | add_executable(test-texts test-texts.c ${libu8ident_SOURCES}) 225 | add_test(test-texts ./test-texts) 226 | add_executable(mkc26 mkc26.c ${libu8ident_SOURCES}) 227 | set_source_files_properties(mkc26 228 | PROPERTIES COMPILE_FLAGS -DU8ID_PROFILE_SAFEC26) 229 | add_test(mkc26 ./mkc26) 230 | endif(NOT CMAKE_CROSSCOMPILING) 231 | 232 | add_custom_target( 233 | clang-format 234 | COMMAND clang-format -i 235 | ${CMAKE_CURRENT_SOURCE_DIR}/*.c 236 | ${CMAKE_CURRENT_SOURCE_DIR}/include/*.h 237 | ${CMAKE_CURRENT_SOURCE_DIR}/scripts.h 238 | ${CMAKE_CURRENT_SOURCE_DIR}/confus.h 239 | ${CMAKE_CURRENT_SOURCE_DIR}/u8id*.h) 240 | 241 | add_custom_target( 242 | regen-scripts 243 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/Scripts.txt 244 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt 245 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 246 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt 247 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt 248 | COMMAND wget -N https://www.unicode.org/Public/security/latest/IdentifierType.txt 249 | COMMAND wget -N https://www.unicode.org/Public/security/latest/IdentifierStatus.txt 250 | COMMAND perl mkscripts.pl 251 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 252 | 253 | add_custom_target( 254 | regen-confus 255 | COMMAND wget -N https://www.unicode.org/Public/security/latest/confusables.txt 256 | COMMAND perl mkconfus.pl 257 | #BYPRODUCTS confus.h 258 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 259 | 260 | add_custom_target( 261 | regen-all 262 | MAIN_DEPENDENCY regen-scripts 263 | DEPENDS regen-confus 264 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 265 | 266 | add_custom_target( 267 | TAGS 268 | COMMAND etags --language=c++ *.c *.h include/*.h 269 | DEPENDS *.c *.h include/*.h 270 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 271 | 272 | # make install 273 | # make install/local 274 | # make install/strip 275 | if(MSVC) 276 | #untested 277 | install(TARGETS u8ident RUNTIME PUBLIC_HEADER 278 | DESTINATION libu8ident-${PACKAGE_VERSION}) 279 | else() 280 | include(GNUInstallDirs) 281 | endif() 282 | install(TARGETS u8ident 283 | LIBRARY 284 | COMPONENT u8ident) 285 | install(TARGETS RUNTIME) 286 | install(TARGETS u8idlint) 287 | 288 | if(MSVC) 289 | #untested 290 | add_custom_target(dist 291 | COMMAND zip libu8ident-${PACKAGE_VERSION}.zip libu8ident-*.dll 292 | include/u8ident.h ${libu8ident_DOCS} 293 | DEPENDS u8ident) 294 | endif() 295 | 296 | # make package 297 | # make package_source (only after distclean or git clean -dxf) 298 | set(CPACK_PACKAGE_VENDOR "Reini Urban") 299 | set(CPACK_PACKAGE_CONTACT "rurban@cpan.org") 300 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Check unicode security guidelines for identifiers") 301 | #set(CPACK_PACKAGE_VERSION_MAJOR ${PACKAGE_VERSION_MAJOR}) 302 | #set(CPACK_PACKAGE_VERSION_MINOR ${PACKAGE_VERSION_MINOR}) 303 | #set(CPACK_PACKAGE_VERSION_PATCH ${PACKAGE_VERSION_PATCH}) 304 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") 305 | set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md") 306 | 307 | set(CPACK_RPM_PACKAGE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") 308 | set(CPACK_SOURCE_GENERATOR "TGZ;ZIP") 309 | include(CPack) 310 | -------------------------------------------------------------------------------- /FindCRoaring.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find CRoaring 2 | # Once done this will define 3 | # CROARING_FOUND - System has roaring.c 4 | # CROARING_INCLUDE_DIRS - The roaring.h include directories 5 | # CROARING_DEFINITIONS - Compiler switches required for using roaring.c 6 | 7 | find_package(PkgConfig) 8 | pkg_check_modules(PC_CROARING QUIET roaring) 9 | set(CROARING_DEFINITIONS ${PC_CROARING_CFLAGS}) 10 | 11 | find_path(CROARING_INCLUDE_DIR roaring.h 12 | HINTS ${PC_CROARING_INCLUDEDIR} ${PC_CROARING_INCLUDE_DIRS} 13 | PATH_SUFFIXES CRoaring ) 14 | 15 | find_library(CROARING_LIBRARY NAMES CRoaring libroaring 16 | HINTS ${PC_CROARING_LIBDIR} ${PC_CROARING_LIBRARY_DIRS} ) 17 | 18 | include(FindPackageHandleStandardArgs) 19 | # handle the QUIETLY and REQUIRED arguments and set CROARING_FOUND to TRUE 20 | # if all listed variables are TRUE 21 | find_package_handle_standard_args(CRoaring DEFAULT_MSG 22 | CROARING_LIBRARY CROARING_INCLUDE_DIR) 23 | 24 | mark_as_advanced(CROARING_INCLUDE_DIR CROARING_LIBRARY ) 25 | 26 | set(CROARING_LIBRARIES ${CROARING_LIBRARY} ) 27 | set(CROARING_INCLUDE_DIRS ${CROARING_INCLUDE_DIR} ) 28 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | libu8ident NEWS -- history of user-visible changes. -*-indented-text-*- 2 | Copyright (C) 2022 Reini Urban 3 | 4 | libu8ident 0.3 - 2022-12-03 5 | 6 | Fixed a bug with excluded scripts which are only now addable. 7 | Change is_greek_latin_confus violations with C26_4 to return ERR_CONFUS, 8 | not ERR_SCRIPTS (GH #12). 9 | Add more combining marks checks (GH #7): TR39#5.5 "Forbid sequences of base character 10 | + nonspacing mark that look the same as or confusingly similar to the base character 11 | alone". Also forbid non-spacing marks with base chars already including the 12 | non-spacing mark, like Ä with DIAERESIS. 13 | Update to Unicode version 15.0.0: 14 | - 11 new mark ranges. 15 | - 2 new Excluded scripts: Kawi, Nag Mundari. 16 | - The zero width joiner (ZWJ) and zero width non-joiner (ZWNJ) changed to IdStatus 17 | Restricted. 18 | - More id and xid chars 19 | 20 | libu8ident 0.2 - 2022-02-12: 21 | 22 | Bump SOLIB MAJOR to 1 23 | The C++23 deadline is gone, rename all c23 to c26. Esp. command-line 24 | and configure options. 25 | Fixed SAFEC26 to exclude confusable Technical, and to skip a lot of 26 | not Allowed ID_Start ranges. 27 | 28 | libu8ident 0.1 - 2022-01-24: 29 | 30 | First release, with the library, u8idlint and lots of C23 support: 31 | the optimized unic23.h, and the 2 technical papers for C23++ and C23 32 | to use TR39. 33 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | libu8ident is 2 | Copyright © 2017, 2021 Reini Urban. 3 | 4 | The normalization headers were created by perl-Unicode-Normalize. 5 | Copyright © 2001-2012 SADAHIRO Tomoyuki. Japan. 6 | Copyright © 2017 Reini Urban. 7 | 8 | Parts of the normalization code were written for cperl (u8) and the safeclib (wchar). 9 | Copyright © 2015-2016 cPanel Inc. 10 | Copyright © 2017, 2021 Reini Urban. 11 | 12 | The Unicode Database is 13 | Copyright © 2021 Unicode®, Inc. 14 | Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. 15 | For terms of use, see http://www.unicode.org/terms_of_use.html 16 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /build-aux/autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | libtoolize 3 | autoreconf -fi 4 | -------------------------------------------------------------------------------- /build-aux/git-version-gen: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Print a version string. 3 | scriptversion=2022-01-25.11; # UTC 4 | 5 | # Copyright (C) 2007-2018 Free Software Foundation, Inc. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation; either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with this program. If not, see . 19 | 20 | # This script is derived from GIT-VERSION-GEN from GIT: https://git-scm.com/. 21 | # It may be run two ways: 22 | # - from a git repository in which the "git describe" command below 23 | # produces useful output (thus requiring at least one signed tag) 24 | # - from a non-git-repo directory containing a .tarball-version file, which 25 | # presumes this script is invoked like "./git-version-gen .tarball-version". 26 | 27 | # In order to use intra-version strings in your project, you will need two 28 | # separate generated version string files: 29 | # 30 | # .tarball-version - present only in a distribution tarball, and not in 31 | # a checked-out repository. Created with contents that were learned at 32 | # the last time autoconf was run, and used by git-version-gen. Must not 33 | # be present in either $(srcdir) or $(builddir) for git-version-gen to 34 | # give accurate answers during normal development with a checked out tree, 35 | # but must be present in a tarball when there is no version control system. 36 | # Therefore, it cannot be used in any dependencies. GNUmakefile has 37 | # hooks to force a reconfigure at distribution time to get the value 38 | # correct, without penalizing normal development with extra reconfigures. 39 | # 40 | # .version - present in a checked-out repository and in a distribution 41 | # tarball. Usable in dependencies, particularly for files that don't 42 | # want to depend on config.h but do want to track version changes. 43 | # Delete this file prior to any autoconf run where you want to rebuild 44 | # files to pick up a version string change; and leave it stale to 45 | # minimize rebuild time after unrelated changes to configure sources. 46 | # 47 | # As with any generated file in a VC'd directory, you should add 48 | # /.version to .gitignore, so that you don't accidentally commit it. 49 | # .tarball-version is never generated in a VC'd directory, so needn't 50 | # be listed there. 51 | # 52 | # Use the following line in your configure.ac, so that $(VERSION) will 53 | # automatically be up-to-date each time configure is run (and note that 54 | # since configure.ac no longer includes a version string, Makefile rules 55 | # should not depend on configure.ac for version updates). 56 | # 57 | # AC_INIT([GNU project], 58 | # m4_esyscmd([build-aux/git-version-gen .tarball-version]), 59 | # [bug-project@example]) 60 | # 61 | # Then use the following lines in your Makefile.am, so that .version 62 | # will be present for dependencies, and so that .version and 63 | # .tarball-version will exist in distribution tarballs. 64 | # 65 | # EXTRA_DIST = $(top_srcdir)/.version 66 | # BUILT_SOURCES = $(top_srcdir)/.version 67 | # $(top_srcdir)/.version: 68 | # echo $(VERSION) > $@-t && mv $@-t $@ 69 | # dist-hook: 70 | # echo $(VERSION) > $(distdir)/.tarball-version 71 | 72 | 73 | me=$0 74 | 75 | version="git-version-gen $scriptversion 76 | 77 | Copyright 2011 Free Software Foundation, Inc. 78 | There is NO warranty. You may redistribute this software 79 | under the terms of the GNU General Public License. 80 | For more information about these matters, see the files named COPYING." 81 | 82 | usage="\ 83 | Usage: $me [OPTION]... \$srcdir/.tarball-version [TAG-NORMALIZATION-SED-SCRIPT] 84 | Print a version string. 85 | 86 | Options: 87 | 88 | --prefix PREFIX prefix of git tags (default 'v') 89 | --fallback VERSION 90 | fallback version to use if \"git --version\" fails 91 | 92 | --help display this help and exit 93 | --version output version information and exit 94 | 95 | Running without arguments will suffice in most cases." 96 | 97 | prefix= 98 | fallback= 99 | 100 | while test $# -gt 0; do 101 | case $1 in 102 | --help) echo "$usage"; exit 0;; 103 | --version) echo "$version"; exit 0;; 104 | --prefix) shift; prefix=${1?};; 105 | --fallback) shift; fallback=${1?};; 106 | -*) 107 | echo "$0: Unknown option '$1'." >&2 108 | echo "$0: Try '--help' for more information." >&2 109 | exit 1;; 110 | *) 111 | if test "x$tarball_version_file" = x; then 112 | tarball_version_file="$1" 113 | elif test "x$tag_sed_script" = x; then 114 | tag_sed_script="$1" 115 | else 116 | echo "$0: extra non-option argument '$1'." >&2 117 | exit 1 118 | fi;; 119 | esac 120 | shift 121 | done 122 | 123 | if test "x$tarball_version_file" = x; then 124 | echo "$usage" 125 | exit 1 126 | fi 127 | 128 | tag_sed_script="${tag_sed_script:-s/-/_/g}" 129 | 130 | nl=' 131 | ' 132 | 133 | # Avoid meddling by environment variable of the same name. 134 | v= 135 | v_from_git= 136 | 137 | # First see if there is a tarball-only version file. 138 | # then try "git describe", then default. 139 | if test -f $tarball_version_file 140 | then 141 | v=`cat $tarball_version_file` || v= 142 | case $v in 143 | *$nl*) v= ;; # reject multi-line output 144 | [0-9]*) ;; 145 | *) v= ;; 146 | esac 147 | test "x$v" = x \ 148 | && echo "$0: WARNING: $tarball_version_file is missing or damaged" 1>&2 149 | fi 150 | 151 | if test "x$v" != x 152 | then 153 | : # use $v 154 | # Otherwise, if there is at least one git commit involving the working 155 | # directory, and "git describe" output looks sensible, use that to 156 | # derive a version string. 157 | elif test "`git log -1 --pretty=format:x . 2>&1`" = x \ 158 | && v=`git describe --tags --abbrev=4 --match="$prefix*" HEAD 2>/dev/null \ 159 | || git describe --tags --abbrev=4 HEAD 2>/dev/null` \ 160 | && v=`printf '%s\n' "$v" | sed "$tag_sed_script"` \ 161 | && case $v in 162 | $prefix[0-9]*) ;; 163 | *) (exit 1) ;; 164 | esac 165 | then 166 | # Is this a new git that lists number of commits since the last 167 | # tag or the previous older version that did not? 168 | # Newer: v6.10_77_g0f8faeb 169 | # Older: v6.10_g0f8faeb 170 | vprefix=`expr "X$v" : 'X\(.*\)_g[^_]*$'` || vprefix=$v 171 | case $vprefix in 172 | *-*) : git describe is probably okay three part flavor ;; 173 | *) 174 | : git describe is older two part flavor 175 | # Recreate the number of commits and rewrite such that the 176 | # result is the same as if we were using the newer version 177 | # of git describe. 178 | vtag=`echo "$v" | sed 's/_.*//'` 179 | commit_list=`git rev-list "$vtag"..HEAD 2>/dev/null` \ 180 | || { commit_list=failed; 181 | echo "$0: WARNING: git rev-list failed" 1>&2; } 182 | numcommits=`echo "$commit_list" | wc -l` 183 | v=`echo "$v" | sed "s/\(.*\)_\(.*\)/\1_$numcommits_\2/"`; 184 | test "$commit_list" = failed && v=UNKNOWN 185 | ;; 186 | esac 187 | 188 | # Change the penultimate "_" to ".", for version-comparing tools. 189 | # Remove the "g" to save a byte. 190 | # The rpm version must not contain a - 191 | v=`echo "$v" | sed 's/_\([^_]*\)_g\([^_]*\)$/.\1_\2/'`; 192 | v_from_git=1 193 | elif test "x$fallback" = x || git --version >/dev/null 2>&1; then 194 | v=UNKNOWN 195 | else 196 | v=$fallback 197 | fi 198 | 199 | v=`echo "$v" |sed "s/^$prefix//"` 200 | 201 | # Test whether to append the "_dirty" suffix only if the version 202 | # string we're using came from git. I.e., skip the test if it's "UNKNOWN" 203 | # or if it came from .tarball-version. 204 | if test "x$v_from_git" != x; then 205 | # Don't declare a version "dirty" merely because a timestamp has changed. 206 | git update-index --refresh > /dev/null 2>&1 207 | 208 | dirty=`exec 2>/dev/null;git diff-index --name-only HEAD` || dirty= 209 | case "$dirty" in 210 | '') ;; 211 | *) # Append the suffix only if there isn't one already. 212 | case $v in 213 | *_dirty) ;; 214 | *) v="${v}_dirty" ;; 215 | esac ;; 216 | esac 217 | fi 218 | 219 | # Omit the trailing newline, so that m4_esyscmd can use the result directly. 220 | printf %s "$v" 221 | 222 | # Local variables: 223 | # eval: (add-hook 'before-save-hook 'time-stamp) 224 | # time-stamp-start: "scriptversion=" 225 | # time-stamp-format: "%:y-%02m-%02d.%02H" 226 | # time-stamp-time-zone: "UTC0" 227 | # time-stamp-end: "; # UTC" 228 | # End: 229 | -------------------------------------------------------------------------------- /c11-all.h: -------------------------------------------------------------------------------- 1 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 2 | /* libu8ident - Check unicode security guidelines for identifiers. 3 | Copyright 2021, 2022 Reini Urban 4 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | 6 | Generated by mkc26 from unic11.h. 7 | UNICODE version 16.0 8 | */ 9 | static const struct range_bool c11_start_list[] = { 10 | {'$', '$'}, {'A', 'Z'}, {'_', '_'}, {'a', 'z'}, 11 | {0x00A8, 0x00A8}, {0x00AA, 0x00AA}, 12 | {0x00AD, 0x00AD}, {0x00AF, 0x00AF}, {0x00B2, 0x00B5}, 13 | {0x00B7, 0x00BA}, {0x00BC, 0x00BE}, {0x00C0, 0x00D6}, 14 | {0x00D8, 0x00F6}, {0x00F8, 0x00FF}, 15 | // {0x0100, 0x02FF}, // Latin, 2B0-2FF: Modifiers (2EA Bopomofo) 16 | {0xFF, 0x2FF}, // Latin ÿ..˿ 17 | {0x370, 0x167F}, // Greek Ͱ..ᙿ 18 | {0x1681, 0x180D}, // Ogham (Excluded) ᚁ..᠍ 19 | {0x180F, 0x1DBF}, // Mongolian (Excluded) ᠏..ᶿ 20 | {0x1E00, 0x1FFF}, // Latin Ḁ..῿ 21 | {0x200B, 0x200D}, // Common ​..‍ 22 | {0x202A, 0x202E}, // Common ‪..‮ 23 | {0x203F, 0x2040}, // Common ‿..⁀ 24 | {0x2054, 0x2054}, // Common ⁔ 25 | {0x2060, 0x20CF}, // Common ⁠..⃏ 26 | {0x2100, 0x218F}, // Common ℀..↏ 27 | {0x2460, 0x24FF}, // Common ①..⓿ 28 | {0x2776, 0x2793}, // Common ❶..➓ 29 | {0x2C00, 0x2DFF}, // Glagolitic (Excluded) Ⰰ..ⷿ 30 | {0x2E80, 0x2FFF}, // Han ⺀..⿿ 31 | {0x3004, 0x3007}, // Common 〄..〇 32 | {0x3021, 0x302F}, // Han 〡..〯 33 | {0x3031, 0xD7FF}, // Common 〱..퟿ 34 | {0xF900, 0xFD3D}, // Han 豈..ﴽ 35 | {0xFD40, 0xFDCF}, // Arabic ﵀..﷏ 36 | {0xFDF0, 0xFE1F}, // Arabic ﷰ..︟ 37 | {0xFE30, 0xFE44}, // Common ︰..﹄ 38 | {0xFE47, 0xFFFD}, // Common ﹇..� 39 | {0x10000, 0x1FFFD}, // Linear_B (Excluded) 𐀀..🿽 40 | {0x20000, 0x2FFFD}, // Han 𠀀..𯿽 41 | {0x30000, 0x3FFFD}, // Han 𰀀..𿿽 42 | {0x40000, 0x4FFFD}, // (null) (Limited) 񀀀..񏿽 43 | {0x50000, 0x5FFFD}, // (null) (Limited) 񐀀..񟿽 44 | {0x60000, 0x6FFFD}, // (null) (Limited) 񠀀..񯿽 45 | {0x70000, 0x7FFFD}, // (null) (Limited) 񰀀..񿿽 46 | {0x80000, 0x8FFFD}, // (null) (Limited) 򀀀..򏿽 47 | {0x90000, 0x9FFFD}, // (null) (Limited) 򐀀..򟿽 48 | {0xA0000, 0xAFFFD}, // (null) (Limited) 򠀀..򯿽 49 | {0xB0000, 0xBFFFD}, // (null) (Limited) 򰀀..򿿽 50 | {0xC0000, 0xCFFFD}, // (null) (Limited) 󀀀..󏿽 51 | {0xD0000, 0xDFFFD}, // (null) (Limited) 󐀀..󟿽 52 | {0xE0000, 0xEFFFD}, // (null) (Limited) 󠀀..󯿽 53 | }; // 36 ranges, 1 singles, 971267 codepoints 54 | static const struct range_bool c11_cont_list[] = { 55 | {'$', '$'}, 56 | {'0', '9'}, 57 | {0x300, 0x36F}, // Inherited ̀..ͯ 58 | {0x1DC0, 0x1DFF}, // Inherited ᷀..᷿ 59 | {0x20D0, 0x20FF}, // Inherited ⃐..⃿ 60 | {0xFE20, 0xFE2F}, // Inherited ︠..︯ 61 | }; // 4 ranges, 0 singles, 236 codepoints 62 | -------------------------------------------------------------------------------- /cmakeconfig.h.in: -------------------------------------------------------------------------------- 1 | /* config.h. Generated from cmakeconfig.h.in by cmake. */ 2 | 3 | #ifndef __U8ID_CONF_H__ 4 | #define __U8ID_CONF_H__ 5 | 6 | #define HAVE_CONFIG_H 1 7 | /* Defined to 1 on a cmake build */ 8 | #define CMAKE_BUILD 1 9 | 10 | /* Define to force a single normalization. */ 11 | #cmakedefine LIBU8IDENT_NORM 12 | #ifdef LIBU8IDENT_NORM 13 | # define U8ID_NORM=@LIBU8IDENT_NORM@ 14 | #else 15 | # undef U8ID_NORM 16 | #endif 17 | 18 | /* Define to force a single profile. */ 19 | #cmakedefine LIBU8IDENT_PROFILE 20 | #ifdef LIBU8IDENT_PROFILE 21 | # define U8ID_PROFILE=@LIBU8IDENT_PROFILE@ 22 | #else 23 | # undef U8ID_PROFILE 24 | #endif 25 | 26 | /* Define to force a single check_xid option. */ 27 | #cmakedefine ENABLE_CHECK_XID 28 | #cmakedefine DISABLE_CHECK_XID 29 | 30 | /* Define to enable confus.h. */ 31 | #cmakedefine HAVE_CONFUS 32 | 33 | /* Define if a shared library will be built */ 34 | #define ENABLE_SHARED @BUILD_SHARED_LIBS@ 35 | 36 | /* TODO Defined to 1 when the compiler supports c11 */ 37 | #define HAVE_C11 1 38 | 39 | /* Define if you have the header file. */ 40 | #cmakedefine HAVE_DLFCN_H 41 | 42 | /* Define if you have the header file. */ 43 | #cmakedefine HAVE_DIRENT_H 44 | 45 | /* Define if you have the header file. */ 46 | #cmakedefine HAVE_GETOPT_H 47 | 48 | /* Define if you have the `getopt_long' function. */ 49 | #cmakedefine HAVE_GETOPT_LONG 50 | 51 | /* Define if you have the header file. */ 52 | #cmakedefine HAVE_INTTYPES_H 53 | 54 | /* Define if you have the header file. */ 55 | #cmakedefine HAVE_LIBGEN_H 56 | 57 | /* Define if you have the header file. */ 58 | #cmakedefine HAVE_MALLOC_H 59 | 60 | /* Define if you have the header file. */ 61 | #cmakedefine HAVE_MEMORY_H 62 | 63 | /* Define if you have the `memset' function. */ 64 | #cmakedefine HAVE_MEMSET 65 | 66 | /* Define if stdbool.h conforms to C99. */ 67 | #cmakedefine HAVE_STDBOOL_H 68 | 69 | /* Define if you have the header file. */ 70 | #cmakedefine HAVE_STDDEF_H 71 | 72 | /* Define if you have the header file. */ 73 | #cmakedefine HAVE_STDINT_H 74 | 75 | /* Define if you have the header file. */ 76 | #cmakedefine HAVE_STDLIB_H 77 | 78 | /* Define if you have the header file. */ 79 | #cmakedefine HAVE_STRINGS_H 80 | 81 | /* Define if you have the header file. */ 82 | #cmakedefine HAVE_STRING_H 83 | 84 | /* Define if you have the header file. */ 85 | #cmakedefine HAVE_SYS_STAT_H 86 | 87 | /* Define if you have the header file. */ 88 | #cmakedefine HAVE_SYS_TYPES_H 89 | 90 | /* Define if you have the header file. */ 91 | #cmakedefine HAVE_UNISTD_H 92 | 93 | /* TODO Define if the system has the type `_Bool'. */ 94 | #define HAVE__BOOL 1 95 | 96 | /* Define if the system has the `__builtin_ffs' built-in function */ 97 | #cmakedefine HAVE___BUILTIN_FFS 98 | 99 | /* The size of `size_t', as computed by sizeof. */ 100 | #define SIZEOF_SIZE_T ${SIZE_T} 101 | 102 | /* Define to the full name of this package. */ 103 | #define PACKAGE_NAME "libu8ident" 104 | 105 | /* Define to the full name and version of this package. */ 106 | #define PACKAGE_STRING "libu8ident ${PACKAGE_VERSION}" 107 | 108 | /* Define to the one symbol short name of this package. */ 109 | #define PACKAGE_TARNAME "libu8ident" 110 | 111 | /* Define to the home page for this package. */ 112 | #define PACKAGE_URL "http://github.com/rurban/libu8ident/" 113 | 114 | /* Define to the version of this package. */ 115 | #define PACKAGE_VERSION "${PACKAGE_VERSION}" 116 | 117 | /* Define for Solaris 2.5.1 so the uint32_t typedef from , 118 | , or is not used. If the typedef were allowed, the 119 | #define below would cause a syntax error. */ 120 | /* #undef _UINT32_T */ 121 | 122 | /* Define for Solaris 2.5.1 so the uint64_t typedef from , 123 | , or is not used. If the typedef were allowed, the 124 | #define below would cause a syntax error. */ 125 | /* #undef _UINT64_T */ 126 | 127 | /* Define to `__inline__' or `__inline' if that's what the C compiler 128 | calls it, or to nothing if 'inline' is not supported under any name. */ 129 | #ifndef __cplusplus 130 | /* #undef inline */ 131 | #endif 132 | 133 | /* Define to the type of a signed integer type of width exactly 16 bits if 134 | such a type exists and the standard includes do not define it. */ 135 | /* #undef int16_t */ 136 | 137 | /* Define to the type of a signed integer type of width exactly 32 bits if 138 | such a type exists and the standard includes do not define it. */ 139 | /* #undef int32_t */ 140 | 141 | /* Define to the type of a signed integer type of width exactly 64 bits if 142 | such a type exists and the standard includes do not define it. */ 143 | /* #undef int64_t */ 144 | 145 | /* Define to rpl_malloc if the replacement function should be used. */ 146 | /* #undef malloc */ 147 | 148 | /* Define to rpl_realloc if the replacement function should be used. */ 149 | /* #undef realloc */ 150 | 151 | /* If restrict is broken with this compiler */ 152 | #ifdef __cplusplus 153 | #define restrict 154 | #endif 155 | 156 | /* Define to `unsigned int' if does not define. */ 157 | /* #undef size_t */ 158 | 159 | /* Define to the type of an unsigned integer type of width exactly 16 bits if 160 | such a type exists and the standard includes do not define it. */ 161 | /* #undef uint16_t */ 162 | 163 | /* Define to the type of an unsigned integer type of width exactly 32 bits if 164 | such a type exists and the standard includes do not define it. */ 165 | /* #undef uint32_t */ 166 | 167 | /* Define to the type of an unsigned integer type of width exactly 64 bits if 168 | such a type exists and the standard includes do not define it. */ 169 | /* #undef uint64_t */ 170 | 171 | #endif /* __U8ID_CONF_H__ */ 172 | -------------------------------------------------------------------------------- /config.h.in: -------------------------------------------------------------------------------- 1 | /* config.h.in. Generated from configure.ac by autoheader. */ 2 | 3 | 4 | #ifndef __U8ID_CONF_H__ 5 | #define __U8ID_CONF_H__ 6 | 7 | 8 | /* Defined to the path to the CRoaring library */ 9 | #undef CROARING_PATH 10 | 11 | /* Define to 1 if you have the header file. */ 12 | #undef HAVE_ASSERT_H 13 | 14 | /* Defined to 1 when the compiler supports c11 */ 15 | #undef HAVE_C11 16 | 17 | /* Defined to 1 to add the confusable API */ 18 | #undef HAVE_CONFUS 19 | 20 | /* Defined to 1 when to use the CRoaring library */ 21 | #undef HAVE_CROARING 22 | 23 | /* Define to 1 if you have the header file. */ 24 | #undef HAVE_DIRENT_H 25 | 26 | /* Define to 1 if you have the header file. */ 27 | #undef HAVE_DLFCN_H 28 | 29 | /* Define to 1 if you have the header file. */ 30 | #undef HAVE_GETOPT_H 31 | 32 | /* Define to 1 if you have the `getopt_long' function. */ 33 | #undef HAVE_GETOPT_LONG 34 | 35 | /* Define to 1 if you have the header file. */ 36 | #undef HAVE_INTTYPES_H 37 | 38 | /* Define to 1 if you have the header file. */ 39 | #undef HAVE_LIBGEN_H 40 | 41 | /* Define to 1 if you have the header file. */ 42 | #undef HAVE_MALLOC_H 43 | 44 | /* Define to 1 if stdbool.h conforms to C99. */ 45 | #undef HAVE_STDBOOL_H 46 | 47 | /* Define to 1 if you have the header file. */ 48 | #undef HAVE_STDDEF_H 49 | 50 | /* Define to 1 if you have the header file. */ 51 | #undef HAVE_STDINT_H 52 | 53 | /* Define to 1 if you have the header file. */ 54 | #undef HAVE_STDIO_H 55 | 56 | /* Define to 1 if you have the header file. */ 57 | #undef HAVE_STDLIB_H 58 | 59 | /* Define to 1 if you have the header file. */ 60 | #undef HAVE_STRINGS_H 61 | 62 | /* Define to 1 if you have the header file. */ 63 | #undef HAVE_STRING_H 64 | 65 | /* Define to 1 if you have the header file. */ 66 | #undef HAVE_SYS_STAT_H 67 | 68 | /* Define to 1 if you have the header file. */ 69 | #undef HAVE_SYS_TYPES_H 70 | 71 | /* Define to 1 if you have the `u8_wordbreaks' function. */ 72 | #undef HAVE_U8_WORDBREAKS 73 | 74 | /* Define to 1 if you have the header file. */ 75 | #undef HAVE_UNISTD_H 76 | 77 | /* Define to 1 if you have the header file. */ 78 | #undef HAVE_UNIWBRK_H 79 | 80 | /* Define to 1 if the system has the type `_Bool'. */ 81 | #undef HAVE__BOOL 82 | 83 | /* Define to 1 if the system has the `__builtin_ffs' built-in function */ 84 | #undef HAVE___BUILTIN_FFS 85 | 86 | /* Define to the sub-directory where libtool stores uninstalled libraries. */ 87 | #undef LT_OBJDIR 88 | 89 | /* Define to the address where bug reports for this package should be sent. */ 90 | #undef PACKAGE_BUGREPORT 91 | 92 | /* Define to the full name of this package. */ 93 | #undef PACKAGE_NAME 94 | 95 | /* Define to the full name and version of this package. */ 96 | #undef PACKAGE_STRING 97 | 98 | /* Define to the one symbol short name of this package. */ 99 | #undef PACKAGE_TARNAME 100 | 101 | /* Define to the home page for this package. */ 102 | #undef PACKAGE_URL 103 | 104 | /* Define to the version of this package. */ 105 | #undef PACKAGE_VERSION 106 | 107 | /* The size of `size_t', as computed by sizeof. */ 108 | #undef SIZEOF_SIZE_T 109 | 110 | /* Define to 1 if all of the C90 standard headers exist (not just the ones 111 | required in a freestanding environment). This macro is provided for 112 | backward compatibility; new code need not use it. */ 113 | #undef STDC_HEADERS 114 | 115 | /* Define for Solaris 2.5.1 so the uint32_t typedef from , 116 | , or is not used. If the typedef were allowed, the 117 | #define below would cause a syntax error. */ 118 | #undef _UINT32_T 119 | 120 | /* Define for Solaris 2.5.1 so the uint64_t typedef from , 121 | , or is not used. If the typedef were allowed, the 122 | #define below would cause a syntax error. */ 123 | #undef _UINT64_T 124 | 125 | /* Define for Solaris 2.5.1 so the uint8_t typedef from , 126 | , or is not used. If the typedef were allowed, the 127 | #define below would cause a syntax error. */ 128 | #undef _UINT8_T 129 | 130 | /* Define to empty if `const' does not conform to ANSI C. */ 131 | #undef const 132 | 133 | /* Define to `__inline__' or `__inline' if that's what the C compiler 134 | calls it, or to nothing if 'inline' is not supported under any name. */ 135 | #ifndef __cplusplus 136 | #undef inline 137 | #endif 138 | 139 | /* Define to the type of a signed integer type of width exactly 32 bits if 140 | such a type exists and the standard includes do not define it. */ 141 | #undef int32_t 142 | 143 | /* Define to the equivalent of the C99 'restrict' keyword, or to 144 | nothing if this is not supported. Do not define if restrict is 145 | supported only directly. */ 146 | #undef restrict 147 | /* Work around a bug in older versions of Sun C++, which did not 148 | #define __restrict__ or support _Restrict or __restrict__ 149 | even though the corresponding Sun C compiler ended up with 150 | "#define restrict _Restrict" or "#define restrict __restrict__" 151 | in the previous line. This workaround can be removed once 152 | we assume Oracle Developer Studio 12.5 (2016) or later. */ 153 | #if defined __SUNPRO_CC && !defined __RESTRICT && !defined __restrict__ 154 | # define _Restrict 155 | # define __restrict__ 156 | #endif 157 | 158 | /* Define to `unsigned int' if does not define. */ 159 | #undef size_t 160 | 161 | /* Define to the type of an unsigned integer type of width exactly 16 bits if 162 | such a type exists and the standard includes do not define it. */ 163 | #undef uint16_t 164 | 165 | /* Define to the type of an unsigned integer type of width exactly 32 bits if 166 | such a type exists and the standard includes do not define it. */ 167 | #undef uint32_t 168 | 169 | /* Define to the type of an unsigned integer type of width exactly 64 bits if 170 | such a type exists and the standard includes do not define it. */ 171 | #undef uint64_t 172 | 173 | /* Define to the type of an unsigned integer type of width exactly 8 bits if 174 | such a type exists and the standard includes do not define it. */ 175 | #undef uint8_t 176 | 177 | 178 | #endif /* __U8ID_CONF_H__ */ 179 | 180 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # configure.ac - Process with autoconf to produce a configure script 2 | # (C) 2021-2022 Reini Urban 3 | # See LICENSE 4 | AC_PREREQ([2.69]) 5 | AC_INIT([libu8ident], 6 | m4_esyscmd([build-aux/git-version-gen .version]), 7 | [https://github.com/rurban/libu8ident/issues],, 8 | [http://github.com/rurban/libu8ident/]) 9 | AC_CONFIG_AUX_DIR([build-aux]) 10 | AC_CONFIG_SRCDIR([u8ident.c]) 11 | AC_CONFIG_MACRO_DIR([m4]) 12 | AC_SUBST([LIBU8IDENT_SO_VERSION], [1:0:0]) 13 | VERSION_FILE=".version" 14 | AC_SUBST([VERSION_FILE]) 15 | AC_MSG_CHECKING([git version]) 16 | AC_MSG_RESULT($PACKAGE_VERSION) 17 | AC_MSG_CHECKING([so version-info]) 18 | AC_MSG_RESULT($LIBU8IDENT_SO_VERSION) 19 | 20 | AM_INIT_AUTOMAKE([1.10 no-define foreign dist-xz]) 21 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) 22 | #dnl m4_pattern_allow 23 | LT_INIT 24 | AC_CHECK_TOOLS([AR], [gcc-ar ar]) 25 | AC_CHECK_TOOLS([RANLIB], [gcc-ranlib ranlib]) 26 | AC_CHECK_TOOLS([NM], [gcc-nm nm]) 27 | #dnl Initialize pkg-config since we are installing a .pc file 28 | PKG_INSTALLDIR 29 | AM_CPPFLAGS="-I\$(top_srcdir)/include" 30 | 31 | AC_MSG_NOTICE([ 32 | 33 | Check options]) 34 | # =============================================== 35 | # hardcode certain options, to creater smaller and faster binaries 36 | 37 | # TR31 charset profile for identifiers 38 | # Recommended: --with-tr31=allowed or none 39 | AC_MSG_CHECKING([for --with-tr31=]) 40 | AC_ARG_WITH([tr31], 41 | [AS_HELP_STRING([--with-tr31={ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8,NONE}], 42 | [support only this TR31 identifier profile. @<:@default=xid@:>@ 43 | Recommended: --with-tr31=allowed])], 44 | [case "${with_tr31}" in 45 | ALLOWED|SAFEC26|ID|XID|C11|C23|ALLUTF8) U8ID_TR31=${with_tr31}; AC_MSG_RESULT([$with_tr31]) ;; 46 | NONE) CFLAGS="$CFLAGS -DDISABLE_U8ID_TR31"; AC_MSG_RESULT([$with_tr31]) ;; 47 | *) AC_MSG_ERROR([invalid ${with_tr31}: only ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8,NONE]) ;; 48 | esac], 49 | [ U8ID_TR31=; AC_MSG_RESULT([none. support all (default). Recommended: ALLOWED]) ] 50 | ) 51 | if test -n "$U8ID_TR31"; then 52 | CFLAGS="$CFLAGS -DU8ID_TR31=$U8ID_TR31" 53 | dnl AC_DEFINE([U8IDP_TR31],[$U8ID_TR31],[Define to ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8 to support only this TR31 identifier charset]) 54 | fi 55 | AC_SUBST([U8ID_TR31]) 56 | 57 | # TR39 Mixed-script security profile for identifiers 58 | # Recommended: --with-profile=4 59 | AC_MSG_CHECKING([for --with-profile=]) 60 | AC_ARG_WITH([profile], 61 | [AS_HELP_STRING([--with-profile={2,3,4,5,6,C26_4,C11_6}], 62 | [support only this TR39 Unicode security profile. @<:@default=4@:>@ 63 | Recommended: --with-profile=4])], 64 | [case "${with_profile}" in 65 | 2|3|4|5|6|C26_4|C11_6) U8ID_PROFILE=${with_profile}; AC_MSG_RESULT([$with_profile]) ;; 66 | *) AC_MSG_ERROR([invalid ${with_profile}: only 2,3,4,5,6,C26_4,C11_6]) ;; 67 | esac], 68 | [ U8ID_PROFILE=; AC_MSG_RESULT([none. support all (default). Recommended: 4]) ] 69 | ) 70 | if test -n "$U8ID_PROFILE"; then 71 | CFLAGS="$CFLAGS -DU8ID_PROFILE=$U8ID_PROFILE" 72 | dnl AC_DEFINE([U8ID_PROFILE],[$U8ID_PROFILE],[Define to 2-6,C26_4,C11_6 to support only this Unicode security profile]) 73 | fi 74 | AC_SUBST([U8ID_PROFILE]) 75 | 76 | # Recommended: --with-norm=NFC 77 | AC_MSG_CHECKING([for --with-norm=]) 78 | AC_ARG_WITH([norm], 79 | [AS_HELP_STRING([--with-norm=NF{K,}{C,D}], 80 | [support only this normalization method. @<:@default=@:>@ 81 | Recommended: --with-norm=NFC])], 82 | [case "${with_norm}" in 83 | NFKD|NFKC|NFC|NFD) U8ID_NORM=${with_norm}; AC_MSG_RESULT([$with_norm]) ;; 84 | FCD|FCD) U8ID_NORM=${with_norm}; AC_MSG_RESULT([$with_norm]) ;; 85 | *) AC_MSG_ERROR([invalid ${with_norm}: only NFC,NFKC,NFD,NFKD,FCC,FCD]) ;; 86 | esac], 87 | [ U8ID_NORM=; AC_MSG_RESULT([none. support all (default). Recommended: NFC]) ] 88 | ) 89 | if test -n "$U8ID_NORM"; then 90 | if test x$U8ID_NORM != xNFC && test x$U8ID_PROFILE = xC26_4 -o x$U8ID_TR31 = xSAFEC26 -o x$U8ID_TR31 = xC23 91 | then 92 | AC_MSG_ERROR([Profile $U8ID_PROFILE or TR31 $U8ID_TR31 requires NFC]) 93 | else 94 | CFLAGS="$CFLAGS -DU8ID_NORM=$U8ID_NORM" 95 | dnl AC_DEFINE([U8ID_NORM],[$U8ID_NORM],[Define to NF{K,}{C,D} to support only this Unicode normalization]) 96 | fi 97 | fi 98 | AC_SUBST([U8ID_NORM]) 99 | 100 | AC_MSG_CHECKING([for --enable-confus]) 101 | AC_ARG_ENABLE([confus], 102 | [AS_HELP_STRING([--enable-confus], 103 | [add the confusables set @<:@default=no@:>@])], 104 | [case "${enableval}" in 105 | yes) HAVE_CONFUS=1; 106 | AC_DEFINE([HAVE_CONFUS], 1, 107 | [Defined to 1 to add the confusable API]) 108 | AC_MSG_RESULT([yes]) 109 | ;; 110 | *) AC_MSG_RESULT([no]) ;; 111 | esac], [HAVE_CONFUS= ]) 112 | if test -z "$HAVE_CONFUS"; then 113 | AC_MSG_RESULT([no]) 114 | fi 115 | AC_SUBST([HAVE_CONFUS]) 116 | 117 | download_croaring() { 118 | git clone https://github.com/RoaringBitmap/CRoaring 119 | } 120 | 121 | AC_MSG_CHECKING([for --with-croaring]) 122 | AC_ARG_WITH([croaring], 123 | [AS_HELP_STRING([--with-croaring=optional_path], 124 | [Use the CRoaring library.])], 125 | [case "${with_croaring}" in 126 | yes) HAVE_CROARING=1 127 | AC_MSG_RESULT([$with_croaring]) 128 | AC_CHECK_TOOLS([XXD], [xxd]) 129 | # in a parallel dir? 130 | with_croaring="$srcdir/../CRoaring" 131 | if test -d "${with_croaring}" && \ 132 | test -e "${with_croaring}/roaring.c" && \ 133 | test -e "${with_croaring}/roaring.h" 134 | then 135 | CROARING_PATH="\"$srcdir/../CRoaring\"" 136 | AC_MSG_RESULT([found "$srcdir/../CRoaring"]) 137 | else 138 | download_croaring && CROARING_PATH="CRoaring" 139 | fi 140 | ;; 141 | *) if test -d "${with_croaring}" && \ 142 | test -e "${with_croaring}/roaring.c" && \ 143 | test -e "${with_croaring}/roaring.h" 144 | then 145 | HAVE_CROARING=1 146 | CROARING_PATH="\"${with_croaring}\"" 147 | AC_MSG_RESULT(["$with_croaring"]) 148 | else 149 | AC_MSG_ERROR([invalid ${with_profile} path]) 150 | fi 151 | ;; 152 | esac], 153 | [ HAVE_CROARING=; AC_MSG_RESULT([no]) ] 154 | ) 155 | 156 | if test -n "$HAVE_CROARING"; then 157 | AC_DEFINE([HAVE_CROARING], 1, 158 | [Defined to 1 when to use the CRoaring library]) 159 | fi 160 | AC_SUBST([HAVE_CROARING]) 161 | AC_SUBST([CROARING_PATH]) 162 | if test -n "$CROARING_PATH"; then 163 | AC_DEFINE_UNQUOTED([CROARING_PATH], $CROARING_PATH, 164 | [Defined to the path to the CRoaring library]) 165 | fi 166 | AM_CONDITIONAL([HAVE_CONFUS], [test x$HAVE_CONFUS = x1]) 167 | AM_CONDITIONAL([HAVE_CROARING], [test x$HAVE_CROARING = x1]) 168 | 169 | AC_MSG_NOTICE([ 170 | 171 | Check programs]) 172 | # =============================================== 173 | AC_CANONICAL_HOST 174 | AC_PROG_CC 175 | AC_PROG_CPP 176 | AC_PROG_INSTALL 177 | AM_PROG_CC_C_O 178 | AC_PROG_CC 179 | dnl AC_PROG_CC_C11 180 | if test "x$ac_cv_prog_cc_c11" != "xno"; then 181 | AC_DEFINE([HAVE_C11], 1, 182 | [Defined to 1 when the compiler supports c11]) 183 | if test "x$ac_cv_prog_cc_c11" != "x"; then 184 | AC_MSG_RESULT([added $ac_cv_prog_cc_c11 to CFLAGS]) 185 | AM_CFLAGS="$AM_CFLAGS $ac_cv_prog_cc_c11" 186 | fi 187 | fi 188 | AC_SUBST(HAVE_C11) 189 | AC_CACHE_CHECK([if compiling with clang], [ax_cv_cc_target_clang], 190 | [ 191 | saved_CFLAGS="$CFLAGS" 192 | CFLAGS="$TARGET_CFLAGS" 193 | AC_COMPILE_IFELSE( 194 | [AC_LANG_PROGRAM([], [[ 195 | #ifdef __clang__ 196 | #error "is clang" 197 | #endif 198 | ]])], 199 | [ax_cv_cc_target_clang=no], [ax_cv_cc_target_clang=yes]) 200 | CFLAGS="$saved_CFLAGS" 201 | ]) 202 | 203 | # clang has no broken return-local-addr detection 204 | if test x$ac_cv_c_compiler_gnu = xyes -a $ax_cv_cc_target_clang = xno; then 205 | case $host_os in 206 | mingw*|cygwin*|msys*) 207 | WARN_CFLAGS="-Wall -Wextra" ;; 208 | *) dnl older non-c99 g++ complain about __VA_ARGS__, for(); 209 | if test "x$ac_cv_prog_cc_c99" = "xno"; then 210 | WARN_CFLAGS="-Wall -Wextra" 211 | else 212 | WARN_CFLAGS="-Wall -Wextra -Werror -Wno-return-local-addr" 213 | fi 214 | ;; 215 | esac 216 | elif test $ax_cv_cc_target_clang = xyes; then 217 | WARN_CFLAGS="-Wall -Wextra -Werror" 218 | else 219 | WARN_CFLAGS="-Wall -Wextra" 220 | fi 221 | AC_SUBST(WARN_CFLAGS) 222 | 223 | dnl optional for packaging or maintainance 224 | AC_CHECK_TOOLS([WGET], [wget2 wget]) 225 | # dnf install rubygem-ronn-ng 226 | AC_CHECK_TOOLS([RONN], [ronn]) 227 | AC_CHECK_TOOLS([PERL], [cperl perl]) 228 | AC_CHECK_TOOLS([PANDOC], [pandoc]) 229 | AM_CONDITIONAL([HAVE_PANDOC], [test -n "$PANDOC"]) 230 | AC_CHECK_TOOLS([GPERF], [gperf]) 231 | 232 | AC_MSG_NOTICE([Check header files]) 233 | # =============================================== 234 | AC_HEADER_STDBOOL 235 | AC_CHECK_HEADERS([stdlib.h \ 236 | malloc.h \ 237 | string.h \ 238 | stddef.h \ 239 | inttypes.h \ 240 | stdint.h \ 241 | assert.h \ 242 | unistd.h \ 243 | getopt.h \ 244 | sys/types.h \ 245 | sys/stat.h \ 246 | dirent.h \ 247 | libgen.h \ 248 | uniwbrk.h \ 249 | ]) 250 | AC_SEARCH_LIBS([u8_wordbreaks],[unistring], 251 | [AC_DEFINE([HAVE_U8_WORDBREAKS], [1], [Define to 1 if you have the `u8_wordbreaks' function.]) 252 | u8idlint_LIBS=$ac_cv_search_u8_wordbreaks 253 | AC_SUBST([u8idlint_LIBS]) 254 | AC_SUBST([HAVE_U8_WORDBREAKS]) 255 | ], 256 | [if pkg-config --exists libunistring; then 257 | AC_DEFINE([HAVE_U8_WORDBREAKS], [1], [Define to 1 if you have the `u8_wordbreaks' function.]) 258 | U8IDLINT_CFLAGS=`pkg-config libunistring --cflags` 259 | u8idlint_LIBS=`pkg-config libunistring --libs` 260 | AC_SUBST([U8IDLINT_CFLAGS]) 261 | AC_SUBST([u8idlint_LIBS]) 262 | AC_SUBST([HAVE_U8_WORDBREAKS]) 263 | else 264 | AC_MSG_WARN([libunistring for u8idlint not found]) 265 | fi]) 266 | AM_CONDITIONAL([HAVE_LIBUNISTRING], [test x$HAVE_U8_WORDBREAKS = x1]) 267 | 268 | AC_MSG_NOTICE([Check typedefs, structures, and compiler characteristics]) 269 | # =============================================== 270 | AC_C_CONST 271 | AC_C_INLINE 272 | AC_C_RESTRICT 273 | AC_TYPE_SIZE_T 274 | AC_TYPE_INT32_T 275 | AC_TYPE_UINT8_T 276 | AC_TYPE_UINT16_T 277 | AC_TYPE_UINT32_T 278 | AC_TYPE_UINT64_T 279 | dnl AC_TYPE_UINTPTR_T 280 | AC_CHECK_SIZEOF(size_t) 281 | dnl AC_CHECK_FUNCS([ memset strcmp ]) 282 | AC_CHECK_FUNCS([getopt_long],[], 283 | AC_MSG_WARN([getopt_long not found. u8idlint will not accept long options.])) 284 | AX_GCC_BUILTIN(__builtin_ffs) 285 | 286 | case $host_os in 287 | mingw*|cygwin*|msys*) 288 | have_windows=1 289 | CFLAGS="$CFLAGS -fstack-protector" 290 | ;; 291 | esac 292 | AM_CONDITIONAL([HAVE_WINDOWS], [test x1 = x$have_windows]) 293 | 294 | AC_CHECK_TOOLS([PERL], [cperl perl]) 295 | AC_CHECK_TOOLS([WGET], [wget]) 296 | 297 | AH_TOP([ 298 | #ifndef __U8ID_CONF_H__ 299 | #define __U8ID_CONF_H__ 300 | ]) 301 | 302 | AH_BOTTOM([ 303 | #endif /* __U8ID_CONF_H__ */ 304 | ]) 305 | 306 | LTEXEC="./libtool --mode=execute" 307 | AC_SUBST(LTEXEC) 308 | 309 | AC_CONFIG_HEADERS([config.h]) 310 | AC_CONFIG_FILES([libu8ident.pc 311 | Makefile]) 312 | AC_CONFIG_FILES([test-texts.test], [chmod +x test-texts.test]) 313 | AC_OUTPUT 314 | -------------------------------------------------------------------------------- /doc/D2528R1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/D2528R1.pdf -------------------------------------------------------------------------------- /doc/P2528R0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/P2528R0.pdf -------------------------------------------------------------------------------- /doc/c11.md: -------------------------------------------------------------------------------- 1 | # C11 2 | 3 | The unicode security guidelines for identifiers specify some rules 4 | for identifiers to stay identifiable. Most language and tool 5 | implementors ignored them when they added unicode support to their language. 6 | 7 | Only extremely few resisted, or did implement it. Java, cperl and rust 8 | properly implemented it. zig and J refused. gcc came up with in version 9 | 10, but the feature request started in 2015. clang has it since 3.3, 10 | msvc focuses on insecure UTF-16 instead. 11 | 12 | # The insecure C Standard C11 13 | 14 | The C11 standard started allowing a certain range of (mostly insecure) AltId 15 | codepoints, and did not define a XID profile, nor any TR39 security restrictions. 16 | Thus this is an insecure Unrestricted profile 6. 17 | In libu8ident this is `U8ID_PROFILE_C11_6`, defined by `-DU8ID_PROFILE_C11STD` 18 | See `unic11.h` and [N2731](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2731.pdf) 19 | Annex D (p. 425) 20 | 21 | The C11 standard ignores secure IdentifierTypes, mixed scripts violations and more. 22 | This allows easy Arabic right-to-left overrides, Greek + Cyrillic confusables, and 23 | does not mandate a normalization of identifiers, leading to confusables 24 | even with Latin-only C identifiers. 25 | 26 | See 27 | summarizing the 28 | [tr31#Alternative_Identifier_Syntax](http://unicode.org/reports/tr31/#Alternative_Identifier_Syntax) 29 | \[AltId\] in where anything is allowed but: white space, "syntax" characters, 30 | private use characters, surrogates, control characters, and 31 | non-characters. 32 | 33 | They hereby violated their own recommendations of 34 | to 35 | only allow Allowed Identifiers, which is to only allow the following 36 | properties: 37 | 38 | * L& The symbol "L&" indicates characters of type Lu, Ll, or Lt (see below). 39 | * Lu Letter, Uppercase 40 | * Ll Letter, Lowercase 41 | * Lt Letter, Titlecase 42 | * Lm Letter, Modifier 43 | * Lo Letter, Other 44 | * Mn Mark, Non-Spacing 45 | * Mc Mark, Spacing Combining 46 | * Nd Number, Decimal Digit 47 | * Nl Number, Letter 48 | * Pc Punctuation, Connector 49 | 50 | referring to the table TR10176-4. In practice you would also disallow 51 | Excluded and Limited Use scripts, only allow Allowed IdentifierStatus XID's, 52 | and check for mixed-script violations. 53 | 54 | The C++ Working Group detected the XID problem in 55 | [n2836](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2836.pdf) for 56 | C++23, but found that implementing TR39 would be too hard. "This proposal 57 | does not address some potential security concerns—so called homoglyph 58 | attacks—where letters that appear the same may be treated as 59 | distinct. Methods of defense against such attacks are complex and 60 | evolving, and requiring mitigation strategies would impose substantial 61 | implementation burden." [p194R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html) 62 | 63 | Mixed script detection such as done here is certainly not as hard as 64 | TR31 mitigation and esp. NFC normalization. It is also not evolving, TR39 is 65 | standardized since Unicode 6, with its earliest revisions in [2006](https://www.unicode.org/Public/security/revision-02/). 66 | 67 | # The SAFEC26 Extension 68 | 69 | I provide a fixed version of the C++ and C standard for secure unicode 70 | identifers with a special profile, called SAFEC26. This is an 71 | extended Moderate Profile, plus allowing **Greek** scripts with Latin, 72 | plus only allowing a special SAFEC26 TR31 charset. You realistically only 73 | get something into the C standard if you get it into C++ first. 74 | 75 | The secure extension disallows the Restricted and Limited_Use scripts 76 | and identifiers, disallows arbitrary right-to-left and left-to-right 77 | overrides within identifiers (i.e. no Latin + Arabic mixes), and 78 | disallows all the insecure mixed scripts combinations. But you can 79 | still write and compile unreadable Asian or Indian C code, or even 80 | Arabic (right-to-left), as long as you don't mix it with Latin 81 | (left-to-right). 82 | 83 | # Compilers 84 | 85 | The insecure C11 unicode identifiers were added to gcc, clang and 86 | chibicc. C99 already allowed Extended C99 Identifiers with \u and \U escape 87 | sequences. 88 | msvc claims to support them, but does not so yet, it rather focuses on insecure 89 | UTF-16 identifiers. 90 | [pcc](http://pcc.ludd.ltu.se/ftp/pub/pcc-docs/pcc-utf8-ver3.pdf) was 91 | worked on also. Don't know about other C compilers, most likely they 92 | are still secure. 93 | 94 | * gcc added these with version 10. See [PR 95 | 67224](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67224) but it 96 | generated (unrestricted) UTF-8 encoded ASM output before. So you can 97 | link against earlier GCC object files. 98 | 99 | * clang started secure Extended C99 Identifier support (`\uxxxx`) plus insecure 100 | C11 Unicode Identifiers with version 3.3. 101 | [ReleaseNotes](https://releases.llvm.org/3.3/tools/clang/docs/ReleaseNotes.html), 102 | [godbolt clang 3.3](https://godbolt.org/z/1dzMxPhvc) 103 | 104 | * MSVC has [Unicode support in the compiler and linker](https://docs.microsoft.com/en-us/cpp/build/reference/unicode-support-in-the-compiler-and-linker?view=msvc-140) 105 | for identifiers, included filenames (apparently unsanitized, so RTL 106 | attacks are possible. Plus for macros, string and character 107 | literals. Since at least Visual Studio 2015. But there's a MSVC 108 | [Feature Request](https://developercommunity.visualstudio.com/t/feature-request-allow-unicode-characters-in-identi/821782) 109 | to add insecure UTF-8 identifiers to MSVC. Apparently it doesn't 110 | support them now. [godbolt msvc 19](https://godbolt.org/z/xrnPnGPff) 111 | 112 | * icc since version 19. [godbolt icc 18](https://godbolt.org/z/8fhsf5xhT) 113 | 114 | * chibicc also took the C11 standard ranges: 115 | [unicode.c](https://github.com/rui314/chibicc/commits/main/unicode.c). 116 | My [Unicode security improvements over 117 | C11](https://github.com/rui314/chibicc/issues/32) request is open. 118 | 119 | * pcc acknowledges that the original C11 set _"is not restrictive 120 | enough to ensure readable code"_. 121 | , 122 | but does nothing much against it. The preprocessor translates UTF-8 123 | chars to escaped `\Uxxxxxxxxx` strings though, to make them even 124 | more unreadable, but at least allows for easier tool checks. 125 | 126 | # Linters 127 | 128 | * clang-tidy can check with readability-identifier-naming for 129 | identifiers naming style mismatch (cases), but does not warn on any 130 | unicode security guideline violations. It could be easily added though. 131 | 132 | * This u8idlint started with unicode identifiers checks. 133 | 134 | * found no malicious homoglyphs 135 | in publicly available Go source files yet. 136 | 137 | I know of no other linters or security checkers doing unicode 138 | identifiers checks, even when the C11 standard referred to linters to 139 | do such checks. A binutils linter is also needed to check compiled 140 | libraries for insecure identifiers. See my binutils-gdb repo on 141 | github for a patched readelf to perform checks on unicode symbols. 142 | 143 | # Tools 144 | 145 | I provide with a libu8ident a fixed version of the C++/C standard for 146 | a secure unicode identifers profile, called 147 | **`U8ID_PROFILE_C26_4`**, defined by `-DU8ID_PROFILE_SAFEC26`. This is 148 | an extended Moderate Profile (4), plus allowing Greek with Latin, plus 149 | only allowing Allowed IdentifierTypes. It also demands NFC normalization. 150 | See `unic11.h`, `unic26.h` and `mkc26.c`. 151 | 152 | # Others 153 | 154 | C++, perl5, perl6 (now raku), ruby, python, php, swift, julia, D, nim, crystal 155 | ... all added insecure unicode identifiers in the last years. php just by accident, 156 | not on purpose. 157 | 158 | **zig** refused to add unicode identifiers in the latest two feature requests. 159 | Not so much because of its insecurity, more of hazzles with the yearly changing 160 | Unicode standard. Valid source code could suddenly become invalid. 161 | **J** also stayed with ASCII only. 162 | 163 | **Clojure** is also safe recommending only ASCII identifiers, 164 | [Symbols](https://clojure.org/reference/reader#_symbols), but you can intern 165 | abritrary strings at will. Same as **Common Lisp** and all lisps in general. 166 | There are no identifier/symbol security considerations in place at all, similar 167 | to filesystems. 168 | 169 | **cperl** (an improved perl5) is very secure. It uses the security 170 | profile 4, does NFC normalization internally, and allows adding 171 | Limited_Use scripts, and special encodings for identifiers. 172 | 173 | **rust** is pretty secure. 174 | Support was discussed in [RFC 2457](https://github.com/rust-lang/rust/issues/55467), 175 | [GH 28979](https://github.com/rust-lang/rust/issues/28979) and 176 | [GH 2253](https://github.com/rust-lang/rust/issues/2253) 177 | and is described as [TR31 compatible](https://doc.rust-lang.org/reference/identifiers.html). 178 | Don't know if they decided on NFC or NFKC normalization. 179 | People are still pushing for _more inclusive_ (i.e. insecure) unicode identifiers. 180 | 181 | **C#** and **F#** follow TR31 and do NFC normalization, but do not check for mixed 182 | scripts nor Excluded scripts. [Identifiers](https://go.microsoft.com/fwlink/?LinkId=199552) 183 | 184 | **python** [PIP 3131](https://www.python.org/dev/peps/pep-3131/) at least 185 | does normalization (NFKC, an odd joice), but does not check for mixed 186 | scripts nor Excluded scripts. They did read TR39, but didn't understand 187 | nor follow it. 188 | 189 | **Java** 190 | [identifiers](https://docs.oracle.com/javase/specs/jls/se17/html/jls-3.html#jls-3.8) 191 | mentions only TR31, but no mixed scripts profile nor Excluded scripts 192 | checks. Comparison/lookup only ignores ignorables, but does no normalization. 193 | 194 | **perl** checks same as python for `XID_Start`/`XID_Continue` properties, 195 | which is at least a bit better than C11. It does not check for mixed 196 | scripts nor Excluded scripts, nor does it identifier normalization. 197 | But perl5.16 started even adding support for binary names with embedded 198 | `\0` in 2012. Which is highly insecure, because names are mapped to 199 | filenames and paths when including modules, and the filesystem just 200 | stops searching at the `\0`. This way you can trivially hide shellcode 201 | in user-identifiers. 202 | Earlier perls allowed single source encoding, like Cyrillic, which was 203 | very secure, but abandonded thata few years ago. cperl kept that. ALGOL 68 204 | also allowed encodings (a representation language), but even for its keywords. 205 | 206 | **C++** has the same insecure unicode identifiers as C. 207 | 208 | **javascript** in 209 | [ES5](https://mathiasbynens.be/notes/javascript-identifiers) used Lu, 210 | Ll, Lt, Lm, Lo or Nl for their `ID_Start`, and adds Mn, Mc, Nd and Pc 211 | to their `ID_Cont`. (i.e. TR31 profile 5), but no mixed scripts nor 212 | Excluded scripts, No identifier normalization. Same as DefId as they 213 | discussed early in C11, but abandoned for the insecure AltId 214 | variant. [ES6](https://mathiasbynens.be/notes/javascript-identifiers-es6) 215 | then went more insecure by allowing all ID\_Start and ID\_Cont 216 | characters. 217 | 218 | **Julia** [Allowed Variable 219 | Names](https://docs.julialang.org/en/v1/manual/variables/#man-allowed-variable-names) 220 | went with TR31 and profile 5, but has some special subset. No mixed scripts nor 221 | Excluded scripts, but NFC normalization and some confusable handling. 222 | 223 | **nim** [Identifiers & 224 | Keywords](https://nim-lang.org/docs/manual.html#lexical-analysis-identifiers-amp-keywords) 225 | allows everything as identifier, every unicode character above 127 is 226 | a letter, not-normalized. So they have unidentifiable identifers, and 227 | should rather use a new name for that. Maybe "name" or "symbol" 228 | instead of identifier. 229 | 230 | **crystal** [Symbol](https://crystal-lang.org/reference/1.2/syntax_and_semantics/literals/symbol.html) 231 | allows similarilty binary chunk as nim, everything above 159 is a letter for them. 232 | **Lua** and **luajit** also don't care much about unicode identifiers being identifiable. 233 | 234 | **D** also supports all raw unicode characters as identifiers. 235 | 236 | 237 | **Factor** ditto. 238 | 239 | **Go** is UTF-8 and defines its 240 | [identifers](https://go.dev/ref/spec#Identifiers) as letters, so 241 | TR31 ID. It does not check for mixed scripts nor Excluded scripts, nor 242 | does normalization. An exported name must begin with Lu (Uppercase Letter). 243 | 244 | **Haskell** identifiers must start with Ll (Lowercase Letter), but no further 245 | restrictions or normalization support AFAIK. 246 | 247 | **php** does not support unicode identifiers nor UTF-8 encodings, but 248 | does accidently so, by allowing everything in 249 | `[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*`, which leads the same 250 | insecurities as with nim and crystal, treating everything above 127 as letter. 251 | 252 | **Ada 95** supports raw unicode identifiers, but you need to specify 253 | its encoding. You can bet its entirely insecure. 254 | 255 | Modern **Forth** as glorified stack assembler allows everything 256 | unidentifiable. Older FORTH was safe though. 257 | 258 | **Rosetta Code** has a general overview of unicode identifiers in all 259 | programming languages, but without its special insecurities and 260 | caveats: 261 | 262 | Generally they were all very fast to add insecure unicode names 263 | (ignoring the official security guidelines), and also provided a slow 264 | SipHash security theatre for their hash tables instead of proper hash 265 | table security. 266 | 267 | ---- 268 | Created Reini Urban, 28.Dec 2021 269 | -------------------------------------------------------------------------------- /doc/n2916.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/n2916.pdf -------------------------------------------------------------------------------- /doc/n2932.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/n2932.pdf -------------------------------------------------------------------------------- /doc/nXXXX.patch: -------------------------------------------------------------------------------- 1 | --- doc/P2528R1.md 2022-02-15 13:44:14.210682108 +0100 2 | +++ doc/n2932.md 2022-02-15 13:45:14.746174957 +0100 3 | @@ -1,20 +1,19 @@ 4 | - C++ Identifier Security using Unicode Standard Annex 39 5 | + n2932 - C Identifier Security using Unicode Standard Annex 39 v2 6 | 7 | - Document #: P2538R1 8 | - Date: 2022-02-12 9 | - Project: Programming Language C++ 10 | - Audience: SG-16 11 | - EWG 12 | - CWG 13 | + Date: 2022-02-15 14 | + Project: Programming Language C 15 | + Audience: WG14 16 | + SG-16 17 | Reply-to: Reini Urban 18 | 19 | 1 Abstract 20 | ========== 21 | 22 | -In response to [P1949R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html), and 23 | -in parallel to [n2932](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2932.htm) for C. 24 | +In response to [P1949R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html), 25 | +replaces [n2916](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2916.htm), 26 | +in parallel to [P2528R1](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/P2528R1.html) for C++. 27 | 28 | -Adopt Unicode Annex 39 "Unicode Security Mechanisms" as part of C++26. 29 | +Adopt Unicode Annex 39 "Unicode Security Mechanisms" as part of C26. 30 | 31 | * Comply to a variant of [TR39#5.2](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection) 32 | Mixed-Scripts Moderately Restrictive profile, but allow some Greek letters without 33 | @@ -37,7 +36,7 @@ 34 | Recommend binutils/linker ABI identifier rules: names are UTF-8, 35 | add identifier checks. E.g. `readelf -L -Ue`. 36 | 37 | -In addition adopt this proposal as a Defect Report against C++23 and 38 | +In addition adopt this proposal as a Defect Report against C11 and 39 | earlier. The author provides the 40 | [libu8ident](https://github.com/rurban/libu8ident/) library (Apache 2 41 | licensed) and its generated tables to all implementors. 42 | @@ -48,10 +47,9 @@ 43 | 2 Changes 44 | ========= 45 | 46 | -From R0: 47 | +From [n2916](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2916.htm) 2022-01-22: 48 | 49 | -* Add internal links. 50 | -* Rename C23 to C26, it's too late for C++23. 51 | +* Rename C23 to C26, it's too late for C23, at least for C++23. 52 | * Disallow non-confusable `Technical` U+1C0..U+1C3 53 | * Fix a lot of not Allowed ID_Start ranges. safec26_start_list 54 | from 355 ranges, 115 singles, 99350 codepoints 55 | @@ -83,7 +81,7 @@ 56 | as they appeared in browsers and email. Also names in C object files: 57 | linkers, .def files, ffi's. 58 | 59 | -Implementing TR39 mixed script detection per document (C++ Header and 60 | +Implementing TR39 mixed script detection per document (C Header and 61 | Source file) forbids insecure mixes of Greek and Cyrillic, dangerous 62 | Arabic RTL bidi attacks and most confusables. You can still write in 63 | your language, but then only in commonly written languages, and not 64 | @@ -91,13 +89,13 @@ 65 | 66 | The question remains if TR39 security violations should be ill-formed 67 | (throw an compilation error or warning), or not. Since we do have the 68 | -`-std=c++26` option, and the issues are security relevant, an error 69 | -the issues are security relevant, an error seems to be best. 70 | -Implementations might choose to go for warnings on not-valid scripts, 71 | -mixed scripts, or invalid sequences of combining marks though, even if 72 | -the Unicode Standard recommended for decades that identifiers should 73 | -stay identifiable. If the standard committee opts for the insecure 74 | -option, they should rather rename identifiers to symbols then. 75 | +`-std=c2x` option, and the issues are security relevant, an error 76 | +seems to be best. Implementations might choose to go for warnings on 77 | +not-valid scripts, mixed scripts, or invalid sequences of combining 78 | +marks though, even if the Unicode Standard recommended for decades 79 | +that identifiers should stay identifiable. If the standard committee 80 | +opts for the insecure option, they should rather rename identifiers to 81 | +symbols then. 82 | 83 | 4 Motivation 84 | ============ 85 | @@ -109,11 +107,12 @@ 86 | , 87 | 88 | * 89 | -* with \*-sec\*.c\* 90 | +* with \*-sec\*.c 91 | 92 | These changes would fix all of the known security problems with C++/C 93 | identifiers. With C++ it is more severe as declarations are easily 94 | -confusable with initializations. 95 | +confusable with initializations, but C is still affected in unclear 96 | +situations. 97 | 98 | 5 What will this proposal change 99 | ================================ 100 | @@ -196,7 +195,7 @@ 101 | 5.3 Documents with identifiers in many multiple scripts/languages will become illegal 102 | ------------------------------------------------------------------------------------- 103 | 104 | -C++26 (and C26) will follow the TR39 Security Profile 4 **Moderately 105 | +C26 (and C26++) will follow the TR39 Security Profile 4 **Moderately 106 | Restrictive**, with an exception for Greek. 107 | 108 | * All identifiers in a document qualify as Single Script, or 109 | @@ -213,7 +212,7 @@ 110 | 5.4 Mixed-script runs with combining marks will become illegal 111 | -------------------------------------------------------------- 112 | 113 | -C++26 will check for unlikely sequences of **combining marks**, and 114 | +C26 will check for unlikely sequences of **combining marks**, and 115 | reject some. Combining Marks have no script property per se, but a 116 | variable list of allowed SCX scripts, which need to be checked against 117 | the base character. Also 4 Japanese KATAKANA-HIRAGANA PROLONGED SOUND 118 | @@ -278,7 +277,7 @@ 119 | 8.1 SC 120 | ----- 121 | 122 | -C++ only needs to map unicode characters to a script property via a 123 | +C only needs to map unicode characters to a script property via a 124 | single byte. There are currently 161 scripts assigned, 32 of them are 125 | in common use as identifiers, hence called **Recommended** scripts. The 126 | rest is split up into 127-31 **Excluded** scripts, which are not in common 127 | @@ -389,7 +388,7 @@ 128 | SOUND MARKs, all other Lm modifiers may mix with all SCX. 129 | 130 | The list of allowed combining mark characters (with Common or Inherited 131 | -scripts) in the C++26 TR31 profile is: Lm `Modifier_Letter`, 132 | +scripts) in the C26 TR31 profile is: Lm `Modifier_Letter`, 133 | Mc `Spacing_Mark`, Mn `Nonspacing_Mark`, Me `Enclosing_Mark`. Sk and Cf are 134 | not part of XIDs. 135 | 136 | @@ -416,7 +415,7 @@ 137 | ... 138 | ``` 139 | 140 | -From these 67 Lm plus 513 M\[cn\] ranges filtering out the non-C++26 XID 141 | +From these 67 Lm plus 513 M\[cn\] ranges filtering out the non-C26 XID 142 | candidates, only #8 Identifier_Type = Recommended, Inclusion, 143 | non-confusable Technical, plus only #4.2 Recommended Scripts, plus only codepoints 144 | with multiple SCX entries, plus only codepoints which don't decompose 145 | @@ -425,7 +424,7 @@ 146 | 147 | So some of the Common `XID_Continue` marks therefore cannot be 148 | detected with the SCX logic. But all of them do not combine with Latin 149 | -and are already filtered by the C++26 Mixed Script profile. 150 | +and are already filtered by the C26 Mixed Script profile. 151 | And all of the Combining Marks are caught by the NFC requirement. 152 | 153 | Most Lm Modifier Letters (besides the 4 Japanese PROLONGED SOUND 154 | @@ -564,11 +563,11 @@ 155 | identifiers. One could argue that a mixed-script profile is valid 156 | only for a single identifier, or it is valid for the whole source file 157 | document. And there needs to be a definition if before or after the 158 | -preprocessor, and if to treat names in private structs, classes and 159 | -local names in functions as seperate contexts. 160 | +preprocessor, and if to treat names in private structs and local names 161 | +in functions as seperate contexts. 162 | 163 | If valid for only a single identifier you could arbitralily mix up 164 | -Cyrillic with Greek identifiers in a C++ namespace, and thus these 165 | +Cyrillic with Greek identifiers in a C files, and thus these 166 | identifiers would not be identifiable anymore, as both both can render 167 | to the very same glyphs. Thus we adopt the notion of identifier 168 | contexts. 169 | @@ -605,11 +604,10 @@ 170 | - **private**: Another argument would be that all exported names end 171 | up in the object files and library flat, which would support the 172 | seperation of private and public name contexts, where to perform the 173 | - mixed-script checks. Private contexts (e.g. static structs, private 174 | - class fields, local names in functions) should be seperated from the 175 | - rest. This would prevent from confusables in struct/class 176 | - fields/methods, and the rest is seperated by the checks for the 177 | - public names. 178 | + mixed-script checks. Private contexts (e.g. static structs fields or 179 | + local names in functions) should be seperated from the rest. This 180 | + would prevent from confusables in struct fields/methods, and the 181 | + rest is seperated by the checks for the public names. 182 | 183 | - **after-cpp**: The third, strictest variant would define the context in 184 | the file after cpp. You would not be able to include a Cyrillic-only 185 | @@ -668,13 +666,13 @@ 186 | there, and names are charset (=user) specific, whilst there are no 187 | header fields for the used charset (e.g. if SHIFT-JIS or UTF-8), nor 188 | are there any rules for name lookup (normalization). This is not 189 | -solvable here (in C nor C++), only there or the gABI. Only in the Rust 190 | +solvable here (in C), only there or the gABI. Only in the Rust 191 | ecosystem there are proper unicode identifier rules, but Rust can link 192 | -against C++/C. I haven't detected any exported unicode names in the 193 | -wild, they are only used in local symbols still. UTF-16 compilers such 194 | -as MSVC do export their UNICODE names either in the local character 195 | -set or as UTF-8. If used wildly, object files would not link anymore, 196 | -as local character sets vary, and there is no character set standard 197 | +against C. I haven't detected any exported unicode names in the wild, 198 | +they are only used in local symbols still. UTF-16 compilers such as 199 | +MSVC do export their UNICODE names either in the local character set 200 | +or as UTF-8. If used wildly, object files would not link anymore, as 201 | +local character sets vary, and there is no character set standard 202 | defined. 203 | 204 | The C++/C working groups should urge the binutils/linker working 205 | -------------------------------------------------------------------------------- /doc/nXXXX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/nXXXX.pdf -------------------------------------------------------------------------------- /doc/tr31-bugs.md: -------------------------------------------------------------------------------- 1 | TR31 Security Bugs (UCD Versions 1-14) 2 | ====================================== 3 | 4 | U+FF00..U+FFEF not as ID 5 | --------------------------- 6 | 7 | Most of the U+FF00..U+FFEF Full and Halfwidth letters have incorrectly 8 | `ID_Start` resp. `ID_Continue` properties. XID ditto. 9 | They should not, because they are confusable with the normal 10 | characters in the base planes. E.g. LATIN A-Z are indistuingishable 11 | from A..Z, LATIN a-z from a..z, likewise for the Katakana ヲ..ッ and 12 | ア..ン, and the Hangul ᅠ..ᄒ, ᅡ..ᅦ, ᅧ..ᅬ, ᅭ..ᅲ ᅳ..ᅵ halfwidth letters. 13 | 14 | This is esp. for TR39 a security risk. TR39 provides Identifier Type 15 | properties to exclude insecure identifiers, but I cannot find any 16 | other type property to set these U+FF21..U+FFDC IDs to, than 17 | `Not_XID`. Thus the `ID_Start`/`ID_Continue` property should be 18 | deleted for all of them. If they are not identifiable, they should not 19 | be marked as such. 20 | 21 | Medial letters in `ID_Start`, not `ID_Continue` 22 | ------------------------------------------------- 23 | 24 | DerivedCoreProperties lists all of the Arabic and Thai MEDIAL letters, 25 | which are part of identifiers in `ID_Start`, not in `ID Continue`. Only 26 | the Combining marks are in `ID_Continue`. Thus all unicode-aware 27 | parsers accept such MEDIAL letters incorrectly in the start 28 | position. They should only be allowed in the `ID_Continue` position, 29 | and parsers should disallow them in the end positions for identifiers. 30 | 31 | All the other medial letters (Myanmar, Canadian Aboriginal, Ahom, 32 | Dives Akuru) are not part of Recommended Scripts, so they do not 33 | affect TR39 security. But since almost nobody but Java, cperl and Rust 34 | honor TR39 it's still affecting most parsers. 35 | 36 | Other medial exceptions are noted in TR31 at 2.4 Specific Character 37 | Adjustments, but the tables DerivedCoreProperties and TR39 Identifier 38 | tables and thus all user parsers are wrong. 39 | 40 | 41 | For the proposed C++26/C26 standard no such medial characters are 42 | included. 43 | 44 | Confusable Technical IdTypes 45 | ---------------------------- 46 | 47 | DerivedCoreProperties.txt lists the three ǃ U+1C3 "LATIN LETTER ALVEOLAR CLICK" 48 | ǀ U+1C0 "LATIN LETTER DENTAL CLICK" and ǁ U+1C1 "LATIN LETTER LATERAL CLICK" 49 | as `ID_Start`. But they should not be included in ID at all, as they are 50 | confusable with common operators. 51 | 52 | TR39 IdentifierTypes.txt lists the three ǃ U+1C3 "LATIN LETTER 53 | ALVEOLAR CLICK" ǀ U+1C0 "LATIN LETTER DENTAL CLICK" and ǁ U+1C1 "LATIN 54 | LETTER LATERAL CLICK" as Technical. So for normal IdentifierStatus 55 | Allowed these would be excluded. But C26/C++26 whishes to add Technical 56 | also. Add at least an Exclusion, if not the Non_XID property for XID 57 | stability concerns. But security bugs should trump stability 58 | guarantees. 59 | 60 | See 61 | for an exploit. 62 | 63 | Arabic Presentation Forms-A: U+FB50–U+FDFF and Forms-B: U+FE70–U+FEFF not as ID 64 | ------------------------------------------------------------------------------- 65 | 66 | Forms-A contains a list of Arabic presentation forms encoded as 67 | characters primarily for compatibility reasons. Forms-B are for 68 | compatibility with preexisting standards and legacy implementations 69 | that use these forms as character. Instead of these, letters from the 70 | Arabic block (U+0600..U+06FF) should be used for identifiers. See 71 | and 72 | . The TR39 idtype of these 73 | should be changed to Obsolete. 74 | 75 | ---- 76 | Last checked against DerivedCoreProperties-15.0.0d1.txt from 2022-04-26 77 | -------------------------------------------------------------------------------- /example.c: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2022 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | Contrary to the tests and helpers, provide a test binary to link 6 | against the public methods and lib only. 7 | */ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "u8ident.h" 14 | 15 | static void version(void) { puts("libu8ident example"); } 16 | static void usage(int exitcode) { 17 | version(); 18 | puts("Usage: example [--help|--version]"); 19 | puts("\nSEE ALSO:"); 20 | puts(" u8idlint.1"); 21 | puts("\nAUTHOR:"); 22 | puts(" Reini Urban "); 23 | exit(exitcode); 24 | } 25 | 26 | int main(int argc, char **argv) { 27 | unsigned linenr = 0U; 28 | enum u8id_options xid = U8ID_TR31_XID; 29 | enum u8id_norm norm = U8ID_NFC; 30 | enum u8id_profile profile = U8ID_PROFILE_C26_4; 31 | unsigned u8idopts = (unsigned)xid; 32 | 33 | char path[256] = {0}; 34 | static char line[1024] = {0}; 35 | 36 | if (argc > 1 && !strcmp(argv[1], "--help")) 37 | usage(0); 38 | if (argc > 1 && !strcmp(argv[1], "--version")) { 39 | version(); 40 | exit(0); 41 | } 42 | u8ident_init(profile, norm, u8idopts); 43 | if (getenv("U8IDTEST_TEXTS")) { 44 | strncpy(path, getenv("U8IDTEST_TEXTS"), 255); 45 | #ifdef _WIN32 46 | path[255] = '\0'; // only windows does not terminate 47 | #endif 48 | } 49 | if (argc > 1) { 50 | strncpy(path, argv[1], 255); 51 | #ifdef _WIN32 52 | path[255] = '\0'; // only windows does not terminate 53 | #endif 54 | } else { 55 | if (*path) 56 | strncat(path, "/myanmar-1.txt", 255); 57 | else 58 | strncpy(path, "texts/myanmar-1.txt", 255); 59 | } 60 | 61 | FILE *f = fopen(path, "r"); 62 | if (!f) { 63 | perror("fopen"); 64 | fprintf(stderr, "%s\n", path); 65 | return -1; 66 | } 67 | printf("u8ident_check %s\n", path); 68 | 69 | while (fgets(line, 1023, f)) { 70 | linenr++; 71 | // split on whitespace 72 | char *s = &line[0]; 73 | while (*s) { 74 | // we really should split on identifier boundaries, as in a real parser. 75 | // but this is just a simple example. So we get lots of unneeded XID 76 | // errors. 77 | char word[256]; 78 | char *norm = NULL; 79 | char *p = strchr(s, ' '); 80 | ptrdiff_t len = p ? (ptrdiff_t)(p - s) : (ptrdiff_t)strlen(s); 81 | if (len && s[len - 1] == '\n') 82 | len--; 83 | if (!len) 84 | break; 85 | int ret = u8ident_check_buf(s, len, NULL); 86 | switch (ret) { 87 | case U8ID_EOK_NORM: 88 | if (len < 256) { 89 | strncpy(word, s, len); 90 | word[len] = '\0'; 91 | u8ident_check((uint8_t *)word, &norm); 92 | printf("\"%s\".%u: NFC => \"%s\"\n", word, linenr, norm); 93 | free(norm); 94 | } else { 95 | printf("\"%.*s\".%u: NFC ...\n", (int)len, s, linenr); 96 | } 97 | break; 98 | case U8ID_ERR_XID: 99 | printf("\"%.*s\".%u: Wrong XID U+%X\n", (int)len, s, linenr, 100 | u8ident_failed_char(0)); 101 | break; 102 | case U8ID_ERR_COMBINE: 103 | printf("\"%.*s\".%u: Wrong Combining Mark U+%X\n", (int)len, s, linenr, 104 | u8ident_failed_char(0)); 105 | break; 106 | case U8ID_ERR_SCRIPT: 107 | case U8ID_ERR_SCRIPTS: { 108 | uint32_t cp = u8ident_failed_char(0); 109 | const char *scr = u8ident_failed_script_name(0); 110 | const char *scripts = u8ident_existing_scripts(0); 111 | printf("\"%.*s\".%u: Wrong SCRIPT %s for U+%X, have %s\n", (int)len, s, 112 | linenr, scr, cp, scripts); 113 | free((char*)scripts); 114 | break; 115 | } 116 | default: 117 | break; 118 | } 119 | if (p) 120 | s = p + 1; 121 | else 122 | break; 123 | } 124 | } 125 | fclose(f); 126 | 127 | u8ident_free(); 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /hangul.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is built by cperl regen/regcharclass-safec.pl. 3 | * cp_high means only codepoints > 0xff 4 | */ 5 | 6 | #ifndef H_HANGUL 7 | #define H_HANGUL 1 8 | 9 | /* 10 | HANGUL (Korean) has special normalization rules. 11 | Unicode 9.0 12 | */ 13 | // clang-format off 14 | #define Hangul_SBase 0xAC00 15 | #define Hangul_SFinal 0xD7A3 16 | #define Hangul_SCount 11172 17 | 18 | #define Hangul_NCount 588 19 | 20 | #define Hangul_LBase 0x1100 21 | #define Hangul_LFinal 0x1112 22 | #define Hangul_LCount 19 23 | 24 | #define Hangul_VBase 0x1161 25 | #define Hangul_VFinal 0x1175 26 | #define Hangul_VCount 21 27 | 28 | #define Hangul_TBase 0x11A7 29 | #define Hangul_TFinal 0x11C2 30 | #define Hangul_TCount 28 31 | 32 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) 33 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) 34 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) 35 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) 36 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) 37 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) 38 | 39 | /* much better would be the simple Hangul_IsS(cp) check */ 40 | /*** perl5/cperl GENERATED CODE for all Hangul codepoints ***/ 41 | #define is_HANGUL_cp_high(cp) \ 42 | ( ( 0x1100 <= cp && cp <= 0x11FF ) || ( 0x11FF < cp && \ 43 | ( 0x302E == cp || ( 0x302E < cp && \ 44 | ( 0x302F == cp || ( 0x302F < cp && \ 45 | ( ( 0x3131 <= cp && cp <= 0x318E ) || ( 0x318E < cp && \ 46 | ( ( 0x3200 <= cp && cp <= 0x321E ) || ( 0x321E < cp && \ 47 | ( ( 0x3260 <= cp && cp <= 0x327E ) || ( 0x327E < cp && \ 48 | ( ( 0xA960 <= cp && cp <= 0xA97C ) || ( 0xA97C < cp && \ 49 | ( ( 0xAC00 <= cp && cp <= 0xD7A3 ) || ( 0xD7A3 < cp && \ 50 | ( ( 0xD7B0 <= cp && cp <= 0xD7C6 ) || ( 0xD7C6 < cp && \ 51 | ( ( 0xD7CB <= cp && cp <= 0xD7FB ) || ( 0xD7FB < cp && \ 52 | ( ( 0xFFA0 <= cp && cp <= 0xFFBE ) || ( 0xFFBE < cp && \ 53 | ( ( 0xFFC2 <= cp && cp <= 0xFFC7 ) || ( 0xFFC7 < cp && \ 54 | ( ( 0xFFCA <= cp && cp <= 0xFFCF ) || ( 0xFFCF < cp && \ 55 | ( ( 0xFFD2 <= cp && cp <= 0xFFD7 ) || ( 0xFFDA <= cp && cp <= 0xFFDC ) \ 56 | )))))))))))))))))))))))))) ) 57 | 58 | // clang-format on 59 | #endif /* H_HANGUL */ 60 | -------------------------------------------------------------------------------- /htable.c: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2022 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | A simple open-addressing string hash table for the confusables check. 6 | No deletions needed. 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "htable.h" 15 | 16 | struct htable * new_htab(unsigned cap) { 17 | struct htable *ht = malloc(sizeof(struct htable)); 18 | assert(cap % 2 == 0); 19 | ht->keys = calloc(cap, sizeof(char*)); 20 | ht->values = calloc(cap, sizeof(char*)); 21 | ht->cap = cap; 22 | ht->size = 0; 23 | return ht; 24 | } 25 | 26 | void free_htab(struct htable *htab) { 27 | for (unsigned i=0; i < htab->cap; i++) { 28 | if (htab->keys[i]) { 29 | free(htab->keys[i]); 30 | free(htab->values[i]); 31 | } 32 | } 33 | free(htab->keys); 34 | free(htab->values); 35 | htab->cap = htab->size = 0; 36 | return; 37 | } 38 | 39 | static inline float fill_htab(struct htable *htab) { 40 | return htab->size / htab->cap; 41 | } 42 | 43 | static inline uint32_t hash(const char *key) { 44 | uint32_t h = 2166136261; 45 | uint8_t *p = (uint8_t *)key; 46 | uint8_t c; 47 | while ((c = *p++)) { 48 | h ^= c; 49 | h *= 16777619; 50 | } 51 | return h; 52 | } 53 | 54 | void add_htab(struct htable *htab, const char *key, const char *value) { 55 | assert(htab); 56 | uint32_t h = hash(key) & (htab->cap - 1); 57 | uint32_t oh = h; 58 | assert(key); 59 | if (fill_htab(htab) > 0.75f) { 60 | struct htable *ht2; 61 | resize: 62 | ht2 = new_htab(htab->cap * 2); 63 | //fprintf(stderr, "resize %p[%lu] to %lu\n", htab, htab->size, htab->cap * 2); 64 | for (unsigned i=0; i < htab->cap; i++) { 65 | if (htab->keys[h]) { 66 | add_htab(ht2, htab->keys[h], htab->values[h]); 67 | } 68 | } 69 | free_htab(htab); 70 | memcpy(htab, ht2, sizeof *htab); // copy all over, and start again. 71 | free(ht2); 72 | oh = h = hash(key) & (htab->cap - 1); 73 | } 74 | while (htab->keys[h] && strcmp(htab->keys[h], key)) { // not found 75 | h++; 76 | if (h >= htab->cap) // wraparound 77 | h = 0; 78 | if (h == oh) 79 | goto resize; 80 | } 81 | if (!htab->keys[h]) { // found a free slot 82 | //fprintf(stderr, "add %s key to %p[%lu]\n", key, htab, htab->size+1); 83 | htab->keys[h] = strdup(key); 84 | htab->values[h] = strdup(value); 85 | htab->size++; 86 | } // else already exists, ignore 87 | return; 88 | } 89 | 90 | char * find_htab(struct htable *htab, const char *key) { 91 | assert(htab); 92 | if (!htab->size) 93 | return false; 94 | uint32_t h = hash(key) & (htab->cap - 1); 95 | const uint32_t oh = h; 96 | assert(key); 97 | while (htab->keys[h] && strcmp(htab->keys[h], key)) { // not found 98 | h++; 99 | if (h >= htab->cap) // wraparound 100 | h = 0; 101 | if (h == oh) 102 | return false; 103 | } 104 | //if (htab->keys[h]) 105 | // fprintf(stderr, "find %s -> %s in %p[%lu]\n", htab->keys[h], htab->values[h], htab, htab->size); 106 | return htab->keys[h] ? htab->values[h] : NULL; 107 | } 108 | -------------------------------------------------------------------------------- /htable.h: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2022 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | A simple open-addressing string hash table for the confusables check. 6 | No deletion needed. 7 | */ 8 | 9 | #pragma once 10 | #include 11 | #include 12 | 13 | struct htable { 14 | unsigned size; 15 | unsigned cap; 16 | char **keys; 17 | char **values; 18 | }; 19 | 20 | struct htable * new_htab(unsigned cap); 21 | void free_htab(struct htable *htab); 22 | // adds a copy to the htable 23 | void add_htab(struct htable *htab, const char *key, const char *value); 24 | // return value 25 | char * find_htab(struct htable *htab, const char *key); 26 | -------------------------------------------------------------------------------- /include/u8ident.h: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2021,2022 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | #define U8IDENT_VERSION_MAJOR 0 10 | #define U8IDENT_VERSION_MINOR 0 11 | #define U8IDENT_UNICODE_VERSION 15 12 | 13 | enum u8id_norm { 14 | U8ID_NFC = 0, // the default, shorter canonical composed normalization 15 | U8ID_NFD = 1, // the longer, canonical decomposed normalization, as in the 16 | // previous Apple HPFS filesystem 17 | U8ID_NFKC = 2, // the compatibility composed normalization, as in Python 3 18 | U8ID_NFKD = 3, // the longer compatibility decomposed normalization 19 | U8ID_FCD = 4, // the faster variants 20 | U8ID_FCC = 5 21 | }; 22 | enum u8id_profile { 23 | U8ID_PROFILE_1 = 1, // ASCII only 24 | U8ID_PROFILE_2 = 2, // Single Script only 25 | U8ID_PROFILE_3 = 3, // Highly Restrictive 26 | U8ID_PROFILE_4 = 4, // Moderately Restrictive 27 | U8ID_PROFILE_5 = 5, // Minimally Restrictive 28 | U8ID_PROFILE_6 = 6, // Unrestricted 29 | U8ID_PROFILE_C11_6 = 7, // The C11 std 30 | // PROFILE_4 + Greek with only Allowed ID's ("SAFEC26") 31 | U8ID_PROFILE_C26_4 = 8, 32 | }; 33 | enum u8id_options { 34 | // clang-format off 35 | // Note: The parser/tokenizer should do that. Without, the checker can be 36 | // faster. 37 | // Can be disallowed with --u8id-tr31=NONE, hardcoded with --u8id-tr31=. 38 | U8ID_TR31_XID = 64, // ID without NFKC quirks, labelled stable, the default 39 | U8ID_TR31_ID = 65, // all letters, plus numbers, punctuation and marks. With 40 | // exotic scripts. 41 | U8ID_TR31_ALLOWED = 66, // TR39 ID with only recommended scripts. Allowed IdentifierStatus. 42 | U8ID_TR31_SAFEC26 = 67, // practical XID with TR39 security measures. see doc/D2528R1.md 43 | U8ID_TR31_C23 = 68, // XID with NFC from the upcoming C23 standard (P1949, N2828). 44 | U8ID_TR31_C11 = 69, // the stable insecure AltId ranges from the C11 standard, Annex D 45 | U8ID_TR31_ALLUTF8 = 70, // allow all > 128, e.g. D, php, nim, crystal 46 | U8ID_TR31_ASCII = 71, // only ASCII letters (as e.g. zig, j. older compilers) 47 | // room for more tr31 profiles 48 | 49 | U8ID_FOLDCASE = 128, 50 | U8ID_WARN_CONFUSABLE = 256, // requires -DHAVE_CONFUS 51 | U8ID_ERROR_CONFUSABLE = 512, // requires -DHAVE_CONFUS 52 | // clang-format on 53 | }; 54 | #define U8ID_TR31_MASK 127 55 | typedef unsigned u8id_ctx_t; 56 | 57 | #ifndef U8ID_NORM_DEFAULT 58 | # define U8ID_NORM_DEFAULT U8ID_NFC 59 | #endif 60 | #ifndef U8ID_PROFILE_DEFAULT 61 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4 62 | #endif 63 | 64 | #ifndef _U8ID_PRIVATE_H 65 | // from outside the dll 66 | # if defined _WIN32 || defined __CYGWIN__ 67 | # define EXTERN __declspec(dllimport) 68 | # else 69 | # define EXTERN extern 70 | # endif 71 | //# define LOCAL 72 | #else 73 | // inside the dll 74 | # if defined _WIN32 || defined __CYGWIN__ 75 | # define EXTERN __declspec(dllexport) 76 | # define LOCAL 77 | # elif __GNUC__ >= 4 78 | # define EXTERN __attribute__((visibility("default"))) 79 | # define LOCAL __attribute__((visibility("hidden"))) 80 | # else 81 | # define EXTERN 82 | # define LOCAL 83 | # endif 84 | #endif 85 | 86 | /* Initialize the library with a tr39 profile, normalization and bitmask of 87 | options, which define more performed checks. Recommended is 88 | `(U8ID_PROFILE_DEFAULT, U8ID_NORM_DEFAULT, 0)`. return -1 on error, 0 if 89 | options are valid. 90 | */ 91 | EXTERN int u8ident_init(enum u8id_profile, enum u8id_norm, unsigned options); 92 | 93 | /* maxlength of an identifier. Default: 1024. Beware that such longs 94 | identifiers, are not really identifiable anymore, and keep them under 80 or 95 | even less. Some filesystems do allow now 32K identifiers, which is a glaring 96 | security hole, waiting to be exploited */ 97 | EXTERN void u8ident_set_maxlength(unsigned maxlen); 98 | 99 | /* Generates a new identifier document/context/directory, which 100 | initializes a new list of seen scripts. Contexts are optional, by 101 | default all checks are done in the same context 0. With compilers 102 | and interpreters a context is a source file, with filesystems a directory, 103 | with usernames you may choose if you need to support different languages at 104 | once. 105 | I cannot think of any such usage, so better avoid contexts with usernames to 106 | avoid mixups. */ 107 | EXTERN u8id_ctx_t u8ident_new_ctx(void); 108 | 109 | /* Changes to the context previously generated with `u8ident_new_ctx`. */ 110 | EXTERN int u8ident_set_ctx(u8id_ctx_t ctx); 111 | 112 | /* Optionally adds a script to the context, if it's known or declared 113 | beforehand. Such as `use utf8 "Greek";` in cperl. 114 | 115 | All http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts 116 | need not to be declared beforehand. 117 | 118 | Common Inherited Arabic Armenian Bengali Bopomofo Cyrillic 119 | Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi Hangul Han Hebrew 120 | Hiragana Katakana Kannada Khmer Lao Latin Malayalam Myanmar Oriya 121 | Sinhala Tamil Telugu Thaana Thai Tibetan 122 | 123 | All http://www.unicode.org/reports/tr31/#Table_Limited_Use_Scripts need 124 | to be or are disallowed by profile: 125 | 126 | Adlam Balinese Bamum Batak Canadian_Aboriginal Chakma Cham Cherokee 127 | Hanifi_Rohingya Javanese Kayah_Li Lepcha Limbu Lisu Mandaic 128 | Meetei_Mayek Miao New_Tai_Lue Newa Nko Nyiakeng_Puachue_Hmong Ol_Chiki 129 | Osage Saurashtra Sundanese Syloti_Nagri Syriac Tai_Le Tai_Tham 130 | Tai_Viet Tifinagh Vai Wancho Yi Unknown 131 | 132 | All others need to be added with u8ident_add_script_name(). 133 | */ 134 | EXTERN int u8ident_add_script_name(const char *name); 135 | EXTERN int u8ident_add_script(uint8_t script); 136 | 137 | EXTERN uint8_t u8ident_get_script(const uint32_t cp); 138 | EXTERN const char *u8ident_script_name(const int scr); 139 | 140 | /* Deletes the context generated with `u8ident_new_ctx`. This is 141 | optional, all remaining contexts are deleted by `u8ident_free` */ 142 | EXTERN int u8ident_free_ctx(u8id_ctx_t ctx); 143 | 144 | /* End this library, cleaning up all internal structures. */ 145 | EXTERN void u8ident_free(void); 146 | 147 | /* Returns a freshly allocated normalized string, in the option defined at 148 | `u8ident_init`. Defaults to U8ID_NFC. */ 149 | EXTERN char *u8ident_normalize(const char *buf, int len); 150 | 151 | /* 152 | Lookup if the codepoint is a confusable. Only with --enable-confus 153 | -DHAVE_CONFUS. With --with-croaring -DHAVE_CROARING this is 154 | twice as fast, and needs half the size. 155 | */ 156 | EXTERN bool u8ident_is_confusable(const uint32_t cp); 157 | 158 | enum u8id_errors { 159 | U8ID_EOK = 0, 160 | U8ID_EOK_NORM = 1, 161 | U8ID_EOK_WARN_CONFUS = 2, 162 | U8ID_EOK_NORM_WARN_CONFUS = 3, 163 | U8ID_ERR_XID = -1, 164 | U8ID_ERR_SCRIPT = -2, 165 | U8ID_ERR_SCRIPTS = -3, 166 | U8ID_ERR_ENCODING = -4, 167 | U8ID_ERR_COMBINE = -5, 168 | U8ID_ERR_CONFUS = -6, 169 | }; 170 | 171 | /* Two variants to check if this identifier is valid. u8ident_check_buf avoids 172 | allocating a fresh string from the parsed input. buf must not be 173 | zero-terminated. 174 | 175 | Return values (enum u8id_errors): 176 | * 0 - valid without need to normalize. 177 | * 1 - valid with need to normalize. 178 | * 2 - warn about confusable 179 | * 3 - warn about confusable and need to normalize 180 | * -1 - invalid xid, disallowed via IdentifierStatus.txt 181 | * -2 - invalid script 182 | * -3 - invalid mixed scripts 183 | * -4 - invalid encoding 184 | * -5 - invalid because confusable (not yet implemented) 185 | outnorm is set to a fresh normalized string if valid. 186 | 187 | Note that in the check we explicitly allow the Latin confusables: 0 1 I ` 188 | i.e. U+30, U+31, U+49, U+60 189 | */ 190 | EXTERN enum u8id_errors u8ident_check(const uint8_t *string, char **outnorm); 191 | EXTERN enum u8id_errors u8ident_check_buf(const char *buf, const int len, 192 | char **outnorm); 193 | 194 | /* A different, but much less reliable check strategy via 195 | confusables.txt only, described in TR 39, Section 4, the skeleton 196 | algorithm. Each identifier is stored in two dynamic hash tables, 197 | and for each confusable match, normalized to NFC, the first 198 | wins. Only with `--enable-confus / -DHAVE_CONFUS`. 199 | */ 200 | EXTERN enum u8id_errors u8ident_check_confusables(const char *buf, const int len); 201 | 202 | /* returns the failing codepoint, which failed in the last check. */ 203 | EXTERN uint32_t u8ident_failed_char(const u8id_ctx_t ctx); 204 | /* returns the constant script name, which failed in the last check. */ 205 | EXTERN const char *u8ident_failed_script_name(const u8id_ctx_t ctx); 206 | 207 | /* Returns a fresh string of the list of the seen scripts in this 208 | context whenever a mixed script error occurs. Needed for the error message 209 | "Invalid script %s, already have %s", where the 2nd %s is returned by this 210 | function. The returned string needs to be freed by the user. 211 | 212 | Usage: 213 | 214 | if (u8id_check("wrongᴧᴫ") == U8ID_ERR_SCRIPTS) { 215 | const char *scripts = u8ident_existing_scripts(ctx); 216 | fprintf(stdout, "Invalid script %s for U+%X, already have %s.\n", 217 | u8ident_failed_script_name(ctx), u8ident_failed_char(ctx), 218 | scripts); 219 | free(scripts); 220 | } 221 | */ 222 | EXTERN const char *u8ident_existing_scripts(const u8id_ctx_t ctx); 223 | -------------------------------------------------------------------------------- /libu8ident.pc.in: -------------------------------------------------------------------------------- 1 | # @configure_input@ 2 | prefix=@prefix@ 3 | exec_prefix=@exec_prefix@ 4 | libdir=@libdir@ 5 | includedir=@includedir@ 6 | 7 | Name: @PACKAGE@ 8 | Description: check unicode security guidelines for identifiers 9 | Version: @VERSION@ 10 | URL: @PACKAGE_URL@ 11 | Libs: -L${libdir} -lu8ident 12 | Cflags: -I${includedir} 13 | -------------------------------------------------------------------------------- /m4/ax_gcc_builtin.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_gcc_builtin.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_GCC_BUILTIN(BUILTIN) 8 | # 9 | # DESCRIPTION 10 | # 11 | # This macro checks if the compiler supports one of GCC's built-in 12 | # functions; many other compilers also provide those same built-ins. 13 | # 14 | # The BUILTIN parameter is the name of the built-in function. 15 | # 16 | # If BUILTIN is supported define HAVE_. Keep in mind that since 17 | # builtins usually start with two underscores they will be copied over 18 | # into the HAVE_ definition (e.g. HAVE___BUILTIN_EXPECT for 19 | # __builtin_expect()). 20 | # 21 | # The macro caches its result in the ax_cv_have_ variable (e.g. 22 | # ax_cv_have___builtin_expect). 23 | # 24 | # The macro currently supports the following built-in functions: 25 | # 26 | # __builtin_assume_aligned 27 | # __builtin_bswap16 28 | # __builtin_bswap32 29 | # __builtin_bswap64 30 | # __builtin_choose_expr 31 | # __builtin___clear_cache 32 | # __builtin_clrsb 33 | # __builtin_clrsbl 34 | # __builtin_clrsbll 35 | # __builtin_clz 36 | # __builtin_clzl 37 | # __builtin_clzll 38 | # __builtin_complex 39 | # __builtin_constant_p 40 | # __builtin_ctz 41 | # __builtin_ctzl 42 | # __builtin_ctzll 43 | # __builtin_expect 44 | # __builtin_ffs 45 | # __builtin_ffsl 46 | # __builtin_ffsll 47 | # __builtin_fpclassify 48 | # __builtin_huge_val 49 | # __builtin_huge_valf 50 | # __builtin_huge_vall 51 | # __builtin_inf 52 | # __builtin_infd128 53 | # __builtin_infd32 54 | # __builtin_infd64 55 | # __builtin_inff 56 | # __builtin_infl 57 | # __builtin_isinf_sign 58 | # __builtin_nan 59 | # __builtin_nand128 60 | # __builtin_nand32 61 | # __builtin_nand64 62 | # __builtin_nanf 63 | # __builtin_nanl 64 | # __builtin_nans 65 | # __builtin_nansf 66 | # __builtin_nansl 67 | # __builtin_object_size 68 | # __builtin_parity 69 | # __builtin_parityl 70 | # __builtin_parityll 71 | # __builtin_popcount 72 | # __builtin_popcountl 73 | # __builtin_popcountll 74 | # __builtin_powi 75 | # __builtin_powif 76 | # __builtin_powil 77 | # __builtin_prefetch 78 | # __builtin_trap 79 | # __builtin_types_compatible_p 80 | # __builtin_unreachable 81 | # 82 | # Unsupported built-ins will be tested with an empty parameter set and the 83 | # result of the check might be wrong or meaningless so use with care. 84 | # 85 | # LICENSE 86 | # 87 | # Copyright (c) 2013 Gabriele Svelto 88 | # Copyright (c) 2018 Reini Urban 89 | # 90 | # Copying and distribution of this file, with or without modification, are 91 | # permitted in any medium without royalty provided the copyright notice 92 | # and this notice are preserved. This file is offered as-is, without any 93 | # warranty. 94 | 95 | #serial 7 96 | 97 | AC_DEFUN([AX_GCC_BUILTIN], [ 98 | AS_VAR_PUSHDEF([ac_var], [ax_cv_have_$1]) 99 | 100 | AC_CACHE_CHECK([for $1], [ac_var], [ 101 | AC_LINK_IFELSE([AC_LANG_PROGRAM([], [ 102 | m4_case([$1], 103 | [__builtin___mbsnrtowcs_chk], [$1("", "", 0, 0, "", 0)], 104 | [__builtin___mbsrtowcs_chk], [$1("", "", 0, "", 0)], 105 | [__builtin___mbstowcs_chk], [$1("", "", 0, 0)], 106 | [__builtin___memcpy_chk], [$1("", "", 0, 0)], 107 | [__builtin___memmove_chk], [$1("", "", 0, 0)], 108 | [__builtin___mempcpy_chk], [$1("", "", 0, 0)], 109 | [__builtin___memset_chk], [$1("", 0, 0, 0)], 110 | [__builtin___strcpy_chk], [$1("", "", 0)], 111 | [__builtin___stpcpy_chk], [$1("", "", 0)], 112 | [__builtin___stpncpy_chk], [$1("", "", 0, 0)], 113 | [__builtin___strncpy_chk], [$1("", "", 0, 0)], 114 | [__builtin___strcat_chk], [$1("", "", 0)], 115 | [__builtin___strncat_chk], [$1("", "", 0, 0)], 116 | [__builtin___printf_chk], [$1(0, "")], 117 | [__builtin___read_chk], [$1(0, "", 0, 0)], 118 | [__builtin___sprintf_chk], [$1("", 0, 0, "")], 119 | [__builtin___snprintf_chk], [$1("", 0, 0, 0, "")], 120 | [__builtin___swprintf_chk], [$1("", 0, 0, 0, "")], 121 | [__builtin___vfprintf_chk], [$1("", 0, "", 0)], 122 | [__builtin___vfwprintf_chk], [$1("", 0, "", 0)], 123 | [__builtin___vprintf_chk], [$1(0, "", 0)], 124 | [__builtin___vsnprintf_chk], [$1("", 0, 0, 0, "", 0)], 125 | [__builtin___vsprintf_chk], [$1("", 0, 0, "", 0)], 126 | [__builtin___vswprintf_chk], [$1("", 0, 0, 0, "", 0)], 127 | [__builtin___vwprintf_chk], [$1(0, "", 0)], 128 | [__builtin___wcrtomb_chk], [$1("", "", "", 0)], 129 | [__builtin___wcscat_chk], [$1("", "", 0)], 130 | [__builtin___wcscpy_chk], [$1("", "", 0)], 131 | [__builtin___wcsncat_chk], [$1("", "", 0, 0)], 132 | [__builtin___wcsncpy_chk], [$1("", "", 0, 0)], 133 | [__builtin___wcsnrtombs_chk], [$1("", "", 0, 0, "", 0)], 134 | [__builtin___wcsrtombs_chk], [$1("", "", 0, "", 0)], 135 | [__builtin___wcstombs_chk], [$1("", "", 0, 0)], 136 | [__builtin___wctomb_chk], [$1("", 0, 0)], 137 | [__builtin___wmemcpy_chk], [$1("", "", 0, 0)], 138 | [__builtin___wmemmove_chk], [$1("", "", 0, 0)], 139 | [__builtin___wmempcpy_chk], [$1("", "", 0, 0)], 140 | [__builtin___wmemset_chk], [$1("", "", 0, 0)], 141 | [__builtin___wprintf_chk], [$1(0, "")], 142 | [__builtin___bnd_set_ptr_bounds], [$1("", 0)], 143 | [__builtin___bnd_narrow_ptr_bounds], [$1("", "", 0)], 144 | [__builtin___bnd_copy_ptr_bounds], [$1("", "")], 145 | [__builtin___bnd_init_ptr_bounds], [$1("")], 146 | [__builtin___bnd_null_ptr_bounds], [$1("")], 147 | [__builtin___bnd_store_ptr_bounds], [$1("", "")], 148 | [__builtin___bnd_chk_ptr_lbounds], [$1("")], 149 | [__builtin___bnd_chk_ptr_ubounds], [$1("")], 150 | [__builtin___bnd_chk_ptr_bounds], [$1("", 0)], 151 | [__builtin___bnd_get_ptr_lbound], [$1("")], 152 | [__builtin___bnd_get_ptr_ubound], [$1("")], 153 | [__builtin_assume_aligned], [$1("", 0)], 154 | [__builtin_bswap16], [$1(0)], 155 | [__builtin_bswap32], [$1(0)], 156 | [__builtin_bswap64], [$1(0)], 157 | [__builtin_choose_expr], [if($1(1, 1, 0)) 1], 158 | [__builtin___clear_cache], [$1("", "")], 159 | [__builtin_clrsb], [$1(0)], 160 | [__builtin_clrsbl], [$1(0)], 161 | [__builtin_clrsbll], [$1(0)], 162 | [__builtin_clz], [$1(0)], 163 | [__builtin_clzl], [$1(0)], 164 | [__builtin_clzll], [$1(0)], 165 | [__builtin_complex], [$1(0.0, 0.0)], 166 | [__builtin_constant_p], [$1(0)], 167 | [__builtin_ctz], [$1(0)], 168 | [__builtin_ctzl], [$1(0)], 169 | [__builtin_ctzll], [$1(0)], 170 | [__builtin_expect], [$1(0, 0)], 171 | [__builtin_ffs], [$1(0)], 172 | [__builtin_ffsl], [$1(0)], 173 | [__builtin_ffsll], [$1(0)], 174 | [__builtin_fpclassify], [$1(0, 1, 2, 3, 4, 0.0)], 175 | [__builtin_huge_val], [$1()], 176 | [__builtin_huge_valf], [$1()], 177 | [__builtin_huge_vall], [$1()], 178 | [__builtin_inf], [$1()], 179 | [__builtin_infd128], [$1()], 180 | [__builtin_infd32], [$1()], 181 | [__builtin_infd64], [$1()], 182 | [__builtin_inff], [$1()], 183 | [__builtin_infl], [$1()], 184 | [__builtin_isinf_sign], [$1(0.0)], 185 | [__builtin_nan], [$1("")], 186 | [__builtin_nand128], [$1("")], 187 | [__builtin_nand32], [$1("")], 188 | [__builtin_nand64], [$1("")], 189 | [__builtin_nanf], [$1("")], 190 | [__builtin_nanl], [$1("")], 191 | [__builtin_nans], [$1("")], 192 | [__builtin_nansf], [$1("")], 193 | [__builtin_nansl], [$1("")], 194 | [__builtin_object_size], [$1("", 0)], 195 | [__builtin_parity], [$1(0)], 196 | [__builtin_parityl], [$1(0)], 197 | [__builtin_parityll], [$1(0)], 198 | [__builtin_popcount], [$1(0)], 199 | [__builtin_popcountl], [$1(0)], 200 | [__builtin_popcountll], [$1(0)], 201 | [__builtin_powi], [$1(0, 0)], 202 | [__builtin_powif], [$1(0, 0)], 203 | [__builtin_powil], [$1(0, 0)], 204 | [__builtin_prefetch], [$1("")], 205 | [__builtin_trap], [$1()], 206 | [__builtin_types_compatible_p], [$1(int, int)], 207 | [__builtin_unreachable], [$1()], 208 | [m4_warn([syntax], [Unsupported built-in $1, the test may fail]) 209 | $1()] 210 | ) 211 | ])], 212 | [AS_VAR_SET([ac_var], [yes])], 213 | [AS_VAR_SET([ac_var], [no])]) 214 | ]) 215 | 216 | AS_IF([test yes = AS_VAR_GET([ac_var])], 217 | [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_$1), 1, 218 | [Define to 1 if the system has the `$1' built-in function])], []) 219 | 220 | AS_VAR_POPDEF([ac_var]) 221 | ]) 222 | -------------------------------------------------------------------------------- /makefile.gnu: -------------------------------------------------------------------------------- 1 | # -*- Makefile -*- 2 | #DEFS := -DU8ID_NORM=NFKC -DU8ID_PROFILE=4 -DDISABLE_CHECK_XID 3 | DEFINES = ${DEFS} -DHAVE___BUILTIN_FFS 4 | HAVE_CONFUS := 1 5 | CC := cc 6 | CFLAGS := -Wall -Wextra 7 | AR := ar 8 | RANLIB := ranlib 9 | GPERF := gperf 10 | # dnf install rubygem-ronn-ng 11 | RONN := ronn 12 | # Maintainer only 13 | VERSION := $(shell build-aux/git-version-gen .version) 14 | SO_MAJ = 1 15 | DEFINES += -DPACKAGE_VERSION="\"$(VERSION)\"" 16 | # This should to be a recent perl, matching the target unicode version 17 | PERL := perl 18 | WGET := wget 19 | PANDOC := pandoc 20 | # End Maintainer-only 21 | 22 | HEADER = include/u8ident.h 23 | NORMHDRS = un8ifcan.h un8ifcmb.h un8ifcmp.h un8ifcpt.h un8ifexc.h 24 | HDRS = u8id_private.h u8id_gc.h scripts.h $(NORMHDRS) hangul.h \ 25 | mark.h medial.h unic11.h scripts16.h htable.h 26 | SRC = u8ident.c u8idscr.c u8idnorm.c 27 | ifeq (${HAVE_CONFUS}, 1) 28 | SRC += u8idroar.c htable.c 29 | HDRS += u8idroar.h confus.h gconfus.h 30 | DEFINES += -DHAVE_CONFUS 31 | endif 32 | ifneq (,$(wildcard /usr/include/sys/stat.h)) 33 | DEFINES += -DHAVE_SYS_STAT_H 34 | endif 35 | ifneq (,$(wildcard /usr/include/dirent.h)) 36 | DEFINES += -DHAVE_DIRENT_H 37 | endif 38 | ifneq (,$(wildcard /usr/include/getopt.h)) 39 | DEFINES += -DHAVE_GETOPT_H 40 | endif 41 | ifneq (,$(wildcard /usr/include/uniwbrk.h)) 42 | DEFINES += -DHAVE_UNIWBRK_H -DHAVE_LIBUNISTRING 43 | LIBUNISTR = -lunistring 44 | else 45 | LIBUNISTR = 46 | endif 47 | ifneq (,$(wildcard roaring.c)) 48 | DEFINES += -DHAVE_CROARING 49 | HDRS += confus_croar.h roaring.h 50 | else 51 | ifneq (,$(wildcard ../CRoaring/roaring.c)) 52 | DEFINES += -DHAVE_CROARING 53 | CFLAGS += -I../CRoaring 54 | HDRS += confus_croar.h roaring.h 55 | endif 56 | endif 57 | ALLHDRS = $(HDRS) unic26.h 58 | #OBJS = u8ident.o u8idscr.o u8idnorm.o u8idroar.o 59 | OBJS = $(SRC:.c=.o) 60 | LIB = libu8ident.a 61 | SOLIB = libu8ident.so 62 | PCXX = P2528R0 63 | NC = n2916 64 | DCURCXX = D2528R1 65 | NCURC = n2932 66 | DOCS = README.md NEWS NOTICE LICENSE doc/c11.md doc/$(PCXX).html doc/$(DCURCXX).html \ 67 | doc/$(PCXX).md doc/$(DCURCXX).md doc/$(NC).html doc/$(NC).md doc/$(NCURC).patch \ 68 | doc/$(NCURC).html doc/$(NCURC).md \ 69 | doc/tr31-bugs.md 70 | MAN3 = u8ident.3 71 | MAN1 = u8idlint.1 72 | MAN = $(MAN1) $(MAN3) 73 | PREFIX = usr 74 | PKG = libu8ident-$(VERSION) 75 | PKG_BIN = $(PKG)-`uname -m` 76 | 77 | CFLAGS_REL = $(CFLAGS) -O2 -DNDEBUG 78 | CFLAGS_PERF = $(CFLAGS) -O2 -DNDEBUG 79 | CFLAGS_DBG = $(CFLAGS) -g -DDEBUG 80 | LTOFLAGS = 81 | 82 | MACHINE := $(shell uname -m) 83 | ifeq (x86_64,$(MACHINE)) 84 | DEFINES += -DHAVE_SYS_STAT_H 85 | LTOFLAGS = -flto 86 | CFLAGS_REL += -march=native 87 | CFLAGS_PERF += -march=native 88 | endif 89 | 90 | RUN_LD_PATH=LD_LIBRARY_PATH=. 91 | OS := $(shell uname -s) 92 | ifeq (Darwin,$(OS)) 93 | RUN_LD_PATH=DYLD_LIBRARY_PATH=. 94 | else 95 | ifeq (AIX,$(OS)) 96 | RUN_LD_PATH=LIBPATH=. 97 | else 98 | ifeq (HP-UX,$(OS)) 99 | RUN_LD_PATH=SHLIB_PATH=. 100 | endif 101 | endif 102 | endif 103 | 104 | # cc prints name as cc, not gcc. so check for the copyright banner 105 | CC_COPY := $(shell $(CC) --version | head -n2 | tail -n1 | cut -c1-7) 106 | # gcc has "Copyright (C) 2020 Free Software Foundation, Inc" 107 | # clang would have "Target: ..." 108 | # icc would have "Copyright (c) (C) 1985-2014 Intel Corporation" 109 | # pcc has "Portable C Compiler 1.2.0.DEVEL 20200630 for x86_64-pc-linux-gnu" on the 1st line 110 | # chibicc has no --version, just --help 111 | ifeq (Copyrig,$(CC_COPY)) 112 | IS_GCC = 1 113 | CFLAGS_REL += -Werror -Wno-return-local-addr 114 | CFLAGS_PERF += -Wno-return-local-addr 115 | CFLAGS_DBG += -Wno-return-local-addr 116 | else 117 | ifeq (Target:,$(CC_COPY)) 118 | IS_CLANG = 1 119 | CFLAGS_REL += -Werror 120 | else 121 | ifeq (Portable C Compiler,$(shell $(CC) --version | cut -c1-19)) 122 | IS_PCC = 1 123 | CFLAGS_REL += -Werror 124 | endif 125 | endif 126 | endif 127 | 128 | most: $(LIB) $(SOLIB) u8idlint doc/$(DCURCXX).html 129 | 130 | all: mkc26 most test-texts test perf docs 131 | 132 | .version: makefile.gnu 133 | build-aux/git-version-gen .version 134 | 135 | .c.o: 136 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c $< -o $@ 137 | .c.i: 138 | $(CC) $(CFLAGS) $(DEFINES) -Iinclude -E -c $< -o $@ 139 | u8idnorm.o: u8idnorm.c u8id_private.h hangul.h $(NORMHDRS) $(HEADER) 140 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c u8idnorm.c -o $@ 141 | u8idroar.o: u8idroar.c u8id_private.h $(HEADER) confus_croar.h 142 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c u8idroar.c -o $@ 143 | 144 | $(LIB): $(SRC) $(HEADER) $(ALLHDRS) $(OBJS) 145 | $(AR) $(ARFLAGS) $@ $(OBJS) 146 | $(RANLIB) $@ 147 | 148 | $(SOLIB): $(SRC) $(HEADER) $(ALLHDRS) 149 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) -shared -fPIC $(DEFINES) -Iinclude \ 150 | -Wl,-soname,$(SOLIB).$(SO_MAJ) -o $@.$(SO_MAJ) $(SRC) 151 | -rm -f $(SOLIB) 152 | ln -s $(SOLIB).$(SO_MAJ) $(SOLIB) 153 | 154 | scripts.h scripts16.h: mkscripts.pl # Scripts.txt ScriptExtensions.txt DerivedNormalizationProps.txt 155 | $(PERL) mkscripts.pl 156 | gconfus.h.in: mkconfus.pl # confusables.txt 157 | $(PERL) mkconfus.pl -c 158 | confus.h: mkconfus.pl mkroar.c # confusables.txt 159 | $(PERL) mkconfus.pl -c 160 | gconfus.h: gconfus.h.in 161 | $(GPERF) -n gconfus.h.in > gconfus.h.tmp && sed -e's,static const unsigned int asso_values,(void)len; static const unsigned int asso_values,' gconfus.h && rm gconfus.h.tmp 162 | confus_croar.h: mkroar.c mkconfus.pl 163 | $(PERL) mkconfus.pl -c 164 | mark.h: mkmark.pl # UnicodeData.txt 165 | $(PERL) mkmark.pl 166 | u8id_gc.h: mkgc.pl # UnicodeData.txt 167 | $(PERL) mkgc.pl 168 | medial.h: mkmedial.pl # UnicodeData.txt 169 | $(PERL) mkmedial.pl 170 | allowed_croar.h nfkc_croar.h nfc_croar.h nfkd_croar.h nfd_croar.h: mkroar.c mkconfus.pl FORCE 171 | $(PERL) mkconfus.pl 172 | 173 | u8idlint: u8idlint.c unic26.h unic11.h $(LIB) 174 | $(CC) $(CFLAGS_REL) -fpie $(DEFINES) -I. -Iinclude u8idlint.c -o $@ $(LIB) $(LIBUNISTR) 175 | 176 | .PHONY: check check-all check-extra check-asan check-norms check-profiles check-tr31 check-mdl \ 177 | check-all-combinations clean regen-scripts regen-norm regen-confus regen-u8idlint-test \ 178 | install man dist-src dist clang-format docs 179 | 180 | ifeq (-DHAVE_CONFUS,$(DEFINES)) 181 | check: test test-texts u8idlint example 182 | ./test 183 | ./test-texts > texts.tst 184 | diff texts.tst texts/result.lst && rm texts.tst 185 | ./u8idlint.test 186 | $(RUN_LD_PATH) ./example 187 | else 188 | ifeq (-DHAVE_CONFUS -DHAVE_CROARING,$(DEFINES)) 189 | check: test test-texts u8idlint example 190 | ./test 191 | ./test-texts > texts.tst 192 | diff texts.tst texts/result.lst && rm texts.tst 193 | ./u8idlint.test 194 | $(RUN_LD_PATH) ./example 195 | else 196 | check: test test-texts u8idlint example 197 | ./test 198 | ./test-texts > texts.tst 199 | diff texts.tst texts/result.lst && rm texts.tst 200 | ./u8idlint.test 201 | $(RUN_LD_PATH) ./example 202 | endif 203 | endif 204 | 205 | check-all: check check-norms check-profiles check-tr31 check-asan 206 | check-extra: check-all check-all-combinations check-mdl 207 | shellcheck *.test test-all-fast.sh test-all.sh 208 | 209 | test: test.c $(SRC) $(HEADER) $(ALLHDRS) 210 | $(CC) $(CFLAGS_DBG) $(DEFINES) -I. -Iinclude test.c $(SRC) -o test 211 | test-texts: test-texts.c $(SRC) $(HEADER) $(ALLHDRS) 212 | $(CC) $(CFLAGS_DBG) -O1 $(DEFINES) -I. -Iinclude test-texts.c $(SRC) -o test-texts $(LIBUNISTR) 213 | example: example.c $(SOLIB) 214 | $(CC) $(CFLAGS_DBG) $(DEFINES) -Iinclude example.c -o $@ -L. -lu8ident 215 | regen-u8idlint-test: u8idlint 216 | -./u8idlint -xsafec26 texts/homo-sec-1.c >texts/homo-sec-1.tst 217 | -./u8idlint -p1 -xc11 texts/homo-sec-1.c >texts/homo-sec-1-p1.tst 218 | -./u8idlint -xc11 texts/homo-1.c >texts/homo-1.tst 219 | -./u8idlint -xallowed texts/bidi-sec-1.c >texts/bidi-sec-1.tst 220 | -./u8idlint -xc11 texts/bidi-sec-1.c >texts/bidi-sec-1-c11.tst 221 | -./u8idlint -xsafec26 texts/bidi-sec-1.c >texts/bidi-sec-1-c26.tst 222 | -./u8idlint -xascii texts/bidi-sec-2.c >texts/bidi-sec-2-ascii.tst 223 | -./u8idlint -xallowed texts/bidi-sec-2.c >texts/bidi-sec-2-allowed.tst 224 | -./u8idlint -xid texts/bidi-sec-2.c >texts/bidi-sec-2.tst 225 | -./u8idlint -xc11 texts/bidi-sec-2.c >texts/bidi-sec-2-c11.tst 226 | 227 | c11-all.h unic26.h: mkc26 scripts.h mark.h medial.h 228 | ./mkc26 229 | mkc26: mkc26.c $(SRC) $(HEADER) $(HDRS) 230 | $(CC) $(CFLAGS_DBG) -O1 $(DEFINES) -DU8ID_PROFILE_SAFEC26 -I. -Iinclude mkc26.c $(SRC) -o $@ 231 | check-asan: test.c $(SRC) $(HEADER) $(ALLHDRS) 232 | $(CC) $(CFLAGS_DBG) $(DEFINES) -fsanitize=address -I. -Iinclude test.c $(SRC) -o test-asan 233 | ./test-asan 234 | # gem install mdl 235 | check-mdl: 236 | mdl *.md doc/*.md 237 | 238 | perf: perf.c u8idroar.c $(HEADER) $(ALLHDRS) \ 239 | nfkc_croar.h nfc_croar.h nfkd_croar.h nfd_croar.h allowed_croar.h confus_croar.h mark.h scripts16.h 240 | $(CC) $(CFLAGS_PERF) -Wno-unused-function $(DEFINES) -DPERF_TEST -I. -Iinclude \ 241 | perf.c u8idroar.c -o perf && \ 242 | ./perf 243 | 244 | clean: 245 | -rm -f u8ident.o u8idnorm.o u8idscr.o u8idroar.o $(LIB) $(SOLIB) \ 246 | perf mkroar mkc26 u8idlint example \ 247 | test test-texts test-asan test-tr31 \ 248 | test-prof{2,3,4,5,6,C26_4,C11_6,SAFEC26,C11STD} \ 249 | test-norm-{NFKC,NFC,FCC,NFKD,NFD,FCD} \ 250 | 251 | # Maintainer-only 252 | # Check coverage and sizes for all configure combinations 253 | check-norms: $(SRC) $(HEADER) $(ALLHDRS) 254 | for n in NFKC NFC FCC NFKD NFD FCD; do \ 255 | echo $$n; \ 256 | $(CC) $(CFLAGS_REL) -DU8ID_NORM=$$n -Wfatal-errors -Iinclude -c u8idnorm.c -o u8idnorm.o && \ 257 | ls -gGh u8idnorm.o; \ 258 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_NORM=$$n -I. -Iinclude test.c $(SRC) \ 259 | -o test-norm-$$n && \ 260 | if ./test-norm-$$n norm; then rm test-norm-$$n; else exit; fi; \ 261 | done 262 | check-profiles: $(SRC) $(HEADER) $(ALLHDRS) 263 | for n in 2 3 4 5 6 C11_6 C26_4; do \ 264 | echo PROFILE_$${n}; \ 265 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_PROFILE=$$n -I. -Iinclude test.c $(SRC) \ 266 | -o test-prof$$n && \ 267 | if ./test-prof$$n profile; then rm test-prof$$n; else exit; fi; \ 268 | done 269 | for n in SAFEC26 C11STD; do \ 270 | echo PROFILE_$${n}; \ 271 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_PROFILE_$${n} -I. -Iinclude test.c $(SRC) \ 272 | -o test-prof$$n && \ 273 | if ./test-prof$$n profile; then rm test-prof$$n; else exit; fi; \ 274 | done 275 | check-tr31: $(SRC) $(HEADER) $(ALLHDRS) 276 | for x in ALLOWED SAFEC26 ID XID C11 C23 ALLUTF8 NONE; do \ 277 | echo U8ID_TR31_$$x; \ 278 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_TR31=$$x -I. -Iinclude test.c $(SRC) \ 279 | -o test-xid-$$x && \ 280 | if ./test-xid-$$x xid; then rm test-xid-$$x; else exit; fi; \ 281 | done 282 | check-all-combinations: $(SRC) $(HEADER) $(ALLHDRS) 283 | for n in NFKC NFC NFKD NFD FCD FCC; do \ 284 | for p in 2 3 4 5 6 C11_6 C26_4; do \ 285 | for x in ALLOWED SAFEC26 ID XID C11 C23 ALLUTF8 NONE; do \ 286 | if [ $$n != NFC ] && [ $$p = C26_4 -o $$x = SAFEC26 -o $$x = C23 ]; then \ 287 | echo "skip -DU8ID_NORM=$$n -DU8ID_PROFILE=$$p -DU8ID_TR31=$$x"; \ 288 | else \ 289 | echo "check -DU8ID_NORM=$$n -DU8ID_PROFILE=$$p -DU8ID_TR31=$$x"; \ 290 | $(CC) $(CFLAGS_DBG) $(DEFINES) -I. -Iinclude -DU8ID_PROFILE=$$p -DU8ID_NORM=$$n -DU8ID_TR31=$$x \ 291 | -Wfatal-errors test.c $(SRC) \ 292 | -o test-profiles && \ 293 | ./test-profiles || exit; \ 294 | fi; \ 295 | done; \ 296 | done; \ 297 | done; \ 298 | rm test-profiles 299 | 300 | # Create the normalization headers via a current perl 301 | Unicode-Normalize: un8ifcan.h 302 | if test -d Unicode-Normalize; then \ 303 | cd Unicode-Normalize && git pull --rebase && cd ..; \ 304 | else \ 305 | git clone https://github.com/rurban/Unicode-Normalize; fi 306 | regen-norm: Unicode-Normalize un8ifcan.h 307 | cd Unicode-Normalize && \ 308 | $(PERL) Makefile.PL && \ 309 | make && \ 310 | $(PERL) mkheader -ind -std && \ 311 | cd - && cp Unicode-Normalize/un8if*.h . 312 | # Download some UCD files and create scripts.h 313 | regen-scripts: 314 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/Scripts.txt 315 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt 316 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 317 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt 318 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt 319 | $(WGET) -N https://www.unicode.org/Public/security/latest/IdentifierType.txt 320 | $(WGET) -N https://www.unicode.org/Public/security/latest/IdentifierStatus.txt 321 | $(PERL) mkscripts.pl 322 | regen-confus: 323 | $(WGET) -N https://www.unicode.org/Public/security/latest/confusables.txt 324 | $(PERL) mkconfus.pl 325 | 326 | docs: $(DOCS) 327 | # doc/P2528R0.* and doc/n2916.* are frozen 328 | html: doc/$(DCURCXX).html doc/$(NCURC).html 329 | pdf: doc/$(DCURCXX).pdf doc/$(NCURC).pdf 330 | doc/$(DCURCXX).html: doc/$(DCURCXX).md 331 | -$(PANDOC) -s -o $@ doc/$(DCURCXX).md --metadata title="$(DCURCXX) - C++ Identifier Security using Unicode Standard Annex 39" 332 | doc/$(DCURCXX).pdf: doc/$(DCURCXX).md 333 | -$(PANDOC) -s --pdf-engine=xelatex -o $@ doc/$(DCURCXX).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono" 334 | doc/$(NCURC).html: doc/$(NCURC).md 335 | -$(PANDOC) -s -o $@ doc/$(NCURC).md --metadata title="$(NCURC) - C Identifier Security using Unicode Standard Annex 39 v2" 336 | doc/$(NCURC).pdf: doc/$(NCURC).md 337 | -$(PANDOC) -s --pdf-engine=xelatex -o $@ doc/$(NCURC).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono" --metadata title="$(NCURC) - C Identifier Security using Unicode Standard Annex 39 v2" 338 | 339 | Dockerfile.pandoc: makefile.gnu 340 | echo "FROM fedora:35" >Dockerfile.pandoc 341 | echo "RUN yum -y install pandoc texlive-xetex dejavu-sans-mono-fonts dejavu-serif-fonts dejavu-sans-fonts" >> Dockerfile.pandoc 342 | echo "RUN useradd --no-log-init -U user -u 1000 -m" >> Dockerfile.pandoc 343 | echo "USER user" >> Dockerfile.pandoc 344 | echo "WORKDIR /home/user" >> Dockerfile.pandoc 345 | docker build -t pandoc -f Dockerfile.pandoc . 346 | docker-html: Dockerfile.pandoc 347 | -docker run -u user -v `pwd`/doc:/doc -it pandoc pandoc -s -o /doc/$(DCURCXX).html /doc/$(DCURCXX).md 348 | chown $$USER:$$USER doc/$(DCURCXX).html 349 | docker-pdf: Dockerfile.pandoc 350 | -docker run -u user -v `pwd`/doc:/doc -it pandoc pandoc -s --pdf-engine=xelatex -o /doc/$(DCURCXX).pdf /doc/$(DCURCXX).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono" 351 | chown $$USER:$$USER doc/$(DCURCXX).pdf 352 | 353 | patch-c-doc: 354 | patch -f doc/$(NCURC).md doc/$(NCURC).patch 355 | regen-c-patch: 356 | -diff -bu doc/$(DCURCXX).md doc/$(NCURC).md >doc/$(NCURC).patch 357 | 358 | clang-format: 359 | clang-format -i *.c include/*.h scripts.h confus.h mark.h scripts16.h u8id*.h 360 | GTAGS: $(SRC) $(HEADER) $(ALLHDRS) 361 | ls $(SRC) $(HEADER) $(ALLHDRS) | gtags -f - 362 | 363 | # End Maintainer-only 364 | 365 | man: $(MAN) 366 | 367 | RONN_ARGS=--roff --manual "U8IDENT Manual $(VERSION)" --organization=rurban/libu8ident 368 | u8ident.3: README.md 369 | $(RONN) $(RONN_ARGS) < README.md > $@ 370 | u8idlint.1: u8idlint 371 | LC_ALL=C help2man -N -s1 -p libu8ident --manual "U8IDENT Manual $(VERSION)" -o $@ ./u8idlint$(EXEEXT) 372 | 373 | dist: $(LIB) $(MAN) $(DOCS) 374 | -rm -rf $(PKG) 375 | $(MAKE) -f makefile.gnu install DESTDIR="$(PKG)" 376 | tar cfz $(PKG_BIN).tar.gz -C $(PKG) . 377 | -rm -rf $(PKG) 378 | 379 | dist-src: 380 | -rm -rf $(PKG) 381 | -mkdir -p $(PKG)/include 382 | cp $(HEADER) $(PKG)/include/ 383 | cp `git ls-tree -r --name-only HEAD` $(PKG)/ 384 | tar cfz $(PKG).tar.gz $(PKG) 385 | rm -rf $(PKG) 386 | 387 | install: $(LIB) $(MAN) 388 | -mkdir -p $(DESTDIR)/$(PREFIX)/include 389 | -mkdir -p $(DESTDIR)/$(PREFIX)/lib 390 | -mkdir -p $(DESTDIR)/$(PREFIX)/bin 391 | -mkdir -p $(DESTDIR)/$(PREFIX)/share/doc/libu8ident 392 | -mkdir -p $(DESTDIR)/$(PREFIX)/share/man/man3 393 | install -m0644 $(HEADER) $(DESTDIR)/$(PREFIX)/include 394 | install -m0644 $(LIB) $(DESTDIR)/$(PREFIX)/lib 395 | install -m0755 u8idlint $(DESTDIR)/$(PREFIX)/bin 396 | install -m0644 $(MAN1) $(DESTDIR)/$(PREFIX)/share/man/man1 397 | install -m0644 $(MAN3) $(DESTDIR)/$(PREFIX)/share/man/man3 398 | install -m0644 $(DOCS) $(DESTDIR)/$(PREFIX)/share/doc/libu8ident 399 | -------------------------------------------------------------------------------- /medial.h: -------------------------------------------------------------------------------- 1 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 2 | /* libu8ident - Check unicode security guidelines for identifiers. 3 | Copyright 2022 Reini Urban 4 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | 6 | UCD Medial Letters and Forms, not at Start and not at End of a word. 7 | Includes MEDIAL and https://www.unicode.org/reports/tr31/#Table_Optional_Medial 8 | Generated by mkmedial.pl, do not modify. 9 | */ 10 | 11 | #ifdef EXTERN_SCRIPTS 12 | extern const struct range_bool medial_list[77]; 13 | #else 14 | const struct range_bool medial_list[] = { 15 | // clang-format off 16 | {0x27, 0x27}, // ' 17 | {0x2D, 0x2E}, // -... 18 | {0x3A, 0x3A}, // : 19 | {0xB7, 0xB7}, // · 20 | {0x58A, 0x58A}, // ֊ 21 | {0x5F4, 0x5F4}, // ״ 22 | {0xF0B, 0xF0B}, // ་ 23 | {0x103B, 0x103E}, // ျ..ှ 24 | {0x105E, 0x1060}, // ၞ..ၠ 25 | {0x1082, 0x1082}, // ႂ 26 | {0x14EC, 0x14EC}, // ᓬ 27 | {0x1552, 0x1552}, // ᕒ 28 | {0x1A55, 0x1A56}, // ᩕ..ᩖ 29 | {0x200C, 0x200C}, // ‌ 30 | {0x2010, 0x2010}, // ‐ 31 | {0x2019, 0x2019}, // ’ 32 | {0x2027, 0x2027}, // ‧ 33 | {0x30A0, 0x30A0}, // ゠ 34 | {0x30FB, 0x30FB}, // ・ 35 | {0xFB55, 0xFB55}, // ﭕ 36 | {0xFB59, 0xFB59}, // ﭙ 37 | {0xFB5D, 0xFB5D}, // ﭝ 38 | {0xFB61, 0xFB61}, // ﭡ 39 | {0xFB65, 0xFB65}, // ﭥ 40 | {0xFB69, 0xFB69}, // ﭩ 41 | {0xFB6D, 0xFB6D}, // ﭭ 42 | {0xFB71, 0xFB71}, // ﭱ 43 | {0xFB75, 0xFB75}, // ﭵ 44 | {0xFB79, 0xFB79}, // ﭹ 45 | {0xFB7D, 0xFB7D}, // ﭽ 46 | {0xFB81, 0xFB81}, // ﮁ 47 | {0xFB91, 0xFB91}, // ﮑ 48 | {0xFB95, 0xFB95}, // ﮕ 49 | {0xFB99, 0xFB99}, // ﮙ 50 | {0xFB9D, 0xFB9D}, // ﮝ 51 | {0xFBA3, 0xFBA3}, // ﮣ 52 | {0xFBA9, 0xFBA9}, // ﮩ 53 | {0xFBAD, 0xFBAD}, // ﮭ 54 | {0xFBD6, 0xFBD6}, // ﯖ 55 | {0xFBE7, 0xFBE7}, // ﯧ 56 | {0xFBE9, 0xFBE9}, // ﯩ 57 | {0xFBFF, 0xFBFF}, // ﯿ 58 | {0xFCDF, 0xFCF4}, // ﳟ..ﳴ 59 | {0xFD34, 0xFD3B}, // ﴴ..ﴻ 60 | {0xFE77, 0xFE77}, // ﹷ 61 | {0xFE79, 0xFE79}, // ﹹ 62 | {0xFE7B, 0xFE7B}, // ﹻ 63 | {0xFE7D, 0xFE7D}, // ﹽ 64 | {0xFE7F, 0xFE7F}, // ﹿ 65 | {0xFE8C, 0xFE8C}, // ﺌ 66 | {0xFE92, 0xFE92}, // ﺒ 67 | {0xFE98, 0xFE98}, // ﺘ 68 | {0xFE9C, 0xFE9C}, // ﺜ 69 | {0xFEA0, 0xFEA0}, // ﺠ 70 | {0xFEA4, 0xFEA4}, // ﺤ 71 | {0xFEA8, 0xFEA8}, // ﺨ 72 | {0xFEB4, 0xFEB4}, // ﺴ 73 | {0xFEB8, 0xFEB8}, // ﺸ 74 | {0xFEBC, 0xFEBC}, // ﺼ 75 | {0xFEC0, 0xFEC0}, // ﻀ 76 | {0xFEC4, 0xFEC4}, // ﻄ 77 | {0xFEC8, 0xFEC8}, // ﻈ 78 | {0xFECC, 0xFECC}, // ﻌ 79 | {0xFED0, 0xFED0}, // ﻐ 80 | {0xFED4, 0xFED4}, // ﻔ 81 | {0xFED8, 0xFED8}, // ﻘ 82 | {0xFEDC, 0xFEDC}, // ﻜ 83 | {0xFEE0, 0xFEE0}, // ﻠ 84 | {0xFEE4, 0xFEE4}, // ﻤ 85 | {0xFEE8, 0xFEE8}, // ﻨ 86 | {0xFEEC, 0xFEEC}, // ﻬ 87 | {0xFEF4, 0xFEF4}, // ﻴ 88 | {0x1171D, 0x1171F}, // 𑜝..𑜟 89 | {0x11940, 0x11940}, // 𑥀 90 | {0x11942, 0x11942}, // 𑥂 91 | {0x1612A, 0x1612C}, // 𖄪..𖄬 92 | {0x1612E, 0x1612E}, // 𖄮 93 | // clang-format on 94 | }; // 8 ranges, 69 singles, 116 codepoints 95 | #endif 96 | -------------------------------------------------------------------------------- /mingw-cross.cmake: -------------------------------------------------------------------------------- 1 | # from linux or a bsd: 2 | # cmake . -DCMAKE_TOOLCHAIN_FILE=mingw-cross.cmake 3 | # from windows: cmake . -G "MinGW Makefiles" 4 | set(CMAKE_SYSTEM_NAME Windows) 5 | 6 | set(TOOLCHAIN_PREFIX x86_64-w64-mingw32) 7 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc) 8 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++) 9 | 10 | # where is the target environment located 11 | set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX}) 12 | 13 | # adjust the default behavior of the FIND_XXX() commands: 14 | # search programs in the host environment 15 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 16 | # search headers and libraries in the target environment 17 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 18 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 19 | -------------------------------------------------------------------------------- /mkconfus.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl -s 2 | # libu8ident - Check unicode security guidelines for identifiers. 3 | # Copyright 2014, 2021, 2022 Reini Urban 4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | # 6 | # Create confus.h from https://www.unicode.org/Public/security/latest/confusables.txt 7 | # with mkconfus.pl -c. 8 | # perf needs mkconfus.pl without -c to generate all headers. 9 | # 10 | # Note that the first array is just a binary-search in an unoptimized, 11 | # uncompressed array, without any values. It might be smaller and 12 | # faster with gperf or cbitset/croaring. 13 | 14 | use vars qw($c); 15 | use strict; 16 | use Config; 17 | use utf8; 18 | use Encode (); 19 | use B 'cstring'; 20 | use Unicode::Normalize; 21 | 22 | my $confus = "confusables.txt"; 23 | for ($confus) { 24 | if (!-e $_) { 25 | system("wget -N https://www.unicode.org/Public/security/latest/$_"); 26 | } 27 | } 28 | 29 | # fixup wrong pairs. these are not really confusable 30 | my %confbugs = map { $_ => 1 } qw(0022 0025 0030 0031 0049 006D 007C 31 | 0251 028B 028F 32 | 03BD 03C3 03D1 03F1 03F4 33 | 0561 0570 0575 0584); 34 | my (@CONF, @ucd_version, $started); 35 | open my $CONF, "<", $confus or die "$confus $!"; 36 | while (<$CONF>) { 37 | if (!$started) { 38 | if (/^# Version: (\d+)\.(\d)\.(\d)/) { @ucd_version = ($1,$2,$3); } 39 | if (/^# For documentation and/) { $started++; } 40 | next unless $started; 41 | } else { 42 | if (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5}) ;/) { 43 | my ($from, $to1) = (hex($1), hex($2)); 44 | push @CONF, [$from, $to1]; 45 | } 46 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) { 47 | my ($from, $ids) = (hex($1), $2); 48 | my @ids = map{ hex $_ } split(' ',$ids); 49 | push @CONF, [$from, @ids]; 50 | } 51 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) { 52 | my ($from, $ids) = (hex($1), $2); 53 | my @ids = map{ hex $_ } split(' ',$ids); 54 | push @CONF, [$from, @ids]; 55 | } 56 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) { 57 | my ($from, $ids) = (hex($1), $2); 58 | my @ids = map{ hex $_ } split(' ',$ids); 59 | push @CONF, [$from, @ids]; 60 | } 61 | if (exists $confbugs{$1}) { 62 | $CONF[$#CONF] = [ @{$CONF[$#CONF]}, "//" ]; 63 | } 64 | } 65 | } 66 | close $CONF; 67 | @CONF = sort {$a->[0] <=> $b->[0]} @CONF; 68 | printf "%u confusables\n", scalar @CONF; 69 | 70 | my $ofile1 = "confus.h"; 71 | chmod 0644, $ofile1 if -e $ofile1; 72 | open my $H1, ">:utf8", $ofile1 or die "writing $ofile1 $!"; 73 | print $H1 <<"EOF"; 74 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 75 | /* libu8ident - Check unicode security guidelines for identifiers. 76 | Copyright 2021 Reini Urban 77 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 78 | 79 | Generated by mkconfus.pl, do not modify. 80 | UNICODE version $ucd_version[0].$ucd_version[1] 81 | */ 82 | #include 83 | 84 | /* Sorted set of all confusables without mapping, 85 | from https://www.unicode.org/Public/security/latest/confusables.txt 86 | Some confusables are outcommented. 87 | */ 88 | #ifndef EXTERN_SCRIPTS 89 | const uint32_t confusables[] = { 90 | // clang-format off 91 | EOF 92 | 93 | my $ofile = "gconfus.h.in"; 94 | chmod 0644, $ofile if -e $ofile; 95 | open my $H, ">:utf8", $ofile or die "writing $ofile $!"; 96 | print $H <<"EOF"; 97 | %{/* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 98 | /* libu8ident - Check unicode security guidelines for identifiers. 99 | Copyright 2014, 2021, 2022 Reini Urban 100 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 101 | 102 | generated by mkconfus.pl, do not modify. 103 | UNICODE version $ucd_version[0].$ucd_version[1] 104 | */ 105 | 106 | #include 107 | #include "u8id_private.h" 108 | 109 | // v3.1 changed len type from unsigned int to size_t (gperf d519d1a821511eaa22eae6d9019a548aea21e6) 110 | #ifdef GPERF_VERSION 111 | # if GPERF_VERSION < 301 112 | # define SIZE_TYPE unsigned int 113 | # else 114 | # define SIZE_TYPE size_t 115 | # endif 116 | #else 117 | # define SIZE_TYPE size_t 118 | #endif 119 | 120 | /* FIXME gperf to accept uint32_t keys. 121 | key is "%05X" of the unicode codepoint, fixed length 5. 122 | u8nfc is the rhs of confusables, already decomposed into NFC and encoded as UTF-8. 123 | Some confusables are outcommented. 124 | */ 125 | %} 126 | %7bit 127 | %language=ANSI-C 128 | %struct-type 129 | %readonly-tables 130 | %null-strings 131 | 132 | struct confus_gperf { const char *const name; const char *const u8nfc; }; 133 | 134 | %% 135 | EOF 136 | my $i = 0; 137 | my $enc = Encode::find_encoding("UTF-8"); 138 | for my $c (@CONF) { 139 | if (@$c == 2) { 140 | printf $H "#-- %s -> %s\n", chr $c->[0], chr $c->[1]; 141 | printf $H1 " 0x%04X,\t/* %s -> %s */\n", $c->[0], chr $c->[0], chr $c->[1]; 142 | printf $H "%05X,\t%s\n", $c->[0], B::cstring NFC(chr $c->[1]); 143 | $i++; 144 | } else { 145 | my $f = shift @$c; 146 | my $j = @$c - 1; 147 | if ($c->[$j] eq "//") { 148 | printf $H1 "//"; 149 | pop @$c; 150 | printf $H "#-- %s -> %s\n", chr $f, join('', map {chr $_} @$c); 151 | printf $H "# "; 152 | } else { 153 | printf $H "#-- %s -> %s\n", chr $f, join('', map {chr $_} @$c); 154 | $i++; 155 | } 156 | printf $H1 " 0x%04X,\t/* %s -> ", $f, chr $f; 157 | printf $H "%05X,\t", $f; 158 | my $nfd = ""; 159 | for (@$c) { 160 | printf $H1 "%s", chr $_; 161 | $nfd .= NFC(chr $_); 162 | } 163 | printf $H1 " */\n"; 164 | printf $H "%s\n", B::cstring $nfd; 165 | } 166 | } 167 | 168 | print $H <<'EOF'; 169 | %% 170 | 171 | /* 172 | * Local variables: 173 | * c-file-style: "gnu" 174 | * End: 175 | * vim: expandtab shiftwidth=4 cinoptions='\:2=2' : 176 | */ 177 | EOF 178 | close $H; 179 | chmod 0444, $ofile; 180 | 181 | print $H1 <<"EOF"; 182 | // clang-format on 183 | }; 184 | #else 185 | extern const uint32_t confusables[$i]; 186 | #endif 187 | EOF 188 | close $H1; 189 | chmod 0444, $ofile1; 190 | 191 | print "Create serialized roaring bitmaps:\n"; 192 | my $arg = $c ? "confus" : ""; 193 | if ($^O =~ /Win32/) { 194 | system($Config{cc}." mkroar.c -I. -o mkroar.exe"); 195 | if ($c) { 196 | system("mkroar.exe", $arg); 197 | } else { 198 | system("mkroar.exe"); 199 | } 200 | # ignore vms for now 201 | } else { 202 | system($Config{cc}." mkroar.c -I. -o mkroar"); 203 | if ($c) { 204 | system("./mkroar", $arg); 205 | } else { 206 | system("./mkroar"); 207 | } 208 | } 209 | print "\n"; 210 | # allowed not optimized. stayed with 816 array 211 | # confus optimized from 8552 byte to 4731 byte (3 run-length encoded containers) 212 | # NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M 213 | my @list = $c ? qw(confus) 214 | : qw(allowed confus nfd_n nfc_n nfc_m nfkd_n nfkc_n nfkc_m); 215 | for my $name (@list) { 216 | my $c = $name . "_croar"; 217 | my $b = $c; 218 | if ($name =~ /(nfk?[cd])_./) { 219 | $b = $1 . "_croar"; 220 | } 221 | if ($name =~ /^nfk?c_m/) { 222 | system("xxd -i $c.bin >> $b.h"); 223 | unlink "$c.bin"; 224 | } else { 225 | open my $F, '>', "$b.h"; 226 | print $F "/* generated via mkroar.c */\n"; 227 | close $F; 228 | system("xxd -i $c.bin >> $b.h"); 229 | unlink "$c.bin"; 230 | print "Created $b.h\n"; 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /mkgc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # libu8ident - Check unicode security guidelines for identifiers. 3 | # Copyright 2022 Reini Urban 4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | # 6 | # Create u8id_gc.h General_Category API 7 | 8 | use strict; 9 | use Config; 10 | use utf8; 11 | 12 | my $ucd = "UnicodeData.txt"; 13 | if (!-e $ucd) { 14 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd"); 15 | } 16 | 17 | my $gc_h = "u8id_gc.h"; 18 | my (@GC, %GC); 19 | open my $UCD, "<", $ucd or die "$ucd $!"; 20 | my ($from, $to, $oldto, $oldgc, $gc) = (0,0,0,'Cc',''); 21 | while (<$UCD>) { 22 | my @l = split ';'; 23 | $gc = @l[2]; 24 | if ($gc =~ /^\w.$/) { 25 | $GC{$gc}++; 26 | $to = hex($l[0]); 27 | if ($oldgc ne $gc) { 28 | push @GC, [$from, $to-1, $oldgc]; 29 | $from = $to; 30 | $oldgc = $gc; 31 | } 32 | #else { 33 | # $oldto = $GC[$#GC]->[1]; 34 | # # FIXME 35 | # if ($oldto + 1 != $to) { 36 | # push @GC, [$to, $to, $gc]; 37 | # $from = $to; 38 | # } else { # update last 39 | # $GC[$#GC]->[1] = $to; 40 | # } 41 | #} 42 | } 43 | } 44 | push @GC, [$from, $to, $gc]; # the remainder 45 | close $UCD; 46 | 47 | if (!-w $gc_h) { 48 | chmod 0644, $gc_h; 49 | } 50 | open my $H, ">:utf8", $gc_h or die "writing $gc_h $!"; 51 | print $H <<'EOF'; 52 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 53 | /* libu8ident - Check unicode security guidelines for identifiers. 54 | Copyright 2022 Reini Urban 55 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 56 | 57 | UCD General_Category API 58 | For scripts.h we are only interested for the Identifier parts in scx_list[] 59 | to detect illegal runs. 60 | But u8idlint.c/unic26.h needs the full list. 61 | Thanksfully this is only needed offline, to create unic26.h. 62 | Generated by mkgc.pl, do not modify. 63 | */ 64 | 65 | #include 66 | 67 | enum u8id_gc { 68 | EOF 69 | my @GCs = sort keys %GC; 70 | for my $g (@GCs) { 71 | $g =~ s/&/amp/; 72 | printf $H " GC_%s,\n", $g; 73 | } 74 | printf $H " GC_L, // is L&, all letters. (for unic26.h only)\n"; 75 | printf $H " GC_V, // is varying, eg L or S. (for unic26.h only),\n"; 76 | printf $H " GC_INVALID,\n"; 77 | printf $H <<'EOF', scalar(@GCs) + 3; 78 | }; 79 | 80 | #ifdef EXTERN_SCRIPTS 81 | extern const char *const u8id_gc_names[%u]; 82 | #else 83 | LOCAL const char *const u8id_gc_names[] = { 84 | EOF 85 | for my $g (@GCs) { 86 | $g =~ s/&/amp/; 87 | printf $H " \"%s\",\n", $g; 88 | } 89 | printf $H " \"L\", // L& really (for unic26.h only)\n"; 90 | printf $H " \"V\", // varying (for unic26.h only)\n"; 91 | printf $H " \"Zz\"\n"; 92 | printf $H <<'EOF'; 93 | }; 94 | #endif 95 | 96 | struct gc { 97 | uint32_t from; 98 | uint32_t to; 99 | enum u8id_gc gc; 100 | }; 101 | 102 | EOF 103 | 104 | printf $H <<'EOF', scalar @GC; 105 | #ifdef EXTERN_SCRIPTS 106 | extern const struct gc gc_list[%u]; 107 | #else 108 | const struct gc gc_list[] = { 109 | // clang-format off 110 | EOF 111 | for my $r (@GC) { 112 | my $u = chr $r->[0]; 113 | $u = '' if $r->[0] >= 0xDB70 && $r->[0] <= 0xDFFF; 114 | $u = '' if $u !~ /\w/; 115 | $u = '' if $r->[2] =~ /^M/; 116 | if ($r->[0] == $r->[1]) { 117 | printf $H " {0x%X, 0x%X, GC_%s},\t// %s\n", $r->[0], $r->[1], $r->[2], $u; 118 | } else { 119 | my $u1 = chr $r->[1]; 120 | $u1 = '' if $r->[1] >= 0xD800 && $r->[1] <= 0xDFFF; 121 | $u1 = '' if $u !~ /\w/; 122 | $u1 = '' if $r->[2] =~ /^M/; 123 | printf $H " {0x%X, 0x%X, GC_%s},\t// %s..%s\n", $r->[0], $r->[1], $r->[2], $u, $u1; 124 | } 125 | } 126 | printf $H <<'EOF'; 127 | // clang-format on 128 | }; 129 | #endif 130 | EOF 131 | 132 | close $H; 133 | chmod 0444, $gc_h; 134 | print "Created $gc_h\n"; 135 | 136 | -------------------------------------------------------------------------------- /mkmark.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # libu8ident - Check unicode security guidelines for identifiers. 3 | # Copyright 2014, 2021 Reini Urban 4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | # 6 | # Create mark.h Combining_Mark (Mc | Me | Mn) 7 | 8 | use strict; 9 | use Config; 10 | my $ucd = "UnicodeData.txt"; 11 | if (!-e $ucd) { 12 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd"); 13 | } 14 | 15 | my $mark_h = "mark.h"; 16 | my $doc = "doc/appendix-h.md"; 17 | my (@MARK, @NSM, %NSM); 18 | open my $UCD, "<", $ucd or die "$ucd $!"; 19 | my ($from, $to, $oldto); 20 | while (<$UCD>) { 21 | my @l = split ';'; 22 | my $mark = $l[2]; 23 | if ($mark =~ /^M[cen]$/) { 24 | $to = hex($l[0]); 25 | if (!$from) { 26 | push @MARK, [$to, $to]; 27 | $from = $to; 28 | } else { 29 | $oldto = $MARK[$#MARK]->[1]; 30 | if ($oldto + 1 != $to) { 31 | push @MARK, [$to, $to]; 32 | $from = $to; 33 | } else { # update last 34 | $MARK[$#MARK]->[1] = $to; 35 | } 36 | } 37 | my $name = $l[1]; 38 | my $n = $l[10]; 39 | if ($name =~ /^COMBINING / && $l[4] eq 'NSM' && $n =~ /^NON-SPACING /) { 40 | $n =~ s/^NON-SPACING //; 41 | my $nsm = $n; 42 | push @NSM, [ $nsm, $to ]; 43 | } 44 | } 45 | } 46 | # $NSM{'DOT ABOVE'} = [ ord('i') ]; 47 | seek($UCD,0,0); 48 | while (<$UCD>) { 49 | my @l = split ';'; 50 | my $name = $l[1]; 51 | # Check "HIRAGANA LETTER PA;Lo;0;L;306F 309A;" NFD expansion 52 | if ($l[2] =~ /^L/ && $l[5] =~ / /) { 53 | for my $r (@NSM) { 54 | my $nsm = $r->[0]; 55 | my $n = sprintf("%04X", $r->[1]); 56 | if ($l[5] =~ / \Q$n\E$/) { 57 | my $cp = hex $l[0]; 58 | if (exists $NSM{$nsm}) { 59 | push @{$NSM{$nsm}}, $cp; 60 | } else { 61 | $NSM{$nsm} = [ $cp ]; 62 | } 63 | } 64 | } 65 | } 66 | # but also check names with NSM's, e.g. LATIN SMALL LETTER Q WITH HOOK 67 | # without any NFD expansion. => DOT ABOVE, DIAERESIS, DOT BELOW, THREE DOTS ABOVE, 68 | # FOUR DOTS ABOVE. 69 | elsif ($name =~ /LETTER .+ WITH .+/) { 70 | for my $r (@NSM) { 71 | my $nsm = $r->[0]; 72 | if ($name =~ /LETTER .+ WITH \Q$nsm\E$/) { 73 | my $cp = hex $l[0]; 74 | if (exists $NSM{$nsm}) { 75 | push @{$NSM{$nsm}}, $cp; 76 | } else { 77 | $NSM{$nsm} = [ $cp ]; 78 | } 79 | } 80 | } 81 | } 82 | # plus all the DOTLESS i and j letters, but not arabic BEH, NOON, FEH, QAF 83 | if ($name =~ /LETTER .+ DOTLESS [IJ]/) { 84 | my $nsm = 'DOT ABOVE'; 85 | my $cp = hex $l[0]; 86 | #if (exists $NSM{$nsm}) { 87 | push @{$NSM{$nsm}}, $cp; 88 | #} else { 89 | # $NSM{$nsm} = [ $cp ]; 90 | #} 91 | } 92 | } 93 | close $UCD; 94 | # delete empty NSM's 95 | my $NUM_NSM = scalar @NSM; 96 | my @N; 97 | for my $r (@NSM) { 98 | if (exists $NSM{$r->[0]}) { 99 | push @N, $r; 100 | } else { 101 | warn "skip empty $r->[0] ",sprintf("%04X",$r->[1]),"\n"; 102 | } 103 | } 104 | @NSM = @N; 105 | 106 | chmod 0644, $mark_h if !-w $mark_h; 107 | chmod 0644, $doc if !-w $doc; 108 | open my $H, ">encoding(UTF-8)", $mark_h or die "writing $mark_h $!"; 109 | open my $DOC, ">encoding(UTF-8)", $doc or die "writing $doc $!"; 110 | print $H <<'EOF'; 111 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 112 | /* libu8ident - Check unicode security guidelines for identifiers. 113 | Copyright 2014, 2021, 2022 Reini Urban 114 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 115 | 116 | All Combining_Mark (Mc | Me | Mn), 117 | All letters with non-spacing combining marks. 118 | Generated by mkmark.pl, do not modify. 119 | */ 120 | #include 121 | #include 122 | 123 | struct nsm_ws { 124 | uint32_t nsm; 125 | wchar_t *letters; 126 | }; 127 | 128 | EOF 129 | print $DOC <[0], $r->[1]; 144 | } 145 | printf $H <<'EOF'; 146 | // clang-format on 147 | }; 148 | #endif 149 | EOF 150 | 151 | printf $H <<'EOF'; 152 | 153 | /* All non-spacing combining marks, sorted */ 154 | enum nsm_marks { 155 | EOF 156 | for my $r (@NSM) { 157 | my $n = $r->[0]; 158 | $n =~ s/[ -]/_/g; 159 | printf $H " NSM_%s,\t/* %X */\n", $n, $r->[1]; 160 | } 161 | printf $H <<'EOF'; 162 | NSM_LAST 163 | }; 164 | EOF 165 | 166 | printf $H <<'EOF', scalar @NSM; 167 | 168 | /* All letters with non-spacing combining marks, sorted. 169 | The first entry is the NSM, if letters exist. 170 | */ 171 | #ifdef EXTERN_SCRIPTS 172 | extern const struct nsm_ws nsm_letters[%u]; 173 | #else 174 | const struct nsm_ws nsm_letters[] = { 175 | // clang-format off 176 | EOF 177 | printf $DOC <<'EOF', $NUM_NSM, scalar @NSM; 178 | 179 | From all %u non-spacing marks, 180 | the list of letters already including its %u non-spacing marks: 181 | 182 | EOF 183 | 184 | for my $r (@NSM) { 185 | my $nsm = $r->[0]; 186 | printf $H " { "; 187 | if ($NSM{$nsm}) { 188 | my $u = ""; 189 | if ($r->[1] >= 0x10000) { 190 | printf $H "0x%05X, /* NSM: %s %X */\n ", $r->[1], $nsm, $r->[1]; 191 | printf $DOC "- NSM: %s %05X\n\n ", $nsm, $r->[1]; 192 | } else { 193 | printf $H "0x%04X, /* NSM: %s %X */\n ", $r->[1], $nsm, $r->[1]; 194 | printf $DOC "- NSM: %s %04X\n\n ", $nsm, $r->[1]; 195 | } 196 | printf $H "L\""; 197 | my $i = 0; 198 | my @L = @{$NSM{$nsm}}; 199 | for (@L) { 200 | if ($_ != 0) { 201 | $u .= chr ($_); 202 | if ($_ >= 0x10000) { 203 | printf $H "\\U%08X", $_; 204 | printf $DOC "%05X", $_; 205 | } else { 206 | printf $H "\\u%04X", $_; 207 | printf $DOC "%04X", $_; 208 | } 209 | if (@L == $i + 1) { 210 | print $DOC "\n"; 211 | } else { 212 | print $DOC ++$i % 8 ? ", " : "\n "; 213 | } 214 | } else { 215 | warn "0 in $nsm"; 216 | } 217 | } 218 | printf $H "\" },\n /* %s */\n", $u; 219 | printf $DOC "\n \"%s\"\n", $u; 220 | printf $DOC "\n"; 221 | } else { 222 | printf $H "0U }, /* NSM: %s %X */\n", $nsm, $r->[1]; 223 | printf $DOC "None\n"; 224 | } 225 | } 226 | printf $H <<'EOF'; 227 | // clang-format on 228 | }; 229 | #endif 230 | EOF 231 | 232 | # assume symlinked 233 | if (-e "roaring.h" && -e "roaring.c") { 234 | print "\nCreate serialized roaring bitmaps:\n"; 235 | if ($^O =~ /Win32/) { 236 | system($Config{cc}." mkroar.c -I. -o mkroar.exe"); 237 | system("mkroar.exe mark"); 238 | # ignore vms for now 239 | } else { 240 | system($Config{cc}." mkroar.c -I. -o mkroar"); 241 | system("./mkroar mark"); 242 | } 243 | print "\n"; 244 | my $c = "mark_croar"; 245 | print $H <<'EOF'; 246 | 247 | // This was just an experiment. It's slower than binary search in ranges. 248 | #ifdef HAVE_CROARING 249 | # ifndef EXTERN_SCRIPTS 250 | /* generated via mkroar.c */ 251 | EOF 252 | close $H; 253 | system("xxd -i $c.bin >> $mark_h"); 254 | unlink "$c.bin"; 255 | open $H, ">>", $mark_h or die "appending $mark_h $!"; 256 | print $H <<'EOF'; 257 | # else 258 | extern const unsigned int mark_croar_bin_len; 259 | extern const unsigned char mark_croar_bin[1219]; // checkme on updates 260 | # endif // EXTERN_SCRIPTS 261 | #endif // HAVE_CROARING 262 | EOF 263 | } 264 | 265 | close $H; 266 | close $DOC; 267 | chmod 0444, $mark_h; 268 | chmod 0444, $doc; 269 | print "Created mark.h and $doc\n"; 270 | 271 | -------------------------------------------------------------------------------- /mkmedial.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # libu8ident - Check unicode security guidelines for identifiers. 3 | # Copyright 2022 Reini Urban 4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 | # 6 | # Create medial.h, letters in MEDIAL position 7 | 8 | use strict; 9 | use Config; 10 | use utf8; 11 | 12 | my $ucd = "UnicodeData.txt"; 13 | if (!-e $ucd) { 14 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd"); 15 | } 16 | 17 | my $medial_h = "medial.h"; 18 | my @M; 19 | open my $UCD, "<", $ucd or die "$ucd $!"; 20 | my ($from, $oldto, $to) = (0,0,0); 21 | while (<$UCD>) { 22 | my @l = split ';'; 23 | my $m = $l[0]; 24 | my $name = $l[1]; 25 | if ($name =~ / MEDIAL /) { 26 | my $to = hex($m); 27 | unless ($from) { 28 | $from = $oldto = $to; 29 | } elsif ($oldto + 1 != $to) { 30 | push @M, [$from, $oldto]; 31 | $from = $oldto = $to; 32 | } else { 33 | $oldto = $to; 34 | } 35 | } 36 | } 37 | $to = $oldto unless $to; 38 | push @M, [$from, $to]; # the remainder 39 | close $UCD; 40 | 41 | # add https://www.unicode.org/reports/tr31/#Table_Optional_Medial 42 | # They are mostly not part of any TR31 id set, but at least forbid them at the start and end. 43 | push @M, [0x27, 0x27]; # APOSTROPHE 44 | push @M, [0x2D, 0x2E]; # HYPHEN-MINUS..FULL-STOP 45 | push @M, [0x3A, 0x3A]; # COLON 46 | push @M, [0xB7, 0xB7]; # MIDDLE DOT. disallowed in SAFEC26 47 | push @M, [0x58A, 0x58A]; # ARMENIAN HYPHEN 48 | push @M, [0x5F4, 0x5F4]; # HEBREW PUNCTUATION GERSHAYIM 49 | push @M, [0xF0B, 0xF0B]; # TIBETAN MARK INTERSYLLABIC TSHEG 50 | push @M, [0x200C, 0x200C]; # ZERO WIDTH NON-JOINER 51 | push @M, [0x2010, 0x2010]; # HYPHEN 52 | push @M, [0x2019, 0x2019]; # RIGHT SINGLE QUOTATION MARK 53 | push @M, [0x2027, 0x2027]; # HYPHENATION POINT 54 | push @M, [0x30A0, 0x30A0]; # KATAKANA-HIRAGANA DOUBLE HYPHEN 55 | push @M, [0x30FB, 0x30FB]; # KATAKANA MIDDLE DOT 56 | 57 | @M = sort { $a->[0] <=> $b->[0] } @M; 58 | 59 | if (!-w $medial_h) { 60 | chmod 0644, $medial_h; 61 | } 62 | open my $H, ">:utf8", $medial_h or die "writing $medial_h $!"; 63 | print $H <<'EOF'; 64 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */ 65 | /* libu8ident - Check unicode security guidelines for identifiers. 66 | Copyright 2022 Reini Urban 67 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 68 | 69 | UCD Medial Letters and Forms, not at Start and not at End of a word. 70 | Includes MEDIAL and https://www.unicode.org/reports/tr31/#Table_Optional_Medial 71 | Generated by mkmedial.pl, do not modify. 72 | */ 73 | 74 | EOF 75 | 76 | printf $H <<'EOF', scalar @M; 77 | #ifdef EXTERN_SCRIPTS 78 | extern const struct range_bool medial_list[%u]; 79 | #else 80 | const struct range_bool medial_list[] = { 81 | // clang-format off 82 | EOF 83 | my ($b, $s, $sum) = (0,0,0); 84 | for my $r (@M) { 85 | my $u = chr $r->[0]; 86 | $u = '' if $r->[0] >= 0xDB70 && $r->[0] <= 0xDFFF; 87 | #$u = '' if $u !~ /\w/; 88 | if ($r->[0] == $r->[1]) { 89 | $s++; $sum++; 90 | printf $H " {0x%X, 0x%X},\t// %s\n", $r->[0], $r->[1], $u; 91 | } else { 92 | my $u1 = chr $r->[1]; 93 | $u1 = '' if $r->[1] >= 0xD800 && $r->[1] <= 0xDFFF; 94 | #$u1 = '' if $u !~ /\w/; 95 | $b++; $sum += ($r->[1] + 1 - $r->[0]); 96 | printf $H " {0x%X, 0x%X},\t// %s..%s\n", $r->[0], $r->[1], $u, $u1; 97 | } 98 | } 99 | printf $H <<'EOF', $b, $s, $sum; 100 | // clang-format on 101 | }; // %u ranges, %u singles, %u codepoints 102 | #endif 103 | EOF 104 | 105 | close $H; 106 | chmod 0444, $medial_h; 107 | print "Created $medial_h\n"; 108 | 109 | -------------------------------------------------------------------------------- /mkroar.c: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2021 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | generate roaring bitmaps for some sets. 6 | */ 7 | 8 | #include 9 | #include "roaring.c" 10 | #include "u8id_private.h" 11 | #include "u8id_gc.h" 12 | #include "scripts.h" 13 | #include "confus.h" 14 | #include "mark.h" 15 | 16 | enum what_list { 17 | ALLOWED_ID_LIST, 18 | CONFUSABLES, 19 | MARK, 20 | NFD_N, 21 | NFC_N, 22 | NFC_M, 23 | NFKD_N, 24 | NFKC_N, 25 | NFKC_M 26 | }; 27 | 28 | int serialize(size_t size, const uint32_t *list, enum what_list what) { 29 | FILE *f; 30 | roaring_bitmap_t *rb = roaring_bitmap_create_with_capacity(size); 31 | roaring_statistics_t stat; 32 | const char *file; 33 | if (what == CONFUSABLES) { // simple uint32_t[] 34 | for (uint32_t i = 0; i < size; i++) 35 | roaring_bitmap_add(rb, list[i]); 36 | } else /* if (what == ALLOWED_ID_LIST) */ { // struct range_bool 37 | const struct range_bool *blist = (const struct range_bool *)list; 38 | for (uint32_t i = 0; i < size; i++) { 39 | for (uint32_t cp = blist[i].from; cp <= blist[i].to; cp++) { 40 | roaring_bitmap_add(rb, cp); 41 | } 42 | } 43 | } 44 | switch (what) { 45 | case ALLOWED_ID_LIST: 46 | file = "allowed_croar.bin"; 47 | break; 48 | case CONFUSABLES: 49 | file = "confus_croar.bin"; 50 | break; 51 | case MARK: 52 | file = "mark_croar.bin"; 53 | break; 54 | /* NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M */ 55 | case NFD_N: 56 | file = "nfd_n_croar.bin"; 57 | break; 58 | case NFC_N: // this might be slower than binary search 59 | file = "nfc_n_croar.bin"; 60 | break; 61 | case NFC_M: 62 | file = "nfc_m_croar.bin"; 63 | break; 64 | case NFKD_N: 65 | file = "nfkd_n_croar.bin"; 66 | break; 67 | case NFKC_N: 68 | file = "nfkc_n_croar.bin"; 69 | break; 70 | case NFKC_M: 71 | file = "nfkc_m_croar.bin"; 72 | break; 73 | default: 74 | fprintf(stderr, "Unhandled case %d\n", what); 75 | exit(1); 76 | } 77 | 78 | f = fopen(file, "w"); 79 | uint32_t sizebefore = roaring_bitmap_portable_size_in_bytes(rb); 80 | roaring_bitmap_run_optimize(rb); 81 | 82 | uint32_t sizeafter = roaring_bitmap_portable_size_in_bytes(rb); 83 | char *serializedbytes = malloc(sizeafter); 84 | roaring_bitmap_portable_serialize(rb, serializedbytes); 85 | fwrite(serializedbytes, 1, sizeafter, f); 86 | fclose(f); 87 | free(serializedbytes); 88 | printf("\nwrote %u serialized bytes to %s\n", sizeafter, file); 89 | 90 | printf("cardinality = %d\n", (int)roaring_bitmap_get_cardinality(rb)); 91 | printf("size before/after optim: %u/%u\n", sizebefore, sizeafter); 92 | roaring_bitmap_statistics(rb, &stat); 93 | printf("n_bitset_containers = %u\n", stat.n_bitset_containers); 94 | printf("n_values_bitset_containers = %u\n", stat.n_values_bitset_containers); 95 | printf("n_bytes_bitset_containers = %u\n", stat.n_bytes_bitset_containers); 96 | printf("n_array_containers = %u\n", stat.n_array_containers); 97 | printf("n_values_array_containers = %u\n", stat.n_values_array_containers); 98 | printf("n_bytes_array_containers = %u\n", stat.n_bytes_array_containers); 99 | printf("n_run_containers = %u\n", stat.n_run_containers); 100 | printf("n_values_run_containers = %u\n", stat.n_values_run_containers); 101 | printf("n_bytes_run_containers = %u\n", stat.n_bytes_run_containers); 102 | roaring_bitmap_free(rb); 103 | return 0; 104 | } 105 | 106 | int main(int argc, char **argv) { 107 | if (argc < 2 || strcmp(argv[1], "confus") == 0) 108 | serialize(ARRAY_SIZE(confusables), confusables, CONFUSABLES); 109 | if (argc > 1 && strcmp(argv[1], "confus") == 0) 110 | exit(0); 111 | 112 | if (argc < 2 || strcmp(argv[1], "mark") == 0) 113 | serialize(ARRAY_SIZE(mark_list), (const uint32_t *)mark_list, MARK); 114 | if (argc > 1 && strcmp(argv[1], "mark") == 0) 115 | exit(0); 116 | 117 | serialize(ARRAY_SIZE(allowed_id_list), (const uint32_t *)allowed_id_list, 118 | ALLOWED_ID_LIST); 119 | /* NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M */ 120 | serialize(ARRAY_SIZE(NFD_N_list), (const uint32_t *)NFD_N_list, NFD_N); 121 | serialize(ARRAY_SIZE(NFC_N_list), (const uint32_t *)NFC_N_list, NFC_N); 122 | serialize(ARRAY_SIZE(NFC_M_list), (const uint32_t *)NFC_M_list, NFC_M); 123 | serialize(ARRAY_SIZE(NFKD_N_list), (const uint32_t *)NFKD_N_list, NFKD_N); 124 | serialize(ARRAY_SIZE(NFKC_N_list), (const uint32_t *)NFKC_N_list, NFKC_N); 125 | serialize(ARRAY_SIZE(NFKC_M_list), (const uint32_t *)NFKC_M_list, NFKC_M); 126 | 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /mktest-norm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use Unicode::Normalize qw(NFC NFKC NFD NFKD FCD FCC); 3 | use Encode; 4 | sub wstr($) { 5 | join('',map{sprintf'\x%02x',$_} unpack 'W*', encode_utf8 $_[0]); 6 | } 7 | binmode *STDOUT, ":utf8"; 8 | # all NFC letters different from NFKC 9 | # unichars -a '\pL' 'NFC ne NFKC' 10 | # e.g. U+1C5 U+140 11 | 12 | for ("Cafe\x{301}", "Caf\x{e9}", 13 | "\x{1f87}", "\x{03B1}\x{0314}\x{0342}\x{0345}", 14 | "\x{1c5}\x{140}", "D\x{17e}l\x{b7}" 15 | ) 16 | { 17 | #"\xe1\xbe\x87" 18 | print "$_ [",wstr($_),"]:\n"; 19 | printf "NFC: %s [%s]\n", NFC($_), wstr(NFC($_)); 20 | printf "NFKC: %s [%s]\n", NFKC($_), wstr(NFKC($_)); 21 | printf "FCC: %s [%s]\n", FCC($_), wstr(FCC($_)); 22 | printf "NFD: %s [%s]\n", NFD($_), wstr(NFD($_)); 23 | printf "NFKD: %s [%s]\n", NFKD($_), wstr(NFKD($_)); 24 | printf "FCD: %s [%s]\n", FCD($_), wstr(FCD($_)); 25 | print "\n"; 26 | } 27 | -------------------------------------------------------------------------------- /test-all-fast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | make -f makefile.gnu clean 4 | echo "check -UU8ID_NORM -UU8ID_PROFILE" 5 | make -f makefile.gnu check 6 | make -f makefile.gnu check-asan 7 | make -f makefile.gnu clean 8 | for norm in NFC NFD NFKC NFKD FCD FCC; do 9 | for profile in 2 3 4 5 6 C23_4 C11_6; do 10 | echo "check -DU8ID_NORM=$norm -DU8ID_PROFILE=$profile" 11 | make -f makefile.gnu check DEFS="-DU8ID_NORM=$norm -DU8ID_PROFILE=$profile" 12 | make -f makefile.gnu check-asan DEFS="-DU8ID_NORM=$norm -DU8ID_PROFILE=$profile" 13 | make -f makefile.gnu clean 14 | done 15 | done 16 | -------------------------------------------------------------------------------- /test-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | make clean 3 | echo "check -UU8ID_NORM -UU8ID_PROFILE" 4 | ./configure && make check 5 | make check-asan 6 | make clean 7 | for norm in NFKC NFC NFKD NFD FCC FCD; do 8 | for profile in 2 3 4 5 6; do 9 | echo "check -DU8ID_NORM=$norm -DU8ID_PROFILE=$profile" 10 | ./configure --with-norm=$norm --with-profile=$profile && make check 11 | make check-asan 12 | make clean 13 | done 14 | done 15 | 16 | -------------------------------------------------------------------------------- /test-texts.c: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2021, 2022 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | Check some files, wordbreak them to valid identifiers in some common scripts. 6 | */ 7 | #include "u8id_private.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #ifdef HAVE_SYS_TYPES_H 14 | # include 15 | #endif 16 | #ifdef HAVE_SYS_STAT_H 17 | # include 18 | #endif 19 | #ifdef HAVE_DIRENT_H 20 | # include 21 | #endif 22 | #if defined HAVE_DIRENT_H && !defined _MSC_VER 23 | # include 24 | #endif 25 | #ifdef _MSC_VER 26 | # define WIN32_LEAN_AND_MEAN 27 | # include 28 | #endif 29 | #ifdef HAVE_LIBGEN_H 30 | # include 31 | #endif 32 | 33 | #include "u8ident.h" 34 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING 35 | # include "uniwbrk.h" 36 | #endif 37 | #include "u8idscr.h" 38 | //#undef EXTERN_SCRIPTS 39 | #include "unic11.h" 40 | 41 | int verbose = 0; 42 | // private access 43 | unsigned u8ident_options(void); 44 | enum u8id_profile u8ident_profile(void); 45 | char *enc_utf8(char *dest, size_t *lenp, const uint32_t cp); 46 | 47 | #ifdef HAVE_SYS_STAT_H 48 | static int file_exists(const char *path) { 49 | struct stat st; 50 | return (stat(path, &st) == 0) && (st.st_mode & S_IFDIR) != S_IFDIR; 51 | } 52 | #elif defined _MSC_VER 53 | static int file_exists(const char *path) { 54 | const uint16_t dwAttrib = GetFileAttributes(path); 55 | return (dwAttrib != INVALID_FILE_ATTRIBUTES && 56 | !(dwAttrib & FILE_ATTRIBUTE_DIRECTORY); 57 | } 58 | #endif 59 | 60 | int testdir(const char *dir, const char *fname) { 61 | char path[256]; 62 | static char line[1024] = {0}; 63 | unsigned ln = 0; 64 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING 65 | static char brks[1024] = {0}; 66 | #endif 67 | static char word[128] = {0}; 68 | if (!dir) { 69 | strncpy(path, fname, sizeof(path) - 1); 70 | } else { 71 | strncpy(path, dir, sizeof(path) - 1); 72 | path[255] = '\0'; 73 | strncat(path, "/", sizeof(path) - 1); 74 | path[255] = '\0'; 75 | strncat(path, fname, sizeof(path) - 1); 76 | path[255] = '\0'; 77 | } 78 | 79 | FILE *f = fopen(path, "r"); 80 | if (!f) { 81 | perror("fopen"); 82 | fprintf(stderr, "texts/%s\n", fname); 83 | return -1; 84 | } 85 | 86 | printf("-- texts/%s\n", fname); 87 | int ctx = u8ident_new_ctx(); 88 | // while (fscanf(f, " %1023s", word) == 1) 89 | // Check now also against libunistring: u8_wordbreaks 90 | while (fgets(line, 1023, f)) { 91 | char *s = &line[0]; 92 | bool prev_isword = false; 93 | char *wp = &word[0]; 94 | ln++; 95 | *word = '\0'; 96 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING 97 | u8_wordbreaks((uint8_t *)s, strlen(s), brks); 98 | #endif 99 | while (*s) { 100 | char *olds = s; 101 | uint32_t cp = dec_utf8(&s); 102 | if (!cp) { 103 | printf("ERROR %s illegal UTF-8 at line %u, col %lu\n", olds, ln, 104 | s - olds); 105 | exit(1); 106 | } 107 | 108 | // unicode #29 word-break, but simplified: 109 | // must not split at continuations (Combining marks). e.g. for 110 | // texts/arabic-1.txt 111 | const bool iscont = isXID_cont(cp); 112 | bool isword = prev_isword ? (isXID_start(cp) || iscont) : isXID_start(cp); 113 | char force_break = (prev_isword != isword && !iscont); 114 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING 115 | if (force_break != brks[s - olds] && verbose) 116 | /* break at: if libunistring found a break, but we not. 117 | no break at: if we found a break, but libunistring not. */ 118 | fprintf(stderr, "WARN: %sbreak at U+%X, line %u, col %lu\n", 119 | force_break ? "" : "no ", cp, ln, s - olds); 120 | // don't rely in the CI on an optional lib 121 | // force_break = brks[s - olds]; 122 | #endif 123 | // first, or changed from non-word to word, and is no mark (continuation) 124 | if (olds == &line[0] || force_break) { 125 | prev_isword = isword; 126 | if (isword) { 127 | int l = s - olds; 128 | if (l == 1) { 129 | *wp++ = *olds; 130 | } else if (wp + l < &word[128]) { 131 | memcpy(wp, olds, l); 132 | wp += l; 133 | } 134 | continue; // started new word 135 | } else { // word-end: fall-through to word check 136 | *wp = '\0'; 137 | } 138 | } else { // no change. in word or non-word 139 | if (isword) { 140 | int l = s - olds; 141 | if (l == 1) { 142 | *wp++ = *olds; 143 | } else if (wp + l < &word[128]) { 144 | memcpy(wp, olds, l); 145 | wp += l; 146 | } 147 | } 148 | if (*s != '\n') 149 | continue; 150 | } 151 | // bad case "\xd8\xa8\xd8\xb1\xd9\x88\xd8\xad" "بروح" Arabic 152 | // also bad case "أحرارًا" Arabic at pos 12 (missing basesc) 153 | if (!*wp && *word && force_break) { // non-empty word-end 154 | int ret = u8ident_check((uint8_t *)word, NULL); 155 | const char *scripts = u8ident_existing_scripts(ctx); 156 | printf("%s: %s (%s", word, u8ident_errstr(ret), scripts); 157 | if (ret < 0) { 158 | const uint32_t cp = u8ident_failed_char(ctx); 159 | const uint8_t scr = u8ident_get_script(cp); 160 | if (scr != SC_Unknown) 161 | printf(" + U+%X %s)!\n", cp, u8ident_script_name(scr)); 162 | else 163 | printf(" + U+%X)!\n", cp); 164 | } else 165 | printf(")\n"); 166 | free((char *)scripts); 167 | *word = '\0'; 168 | wp = &word[0]; 169 | } 170 | } 171 | } 172 | 173 | u8ident_free_ctx(ctx); 174 | fclose(f); 175 | return 0; 176 | } 177 | 178 | int cmp_str(const void *a, const void *b) { 179 | return strcmp(*(const char **)a, *(const char **)b); 180 | } 181 | 182 | int main(int argc, char **argv) { 183 | int i = 1; 184 | char *dirname = "texts"; 185 | u8ident_init(U8ID_PROFILE_DEFAULT, U8ID_NORM_DEFAULT, U8ID_TR31_XID); 186 | 187 | if (getenv("U8IDTEST_TEXTS")) 188 | dirname = getenv("U8IDTEST_TEXTS"); 189 | else if (argc > 1 && strEQc(argv[1], "-v")) { 190 | verbose++; 191 | i++; 192 | } 193 | 194 | #if !defined _MSC_VER && defined HAVE_SYS_STAT_H 195 | if (argc > i && file_exists(argv[i])) { 196 | while (argc > i && file_exists(argv[i])) { 197 | testdir(NULL, argv[i++]); 198 | } 199 | u8ident_free(); 200 | return 0; 201 | } 202 | if (argc > i && !file_exists(argv[i])) { 203 | dirname = argv[i]; 204 | } 205 | #endif 206 | 207 | #if defined HAVE_DIRENT_H && !defined _MSC_VER 208 | DIR *dir = opendir(dirname); 209 | if (!dir) { 210 | perror("opendir"); 211 | exit(1); 212 | } 213 | #elif defined _MSC_VER 214 | WIN32_FIND_DATA FindFileData; 215 | HANDLE hdir; 216 | #endif 217 | 218 | const char *const exts[] = {".txt", ".c"}; 219 | for (size_t j = 0; j < ARRAY_SIZE(exts); j++) { 220 | const char *ext = exts[j]; 221 | int s = 0; 222 | size_t lext = strlen(ext); 223 | 224 | #if defined HAVE_DIRENT_H && !defined _MSC_VER 225 | struct dirent *d; 226 | # define NEXT_FILE (d = readdir(dir)) 227 | # define CUR_FILE d->d_name 228 | #elif defined _MSC_VER 229 | hdir = FindFirstFile(dirname, &FindFileData); 230 | if (hdir == INVALID_HANDLE_VALUE) 231 | continue; 232 | # define NEXT_FILE FindNextFile(hdir, &FindFileData) 233 | # define CUR_FILE FindFileData.cFileName 234 | #endif 235 | 236 | // sort the names, to compare against the result 237 | while (NEXT_FILE) { 238 | size_t l = strlen(CUR_FILE); 239 | if (l > lext && '.' != CUR_FILE[0] && strEQ(&CUR_FILE[l - lext], ext)) { 240 | s++; 241 | } 242 | } 243 | 244 | #if defined HAVE_DIRENT_H && !defined _MSC_VER 245 | rewinddir(dir); 246 | #elif defined _MSC_VER 247 | FindClose(hdir); 248 | hdir = FindFirstFile(dirname, &FindFileData); 249 | #endif 250 | 251 | const char **files = calloc(s, sizeof(char *)); 252 | int i = 0; 253 | while (NEXT_FILE) { 254 | size_t l = strlen(CUR_FILE); 255 | if (l > lext && '.' != CUR_FILE[0] && strEQ(&CUR_FILE[l - lext], ext)) { 256 | if (i >= s) 257 | break; 258 | assert(i < s); 259 | files[i] = malloc(l + 1); 260 | strcpy((char *)files[i], CUR_FILE); 261 | i++; 262 | } 263 | } 264 | if (s) 265 | qsort(files, s, sizeof(char *), cmp_str); 266 | for (i = 0; i < s; i++) { 267 | // printf("%s\n", files[i]); 268 | testdir(dirname, files[i]); 269 | } 270 | for (int i = 0; i < s; i++) 271 | free((void *)files[i]); 272 | free(files); 273 | } 274 | #if defined HAVE_DIRENT_H && !defined _MSC_VER 275 | closedir(dir); 276 | #elif defined _MSC_VER 277 | FindClose(hdir); 278 | #endif 279 | 280 | u8ident_free(); 281 | return 0; 282 | } 283 | -------------------------------------------------------------------------------- /test-texts.test.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | U8IDTEST_TEXTS="@top_srcdir@/texts" @LTEXEC@ ./test-texts@EXEEXT@ > texts.rst && \ 4 | diff "@top_srcdir@/texts/result.lst" texts.rst && rm texts.rst 5 | -------------------------------------------------------------------------------- /texts/amharic-1.txt: -------------------------------------------------------------------------------- 1 | የግዕዝ ፊደል 2 | ሁሉም የሰው ልጆች የተወለዱት በነፃነት እና በክብር እና በመብት እኩል ናቸው። የማሰብ እና የህሊና ችሎታ ተሰጥቷቸው እርስ በርሳቸው በወንድማማችነት መንፈስ መተጋገዝ አለባቸው። 3 | -------------------------------------------------------------------------------- /texts/arabic-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | int الناس = 0; 3 | 4 | int الإء() { 5 | return الناس; 6 | } 7 | int main() { 8 | int ير = 1; 9 | assert(ير == 1 ); 10 | return الإء(); 11 | /* يولد جميع الناس أحرارًا ومتساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإء. */ 12 | } 13 | -------------------------------------------------------------------------------- /texts/arabic-1.txt: -------------------------------------------------------------------------------- 1 | نص عربي 2 | يولد جميع الناس أحرارًا ومتساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإخاء. 3 | -------------------------------------------------------------------------------- /texts/armenian-1.txt: -------------------------------------------------------------------------------- 1 | Հայ գիր 2 | Բոլոր մարդիկ ծնվում են ազատ և հավասար արժանապատվության և իրավունքների մեջ: Նրանք օժտված են բանականությամբ և խղճով և պետք է միմյանց հանդեպ վարվեն եղբայրության ոգով: 3 | -------------------------------------------------------------------------------- /texts/bengali-1.txt: -------------------------------------------------------------------------------- 1 | বাংলা লিপি 2 | সমস্ত মানুষ স্বাধীনভাবে জন্মগ্রহণ করে এবং মর্যাদা ও অধিকারে সমান। তারা যুক্তি এবং বিবেক দ্বারা সমৃদ্ধ এবং ভ্রাতৃত্বের চেতনায় একে অপরের প্রতি আচরণ করা উচিত। 3 | -------------------------------------------------------------------------------- /texts/bidi-sec-1-allowed.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-1.c 2 | admin.: ERR_XID (Latin + U+2E Common)! 3 | -------------------------------------------------------------------------------- /texts/bidi-sec-1-c11.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-1.c 2 | ‮: ERR_SCRIPT (Latin + U+202E Common)! 3 | ⁦if: ERR_SCRIPT (Latin + U+2066 Common)! 4 | ⁩: ERR_SCRIPT (Latin + U+2069 Common)! 5 | ⁦: ERR_SCRIPT (Latin + U+2066 Common)! 6 | ‮: ERR_SCRIPT (Latin + U+202E Common)! 7 | ⁦: ERR_SCRIPT (Latin + U+2066 Common)! 8 | -------------------------------------------------------------------------------- /texts/bidi-sec-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // RLO U+202E, LRE U+202A, PDI U+2069, LRI U+2066 4 | 5 | int main() { 6 | bool isAdmin = false; 7 | /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */ 8 | printf("You are an admin.\n"); 9 | /* end admins only ‮ { ⁦*/ 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /texts/bidi-sec-1.py: -------------------------------------------------------------------------------- 1 | s = "א" * 100 # "א" is assigned 2 | print(s) 3 | -------------------------------------------------------------------------------- /texts/bidi-sec-1.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-1.c 2 | -------------------------------------------------------------------------------- /texts/bidi-sec-2-allowed.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-2.c 2 | ‮⁦ 3 | : ERR_BIDI (Latin + U+202E)! 4 | ⁦ 5 | : ERR_BIDI (Latin + U+2066)! 6 | ‮⁦// Check if admin⁩⁦) 7 | : ERR_BIDI (Latin + U+202E)! 8 | ⁦// Check if admin⁩⁦) 9 | : ERR_BIDI (Latin + U+2066)! 10 | ⁩⁦) 11 | : ERR_BIDI (Latin + U+2069)! 12 | ⁦) 13 | : ERR_BIDI (Latin + U+2066)! 14 | admin.: ERR_XID (Latin + U+2E Common)! 15 | -------------------------------------------------------------------------------- /texts/bidi-sec-2-ascii.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-2.c 2 | ‮⁦ 3 | : ERR_BIDI (Latin + U+202E)! 4 | ⁦ 5 | : ERR_BIDI (Latin + U+2066)! 6 | ‮⁦// Check if admin⁩⁦) 7 | : ERR_BIDI (Latin + U+202E)! 8 | ⁦// Check if admin⁩⁦) 9 | : ERR_BIDI (Latin + U+2066)! 10 | ⁩⁦) 11 | : ERR_BIDI (Latin + U+2069)! 12 | ⁦) 13 | : ERR_BIDI (Latin + U+2066)! 14 | -------------------------------------------------------------------------------- /texts/bidi-sec-2-c11.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-2.c 2 | ‮⁦: ERR_SCRIPT (Latin + U+202E Common)! 3 | USER‮⁦: ERR_SCRIPT (Latin + U+202E Common)! 4 | admin⁩⁦: ERR_SCRIPT (Latin + U+2069 Common)! 5 | -------------------------------------------------------------------------------- /texts/bidi-sec-2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #define USER 0‮⁦ 3 | #define ADMIN 1 4 | // RLO U+202E, LRE U+202A, PDI U+2069, LRI U+2066 5 | 6 | int main() { 7 | int accessLevel = USER; 8 | if (accessLevel != USER‮⁦// Check if admin⁩⁦) 9 | printf("You are an admin.\n"); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /texts/bidi-sec-2.tst: -------------------------------------------------------------------------------- 1 | texts/bidi-sec-2.c 2 | ‮⁦ 3 | : ERR_BIDI (Latin + U+202E)! 4 | ⁦ 5 | : ERR_BIDI (Latin + U+2066)! 6 | ‮⁦// Check if admin⁩⁦) 7 | : ERR_BIDI (Latin + U+202E)! 8 | ⁦// Check if admin⁩⁦) 9 | : ERR_BIDI (Latin + U+2066)! 10 | ⁩⁦) 11 | : ERR_BIDI (Latin + U+2069)! 12 | ⁦) 13 | : ERR_BIDI (Latin + U+2066)! 14 | -------------------------------------------------------------------------------- /texts/bidi-sec-3.c: -------------------------------------------------------------------------------- 1 | #include 2 | // RLI U+2067 3 | 4 | int main() { 5 | /* Say hello; newline⁧ /*/ return 0 ; 6 | printf("Hello world\n"); 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /texts/bidi-sec-4.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | constexpr char password[] = "12345"; 5 | printf(R"("א(ב)"? password: ")"); 6 | } 7 | -------------------------------------------------------------------------------- /texts/bopomofo-1.txt: -------------------------------------------------------------------------------- 1 | ㄅㄞˇ ㄎㄜ ㄑㄩㄢˊ ㄕㄨ 百科全書 百科全书 2 | ㄓㄨˋ ㄧㄣ ㄈㄨˊ ㄏㄠˋ 3 | -------------------------------------------------------------------------------- /texts/bulgarian-1.txt: -------------------------------------------------------------------------------- 1 | български (кирилица) 2 | Всички човешки същества се раждат свободни и равни по достойнство и права. Те са надарени с разум и съвест и трябва да се отнасят един към друг в дух на братство. 3 | -------------------------------------------------------------------------------- /texts/chinese-s-1.txt: -------------------------------------------------------------------------------- 1 | 简体中文) 2 | 人人生而自由,在尊严和权利上一律平等。 他们被赋予了理性和良知,应该本着兄弟情谊的精神相互对待。 3 | -------------------------------------------------------------------------------- /texts/chinese-trad-1.txt: -------------------------------------------------------------------------------- 1 | 中國傳統的) 2 | 人人生而自由,在尊嚴和權利上一律平等。 他們被賦予了理性和良知,應該本著兄弟情誼的精神相互對待。 3 | -------------------------------------------------------------------------------- /texts/cyrillic-1.txt: -------------------------------------------------------------------------------- 1 | Кириллица 2 | Все люди рождаются свободными и равными в своем достоинстве и правах. Они наделены разумом и совестью и должны действовать по отношению друг к другу в духе братства. 3 | -------------------------------------------------------------------------------- /texts/english-1.txt: -------------------------------------------------------------------------------- 1 | All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood. 2 | -------------------------------------------------------------------------------- /texts/farsi-1.txt: -------------------------------------------------------------------------------- 1 | خط فارسی 2 | همه انسانها آزاد به دنیا می آیند و از نظر حیثیت و حقوق با هم برابرند. آنها دارای عقل و وجدان هستند و باید نسبت به یکدیگر با روحیه برادری رفتار کنند. 3 | -------------------------------------------------------------------------------- /texts/georgian-1.txt: -------------------------------------------------------------------------------- 1 | ქართული დამწერლობა 2 | ყველა ადამიანი იბადება თავისუფალი და თანასწორი ღირსებითა და უფლებებით. ისინი დაჯილდოვებულნი არიან გონიერებითა და სინდისით და ერთმანეთის მიმართ ძმობის სულისკვეთებით უნდა მოიქცნენ. 3 | -------------------------------------------------------------------------------- /texts/greek-1.txt: -------------------------------------------------------------------------------- 1 | Ελληνική γραφή 2 | Όλοι οι άνθρωποι γεννιούνται ελεύθεροι και ίσοι σε αξιοπρέπεια και δικαιώματα. Είναι προικισμένοι με λογική και συνείδηση και πρέπει να ενεργούν ο ένας προς τον άλλον με πνεύμα αδελφοσύνης. 3 | -------------------------------------------------------------------------------- /texts/gujarati-1.txt: -------------------------------------------------------------------------------- 1 | ગુજરાતી લિપિ 2 | બધા મનુષ્યો સ્વતંત્ર જન્મે છે અને ગૌરવ અને અધિકારોમાં સમાન છે. તેઓ તર્ક અને વિવેકથી સંપન્ન છે અને તેઓ એકબીજા પ્રત્યે ભાઈચારાની ભાવનાથી વર્તે છે. 3 | -------------------------------------------------------------------------------- /texts/hebrew-1.txt: -------------------------------------------------------------------------------- 1 | כתב עברי 2 | כל בני האדם נולדו חופשיים ושווים בכבוד ובזכויות. הם ניחנים בשכל ובמצפון וצריכים לפעול זה כלפי זה ברוח של אחווה. 3 | -------------------------------------------------------------------------------- /texts/hindi-1.txt: -------------------------------------------------------------------------------- 1 | देवनागरी लिपि 2 | सभी मनुष्य स्वतंत्र पैदा होते हैं और सम्मान और अधिकारों में समान होते हैं। वे तर्क और विवेक से संपन्न हैं और उन्हें भाईचारे की भावना से एक दूसरे के प्रति कार्य करना चाहिए। 3 | -------------------------------------------------------------------------------- /texts/homo-1-p4.tst: -------------------------------------------------------------------------------- 1 | texts/homo-1.c 2 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)! 3 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)! 4 | -------------------------------------------------------------------------------- /texts/homo-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | int main() { 3 | int a = 2; // which is which? 4 | int а = 3; 5 | assert(a == 2); // OK 6 | assert(а == 3); // OK 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /texts/homo-1.tst: -------------------------------------------------------------------------------- 1 | texts/homo-1.c 2 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)! 3 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)! 4 | -------------------------------------------------------------------------------- /texts/homo-sec-1-p1.tst: -------------------------------------------------------------------------------- 1 | texts/homo-sec-1.c 2 | сhесk: ERR_XID (Latin + U+441 Cyrillic)! 3 | СНЕСК: ERR_XID (Latin + U+421 Cyrillic)! 4 | -------------------------------------------------------------------------------- /texts/homo-sec-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int CHECK (const char *arg) { 4 | return strcmp(arg, "check") == 0; 5 | } 6 | int СНЕСК (const char *arg) { 7 | return strcmp(arg, "сhесk") == 0; 8 | } 9 | 10 | int main () { 11 | return СНЕСК("check"); 12 | } 13 | -------------------------------------------------------------------------------- /texts/homo-sec-1.js: -------------------------------------------------------------------------------- 1 | const [ ENV_PROD, ENV_DEV ] = [ 'PRODUCTION', 'DEVELOPMENT']; 2 | /* … */ 3 | const environment = 'PRODUCTION'; 4 | /* … */ 5 | function isUserAdmin(user) { 6 | if(environmentǃ=ENV_PROD){ 7 | // bypass authZ checks in DEV 8 | return true; 9 | } 10 | 11 | /* … */ 12 | return false; 13 | } 14 | -------------------------------------------------------------------------------- /texts/homo-sec-1.py: -------------------------------------------------------------------------------- 1 | print (int('৪୨')) 2 | # => 42 3 | print ('{٥}'.format('zero', 'one', 'two', 'three', 'four', 'five')) 4 | # => five 5 | -------------------------------------------------------------------------------- /texts/homo-sec-1.tst: -------------------------------------------------------------------------------- 1 | texts/homo-sec-1.c 2 | сhесk: ERR_SCRIPTS (Latin + U+441 Cyrillic)! 3 | СНЕСК: ERR_SCRIPTS (Latin + U+421 Cyrillic)! 4 | -------------------------------------------------------------------------------- /texts/homo-sec-2.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | enum env_e { 4 | ENV_PRODUCTION, 5 | ENV_DEVELOPMENT 6 | }; 7 | /* … */ 8 | const enum env_e environment = ENV_PRODUCTION; 9 | /* … */ 10 | bool isUserAdmin(const char *user) { 11 | if(environmentǃ=ENV_PRODUCTION){ 12 | // bypass authZ checks in DEVELOPMENT 13 | return true; 14 | } 15 | 16 | /* … */ 17 | return false; 18 | } 19 | -------------------------------------------------------------------------------- /texts/hungarian-1.txt: -------------------------------------------------------------------------------- 1 | Magyar 2 | Minden emberi lény szabadnak és egyenlőnek születik méltóságában és jogaiban. Ésszel és lelkiismerettel rendelkeznek, és a testvériség szellemében kell viselkedniük egymással. 3 | -------------------------------------------------------------------------------- /texts/igbo-1.txt: -------------------------------------------------------------------------------- 1 | Igbo 2 | A mụrụ mmadụ niile nweere onwe ha nhata na ugwu na ikike. Enyere ha uche na akọ na uche ma kwesị ịkpakọrịta ibe ha n'ime mmụọ nke ụmụnna. 3 | -------------------------------------------------------------------------------- /texts/japanese-1.txt: -------------------------------------------------------------------------------- 1 | 日本語の台本 2 | すべての人間は自由に生まれ、尊厳と権利において平等です。 彼らは理性と良心に恵まれており、兄弟愛の精神でお互いに向かって行動する必要があります。 3 | -------------------------------------------------------------------------------- /texts/japanese-2.txt: -------------------------------------------------------------------------------- 1 | ワンワン 2 | Tシャツを3枚購入しました。 3 | -------------------------------------------------------------------------------- /texts/kannada-1.txt: -------------------------------------------------------------------------------- 1 | ಕನ್ನಡ ಲಿಪಿ 2 | ಎಲ್ಲಾ ಮಾನವರು ಸ್ವತಂತ್ರವಾಗಿ ಮತ್ತು ಘನತೆ ಮತ್ತು ಹಕ್ಕುಗಳಲ್ಲಿ ಸಮಾನವಾಗಿ ಹುಟ್ಟಿದ್ದಾರೆ. ಅವರು ವಿವೇಚನೆ ಮತ್ತು ಆತ್ಮಸಾಕ್ಷಿಯನ್ನು ಹೊಂದಿದ್ದಾರೆ ಮತ್ತು ಸಹೋದರತ್ವದ ಮನೋಭಾವದಿಂದ ಪರಸ್ಪರ ವರ್ತಿಸಬೇಕು. 3 | -------------------------------------------------------------------------------- /texts/khmer-1.txt: -------------------------------------------------------------------------------- 1 | អក្សរខ្មែរ 2 | មនុស្សគ្រប់រូបកើតមកមានសេរីភាព និងស្មើភាពគ្នាក្នុងសេចក្តីថ្លៃថ្នូរ និងសិទ្ធិ។ ពួកគេត្រូវបានផ្តល់ដោយហេតុផល និងមនសិការ ហើយគួរតែប្រព្រឹត្តចំពោះគ្នាទៅវិញទៅមកក្នុងស្មារតីភាតរភាព។ 3 | -------------------------------------------------------------------------------- /texts/korean-1.txt: -------------------------------------------------------------------------------- 1 | 한글 스크립트 2 | 모든 인간은 태어날 때부터 자유롭고 존엄성과 권리가 평등합니다. 그들은 이성과 양심을 부여받았으며 형제애의 정신으로 서로를 대해야 합니다. 3 | -------------------------------------------------------------------------------- /texts/korean-2.txt: -------------------------------------------------------------------------------- 1 | ฝ오::ʉ 2 | $ฝ오::ʉ 3 | 4 | -------------------------------------------------------------------------------- /texts/lao-1.txt: -------------------------------------------------------------------------------- 1 | ອັກສອນລາວ 2 | ມະນຸດທຸກຄົນເກີດມາມີເສລີພາບ ແລະສະເໝີພາບໃນກຽດສັກສີ ແລະສິດທິ. ເຂົາ​ເຈົ້າ​ມີ​ເຫດຜົນ​ແລະ​ສະຕິ​ຮູ້ສຶກ​ຜິດ​ຊອບ ແລະ​ຄວນ​ປະຕິບັດ​ຕໍ່​ກັນ​ແລະ​ກັນ​ໃນ​ຈິດ​ໃຈ​ຂອງ​ພີ່​ນ້ອງ. 3 | -------------------------------------------------------------------------------- /texts/latin-1.txt: -------------------------------------------------------------------------------- 1 | aʙȼ1 2 | aʙȼ2 3 | aʙȼ3 -------------------------------------------------------------------------------- /texts/malayalam-1.txt: -------------------------------------------------------------------------------- 1 | മലയാളം ലിപി 2 | എല്ലാ മനുഷ്യരും സ്വതന്ത്രരും അന്തസ്സിലും അവകാശങ്ങളിലും തുല്യരായി ജനിച്ചവരാണ്. അവർ യുക്തിയും മനസ്സാക്ഷിയും ഉള്ളവരാണ്, അവർ പരസ്പരം സാഹോദര്യത്തിന്റെ മനോഭാവത്തിൽ പ്രവർത്തിക്കണം. -------------------------------------------------------------------------------- /texts/marathi-1.txt: -------------------------------------------------------------------------------- 1 | मराठी लिपी 2 | सर्व माणसं स्वतंत्रपणे जन्माला येतात आणि सन्मान आणि अधिकारांमध्ये समान असतात. ते तर्क आणि विवेकाने संपन्न आहेत आणि त्यांनी एकमेकांशी बंधुभावाच्या भावनेने वागले पाहिजे. 3 | -------------------------------------------------------------------------------- /texts/math-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) { 5 | int λ = 1; 6 | double π = M_PI; 7 | double π2 = M_PI * 2.; 8 | printf ("%d %f %f\n", λ, π, π2); 9 | } 10 | -------------------------------------------------------------------------------- /texts/math-2.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() 4 | { 5 | double εₙ₊₁, εₙ, εₙ₋₁, β₁, β₂, β₃, Δtₙ, k; 6 | const double Δtₙ₊₁ = pow(εₙ₊₁, β₁/k) * pow(εₙ, β₂/k) * pow(εₙ₋₁, β₃/k) * Δtₙ; 7 | return 0; 8 | } 9 | -------------------------------------------------------------------------------- /texts/math-2.cc: -------------------------------------------------------------------------------- 1 | // GH 12 wrong greek ERR_SCRIPTS with subscripts 2 | #include 3 | 4 | int main() 5 | { 6 | double εₙ₊₁, εₙ, εₙ₋₁, β₁, β₂, β₃, Δtₙ, k; 7 | const auto Δtₙ₊₁ = std::pow(εₙ₊₁, β₁/k) * std::pow(εₙ, β₂/k) * std::pow(εₙ₋₁, β₃/k) * Δtₙ; 8 | return 0; 9 | } 10 | -------------------------------------------------------------------------------- /texts/myanmar-1.txt: -------------------------------------------------------------------------------- 1 | မြန်မာစာပဲဗျ။ 2 | လူသားတိုင်းသည် ဂုဏ်သိက္ခာနှင့် အခွင့်အရေးများဖြင့် လွတ်လပ်စွာ မွေးဖွားလာကြသည်။ ၎င်းတို့သည် ဆင်ခြင်တုံတရားနှင့် အသိတရားနှင့် ပြည့်စုံပြီး အချင်းချင်း ညီရင်းအစ်ကို စိတ်ဓာတ်ဖြင့် ပြုမူသင့်သည်။ 3 | -------------------------------------------------------------------------------- /texts/nfc-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | int main() { 3 | int Café = 1; // with Mn. Cafe\u0301 is not in NFC -Wnormalized 4 | assert(Café == 1); // Caf\u00e9 without Mn, same NFC 5 | return 0; 6 | } 7 | -------------------------------------------------------------------------------- /texts/nfkc-1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | int ℌ = 1; // which is which? 6 | int Ⅸ = 2; 7 | assert(H == 1); // same NFKC as ℌ 8 | assert(IX == 2); // same NFKC as Ⅸ 9 | double ℯ = M_E; 10 | assert(e == M_E); // same NFKC as ℯ 11 | int ℨ = 3; 12 | assert(Z == 3); // same NFKC as ℨ 13 | return 0; 14 | } 15 | -------------------------------------------------------------------------------- /texts/norm-sec-1.cperl: -------------------------------------------------------------------------------- 1 | use utf8; 2 | use feature 'unicode_strings'; 3 | $x = 8; 4 | print $x²; # => 64 5 | -------------------------------------------------------------------------------- /texts/norm-sec-1.py: -------------------------------------------------------------------------------- 1 | xⁿ = 8 2 | print(xn) # => 8 3 | -------------------------------------------------------------------------------- /texts/norm-sec-1.raku: -------------------------------------------------------------------------------- 1 | my $x = 8; 2 | print $x²; # => 64 3 | -------------------------------------------------------------------------------- /texts/odia-1.txt: -------------------------------------------------------------------------------- 1 | ଓଡିଆ ସ୍କ୍ରିପ୍ଟ | 2 | ସମସ୍ତ ମଣିଷ ମୁକ୍ତ ଏବଂ ସମ୍ମାନ ଏବଂ ଅଧିକାରରେ ସମାନ ଭାବରେ ଜନ୍ମଗ୍ରହଣ କରନ୍ତି | ସେମାନଙ୍କୁ କାରଣ ଏବଂ ବିବେକ ଦିଆଯାଇଛି ଏବଂ ଭାଇଚାରା ଭାବନାରେ ପରସ୍ପର ପ୍ରତି କାର୍ଯ୍ୟ କରିବା ଉଚିତ୍ | 3 | -------------------------------------------------------------------------------- /texts/punjabi-1.txt: -------------------------------------------------------------------------------- 1 | ਗੁਰਮੁਖੀ ਲਿਪੀ 2 | ਸਾਰੇ ਮਨੁੱਖ ਅਜ਼ਾਦ ਪੈਦਾ ਹੋਏ ਹਨ ਅਤੇ ਸਨਮਾਨ ਅਤੇ ਅਧਿਕਾਰਾਂ ਵਿੱਚ ਬਰਾਬਰ ਹਨ। ਉਹ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਨਾਲ ਸੰਪੰਨ ਹਨ ਅਤੇ ਉਹਨਾਂ ਨੂੰ ਇੱਕ ਦੂਜੇ ਪ੍ਰਤੀ ਭਾਈਚਾਰੇ ਦੀ ਭਾਵਨਾ ਨਾਲ ਕੰਮ ਕਰਨਾ ਚਾਹੀਦਾ ਹੈ। 3 | -------------------------------------------------------------------------------- /texts/scots-gaelic-1.txt: -------------------------------------------------------------------------------- 1 | Albannaich-gaelic 2 | Tha mac an duine air a bhreith an-asgaidh agus co-ionann ann an urram agus còirichean. Tha iad air an toirt seachad le adhbhar agus cogais agus bu chòir dhaibh a bhith ag obair an aghaidh a chèile ann an spiorad bràithreachas. 3 | -------------------------------------------------------------------------------- /texts/sinhala-1.txt: -------------------------------------------------------------------------------- 1 | සිංහල අකුරු 2 | සියලුම මිනිසුන් නිදහස් හා සමාන ගෞරවයෙන් හා අයිතිවාසිකම්වලින් උපත ලබයි. ඔවුන් තර්කානුකූලව හා හෘදය සාක්ෂියෙන් යුක්ත වන අතර එකිනෙකාට සහෝදරත්වයෙන් යුතුව කටයුතු කළ යුතුය. 3 | -------------------------------------------------------------------------------- /texts/swedish-1.txt: -------------------------------------------------------------------------------- 1 | latinsk skrift 2 | Alla människor är födda fria och lika i värdighet och rättigheter. De är utrustade med förnuft och samvete och bör handla mot varandra i en anda av broderskap. 3 | -------------------------------------------------------------------------------- /texts/tamil-1.txt: -------------------------------------------------------------------------------- 1 | தமிழ் எழுத்து 2 | எல்லா மனிதர்களும் சுதந்திரமாகவும் கண்ணியத்திலும் உரிமைகளிலும் சமமாகப் பிறந்தவர்கள். அவர்கள் பகுத்தறிவும் மனசாட்சியும் கொண்டவர்கள் மற்றும் சகோதரத்துவ உணர்வோடு ஒருவருக்கொருவர் செயல்பட வேண்டும். 3 | -------------------------------------------------------------------------------- /texts/telugu-1.txt: -------------------------------------------------------------------------------- 1 | తెలుగు లిపి 2 | మానవులందరూ స్వేచ్ఛగా మరియు గౌరవం మరియు హక్కులలో సమానంగా జన్మించారు. వారు హేతువు మరియు మనస్సాక్షిని కలిగి ఉంటారు మరియు సోదరభావంతో ఒకరితో ఒకరు వ్యవహరించాలి. 3 | -------------------------------------------------------------------------------- /texts/thai-1.txt: -------------------------------------------------------------------------------- 1 | อักษรไทย 2 | มนุษย์ทุกคนเกิดมาอย่างเสรีและเท่าเทียมกันในศักดิ์ศรีและสิทธิ พวกเขามีเหตุผลและมโนธรรมและควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ 3 | -------------------------------------------------------------------------------- /texts/turkish-1.txt: -------------------------------------------------------------------------------- 1 | Türkçe 2 | Bütün insanlar hür, haysiyet ve haklar bakımından eşit doğarlar. Akıl ve vicdan sahibidirler ve birbirlerine karşı kardeşlik ruhuyla hareket etmelidirler. 3 | -------------------------------------------------------------------------------- /texts/urdu-1.txt: -------------------------------------------------------------------------------- 1 | اردو 2 | تمام انسان آزاد پیدا ہوئے ہیں اور عزت اور حقوق میں برابر ہیں۔ وہ عقل اور ضمیر کے حامل ہیں اور انہیں بھائی چارے کے جذبے سے ایک دوسرے کے ساتھ برتاؤ کرنا چاہیے۔ 3 | -------------------------------------------------------------------------------- /texts/vietnamese-1.txt: -------------------------------------------------------------------------------- 1 | Chữ quốc ngữ 2 | Tất cả con người sinh ra đều tự do, bình đẳng về nhân phẩm và quyền. Họ được phú cho lý trí và lương tâm và nên hành động với nhau trong tinh thần anh em. 3 | -------------------------------------------------------------------------------- /u8id_private.h: -------------------------------------------------------------------------------- 1 | #ifndef _U8ID_PRIVATE_H 2 | #define _U8ID_PRIVATE_H 3 | 4 | #ifdef HAVE_CONFIG_H 5 | # include "config.h" 6 | #endif 7 | #include 8 | #include 9 | #include 10 | 11 | #if defined _WIN32 || defined __CYGWIN__ 12 | # define EXTERN __declspec(dllexport) 13 | # define LOCAL 14 | #elif __GNUC__ >= 4 15 | # define EXTERN __attribute__((visibility("default"))) 16 | # define LOCAL __attribute__((visibility("hidden"))) 17 | #else 18 | # define EXTERN 19 | # define LOCAL 20 | #endif 21 | 22 | #ifndef PERF_TEST 23 | // they are all too slow 24 | # undef USE_ALLOWED_CROAR 25 | # undef USE_MARK_CROAR 26 | # undef USE_NORM_CROAR 27 | #else 28 | # define USE_ALLOWED_CROAR 29 | # define USE_MARK_CROAR 30 | # define USE_NORM_CROAR 31 | #endif 32 | 33 | #if __GNUC__ >= 3 34 | # define _expect(expr, value) __builtin_expect((expr), (value)) 35 | # define INLINE static inline 36 | #else 37 | # define _expect(expr, value) (expr) 38 | # define INLINE static 39 | #endif 40 | #ifndef likely 41 | # define likely(expr) _expect((long)((expr) != 0), 1) 42 | # define unlikely(expr) _expect((long)((expr) != 0), 0) 43 | #endif 44 | 45 | #define ARRAY_SIZE(x) sizeof(x) / sizeof(*x) 46 | #define strEQ(s1, s2) !strcmp((s1), (s2)) 47 | #define strEQc(s1, s2) !strcmp((s1), s2 "") 48 | 49 | #define NFC 0 50 | #define NFD 1 51 | #define NFKC 2 52 | #define NFKD 3 53 | #define FCD 4 54 | #define FCC 5 55 | #define C11_6 7 56 | #define C26_4 8 57 | 58 | // allowed set of identifiers. TR31 --xid tokenizer options 59 | // we need XID, the default, as first for uninitialized options. 60 | enum xid_e { 61 | XID, // ID minus NFKC quirks, labelled stable, the default. 62 | ID, // all letters, plus numbers, punctuation and marks. With exotic scripts. 63 | ALLOWED, // TR39 ID with only recommended scripts. Allowed IdentifierStatus. 64 | SAFEC26, // practical XID with TR39 security measures, see P2528R1. 65 | C23, // XID with NFC requirement from C23 (P1949, N2828). 66 | C11, // the stable insecure AltId ranges from the C11 standard, Annex D. 67 | ALLUTF8, // all > 128, e.g. D, php, nim, crystal. 68 | ASCII, // only ASCII letters. 69 | }; 70 | #define XID 0 71 | #define ID 1 72 | #define ALLOWED 2 73 | #define SAFEC26 3 74 | #define C23 4 75 | #define C11 5 76 | #define ALLUTF8 6 77 | #define ASCII 7 78 | #define NONE 8 79 | #define FIRST_XID_E XID 80 | #define LAST_XID_E ASCII 81 | 82 | #define _XSTR(s) _STR(s) 83 | #define _STR(s) #s 84 | #define CAT(a, b) a##b 85 | #define PASTE(a, b) CAT(a, b) 86 | #define JOIN(prefix, name) PASTE(prefix, PASTE(_, name)) 87 | 88 | #ifdef U8ID_NORM 89 | # if U8ID_NORM == NFC 90 | # define U8ID_NORM_DEFAULT U8ID_NFC 91 | # elif U8ID_NORM == NFD 92 | # define U8ID_NORM_DEFAULT U8ID_NFD 93 | # elif U8ID_NORM == NFKD 94 | # define U8ID_NORM_DEFAULT U8ID_NFKD 95 | # elif U8ID_NORM == NFKC 96 | # define U8ID_NORM_DEFAULT U8ID_NFKC 97 | # elif U8ID_NORM == FCC 98 | # define U8ID_NORM_DEFAULT U8ID_FCC 99 | # elif U8ID_NORM == FCD 100 | # define U8ID_NORM_DEFAULT U8ID_FCD 101 | # else 102 | # error "Invalid U8ID_NORM "_XSTR(U8ID_NORM) 103 | # endif 104 | #else 105 | # define U8ID_NORM_DEFAULT U8ID_NFC 106 | #endif 107 | 108 | #ifdef U8ID_PROFILE 109 | # if U8ID_PROFILE == 1 110 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_1 111 | # elif U8ID_PROFILE == 2 112 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_2 113 | # elif U8ID_PROFILE == 3 114 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_3 115 | # elif U8ID_PROFILE == 4 116 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4 117 | # elif U8ID_PROFILE == 5 118 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_5 119 | # elif U8ID_PROFILE == 6 120 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_6 121 | # elif U8ID_PROFILE == C26_4 122 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C26_4 123 | # define U8ID_PROFILE_SAFEC26 124 | # elif U8ID_PROFILE == C11_6 125 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C11_6 126 | # define U8ID_PROFILE_C11STD 127 | # else 128 | # error "Invalid U8ID_PROFILE "_XSTR(U8ID_PROFILE) 129 | # endif 130 | #elif defined U8ID_PROFILE_SAFEC26 131 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C26_4 132 | # define U8ID_PROFILE C26_4 133 | #elif defined U8ID_PROFILE_C11STD 134 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C11_6 135 | # define U8ID_PROFILE C11_6 136 | #else 137 | // Moderately Restrictive 138 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4 139 | #endif 140 | 141 | #ifdef DISABLE_U8ID_TR31 142 | # define DISABLE_CHECK_XID 143 | # define U8ID_TR31_DEFAULT 0 144 | # undef U8ID_TR31 145 | # undef ENABLE_CHECK_XID 146 | #endif 147 | #ifdef U8ID_TR31 148 | //# pragma message("U8ID_TR31=" _XSTR(U8ID_TR31)) 149 | # if U8ID_TR31 == NONE 150 | # define DISABLE_CHECK_XID 151 | # define U8ID_TR31_DEFAULT 0 152 | # else 153 | # define ENABLE_CHECK_XID 154 | # if U8ID_TR31 == ALLOWED 155 | # define U8ID_TR31_DEFAULT U8ID_TR31_ALLOWED 156 | # elif U8ID_TR31 == ASCII 157 | # define U8ID_TR31_DEFAULT U8ID_TR31_ASCII 158 | # elif U8ID_TR31 == SAFEC26 159 | # define U8ID_TR31_DEFAULT U8ID_TR31_SAFEC26 160 | # elif U8ID_TR31 == ID 161 | # define U8ID_TR31_DEFAULT U8ID_TR31_ID 162 | # elif U8ID_TR31 == XID 163 | # define U8ID_TR31_DEFAULT U8ID_TR31_XID 164 | # elif U8ID_TR31 == C11 165 | # define U8ID_TR31_DEFAULT U8ID_TR31_C11 166 | # elif U8ID_TR31 == C23 167 | # define U8ID_TR31_DEFAULT U8ID_TR31_C23 168 | # elif U8ID_TR31 == ALLUTF8 169 | # define U8ID_TR31_DEFAULT U8ID_TR31_ALLUTF8 170 | # endif 171 | # endif 172 | #else 173 | # define U8ID_TR31_DEFAULT U8ID_TR31_XID 174 | #endif 175 | 176 | #if defined U8ID_NORM && (U8ID_NORM != NFC) 177 | # if (U8ID_TR31 == SAFEC26) || (U8ID_TR31 == C23) 178 | # error "Invalid U8ID_NORM with U8ID_TR31" 179 | # endif 180 | #endif 181 | 182 | #include "htable.h" 183 | 184 | #define U8ID_CTX_TRESH 5 185 | #define U8ID_SCR_TRESH 8 186 | struct ctx_t { 187 | uint8_t count; 188 | uint8_t has_han : 1; 189 | uint8_t is_japanese : 1; 190 | uint8_t is_chinese : 1; 191 | uint8_t is_korean : 1; 192 | uint8_t is_rtl : 1; // Hebrew or Arabic 193 | uint32_t last_cp; // only set on errors 194 | union { 195 | uint64_t scr64; // room for 8 scripts 196 | uint8_t scr8[U8ID_SCR_TRESH]; 197 | // we need more than 8 only with insecure 198 | // profiles, or when we manually add extra scripts. 199 | uint8_t *u8p; // or if count > 8 200 | }; 201 | struct htable *htab; 202 | struct htable *htab1; 203 | }; 204 | 205 | // clang-format off 206 | #if (defined(__GNUC__) && ((__GNUC__ * 100) + __GNUC_MINOR__) >= 480) 207 | # define GCC_DIAG_PRAGMA(x) _Pragma (#x) 208 | # define GCC_DIAG_IGNORE(x) \ 209 | GCC_DIAG_PRAGMA (GCC diagnostic ignored #x) 210 | #else 211 | # define GCC_DIAG_IGNORE(w) 212 | #endif 213 | 214 | LOCAL enum u8id_norm u8ident_norm(void); 215 | LOCAL enum u8id_profile u8ident_profile(void); 216 | LOCAL enum u8id_options u8ident_tr31(void); 217 | LOCAL unsigned u8ident_options(void); 218 | LOCAL unsigned u8ident_maxlength(void); 219 | LOCAL const char *u8ident_errstr(int errcode); 220 | // from u8idnorm.c 221 | LOCAL uint32_t dec_utf8(char **strp); 222 | LOCAL char *enc_utf8(char *dest, size_t *lenp, const uint32_t cp); 223 | 224 | #endif // _U8ID_PRIVATE_H 225 | -------------------------------------------------------------------------------- /u8idlint.1: -------------------------------------------------------------------------------- 1 | .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.49.3. 2 | .TH U8IDLINT "1" "December 2024" "u8idlint 0.3.3_d005" "U8IDENT Manual 0.3.3_d005" 3 | .SH NAME 4 | u8idlint \- manual page for u8idlint 0.3.3_d005 5 | .SH SYNOPSIS 6 | .B u8idlint 7 | [\fI\,OPTIONS\/\fR] [\fI\,dirs or files\/\fR]... 8 | .SH DESCRIPTION 9 | u8idlint 0.3.3_d005 10 | .SS "OPTIONS:" 11 | .TP 12 | \fB\-n\fR|\-\-norm=nfc,nfkc,nfd,nfkc 13 | default: nfc 14 | .IP 15 | set to nfkd by default for python 16 | .TP 17 | \fB\-p\fR|\-\-profile=1,2,3,4,5,6,c26_4,c11_6 18 | default: c26_4 19 | .IP 20 | TR39 unicode mixed\-script security profile for identifiers: 21 | .TP 22 | 1 23 | ASCII. Sets xid ascii. 24 | .TP 25 | 2 26 | Single script 27 | .TP 28 | 3 29 | Highly Restrictive 30 | .TP 31 | 4 32 | Moderately Restrictive 33 | .TP 34 | 5 35 | Minimally Restrictive 36 | .TP 37 | 6 38 | Unrestricted 39 | .TP 40 | c11_6 41 | C11STD. Sets xid c11. 42 | .TP 43 | c26_4 44 | SAFEC26 (i.e. 4 with Greek). Sets xid safec26. 45 | .TP 46 | \fB\-x\fR|\-\-xid=ascii,allowed,safec26,id,xid,c11,c23,allutf8 47 | default: allowed 48 | .IP 49 | TR31 set of identifiers: 50 | .TP 51 | ascii 52 | only ASCII letters, punctuations. plus numbers 53 | .TP 54 | allowed 55 | tr31 with only recommended scripts, IdentifierStatus 56 | .TP 57 | safec26 58 | allowed but different Identifier_Type and NFC 59 | .TP 60 | id 61 | all letters. plus numbers, punctuations and combining marks 62 | .TP 63 | xid 64 | stable id subset, no NFKC quirks 65 | .TP 66 | c23 67 | xid with NFC requirement from C23 68 | .TP 69 | c11 70 | some AltId unicode ranges from C11 71 | .TP 72 | allutf8 73 | allow all >128. e.g. php, nim, crystal 74 | .TP 75 | \fB\-e\fR|\-\-ext=.c 76 | only this file extension 77 | .HP 78 | \fB\-r\fR|\-\-recursive 79 | .HP 80 | \fB\-v\fR|\-\-verbose 81 | .HP 82 | \fB\-q\fR|\-\-quiet 83 | .HP 84 | \fB\-\-help\fR 85 | .PP 86 | u8idlint checks all words in UTF\-8 source files for 87 | violations against various unicode security guidelines for identifiers. 88 | For special known file extensions it uses its default xid to parse identifiers. 89 | (i.e. *.c uses C11) 90 | It adds a special BIDI warning on bidi formatting chars and when the document 91 | is not in Hebrew nor Arabic. 92 | .SS "SEE ALSO:" 93 | .IP 94 | u8ident.3 95 | .SS "AUTHOR:" 96 | .IP 97 | Reini Urban 98 | -------------------------------------------------------------------------------- /u8idlint.test: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # cmd in pwd 3 | # texts in this dir 4 | ROOT="$(dirname "$0")" 5 | if [ "$ROOT" != "." ] && [ ! -d texts ]; then 6 | NEWTEXTS=1 7 | cp -r "$ROOT"/texts . || exit 0 8 | chmod +w texts || exit 0 9 | fi 10 | 11 | check3() { 12 | arg=$1 13 | file=texts/$2 14 | tst=texts/$3 15 | rst=$3.rst 16 | # shellcheck disable=SC2086 17 | ./u8idlint $arg "$file" > "$rst" 18 | if diff "$tst" "$rst"; then 19 | rm -- ./"$rst" 20 | else 21 | echo "$arg $3.rst failed" 22 | exit 23 | fi 24 | } 25 | 26 | check3 -xsafec26 homo-sec-1.c homo-sec-1.tst 27 | check3 -xc11 homo-sec-1.c homo-sec-1.tst 28 | check3 "-p1 -xc11" homo-sec-1.c homo-sec-1-p1.tst 29 | check3 "-p2 -xc11" homo-sec-1.c homo-sec-1.tst 30 | check3 "-p3 -xc11" homo-sec-1.c homo-sec-1.tst 31 | check3 "-p4 -xc11" homo-sec-1.c homo-sec-1.tst 32 | 33 | check3 -xsafec26 homo-1.c homo-1.tst 34 | check3 -xc11 homo-1.c homo-1.tst 35 | 36 | # ascii-xid ignore BIDI words here 37 | check3 -xascii bidi-sec-1.c bidi-sec-1.tst 38 | check3 -xallowed bidi-sec-1.c bidi-sec-1-allowed.tst 39 | check3 -xsafec26 bidi-sec-1.c bidi-sec-1.tst 40 | check3 -xid bidi-sec-1.c bidi-sec-1.tst 41 | check3 -xxid bidi-sec-1.c bidi-sec-1.tst 42 | check3 "" bidi-sec-1.c bidi-sec-1-c11.tst 43 | check3 -xc11 bidi-sec-1.c bidi-sec-1-c11.tst 44 | 45 | check3 -xascii bidi-sec-2.c bidi-sec-2-ascii.tst 46 | check3 -xallowed bidi-sec-2.c bidi-sec-2-allowed.tst 47 | check3 -xsafec26 bidi-sec-2.c bidi-sec-2.tst 48 | check3 -xid bidi-sec-2.c bidi-sec-2.tst 49 | check3 -xxid bidi-sec-2.c bidi-sec-2.tst 50 | check3 -xc11 bidi-sec-2.c bidi-sec-2-c11.tst 51 | check3 "" bidi-sec-2.c bidi-sec-2-c11.tst 52 | 53 | #./u8idlint -xallowed texts/homo-1.c > homo-1-allowed.rst && 54 | # diff texts/homo-1-p4.tst homo-1-allowed.rst && homo-1-allowed.rst 55 | #./u8idlint -xsafec26 texts/homo-1.c > homo-1-c26.rst && 56 | # diff texts/homo-1-p4.tst homo-1-c26.rst && rm homo-1-c26.rst 57 | #./u8idlint -p4 -xc11 texts/homo-1.c > homo-1-c11.rst && 58 | # diff texts/homo-1-p4.tst homo-1-c11.rst && rm homo-1-c11.rst 59 | #./u8idlint -xxid texts/homo-1.c > homo-1-xid.rst && 60 | # diff texts/homo-1-p4st homo-1-xid.rst && homo-1-xid.rst 61 | #./u8idlint -xid texts/homo-1.c > homo-1-id.rst && 62 | # diff texts/homo-1-p4st homo-1-id.rst && rm homo-1-id.rst 63 | #./u8idlint -xc11 texts/bidi-sec-1.c > bidi-sec-1.rst && 64 | # diff texts/bidi-sec-1.tst bidi-sec-1.rst && rm bidi-sec-1.rst 65 | #./u8idlint -xc11 texts/bidi-sec-2.c > bidi-sec-2.rst && 66 | # diff texts/bidi-sec-2.tst bidi-sec-2.rst && rm bidi-sec-2.rst 67 | 68 | rm -f homo-sec-1.tst.rst 69 | #rm -f homo-1-p4.rst bidi-sec-1.rst bidi-sec-2.rst 70 | if [ -n "$NEWTEXTS" ]; then 71 | rm -rf texts || true 72 | fi 73 | -------------------------------------------------------------------------------- /u8idroar.c: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2021 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | use roaring bitmaps for some sets. 6 | Currently only confus_croar is faster than binary search in our ranges lists. 7 | */ 8 | 9 | #include "u8id_private.h" 10 | #ifdef HAVE_CROARING 11 | # include 12 | # include 13 | # include "roaring.c" 14 | # include "confus_croar.h" 15 | 16 | static roaring_bitmap_t *rc = NULL; 17 | 18 | # ifdef USE_ALLOWED_CROAR 19 | # include "allowed_croar.h" 20 | static roaring_bitmap_t *ra = NULL; 21 | # endif 22 | 23 | # ifdef USE_MARK_CROAR 24 | # define EXTERN_SCRIPTS 25 | # include "scripts.h" 26 | # undef EXTERN_SCRIPTS 27 | # include "mark.h" 28 | static roaring_bitmap_t *rm = NULL; 29 | # endif 30 | 31 | # ifdef USE_NORM_CROAR 32 | # include "nfkc_croar.h" 33 | # include "nfc_croar.h" 34 | # include "nfkd_croar.h" 35 | # include "nfd_croar.h" 36 | static roaring_bitmap_t *rnfkc_m = NULL; 37 | static roaring_bitmap_t *rnfc_m = NULL; 38 | static roaring_bitmap_t *rnfkc_n = NULL; 39 | static roaring_bitmap_t *rnfc_n = NULL; 40 | static roaring_bitmap_t *rnfkd_n = NULL; 41 | static roaring_bitmap_t *rnfd_n = NULL; 42 | # endif 43 | 44 | int u8ident_roar_init(void) { 45 | if (!rc) { 46 | rc = roaring_bitmap_portable_deserialize_safe((char *)confus_croar_bin, 47 | confus_croar_bin_len); 48 | if (!rc) 49 | return -1; 50 | } 51 | 52 | // These are disabled by default. Only used by perf 53 | # ifdef USE_NORM_CROAR 54 | 55 | # define DEF_DESERIALIZE_SAFE(rn, n) \ 56 | if (!rn) { \ 57 | rn = roaring_bitmap_portable_deserialize_safe( \ 58 | (char *)JOIN(n, croar_bin), JOIN(n, croar_bin_len)); \ 59 | if (!rn) \ 60 | return -1; \ 61 | } 62 | 63 | DEF_DESERIALIZE_SAFE(rnfkc_m, nfkc_m) 64 | DEF_DESERIALIZE_SAFE(rnfkc_n, nfkc_n) 65 | DEF_DESERIALIZE_SAFE(rnfc_m, nfc_m) 66 | DEF_DESERIALIZE_SAFE(rnfc_n, nfc_n) 67 | DEF_DESERIALIZE_SAFE(rnfkd_n, nfkc_n) 68 | DEF_DESERIALIZE_SAFE(rnfd_n, nfd_n) 69 | # endif 70 | # ifdef USE_ALLOWED_CROAR 71 | DEF_DESERIALIZE_SAFE(ra, allowed) 72 | # endif 73 | # ifdef USE_MARK_CROAR 74 | DEF_DESERIALIZE_SAFE(rm, mark) 75 | # endif 76 | # undef DEF_DESERIALIZE_SAFE 77 | return 0; 78 | } 79 | 80 | void u8ident_roar_free(void) { 81 | 82 | # define FREE_R(rc) \ 83 | if (rc) \ 84 | roaring_bitmap_free(rc); \ 85 | rc = NULL 86 | 87 | FREE_R(rc); 88 | 89 | # ifdef USE_ALLOWED_CROAR 90 | FREE_R(ra); 91 | # endif 92 | # ifdef USE_MARK_CROAR 93 | FREE_R(rm); 94 | # endif 95 | # ifdef USE_NORM_CROAR 96 | FREE_R(rnfkc_m); 97 | FREE_R(rnfkc_n); 98 | FREE_R(rnfc_m); 99 | FREE_R(rnfc_n); 100 | FREE_R(rnfkd_n); 101 | FREE_R(rnfd_n); 102 | # endif 103 | # undef FREE_R 104 | } 105 | 106 | EXTERN bool u8ident_is_confusable(const uint32_t cp) { 107 | return roaring_bitmap_contains(rc, cp); 108 | } 109 | 110 | # ifdef USE_ALLOWED_CROAR 111 | bool u8ident_roar_is_allowed(const uint32_t cp) { 112 | return roaring_bitmap_contains(ra, cp); 113 | } 114 | # endif 115 | 116 | # ifdef USE_MARK_CROAR 117 | bool u8ident_roar_is_mark(const uint32_t cp) { 118 | return roaring_bitmap_contains(rm, cp); 119 | } 120 | # endif 121 | 122 | # ifdef USE_NORM_CROAR 123 | bool u8ident_roar_maybe_nfkc(const uint32_t cp) { 124 | if (roaring_bitmap_contains(rnfkc_n, cp)) 125 | return true; 126 | return roaring_bitmap_contains(rnfkc_m, cp); 127 | } 128 | bool u8ident_roar_maybe_nfc(const uint32_t cp) { 129 | if (roaring_bitmap_contains(rnfc_n, cp)) 130 | return true; 131 | return roaring_bitmap_contains(rnfc_m, cp); 132 | } 133 | bool u8ident_roar_maybe_nfkd(const uint32_t cp) { 134 | return roaring_bitmap_contains(rnfkd_n, cp); 135 | } 136 | bool u8ident_roar_maybe_nfd(const uint32_t cp) { 137 | return roaring_bitmap_contains(rnfd_n, cp); 138 | } 139 | # endif 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /u8idroar.h: -------------------------------------------------------------------------------- 1 | /* libu8ident - Check unicode security guidelines for identifiers. 2 | Copyright 2021 Reini Urban 3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 4 | 5 | use roaring bitmaps for some sets. 6 | */ 7 | 8 | #include "u8id_private.h" 9 | #include 10 | int u8ident_roar_init(void); 11 | int u8ident_roar_free(void); 12 | EXTERN bool u8ident_is_confusable(const uint32_t cp); 13 | 14 | #ifdef USE_ALLOWED_CROAR 15 | bool u8ident_roar_is_allowed(const uint32_t cp); 16 | #endif 17 | 18 | #ifdef USE_MARK_CROAR 19 | bool u8ident_roar_is_mark(const uint32_t cp); 20 | #endif 21 | 22 | #ifdef USE_NORM_CROAR 23 | bool u8ident_roar_maybe_nfkc(const uint32_t cp); 24 | bool u8ident_roar_maybe_nfc(const uint32_t cp); 25 | bool u8ident_roar_maybe_nfkd(const uint32_t cp); 26 | bool u8ident_roar_maybe_nfd(const uint32_t cp); 27 | #endif 28 | -------------------------------------------------------------------------------- /u8idscr.h: -------------------------------------------------------------------------------- 1 | #ifndef _U8IDSCR_H 2 | #define _U8IDSCR_H 3 | 4 | #include 5 | #include 6 | #include "u8id_private.h" 7 | #define EXTERN_SCRIPTS 8 | #include "u8id_gc.h" 9 | #include "scripts.h" 10 | 11 | bool u8ident_has_script(const uint8_t scr); 12 | bool u8ident_has_script_ctx(const uint8_t scr, const struct ctx_t *ctx); 13 | int u8ident_add_script_ctx(const uint8_t scr, struct ctx_t *ctx); 14 | struct ctx_t *u8ident_ctx(void); 15 | uint8_t u8ident_get_script(const uint32_t cp); 16 | /* list of script indices */ 17 | const struct scx *u8ident_get_scx(const uint32_t cp); 18 | /* search for safec23 XID entry, in start or cont lists */ 19 | const struct sc_c23 *u8ident_get_safec23(const uint32_t cp); 20 | bool u8ident_is_MARK(const uint32_t cp); 21 | bool u8ident_is_MEDIAL(const uint32_t cp); 22 | // member or bidi formatting characters for reordering attacks. 23 | // Only valid with RTL scripts, such as Hebrew and Arabic. 24 | bool u8ident_is_bidi(const uint32_t cp); 25 | // Greek letters confusable with Latin 26 | bool u8ident_is_greek_latin_confus(const uint32_t cp); 27 | // bitmask of u8id_idtypes 28 | uint16_t u8ident_get_idtypes(const uint32_t cp); 29 | const char *u8ident_script_name(const int scr); 30 | // bool u8ident_is_decomposed(const uint32_t cp, const uint8_t scr); 31 | bool u8ident_maybe_normalized(const uint32_t cp); 32 | 33 | typedef bool func_tr31(const uint32_t cp); 34 | struct func_tr31_s { 35 | func_tr31 *start; 36 | func_tr31 *cont; 37 | }; 38 | bool isASCII_start(const uint32_t cp); 39 | bool isASCII_cont(const uint32_t cp); 40 | bool isALLOWED_start(const uint32_t cp); 41 | bool isALLOWED_cont(const uint32_t cp); 42 | bool isSAFEC26_start(const uint32_t cp); 43 | bool isSAFEC26_cont(const uint32_t cp); 44 | bool isID_start(const uint32_t cp); 45 | bool isID_cont(const uint32_t cp); 46 | bool isXID_start(const uint32_t cp); 47 | bool isXID_cont(const uint32_t cp); 48 | bool isC11_start(const uint32_t cp); 49 | bool isC11_cont(const uint32_t cp); 50 | bool isC23_start(const uint32_t cp); 51 | bool isC23_cont(const uint32_t cp); 52 | bool isALLUTF8_start(const uint32_t cp); 53 | bool isALLUTF8_cont(const uint32_t cp); 54 | 55 | enum u8id_gc u8ident_get_gc(const uint32_t cp); 56 | const char *u8ident_gc_name(const enum u8id_gc); 57 | 58 | #endif // _U8IDSCR_H 59 | -------------------------------------------------------------------------------- /un8ifexc.h: -------------------------------------------------------------------------------- 1 | /* ex: set ro ft=c: -*- buffer-read-only: t -*- 2 | * 3 | * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 4 | * This file is auto-generated by Unicode-Normalize 1.32 5 | * mkheader -ind -std 6 | * for Unicode 15.0.0 UTF-8 7 | * Any changes here will be lost! 8 | */ 9 | /* Composite exclusions */ 10 | bool isExclusion (uint32_t uv); 11 | 12 | bool isExclusion (uint32_t uv) 13 | { 14 | return 15 | (2392 <= uv && uv <= 2399) 16 | || (2524 <= uv && uv <= 2525) 17 | || uv == 2527 18 | || uv == 2611 19 | || uv == 2614 20 | || (2649 <= uv && uv <= 2651) 21 | || uv == 2654 22 | || (2908 <= uv && uv <= 2909) 23 | || uv == 3907 24 | || uv == 3917 25 | || uv == 3922 26 | || uv == 3927 27 | || uv == 3932 28 | || uv == 3945 29 | || uv == 3958 30 | || uv == 3960 31 | || uv == 3987 32 | || uv == 3997 33 | || uv == 4002 34 | || uv == 4007 35 | || uv == 4012 36 | || uv == 4025 37 | || uv == 10972 38 | || uv == 64285 39 | || uv == 64287 40 | || (64298 <= uv && uv <= 64310) 41 | || (64312 <= uv && uv <= 64316) 42 | || uv == 64318 43 | || (64320 <= uv && uv <= 64321) 44 | || (64323 <= uv && uv <= 64324) 45 | || (64326 <= uv && uv <= 64334) 46 | || (119134 <= uv && uv <= 119140) 47 | || (119227 <= uv && uv <= 119232) 48 | ? TRUE : FALSE; 49 | } 50 | 51 | /* Singletons */ 52 | bool isSingleton (uint32_t uv); 53 | 54 | bool isSingleton (uint32_t uv) 55 | { 56 | return 57 | (832 <= uv && uv <= 833) 58 | || uv == 835 59 | || uv == 884 60 | || uv == 894 61 | || uv == 903 62 | || uv == 8049 63 | || uv == 8051 64 | || uv == 8053 65 | || uv == 8055 66 | || uv == 8057 67 | || uv == 8059 68 | || uv == 8061 69 | || uv == 8123 70 | || uv == 8126 71 | || uv == 8137 72 | || uv == 8139 73 | || uv == 8147 74 | || uv == 8155 75 | || uv == 8163 76 | || uv == 8171 77 | || (8174 <= uv && uv <= 8175) 78 | || uv == 8185 79 | || uv == 8187 80 | || uv == 8189 81 | || (8192 <= uv && uv <= 8193) 82 | || uv == 8486 83 | || (8490 <= uv && uv <= 8491) 84 | || (9001 <= uv && uv <= 9002) 85 | || (63744 <= uv && uv <= 64013) 86 | || uv == 64016 87 | || uv == 64018 88 | || (64021 <= uv && uv <= 64030) 89 | || uv == 64032 90 | || uv == 64034 91 | || (64037 <= uv && uv <= 64038) 92 | || (64042 <= uv && uv <= 64109) 93 | || (64112 <= uv && uv <= 64217) 94 | || (194560 <= uv && uv <= 195101) 95 | ? TRUE : FALSE; 96 | } 97 | 98 | /* non-starter decompositions */ 99 | bool isNonStDecomp (uint32_t uv); 100 | 101 | bool isNonStDecomp (uint32_t uv) 102 | { 103 | return 104 | uv == 836 105 | || uv == 3955 106 | || uv == 3957 107 | || uv == 3969 108 | ? TRUE : FALSE; 109 | } 110 | 111 | /* may be composed with a prev char */ 112 | bool isComp2nd (uint32_t uv); 113 | 114 | bool isComp2nd (uint32_t uv) 115 | { 116 | return 117 | (768 <= uv && uv <= 772) 118 | || (774 <= uv && uv <= 780) 119 | || uv == 783 120 | || uv == 785 121 | || (787 <= uv && uv <= 788) 122 | || uv == 795 123 | || (803 <= uv && uv <= 808) 124 | || (813 <= uv && uv <= 814) 125 | || (816 <= uv && uv <= 817) 126 | || uv == 824 127 | || uv == 834 128 | || uv == 837 129 | || (1619 <= uv && uv <= 1621) 130 | || uv == 2364 131 | || uv == 2494 132 | || uv == 2519 133 | || uv == 2878 134 | || (2902 <= uv && uv <= 2903) 135 | || uv == 3006 136 | || uv == 3031 137 | || uv == 3158 138 | || uv == 3266 139 | || (3285 <= uv && uv <= 3286) 140 | || uv == 3390 141 | || uv == 3415 142 | || uv == 3530 143 | || uv == 3535 144 | || uv == 3551 145 | || uv == 4142 146 | || (4449 <= uv && uv <= 4469) 147 | || (4520 <= uv && uv <= 4546) 148 | || uv == 6965 149 | || (12441 <= uv && uv <= 12442) 150 | || uv == 69818 151 | || uv == 69927 152 | || uv == 70462 153 | || uv == 70487 154 | || uv == 70832 155 | || uv == 70842 156 | || uv == 70845 157 | || uv == 71087 158 | || uv == 71984 159 | ? TRUE : FALSE; 160 | } 161 | 162 | --------------------------------------------------------------------------------