├── .clang-format
├── .github
├── FUNDING.yml
└── workflows
│ └── ci.yml
├── .gitignore
├── .mdl_style.rb
├── .mdlrc
├── CMakeLists.txt
├── FindCRoaring.cmake
├── LICENSE
├── Makefile.am
├── NEWS
├── NOTICE
├── README.md
├── _config.yml
├── build-aux
├── autogen.sh
└── git-version-gen
├── c11-all.h
├── cmakeconfig.h.in
├── config.h.in
├── configure.ac
├── confus.h
├── confus_croar.h
├── doc
├── D2528R1.html
├── D2528R1.md
├── D2528R1.pdf
├── P2528R0.html
├── P2528R0.md
├── P2528R0.pdf
├── c11.md
├── n2916.html
├── n2916.md
├── n2916.pdf
├── n2932.html
├── n2932.md
├── n2932.pdf
├── nXXXX.html
├── nXXXX.md
├── nXXXX.patch
├── nXXXX.pdf
└── tr31-bugs.md
├── example.c
├── gconfus.h
├── gconfus.h.in
├── hangul.h
├── htable.c
├── htable.h
├── include
└── u8ident.h
├── libu8ident.pc.in
├── m4
└── ax_gcc_builtin.m4
├── makefile.gnu
├── mark.h
├── medial.h
├── mingw-cross.cmake
├── mkc26.c
├── mkconfus.pl
├── mkgc.pl
├── mkmark.pl
├── mkmedial.pl
├── mkroar.c
├── mkscripts.pl
├── mktest-norm.pl
├── perf.c
├── scripts.h
├── scripts16.h
├── test-all-fast.sh
├── test-all.sh
├── test-texts.c
├── test-texts.test.in
├── test.c
├── texts
├── amharic-1.txt
├── arabic-1.c
├── arabic-1.txt
├── armenian-1.txt
├── bengali-1.txt
├── bidi-sec-1-allowed.tst
├── bidi-sec-1-c11.tst
├── bidi-sec-1.c
├── bidi-sec-1.py
├── bidi-sec-1.tst
├── bidi-sec-2-allowed.tst
├── bidi-sec-2-ascii.tst
├── bidi-sec-2-c11.tst
├── bidi-sec-2.c
├── bidi-sec-2.tst
├── bidi-sec-3.c
├── bidi-sec-4.cc
├── bopomofo-1.txt
├── bulgarian-1.txt
├── chinese-s-1.txt
├── chinese-trad-1.txt
├── cyrillic-1.txt
├── english-1.txt
├── farsi-1.txt
├── georgian-1.txt
├── greek-1.txt
├── gujarati-1.txt
├── hebrew-1.txt
├── hindi-1.txt
├── homo-1-p4.tst
├── homo-1.c
├── homo-1.tst
├── homo-sec-1-p1.tst
├── homo-sec-1.c
├── homo-sec-1.js
├── homo-sec-1.py
├── homo-sec-1.tst
├── homo-sec-2.c
├── hungarian-1.txt
├── igbo-1.txt
├── japanese-1.txt
├── japanese-2.txt
├── kannada-1.txt
├── khmer-1.txt
├── korean-1.txt
├── korean-2.txt
├── lao-1.txt
├── latin-1.txt
├── malayalam-1.txt
├── marathi-1.txt
├── math-1.c
├── math-2.c
├── math-2.cc
├── myanmar-1.txt
├── nfc-1.c
├── nfkc-1.c
├── norm-sec-1.cperl
├── norm-sec-1.py
├── norm-sec-1.raku
├── odia-1.txt
├── punjabi-1.txt
├── result.lst
├── scots-gaelic-1.txt
├── sinhala-1.txt
├── swedish-1.txt
├── tamil-1.txt
├── telugu-1.txt
├── thai-1.txt
├── turkish-1.txt
├── urdu-1.txt
└── vietnamese-1.txt
├── u8id_gc.h
├── u8id_private.h
├── u8ident.c
├── u8idlint.1
├── u8idlint.c
├── u8idlint.test
├── u8idnorm.c
├── u8idroar.c
├── u8idroar.h
├── u8idscr.c
├── u8idscr.h
├── un8ifcan.h
├── un8ifcmb.h
├── un8ifcmp.h
├── un8ifcpt.h
├── un8ifexc.h
├── unic11.h
└── unic26.h
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | Language: Cpp
3 | BasedOnStyle: LLVM
4 | IndentPPDirectives: AfterHash
5 | SortIncludes: false
6 | SortUsingDeclarations: false
7 | ...
8 |
9 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: rurban
4 | patreon: rurban
5 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Github CI
2 | on: [push, pull_request]
3 |
4 | #strategy:
5 | # matrix:
6 | # os: [ubuntu-14.04, ubuntu-18.04, ubuntu-20.04, ubuntu-latest]
7 | jobs:
8 | linux:
9 | runs-on: ubuntu-20.04
10 | timeout-minutes: 10
11 | strategy:
12 | fail-fast: false
13 | matrix:
14 | CFLAGS:
15 | -
16 | - -march=native -Wall -Wextra -O2
17 | - -march=native -O3 -flto
18 | - -march=native -O2 -fsanitize=address,undefined -fno-omit-frame-pointer
19 | - -m32
20 | CC:
21 | - gcc
22 | - clang
23 | steps:
24 | - uses: actions/checkout@v3
25 | with:
26 | fetch-depth: 1
27 | #- run: sudo apt-get install perl groff
28 | - run: autoreconf -fi
29 | - run: ./configure --enable-confus
30 | - run: make V=1
31 | - run: make V=1 check || (cat test-suite.log; false)
32 | - run: make V=1 check-all
33 | - if: matrix.CFLAGS == '' && matrix.CC == 'gcc'
34 | run: make distcheck
35 | - if: contains(matrix.CFLAGS, '-Wall -Wextra -O2') && matrix.CC == 'clang'
36 | run: make check-all-combinations
37 | - name: Prep-Release
38 | if: matrix.CFLAGS == '' && matrix.CC == 'gcc' && startsWith(github.ref, 'refs/tags/')
39 | run: make dist pkg && echo ${{ github.sha }} > Release.txt; sha256sum libu8ident-*-x86_64-pc-linux-gnu.tar.gz > linux.sha256
40 | - name: Release
41 | continue-on-error: true
42 | uses: softprops/action-gh-release@v1
43 | if: matrix.CFLAGS == '' && matrix.CC == 'gcc' && startsWith(github.ref, 'refs/tags/')
44 | with:
45 | files: |
46 | Release.txt
47 | linux.sha256
48 | libu8ident-*-x86_64-pc-linux-gnu.tar.gz
49 | libu8ident-*.tar.gz
50 | libu8ident-*.tar.xz
51 | macOS:
52 | name: macOS
53 | runs-on: macOS-latest
54 | steps:
55 | - name: checkout
56 | uses: actions/checkout@v3
57 | - run: brew install autoconf automake libtool m4
58 | - run: autoreconf -fi
59 | - run: ./configure --enable-confus
60 | - run: make
61 | - run: make check
62 | - run: make check-all
63 | mingw:
64 | name: mingw
65 | runs-on: windows-latest
66 | env:
67 | MSYS2_DIR: msys64
68 | MSYS2_ARCH: x86_64
69 | MSYSTEM: MINGW64
70 | ARCH: win64
71 | PLATFORM: x64
72 | #PATH: "C:\%MSYS2_DIR%\%MSYSTEM%\bin;C:\%MSYS2_DIR%\usr\bin;%PATH%"
73 | steps:
74 | # see https://github.com/msys2/setup-msys2
75 | - name: setup-msys2
76 | uses: msys2/setup-msys2@v2
77 | with:
78 | path-type: minimal
79 | update: true
80 | install: >-
81 | base-devel
82 | coreutils
83 | zip
84 | mingw-w64-x86_64-toolchain
85 | mingw-w64-x86_64-libtool
86 | mingw-w64-x86_64-perl
87 | libtool
88 | autoconf-wrapper
89 | automake-wrapper
90 | - run: reg add "HKLM\Software\Microsoft\Windows\Windows Error Reporting" /f /v DontShowUI /d 1
91 | - run: git config --global core.autocrlf input
92 | - name: checkout
93 | uses: actions/checkout@v3
94 | - shell: msys2 {0}
95 | #run: autoreconf -fi
96 | run: libtoolize -f && aclocal -I m4 && autoconf && automake --add-missing; autoreconf -fi
97 | continue-on-error: true
98 | # msys2 is broken for today
99 | - shell: msys2 {0}
100 | run: ./configure --enable-confus
101 | continue-on-error: true
102 | - shell: msys2 {0}
103 | run: make
104 | continue-on-error: true
105 | - shell: msys2 {0}
106 | run: make check
107 | continue-on-error: true
108 | - shell: msys2 {0}
109 | run: make pkg; sha256sum libu8ident-*-x86_64-w64-mingw32.zip > mingw.sha256
110 | continue-on-error: true
111 | - name: Release
112 | uses: softprops/action-gh-release@v1
113 | if: startsWith(github.ref, 'refs/tags/')
114 | with:
115 | files: |
116 | mingw.sha256
117 | libu8ident-*-x86_64-w64-mingw32.zip
118 | linux-cmake:
119 | runs-on: ubuntu-latest
120 | timeout-minutes: 10
121 | steps:
122 | - uses: actions/checkout@v3
123 | with:
124 | fetch-depth: 1
125 | - run: cmake .
126 | - run: make
127 | - run: make test
128 | macos-cmake:
129 | runs-on: macOS-latest
130 | timeout-minutes: 10
131 | steps:
132 | - uses: actions/checkout@v3
133 | with:
134 | fetch-depth: 1
135 | - run: cmake .
136 | - run: make
137 | - run: make test
138 | windows-cmake:
139 | runs-on: windows-latest
140 | timeout-minutes: 10
141 | steps:
142 | - uses: actions/checkout@v3
143 | with:
144 | fetch-depth: 1
145 | #- uses: microsoft/setup-msbuild@v1.1
146 | - run: cmake .
147 | - run: cmake --build . --config Release --verbose
148 | - run: cmake --build . --target RUN_TESTS --config Release --verbose
149 | - run: cmake --build . --target package --config Release
150 | # currently -G for "Visual Studio 16 2019"
151 | #- run: msbuild libu8ident.sln
152 | #- run: Debug\u8idtest
153 | - name: Release
154 | uses: softprops/action-gh-release@v1
155 | if: startsWith(github.ref, 'refs/tags/')
156 | with:
157 | files: |
158 | libu8ident-*-win64.exe
159 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.gdbinit
2 | .analysis
3 | .version
4 | GTAGS
5 | GPATH
6 | GRTAGS
7 | TAGS
8 | autom4te.cache
9 | config.h
10 | include/u8ident.h.bak
11 | libu8ident.a
12 | libu8ident.so
13 | libu8ident.so.0
14 | libu8ident.so.1
15 | a.out
16 | u8idlint
17 | u8ident.o
18 | u8idnorm.o
19 | u8idscr.o
20 | u8idroar.o
21 | htable.o
22 | /test
23 | /test-asan
24 | /test-texts
25 | /mkc26
26 | /example
27 | /build
28 | /build-asan
29 | /build-[b-z0-9]*/
30 | /Scripts.txt
31 | /ScriptExtensions.txt
32 | /PropertyValueAliases.txt
33 | /DerivedCoreProperties*.txt
34 | /DerivedNormalizationProps*.txt
35 | /IdentifierType.txt
36 | /IdentifierStatus.txt
37 | /confusables.txt
38 | /UnicodeData*.txt
39 | Unicode-Normalize
40 | libu8ident-*.tar.gz
41 | libu8ident-*.tar.xz
42 | Makefile.in
43 | TODO
44 | aclocal.m4
45 | build-aux/compile
46 | build-aux/config.guess
47 | build-aux/config.sub
48 | configure
49 | build-aux/depcomp
50 | build-aux/install-sh
51 | build-aux/ltmain.sh
52 | build-aux/missing
53 | build-aux/test-driver
54 | m4/
55 | roaring.c
56 | roaring.h
57 | u8ident.3
58 | mkroar
59 | perf
60 | gconfus.h
61 | allowed_croar.h
62 | nfc_croar.h
63 | nfd_croar.h
64 | nfkc_croar.h
65 | nfkd_croar.h
66 | allowed_croar.bin
67 | confus_croar.bin
68 | nf*_croar.bin
69 | mark_croar.bin
70 | test-texts.test
71 | texts.tst
72 | doc/my.latex
73 | doc/appendix-h.md
74 | texts/homo-sec-2.cc
75 | homo-sec-1
76 | homo-sec-1.o
77 | Dockerfile.pandoc
78 | doc/P2528R*.pdf
79 | doc/n2932.pdf
80 |
--------------------------------------------------------------------------------
/.mdl_style.rb:
--------------------------------------------------------------------------------
1 | all
2 | # Multiple top level headers in the same document
3 | exclude_rule 'MD025'
4 |
5 | #rule 'MD004', 'ul-style' => :sublist
6 | exclude_rule 'MD004'
7 |
8 | rule 'MD029', 'ol-prefix' => :ordered
9 | exclude_rule 'MD029'
10 |
11 | exclude_rule 'MD046'
12 |
13 | # First line in file should be a top level header.
14 | # We set an explicit title for pdf and html
15 | exclude_rule 'MD041'
16 |
--------------------------------------------------------------------------------
/.mdlrc:
--------------------------------------------------------------------------------
1 | style '.mdl_style.rb'
2 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | project(libu8ident C)
2 | cmake_minimum_required(VERSION 2.9)
3 | # Supported options:
4 | # -DBUILD_SHARED_LIBS=ON,OFF
5 | # -DU8ID_NORM=NFC,NFKC,NFD,NFKD
6 | # -DU8ID_PROFILE=2,3,4,5,6,C26_4,C11_6
7 | # -DU8ID_TR31=ALLOWED,SAFEC26,ID,XID,C11,ALLUTF8,NONE
8 | # -DHAVE_CONFUS=ON
9 | # -DHAVE_CROARING=ON
10 | # for smaller builds and lib.
11 |
12 | if (EXISTS ".version")
13 | file(READ .version NL_PACKAGE_VERSION)
14 | else()
15 | find_package(Git)
16 | set(PACKAGE_VERSION "")
17 | execute_process(COMMAND ${GIT_EXECUTABLE} describe --long --tags --always
18 | OUTPUT_VARIABLE NL_PACKAGE_VERSION)
19 | endif()
20 | string(STRIP "${NL_PACKAGE_VERSION}" PACKAGE_VERSION)
21 |
22 | include(CheckIncludeFile)
23 | CHECK_INCLUDE_FILE("dlfcn.h" HAVE_DLFCN_H)
24 | CHECK_INCLUDE_FILE("dirent.h" HAVE_DIRENT_H)
25 | CHECK_INCLUDE_FILE("getopt.h" HAVE_GETOPT_H)
26 | CHECK_INCLUDE_FILE("inttypes.h" HAVE_INTTYPES_H)
27 | CHECK_INCLUDE_FILE("malloc.h" HAVE_MALLOC_H)
28 | CHECK_INCLUDE_FILE("memory.h" HAVE_MEMORY_H)
29 | CHECK_INCLUDE_FILE("stdbool.h" HAVE_STDBOOL_H)
30 | CHECK_INCLUDE_FILE("stddef.h" HAVE_STDDEF_H)
31 | CHECK_INCLUDE_FILE("stdint.h" HAVE_STDINT_H)
32 | CHECK_INCLUDE_FILE("stdlib.h" HAVE_STDLIB_H)
33 | CHECK_INCLUDE_FILE("string.h" HAVE_STRING_H)
34 | CHECK_INCLUDE_FILE("strings.h" HAVE_STRINGS_H)
35 | CHECK_INCLUDE_FILE("sys/stat.h" HAVE_SYS_STAT_H)
36 | CHECK_INCLUDE_FILE("sys/types.h" HAVE_SYS_TYPES_H)
37 | include(CheckFunctionExists)
38 | check_function_exists(getopt_long HAVE_GETOPT_LONG)
39 | include(CheckCSourceCompiles)
40 | # Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero.
41 | check_c_source_compiles("
42 | int main() { (void)__builtin_ffs(0); return 0; }
43 | " HAVE___BUILTIN_FFS)
44 | #check_function_exists(__builtin_ffs HAVE___BUILTIN_FFS)
45 | #check_function_exists(strcmp HAVE_STRCMP)
46 | #check_function_exists(strcasestr HAVE_STRCASESTR)
47 | include(CheckTypeSize)
48 | check_type_size(size_t SIZE_T)
49 | #check_type_size(wchar_t WCHAR_T)
50 | #include(CheckCCompilerFlag)
51 | configure_file(cmakeconfig.h.in config.h)
52 | if(CMAKE_MAJOR_VERSION GREATER 2)
53 | include(TestBigEndian)
54 | test_big_endian(IS_BIG_ENDIAN)
55 | if(IS_BIG_ENDIAN)
56 | message(ERROR "Big Endian not supported: <${error}>")
57 | add_definitions(-DBIG_ENDIAN)
58 | endif()
59 | endif()
60 | add_definitions(-DHAVE_CONFIG_H)
61 |
62 | if(MSVC)
63 | # Disable some overly strict MSVC warnings.
64 | add_compile_options(-wd4244 -wd4800 -wd4805 -D_CRT_SECURE_NO_WARNINGS)
65 | endif()
66 | if(MINGW) # WINDOWS AND CMAKE_COMPILER_IS_GNUCC
67 | add_compile_options(-fstack-protector)
68 | add_link_options(-fstack-protector)
69 | endif()
70 |
71 | option(BUILD_SHARED_LIBS "shared libu8ident library" ON)
72 | option(U8ID_NORM "force a single normalization: NFC,NFKC,NFD,NFKD,FCC,FCD" OFF)
73 | option(U8ID_PROFILE "force a single profile: 2-6,C26_4,C11_6" OFF)
74 | option(U8ID_TR31 "force TR31 charset: ALLOWED,SAFEC26,ID,XID,C11,ALLUTF8,NONE" OFF)
75 | option(HAVE_CONFUS "add confusables API" OFF)
76 | option(HAVE_CROARING "use CRoaring bitsets" ON)
77 |
78 | if (HAVE_CONFUS)
79 | if (HAVE_CROARING)
80 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
81 | find_package(CRoaring)
82 | if (CRoaring_FOUND)
83 | add_definitions(-DHAVE_CROARING)
84 | add_definitions(-DCROARING_PATH="${CROARING_LIBRARY}")
85 | else()
86 | unset(HAVE_CROARING)
87 | endif()
88 | endif()
89 | endif()
90 |
91 | if(NOT CMAKE_BUILD_TYPE)
92 | set(CMAKE_BUILD_TYPE "Release"
93 | CACHE STRING "Choose the type of build, options are: Debug Release
94 | RelWithDebInfo MinSizeRel Asan." FORCE)
95 | endif()
96 |
97 | if((CMAKE_MAJOR_VERSION EQUAL 3 AND CMAKE_MINOR_VERSION GREATER_EQUAL 9)
98 | AND (CMAKE_BUILD_TYPE STREQUAL "Release"))
99 | cmake_policy(SET CMP0069 NEW)
100 | include(CheckIPOSupported)
101 | check_ipo_supported(RESULT ipo_supported OUTPUT error)
102 | endif()
103 | if(ipo_supported)
104 | message(STATUS "IPO / LTO enabled")
105 | set_property(GLOBAL PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
106 | add_definitions(-DLTO)
107 | else()
108 | message(STATUS "IPO / LTO not supported: <${error}>")
109 | endif()
110 |
111 | set(libu8ident_DOCS
112 | README.md doc/c11.md doc/n2916.md doc/P2528R0.md doc/P2528R1.md
113 | doc/tr31-bugs.md doc/n2916.patch
114 | NOTICE LICENSE)
115 |
116 | set(libu8ident_HEADERS
117 | include/u8ident.h)
118 |
119 | set(libu8ident_SOURCES
120 | u8ident.c
121 | u8idnorm.c
122 | u8idscr.c
123 | u8idroar.c)
124 |
125 | if (CRoaring_FOUND)
126 | set(roaring_HEADERS
127 | confus_croar.h)
128 | endif()
129 |
130 | set(private_HEADERS
131 | scripts.h
132 | config.h.in
133 | u8idscr.h
134 | u8idroar.h
135 | u8id_private.h
136 | un8ifcan.h un8ifcmb.h un8ifcmp.h un8ifcpt.h un8ifexc.h hangul.h
137 | mark.h unic11.h confus.h ${roaring_HEADERS})
138 |
139 | add_library(u8ident_static STATIC
140 | ${libu8ident_HEADERS}
141 | ${libu8ident_SOURCES}
142 | ${private_HEADERS})
143 | add_library(u8ident SHARED
144 | ${libu8ident_HEADERS}
145 | ${libu8ident_SOURCES}
146 | ${private_HEADERS})
147 | if(MSVC)
148 | if (BUILD_SHARED_LIBS)
149 | # FIXME cmake bug:
150 | # need full path. u8idlint only gets Release\u8ident.lib, but is in a seperate path.
151 | # This is ignored because the target is built, not imported.
152 | #set_target_properties(u8ident PROPERTIES
153 | # IMPORTED_IMPLIB_DEBUG "${CMAKE_CURRENT_BINARY_DIR}\\Debug\\u8ident.lib")
154 | #set_target_properties(u8ident PROPERTIES
155 | # IMPORTED_IMPLIB_RELEASE "${CMAKE_CURRENT_BINARY_DIR}\\Release\\u8ident.lib")
156 | # this is the default already
157 | #if (CMAKE_BUILD_TYPE STREQUAL "Debug")
158 | #set(LIBPATH "${CMAKE_CURRENT_BINARY_DIR}/Debug")
159 | #set_target_properties(u8ident PROPERTIES
160 | # LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Debug")
161 | #else()
162 | #set(LIBPATH "${CMAKE_CURRENT_BINARY_DIR}/Release")
163 | #set_target_properties(u8ident PROPERTIES
164 | # LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Release")
165 | #endif()
166 | endif(BUILD_SHARED_LIBS)
167 | endif(MSVC)
168 | #set_target_properties(${libu8ident_HEADERS} PROPERTIES PUBLIC_HEADER)
169 | set_target_properties(u8ident PROPERTIES PUBLIC_HEADER include/u8ident.h)
170 | if (U8ID_NORM)
171 | set_target_properties(u8ident u8ident_static PROPERTIES
172 | COMPILE_DEFINITIONS U8ID_NORM=${U8ID_NORM})
173 | endif()
174 | if (U8ID_PROFILE)
175 | get_target_property(DEF u8ident COMPILE_DEFINITIONS)
176 | if (DEF)
177 | set_target_properties(u8ident u8ident_static PROPERTIES
178 | COMPILE_DEFINITIONS "${DEF};U8ID_PROFILE=${U8ID_PROFILE}")
179 | else()
180 | set_target_properties(u8ident u8ident_static PROPERTIES
181 | COMPILE_DEFINITIONS U8ID_PROFILE=${U8ID_PROFILE})
182 | endif()
183 | endif()
184 | if (U8ID_TR31)
185 | get_target_property(DEF u8ident COMPILE_DEFINITIONS)
186 | if (DEF)
187 | set_target_properties(u8ident u8ident_static PROPERTIES
188 | COMPILE_DEFINITIONS "${DEF};U8ID_TR31=${U8ID_TR31}")
189 | else()
190 | set_target_properties(u8ident u8ident_static PROPERTIES
191 | COMPILE_DEFINITIONS U8ID_TR31=${U8ID_TR31})
192 | endif()
193 | endif()
194 |
195 | target_include_directories(u8ident PRIVATE
196 | ${CMAKE_CURRENT_SOURCE_DIR}
197 | ${CMAKE_CURRENT_BINARY_DIR})
198 | target_include_directories(u8ident PUBLIC
199 | ${CMAKE_CURRENT_SOURCE_DIR}/include)
200 |
201 | if (CRoaring_FOUND)
202 | add_executable(perf perf.c u8idroar.c)
203 | set_source_files_properties(perf.c
204 | PROPERTIES COMPILE_FLAGS "-DPERF_TEST")
205 | endif()
206 |
207 | add_executable(u8idlint u8idlint.c ${libu8ident_SOURCES})
208 | # cmake MSVC bug: u8idlint only gets Release\u8ident.lib, but is in a subdir
209 | # target_link_directories(u8idlint PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
210 | # target_link_libraries(u8idlint ${CMAKE_CURRENT_BINARY_DIR}/u8ident.lib)
211 | include_directories(BEFORE
212 | ${CMAKE_CURRENT_SOURCE_DIR}/include
213 | ${CMAKE_CURRENT_SOURCE_DIR}
214 | ${CMAKE_CURRENT_BINARY_DIR})
215 |
216 | if(NOT CMAKE_CROSSCOMPILING)
217 | enable_testing()
218 | add_executable(u8idtest test.c ${libu8ident_SOURCES})
219 | add_test(test ./u8idtest)
220 | if (NOT MSVC)
221 | add_test(u8idlint.test "${CMAKE_CURRENT_SOURCE_DIR}/u8idlint.test")
222 | endif()
223 | set(ENV{U8IDTEST_TEXTS} "${CMAKE_CURRENT_SOURCE_DIR}/texts")
224 | add_executable(test-texts test-texts.c ${libu8ident_SOURCES})
225 | add_test(test-texts ./test-texts)
226 | add_executable(mkc26 mkc26.c ${libu8ident_SOURCES})
227 | set_source_files_properties(mkc26
228 | PROPERTIES COMPILE_FLAGS -DU8ID_PROFILE_SAFEC26)
229 | add_test(mkc26 ./mkc26)
230 | endif(NOT CMAKE_CROSSCOMPILING)
231 |
232 | add_custom_target(
233 | clang-format
234 | COMMAND clang-format -i
235 | ${CMAKE_CURRENT_SOURCE_DIR}/*.c
236 | ${CMAKE_CURRENT_SOURCE_DIR}/include/*.h
237 | ${CMAKE_CURRENT_SOURCE_DIR}/scripts.h
238 | ${CMAKE_CURRENT_SOURCE_DIR}/confus.h
239 | ${CMAKE_CURRENT_SOURCE_DIR}/u8id*.h)
240 |
241 | add_custom_target(
242 | regen-scripts
243 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/Scripts.txt
244 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
245 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
246 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
247 | COMMAND wget -N https://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
248 | COMMAND wget -N https://www.unicode.org/Public/security/latest/IdentifierType.txt
249 | COMMAND wget -N https://www.unicode.org/Public/security/latest/IdentifierStatus.txt
250 | COMMAND perl mkscripts.pl
251 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
252 |
253 | add_custom_target(
254 | regen-confus
255 | COMMAND wget -N https://www.unicode.org/Public/security/latest/confusables.txt
256 | COMMAND perl mkconfus.pl
257 | #BYPRODUCTS confus.h
258 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
259 |
260 | add_custom_target(
261 | regen-all
262 | MAIN_DEPENDENCY regen-scripts
263 | DEPENDS regen-confus
264 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
265 |
266 | add_custom_target(
267 | TAGS
268 | COMMAND etags --language=c++ *.c *.h include/*.h
269 | DEPENDS *.c *.h include/*.h
270 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
271 |
272 | # make install
273 | # make install/local
274 | # make install/strip
275 | if(MSVC)
276 | #untested
277 | install(TARGETS u8ident RUNTIME PUBLIC_HEADER
278 | DESTINATION libu8ident-${PACKAGE_VERSION})
279 | else()
280 | include(GNUInstallDirs)
281 | endif()
282 | install(TARGETS u8ident
283 | LIBRARY
284 | COMPONENT u8ident)
285 | install(TARGETS RUNTIME)
286 | install(TARGETS u8idlint)
287 |
288 | if(MSVC)
289 | #untested
290 | add_custom_target(dist
291 | COMMAND zip libu8ident-${PACKAGE_VERSION}.zip libu8ident-*.dll
292 | include/u8ident.h ${libu8ident_DOCS}
293 | DEPENDS u8ident)
294 | endif()
295 |
296 | # make package
297 | # make package_source (only after distclean or git clean -dxf)
298 | set(CPACK_PACKAGE_VENDOR "Reini Urban")
299 | set(CPACK_PACKAGE_CONTACT "rurban@cpan.org")
300 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Check unicode security guidelines for identifiers")
301 | #set(CPACK_PACKAGE_VERSION_MAJOR ${PACKAGE_VERSION_MAJOR})
302 | #set(CPACK_PACKAGE_VERSION_MINOR ${PACKAGE_VERSION_MINOR})
303 | #set(CPACK_PACKAGE_VERSION_PATCH ${PACKAGE_VERSION_PATCH})
304 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
305 | set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
306 |
307 | set(CPACK_RPM_PACKAGE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
308 | set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
309 | include(CPack)
310 |
--------------------------------------------------------------------------------
/FindCRoaring.cmake:
--------------------------------------------------------------------------------
1 | # - Try to find CRoaring
2 | # Once done this will define
3 | # CROARING_FOUND - System has roaring.c
4 | # CROARING_INCLUDE_DIRS - The roaring.h include directories
5 | # CROARING_DEFINITIONS - Compiler switches required for using roaring.c
6 |
7 | find_package(PkgConfig)
8 | pkg_check_modules(PC_CROARING QUIET roaring)
9 | set(CROARING_DEFINITIONS ${PC_CROARING_CFLAGS})
10 |
11 | find_path(CROARING_INCLUDE_DIR roaring.h
12 | HINTS ${PC_CROARING_INCLUDEDIR} ${PC_CROARING_INCLUDE_DIRS}
13 | PATH_SUFFIXES CRoaring )
14 |
15 | find_library(CROARING_LIBRARY NAMES CRoaring libroaring
16 | HINTS ${PC_CROARING_LIBDIR} ${PC_CROARING_LIBRARY_DIRS} )
17 |
18 | include(FindPackageHandleStandardArgs)
19 | # handle the QUIETLY and REQUIRED arguments and set CROARING_FOUND to TRUE
20 | # if all listed variables are TRUE
21 | find_package_handle_standard_args(CRoaring DEFAULT_MSG
22 | CROARING_LIBRARY CROARING_INCLUDE_DIR)
23 |
24 | mark_as_advanced(CROARING_INCLUDE_DIR CROARING_LIBRARY )
25 |
26 | set(CROARING_LIBRARIES ${CROARING_LIBRARY} )
27 | set(CROARING_INCLUDE_DIRS ${CROARING_INCLUDE_DIR} )
28 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | libu8ident NEWS -- history of user-visible changes. -*-indented-text-*-
2 | Copyright (C) 2022 Reini Urban
3 |
4 | libu8ident 0.3 - 2022-12-03
5 |
6 | Fixed a bug with excluded scripts which are only now addable.
7 | Change is_greek_latin_confus violations with C26_4 to return ERR_CONFUS,
8 | not ERR_SCRIPTS (GH #12).
9 | Add more combining marks checks (GH #7): TR39#5.5 "Forbid sequences of base character
10 | + nonspacing mark that look the same as or confusingly similar to the base character
11 | alone". Also forbid non-spacing marks with base chars already including the
12 | non-spacing mark, like Ä with DIAERESIS.
13 | Update to Unicode version 15.0.0:
14 | - 11 new mark ranges.
15 | - 2 new Excluded scripts: Kawi, Nag Mundari.
16 | - The zero width joiner (ZWJ) and zero width non-joiner (ZWNJ) changed to IdStatus
17 | Restricted.
18 | - More id and xid chars
19 |
20 | libu8ident 0.2 - 2022-02-12:
21 |
22 | Bump SOLIB MAJOR to 1
23 | The C++23 deadline is gone, rename all c23 to c26. Esp. command-line
24 | and configure options.
25 | Fixed SAFEC26 to exclude confusable Technical, and to skip a lot of
26 | not Allowed ID_Start ranges.
27 |
28 | libu8ident 0.1 - 2022-01-24:
29 |
30 | First release, with the library, u8idlint and lots of C23 support:
31 | the optimized unic23.h, and the 2 technical papers for C23++ and C23
32 | to use TR39.
33 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | libu8ident is
2 | Copyright © 2017, 2021 Reini Urban.
3 |
4 | The normalization headers were created by perl-Unicode-Normalize.
5 | Copyright © 2001-2012 SADAHIRO Tomoyuki. Japan.
6 | Copyright © 2017 Reini Urban.
7 |
8 | Parts of the normalization code were written for cperl (u8) and the safeclib (wchar).
9 | Copyright © 2015-2016 cPanel Inc.
10 | Copyright © 2017, 2021 Reini Urban.
11 |
12 | The Unicode Database is
13 | Copyright © 2021 Unicode®, Inc.
14 | Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
15 | For terms of use, see http://www.unicode.org/terms_of_use.html
16 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
--------------------------------------------------------------------------------
/build-aux/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | libtoolize
3 | autoreconf -fi
4 |
--------------------------------------------------------------------------------
/build-aux/git-version-gen:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Print a version string.
3 | scriptversion=2022-01-25.11; # UTC
4 |
5 | # Copyright (C) 2007-2018 Free Software Foundation, Inc.
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation; either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | # This script is derived from GIT-VERSION-GEN from GIT: https://git-scm.com/.
21 | # It may be run two ways:
22 | # - from a git repository in which the "git describe" command below
23 | # produces useful output (thus requiring at least one signed tag)
24 | # - from a non-git-repo directory containing a .tarball-version file, which
25 | # presumes this script is invoked like "./git-version-gen .tarball-version".
26 |
27 | # In order to use intra-version strings in your project, you will need two
28 | # separate generated version string files:
29 | #
30 | # .tarball-version - present only in a distribution tarball, and not in
31 | # a checked-out repository. Created with contents that were learned at
32 | # the last time autoconf was run, and used by git-version-gen. Must not
33 | # be present in either $(srcdir) or $(builddir) for git-version-gen to
34 | # give accurate answers during normal development with a checked out tree,
35 | # but must be present in a tarball when there is no version control system.
36 | # Therefore, it cannot be used in any dependencies. GNUmakefile has
37 | # hooks to force a reconfigure at distribution time to get the value
38 | # correct, without penalizing normal development with extra reconfigures.
39 | #
40 | # .version - present in a checked-out repository and in a distribution
41 | # tarball. Usable in dependencies, particularly for files that don't
42 | # want to depend on config.h but do want to track version changes.
43 | # Delete this file prior to any autoconf run where you want to rebuild
44 | # files to pick up a version string change; and leave it stale to
45 | # minimize rebuild time after unrelated changes to configure sources.
46 | #
47 | # As with any generated file in a VC'd directory, you should add
48 | # /.version to .gitignore, so that you don't accidentally commit it.
49 | # .tarball-version is never generated in a VC'd directory, so needn't
50 | # be listed there.
51 | #
52 | # Use the following line in your configure.ac, so that $(VERSION) will
53 | # automatically be up-to-date each time configure is run (and note that
54 | # since configure.ac no longer includes a version string, Makefile rules
55 | # should not depend on configure.ac for version updates).
56 | #
57 | # AC_INIT([GNU project],
58 | # m4_esyscmd([build-aux/git-version-gen .tarball-version]),
59 | # [bug-project@example])
60 | #
61 | # Then use the following lines in your Makefile.am, so that .version
62 | # will be present for dependencies, and so that .version and
63 | # .tarball-version will exist in distribution tarballs.
64 | #
65 | # EXTRA_DIST = $(top_srcdir)/.version
66 | # BUILT_SOURCES = $(top_srcdir)/.version
67 | # $(top_srcdir)/.version:
68 | # echo $(VERSION) > $@-t && mv $@-t $@
69 | # dist-hook:
70 | # echo $(VERSION) > $(distdir)/.tarball-version
71 |
72 |
73 | me=$0
74 |
75 | version="git-version-gen $scriptversion
76 |
77 | Copyright 2011 Free Software Foundation, Inc.
78 | There is NO warranty. You may redistribute this software
79 | under the terms of the GNU General Public License.
80 | For more information about these matters, see the files named COPYING."
81 |
82 | usage="\
83 | Usage: $me [OPTION]... \$srcdir/.tarball-version [TAG-NORMALIZATION-SED-SCRIPT]
84 | Print a version string.
85 |
86 | Options:
87 |
88 | --prefix PREFIX prefix of git tags (default 'v')
89 | --fallback VERSION
90 | fallback version to use if \"git --version\" fails
91 |
92 | --help display this help and exit
93 | --version output version information and exit
94 |
95 | Running without arguments will suffice in most cases."
96 |
97 | prefix=
98 | fallback=
99 |
100 | while test $# -gt 0; do
101 | case $1 in
102 | --help) echo "$usage"; exit 0;;
103 | --version) echo "$version"; exit 0;;
104 | --prefix) shift; prefix=${1?};;
105 | --fallback) shift; fallback=${1?};;
106 | -*)
107 | echo "$0: Unknown option '$1'." >&2
108 | echo "$0: Try '--help' for more information." >&2
109 | exit 1;;
110 | *)
111 | if test "x$tarball_version_file" = x; then
112 | tarball_version_file="$1"
113 | elif test "x$tag_sed_script" = x; then
114 | tag_sed_script="$1"
115 | else
116 | echo "$0: extra non-option argument '$1'." >&2
117 | exit 1
118 | fi;;
119 | esac
120 | shift
121 | done
122 |
123 | if test "x$tarball_version_file" = x; then
124 | echo "$usage"
125 | exit 1
126 | fi
127 |
128 | tag_sed_script="${tag_sed_script:-s/-/_/g}"
129 |
130 | nl='
131 | '
132 |
133 | # Avoid meddling by environment variable of the same name.
134 | v=
135 | v_from_git=
136 |
137 | # First see if there is a tarball-only version file.
138 | # then try "git describe", then default.
139 | if test -f $tarball_version_file
140 | then
141 | v=`cat $tarball_version_file` || v=
142 | case $v in
143 | *$nl*) v= ;; # reject multi-line output
144 | [0-9]*) ;;
145 | *) v= ;;
146 | esac
147 | test "x$v" = x \
148 | && echo "$0: WARNING: $tarball_version_file is missing or damaged" 1>&2
149 | fi
150 |
151 | if test "x$v" != x
152 | then
153 | : # use $v
154 | # Otherwise, if there is at least one git commit involving the working
155 | # directory, and "git describe" output looks sensible, use that to
156 | # derive a version string.
157 | elif test "`git log -1 --pretty=format:x . 2>&1`" = x \
158 | && v=`git describe --tags --abbrev=4 --match="$prefix*" HEAD 2>/dev/null \
159 | || git describe --tags --abbrev=4 HEAD 2>/dev/null` \
160 | && v=`printf '%s\n' "$v" | sed "$tag_sed_script"` \
161 | && case $v in
162 | $prefix[0-9]*) ;;
163 | *) (exit 1) ;;
164 | esac
165 | then
166 | # Is this a new git that lists number of commits since the last
167 | # tag or the previous older version that did not?
168 | # Newer: v6.10_77_g0f8faeb
169 | # Older: v6.10_g0f8faeb
170 | vprefix=`expr "X$v" : 'X\(.*\)_g[^_]*$'` || vprefix=$v
171 | case $vprefix in
172 | *-*) : git describe is probably okay three part flavor ;;
173 | *)
174 | : git describe is older two part flavor
175 | # Recreate the number of commits and rewrite such that the
176 | # result is the same as if we were using the newer version
177 | # of git describe.
178 | vtag=`echo "$v" | sed 's/_.*//'`
179 | commit_list=`git rev-list "$vtag"..HEAD 2>/dev/null` \
180 | || { commit_list=failed;
181 | echo "$0: WARNING: git rev-list failed" 1>&2; }
182 | numcommits=`echo "$commit_list" | wc -l`
183 | v=`echo "$v" | sed "s/\(.*\)_\(.*\)/\1_$numcommits_\2/"`;
184 | test "$commit_list" = failed && v=UNKNOWN
185 | ;;
186 | esac
187 |
188 | # Change the penultimate "_" to ".", for version-comparing tools.
189 | # Remove the "g" to save a byte.
190 | # The rpm version must not contain a -
191 | v=`echo "$v" | sed 's/_\([^_]*\)_g\([^_]*\)$/.\1_\2/'`;
192 | v_from_git=1
193 | elif test "x$fallback" = x || git --version >/dev/null 2>&1; then
194 | v=UNKNOWN
195 | else
196 | v=$fallback
197 | fi
198 |
199 | v=`echo "$v" |sed "s/^$prefix//"`
200 |
201 | # Test whether to append the "_dirty" suffix only if the version
202 | # string we're using came from git. I.e., skip the test if it's "UNKNOWN"
203 | # or if it came from .tarball-version.
204 | if test "x$v_from_git" != x; then
205 | # Don't declare a version "dirty" merely because a timestamp has changed.
206 | git update-index --refresh > /dev/null 2>&1
207 |
208 | dirty=`exec 2>/dev/null;git diff-index --name-only HEAD` || dirty=
209 | case "$dirty" in
210 | '') ;;
211 | *) # Append the suffix only if there isn't one already.
212 | case $v in
213 | *_dirty) ;;
214 | *) v="${v}_dirty" ;;
215 | esac ;;
216 | esac
217 | fi
218 |
219 | # Omit the trailing newline, so that m4_esyscmd can use the result directly.
220 | printf %s "$v"
221 |
222 | # Local variables:
223 | # eval: (add-hook 'before-save-hook 'time-stamp)
224 | # time-stamp-start: "scriptversion="
225 | # time-stamp-format: "%:y-%02m-%02d.%02H"
226 | # time-stamp-time-zone: "UTC0"
227 | # time-stamp-end: "; # UTC"
228 | # End:
229 |
--------------------------------------------------------------------------------
/c11-all.h:
--------------------------------------------------------------------------------
1 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
2 | /* libu8ident - Check unicode security guidelines for identifiers.
3 | Copyright 2021, 2022 Reini Urban
4 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 |
6 | Generated by mkc26 from unic11.h.
7 | UNICODE version 16.0
8 | */
9 | static const struct range_bool c11_start_list[] = {
10 | {'$', '$'}, {'A', 'Z'}, {'_', '_'}, {'a', 'z'},
11 | {0x00A8, 0x00A8}, {0x00AA, 0x00AA},
12 | {0x00AD, 0x00AD}, {0x00AF, 0x00AF}, {0x00B2, 0x00B5},
13 | {0x00B7, 0x00BA}, {0x00BC, 0x00BE}, {0x00C0, 0x00D6},
14 | {0x00D8, 0x00F6}, {0x00F8, 0x00FF},
15 | // {0x0100, 0x02FF}, // Latin, 2B0-2FF: Modifiers (2EA Bopomofo)
16 | {0xFF, 0x2FF}, // Latin ÿ..˿
17 | {0x370, 0x167F}, // Greek Ͱ..ᙿ
18 | {0x1681, 0x180D}, // Ogham (Excluded) ᚁ..᠍
19 | {0x180F, 0x1DBF}, // Mongolian (Excluded) ᠏..ᶿ
20 | {0x1E00, 0x1FFF}, // Latin Ḁ..
21 | {0x200B, 0x200D}, // Common ..
22 | {0x202A, 0x202E}, // Common ..
23 | {0x203F, 0x2040}, // Common ‿..⁀
24 | {0x2054, 0x2054}, // Common ⁔
25 | {0x2060, 0x20CF}, // Common ..
26 | {0x2100, 0x218F}, // Common ℀..
27 | {0x2460, 0x24FF}, // Common ①..⓿
28 | {0x2776, 0x2793}, // Common ❶..➓
29 | {0x2C00, 0x2DFF}, // Glagolitic (Excluded) Ⰰ..ⷿ
30 | {0x2E80, 0x2FFF}, // Han ⺀..
31 | {0x3004, 0x3007}, // Common 〄..〇
32 | {0x3021, 0x302F}, // Han 〡..〯
33 | {0x3031, 0xD7FF}, // Common 〱..
34 | {0xF900, 0xFD3D}, // Han 豈..ﴽ
35 | {0xFD40, 0xFDCF}, // Arabic ﵀..﷏
36 | {0xFDF0, 0xFE1F}, // Arabic ﷰ..
37 | {0xFE30, 0xFE44}, // Common ︰..﹄
38 | {0xFE47, 0xFFFD}, // Common ﹇..�
39 | {0x10000, 0x1FFFD}, // Linear_B (Excluded) 𐀀..
40 | {0x20000, 0x2FFFD}, // Han 𠀀..
41 | {0x30000, 0x3FFFD}, // Han 𰀀..
42 | {0x40000, 0x4FFFD}, // (null) (Limited) ..
43 | {0x50000, 0x5FFFD}, // (null) (Limited) ..
44 | {0x60000, 0x6FFFD}, // (null) (Limited) ..
45 | {0x70000, 0x7FFFD}, // (null) (Limited) ..
46 | {0x80000, 0x8FFFD}, // (null) (Limited) ..
47 | {0x90000, 0x9FFFD}, // (null) (Limited) ..
48 | {0xA0000, 0xAFFFD}, // (null) (Limited) ..
49 | {0xB0000, 0xBFFFD}, // (null) (Limited) ..
50 | {0xC0000, 0xCFFFD}, // (null) (Limited) ..
51 | {0xD0000, 0xDFFFD}, // (null) (Limited) ..
52 | {0xE0000, 0xEFFFD}, // (null) (Limited) ..
53 | }; // 36 ranges, 1 singles, 971267 codepoints
54 | static const struct range_bool c11_cont_list[] = {
55 | {'$', '$'},
56 | {'0', '9'},
57 | {0x300, 0x36F}, // Inherited ̀..ͯ
58 | {0x1DC0, 0x1DFF}, // Inherited ᷀..᷿
59 | {0x20D0, 0x20FF}, // Inherited ⃐..
60 | {0xFE20, 0xFE2F}, // Inherited ︠..︯
61 | }; // 4 ranges, 0 singles, 236 codepoints
62 |
--------------------------------------------------------------------------------
/cmakeconfig.h.in:
--------------------------------------------------------------------------------
1 | /* config.h. Generated from cmakeconfig.h.in by cmake. */
2 |
3 | #ifndef __U8ID_CONF_H__
4 | #define __U8ID_CONF_H__
5 |
6 | #define HAVE_CONFIG_H 1
7 | /* Defined to 1 on a cmake build */
8 | #define CMAKE_BUILD 1
9 |
10 | /* Define to force a single normalization. */
11 | #cmakedefine LIBU8IDENT_NORM
12 | #ifdef LIBU8IDENT_NORM
13 | # define U8ID_NORM=@LIBU8IDENT_NORM@
14 | #else
15 | # undef U8ID_NORM
16 | #endif
17 |
18 | /* Define to force a single profile. */
19 | #cmakedefine LIBU8IDENT_PROFILE
20 | #ifdef LIBU8IDENT_PROFILE
21 | # define U8ID_PROFILE=@LIBU8IDENT_PROFILE@
22 | #else
23 | # undef U8ID_PROFILE
24 | #endif
25 |
26 | /* Define to force a single check_xid option. */
27 | #cmakedefine ENABLE_CHECK_XID
28 | #cmakedefine DISABLE_CHECK_XID
29 |
30 | /* Define to enable confus.h. */
31 | #cmakedefine HAVE_CONFUS
32 |
33 | /* Define if a shared library will be built */
34 | #define ENABLE_SHARED @BUILD_SHARED_LIBS@
35 |
36 | /* TODO Defined to 1 when the compiler supports c11 */
37 | #define HAVE_C11 1
38 |
39 | /* Define if you have the header file. */
40 | #cmakedefine HAVE_DLFCN_H
41 |
42 | /* Define if you have the header file. */
43 | #cmakedefine HAVE_DIRENT_H
44 |
45 | /* Define if you have the header file. */
46 | #cmakedefine HAVE_GETOPT_H
47 |
48 | /* Define if you have the `getopt_long' function. */
49 | #cmakedefine HAVE_GETOPT_LONG
50 |
51 | /* Define if you have the header file. */
52 | #cmakedefine HAVE_INTTYPES_H
53 |
54 | /* Define if you have the header file. */
55 | #cmakedefine HAVE_LIBGEN_H
56 |
57 | /* Define if you have the header file. */
58 | #cmakedefine HAVE_MALLOC_H
59 |
60 | /* Define if you have the header file. */
61 | #cmakedefine HAVE_MEMORY_H
62 |
63 | /* Define if you have the `memset' function. */
64 | #cmakedefine HAVE_MEMSET
65 |
66 | /* Define if stdbool.h conforms to C99. */
67 | #cmakedefine HAVE_STDBOOL_H
68 |
69 | /* Define if you have the header file. */
70 | #cmakedefine HAVE_STDDEF_H
71 |
72 | /* Define if you have the header file. */
73 | #cmakedefine HAVE_STDINT_H
74 |
75 | /* Define if you have the header file. */
76 | #cmakedefine HAVE_STDLIB_H
77 |
78 | /* Define if you have the header file. */
79 | #cmakedefine HAVE_STRINGS_H
80 |
81 | /* Define if you have the header file. */
82 | #cmakedefine HAVE_STRING_H
83 |
84 | /* Define if you have the header file. */
85 | #cmakedefine HAVE_SYS_STAT_H
86 |
87 | /* Define if you have the header file. */
88 | #cmakedefine HAVE_SYS_TYPES_H
89 |
90 | /* Define if you have the header file. */
91 | #cmakedefine HAVE_UNISTD_H
92 |
93 | /* TODO Define if the system has the type `_Bool'. */
94 | #define HAVE__BOOL 1
95 |
96 | /* Define if the system has the `__builtin_ffs' built-in function */
97 | #cmakedefine HAVE___BUILTIN_FFS
98 |
99 | /* The size of `size_t', as computed by sizeof. */
100 | #define SIZEOF_SIZE_T ${SIZE_T}
101 |
102 | /* Define to the full name of this package. */
103 | #define PACKAGE_NAME "libu8ident"
104 |
105 | /* Define to the full name and version of this package. */
106 | #define PACKAGE_STRING "libu8ident ${PACKAGE_VERSION}"
107 |
108 | /* Define to the one symbol short name of this package. */
109 | #define PACKAGE_TARNAME "libu8ident"
110 |
111 | /* Define to the home page for this package. */
112 | #define PACKAGE_URL "http://github.com/rurban/libu8ident/"
113 |
114 | /* Define to the version of this package. */
115 | #define PACKAGE_VERSION "${PACKAGE_VERSION}"
116 |
117 | /* Define for Solaris 2.5.1 so the uint32_t typedef from ,
118 | , or is not used. If the typedef were allowed, the
119 | #define below would cause a syntax error. */
120 | /* #undef _UINT32_T */
121 |
122 | /* Define for Solaris 2.5.1 so the uint64_t typedef from ,
123 | , or is not used. If the typedef were allowed, the
124 | #define below would cause a syntax error. */
125 | /* #undef _UINT64_T */
126 |
127 | /* Define to `__inline__' or `__inline' if that's what the C compiler
128 | calls it, or to nothing if 'inline' is not supported under any name. */
129 | #ifndef __cplusplus
130 | /* #undef inline */
131 | #endif
132 |
133 | /* Define to the type of a signed integer type of width exactly 16 bits if
134 | such a type exists and the standard includes do not define it. */
135 | /* #undef int16_t */
136 |
137 | /* Define to the type of a signed integer type of width exactly 32 bits if
138 | such a type exists and the standard includes do not define it. */
139 | /* #undef int32_t */
140 |
141 | /* Define to the type of a signed integer type of width exactly 64 bits if
142 | such a type exists and the standard includes do not define it. */
143 | /* #undef int64_t */
144 |
145 | /* Define to rpl_malloc if the replacement function should be used. */
146 | /* #undef malloc */
147 |
148 | /* Define to rpl_realloc if the replacement function should be used. */
149 | /* #undef realloc */
150 |
151 | /* If restrict is broken with this compiler */
152 | #ifdef __cplusplus
153 | #define restrict
154 | #endif
155 |
156 | /* Define to `unsigned int' if does not define. */
157 | /* #undef size_t */
158 |
159 | /* Define to the type of an unsigned integer type of width exactly 16 bits if
160 | such a type exists and the standard includes do not define it. */
161 | /* #undef uint16_t */
162 |
163 | /* Define to the type of an unsigned integer type of width exactly 32 bits if
164 | such a type exists and the standard includes do not define it. */
165 | /* #undef uint32_t */
166 |
167 | /* Define to the type of an unsigned integer type of width exactly 64 bits if
168 | such a type exists and the standard includes do not define it. */
169 | /* #undef uint64_t */
170 |
171 | #endif /* __U8ID_CONF_H__ */
172 |
--------------------------------------------------------------------------------
/config.h.in:
--------------------------------------------------------------------------------
1 | /* config.h.in. Generated from configure.ac by autoheader. */
2 |
3 |
4 | #ifndef __U8ID_CONF_H__
5 | #define __U8ID_CONF_H__
6 |
7 |
8 | /* Defined to the path to the CRoaring library */
9 | #undef CROARING_PATH
10 |
11 | /* Define to 1 if you have the header file. */
12 | #undef HAVE_ASSERT_H
13 |
14 | /* Defined to 1 when the compiler supports c11 */
15 | #undef HAVE_C11
16 |
17 | /* Defined to 1 to add the confusable API */
18 | #undef HAVE_CONFUS
19 |
20 | /* Defined to 1 when to use the CRoaring library */
21 | #undef HAVE_CROARING
22 |
23 | /* Define to 1 if you have the header file. */
24 | #undef HAVE_DIRENT_H
25 |
26 | /* Define to 1 if you have the header file. */
27 | #undef HAVE_DLFCN_H
28 |
29 | /* Define to 1 if you have the header file. */
30 | #undef HAVE_GETOPT_H
31 |
32 | /* Define to 1 if you have the `getopt_long' function. */
33 | #undef HAVE_GETOPT_LONG
34 |
35 | /* Define to 1 if you have the header file. */
36 | #undef HAVE_INTTYPES_H
37 |
38 | /* Define to 1 if you have the header file. */
39 | #undef HAVE_LIBGEN_H
40 |
41 | /* Define to 1 if you have the header file. */
42 | #undef HAVE_MALLOC_H
43 |
44 | /* Define to 1 if stdbool.h conforms to C99. */
45 | #undef HAVE_STDBOOL_H
46 |
47 | /* Define to 1 if you have the header file. */
48 | #undef HAVE_STDDEF_H
49 |
50 | /* Define to 1 if you have the header file. */
51 | #undef HAVE_STDINT_H
52 |
53 | /* Define to 1 if you have the header file. */
54 | #undef HAVE_STDIO_H
55 |
56 | /* Define to 1 if you have the header file. */
57 | #undef HAVE_STDLIB_H
58 |
59 | /* Define to 1 if you have the header file. */
60 | #undef HAVE_STRINGS_H
61 |
62 | /* Define to 1 if you have the header file. */
63 | #undef HAVE_STRING_H
64 |
65 | /* Define to 1 if you have the header file. */
66 | #undef HAVE_SYS_STAT_H
67 |
68 | /* Define to 1 if you have the header file. */
69 | #undef HAVE_SYS_TYPES_H
70 |
71 | /* Define to 1 if you have the `u8_wordbreaks' function. */
72 | #undef HAVE_U8_WORDBREAKS
73 |
74 | /* Define to 1 if you have the header file. */
75 | #undef HAVE_UNISTD_H
76 |
77 | /* Define to 1 if you have the header file. */
78 | #undef HAVE_UNIWBRK_H
79 |
80 | /* Define to 1 if the system has the type `_Bool'. */
81 | #undef HAVE__BOOL
82 |
83 | /* Define to 1 if the system has the `__builtin_ffs' built-in function */
84 | #undef HAVE___BUILTIN_FFS
85 |
86 | /* Define to the sub-directory where libtool stores uninstalled libraries. */
87 | #undef LT_OBJDIR
88 |
89 | /* Define to the address where bug reports for this package should be sent. */
90 | #undef PACKAGE_BUGREPORT
91 |
92 | /* Define to the full name of this package. */
93 | #undef PACKAGE_NAME
94 |
95 | /* Define to the full name and version of this package. */
96 | #undef PACKAGE_STRING
97 |
98 | /* Define to the one symbol short name of this package. */
99 | #undef PACKAGE_TARNAME
100 |
101 | /* Define to the home page for this package. */
102 | #undef PACKAGE_URL
103 |
104 | /* Define to the version of this package. */
105 | #undef PACKAGE_VERSION
106 |
107 | /* The size of `size_t', as computed by sizeof. */
108 | #undef SIZEOF_SIZE_T
109 |
110 | /* Define to 1 if all of the C90 standard headers exist (not just the ones
111 | required in a freestanding environment). This macro is provided for
112 | backward compatibility; new code need not use it. */
113 | #undef STDC_HEADERS
114 |
115 | /* Define for Solaris 2.5.1 so the uint32_t typedef from ,
116 | , or is not used. If the typedef were allowed, the
117 | #define below would cause a syntax error. */
118 | #undef _UINT32_T
119 |
120 | /* Define for Solaris 2.5.1 so the uint64_t typedef from ,
121 | , or is not used. If the typedef were allowed, the
122 | #define below would cause a syntax error. */
123 | #undef _UINT64_T
124 |
125 | /* Define for Solaris 2.5.1 so the uint8_t typedef from ,
126 | , or is not used. If the typedef were allowed, the
127 | #define below would cause a syntax error. */
128 | #undef _UINT8_T
129 |
130 | /* Define to empty if `const' does not conform to ANSI C. */
131 | #undef const
132 |
133 | /* Define to `__inline__' or `__inline' if that's what the C compiler
134 | calls it, or to nothing if 'inline' is not supported under any name. */
135 | #ifndef __cplusplus
136 | #undef inline
137 | #endif
138 |
139 | /* Define to the type of a signed integer type of width exactly 32 bits if
140 | such a type exists and the standard includes do not define it. */
141 | #undef int32_t
142 |
143 | /* Define to the equivalent of the C99 'restrict' keyword, or to
144 | nothing if this is not supported. Do not define if restrict is
145 | supported only directly. */
146 | #undef restrict
147 | /* Work around a bug in older versions of Sun C++, which did not
148 | #define __restrict__ or support _Restrict or __restrict__
149 | even though the corresponding Sun C compiler ended up with
150 | "#define restrict _Restrict" or "#define restrict __restrict__"
151 | in the previous line. This workaround can be removed once
152 | we assume Oracle Developer Studio 12.5 (2016) or later. */
153 | #if defined __SUNPRO_CC && !defined __RESTRICT && !defined __restrict__
154 | # define _Restrict
155 | # define __restrict__
156 | #endif
157 |
158 | /* Define to `unsigned int' if does not define. */
159 | #undef size_t
160 |
161 | /* Define to the type of an unsigned integer type of width exactly 16 bits if
162 | such a type exists and the standard includes do not define it. */
163 | #undef uint16_t
164 |
165 | /* Define to the type of an unsigned integer type of width exactly 32 bits if
166 | such a type exists and the standard includes do not define it. */
167 | #undef uint32_t
168 |
169 | /* Define to the type of an unsigned integer type of width exactly 64 bits if
170 | such a type exists and the standard includes do not define it. */
171 | #undef uint64_t
172 |
173 | /* Define to the type of an unsigned integer type of width exactly 8 bits if
174 | such a type exists and the standard includes do not define it. */
175 | #undef uint8_t
176 |
177 |
178 | #endif /* __U8ID_CONF_H__ */
179 |
180 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | # configure.ac - Process with autoconf to produce a configure script
2 | # (C) 2021-2022 Reini Urban
3 | # See LICENSE
4 | AC_PREREQ([2.69])
5 | AC_INIT([libu8ident],
6 | m4_esyscmd([build-aux/git-version-gen .version]),
7 | [https://github.com/rurban/libu8ident/issues],,
8 | [http://github.com/rurban/libu8ident/])
9 | AC_CONFIG_AUX_DIR([build-aux])
10 | AC_CONFIG_SRCDIR([u8ident.c])
11 | AC_CONFIG_MACRO_DIR([m4])
12 | AC_SUBST([LIBU8IDENT_SO_VERSION], [1:0:0])
13 | VERSION_FILE=".version"
14 | AC_SUBST([VERSION_FILE])
15 | AC_MSG_CHECKING([git version])
16 | AC_MSG_RESULT($PACKAGE_VERSION)
17 | AC_MSG_CHECKING([so version-info])
18 | AC_MSG_RESULT($LIBU8IDENT_SO_VERSION)
19 |
20 | AM_INIT_AUTOMAKE([1.10 no-define foreign dist-xz])
21 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
22 | #dnl m4_pattern_allow
23 | LT_INIT
24 | AC_CHECK_TOOLS([AR], [gcc-ar ar])
25 | AC_CHECK_TOOLS([RANLIB], [gcc-ranlib ranlib])
26 | AC_CHECK_TOOLS([NM], [gcc-nm nm])
27 | #dnl Initialize pkg-config since we are installing a .pc file
28 | PKG_INSTALLDIR
29 | AM_CPPFLAGS="-I\$(top_srcdir)/include"
30 |
31 | AC_MSG_NOTICE([
32 |
33 | Check options])
34 | # ===============================================
35 | # hardcode certain options, to creater smaller and faster binaries
36 |
37 | # TR31 charset profile for identifiers
38 | # Recommended: --with-tr31=allowed or none
39 | AC_MSG_CHECKING([for --with-tr31=])
40 | AC_ARG_WITH([tr31],
41 | [AS_HELP_STRING([--with-tr31={ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8,NONE}],
42 | [support only this TR31 identifier profile. @<:@default=xid@:>@
43 | Recommended: --with-tr31=allowed])],
44 | [case "${with_tr31}" in
45 | ALLOWED|SAFEC26|ID|XID|C11|C23|ALLUTF8) U8ID_TR31=${with_tr31}; AC_MSG_RESULT([$with_tr31]) ;;
46 | NONE) CFLAGS="$CFLAGS -DDISABLE_U8ID_TR31"; AC_MSG_RESULT([$with_tr31]) ;;
47 | *) AC_MSG_ERROR([invalid ${with_tr31}: only ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8,NONE]) ;;
48 | esac],
49 | [ U8ID_TR31=; AC_MSG_RESULT([none. support all (default). Recommended: ALLOWED]) ]
50 | )
51 | if test -n "$U8ID_TR31"; then
52 | CFLAGS="$CFLAGS -DU8ID_TR31=$U8ID_TR31"
53 | dnl AC_DEFINE([U8IDP_TR31],[$U8ID_TR31],[Define to ALLOWED,SAFEC26,ID,XID,C11,C23,ALLUTF8 to support only this TR31 identifier charset])
54 | fi
55 | AC_SUBST([U8ID_TR31])
56 |
57 | # TR39 Mixed-script security profile for identifiers
58 | # Recommended: --with-profile=4
59 | AC_MSG_CHECKING([for --with-profile=])
60 | AC_ARG_WITH([profile],
61 | [AS_HELP_STRING([--with-profile={2,3,4,5,6,C26_4,C11_6}],
62 | [support only this TR39 Unicode security profile. @<:@default=4@:>@
63 | Recommended: --with-profile=4])],
64 | [case "${with_profile}" in
65 | 2|3|4|5|6|C26_4|C11_6) U8ID_PROFILE=${with_profile}; AC_MSG_RESULT([$with_profile]) ;;
66 | *) AC_MSG_ERROR([invalid ${with_profile}: only 2,3,4,5,6,C26_4,C11_6]) ;;
67 | esac],
68 | [ U8ID_PROFILE=; AC_MSG_RESULT([none. support all (default). Recommended: 4]) ]
69 | )
70 | if test -n "$U8ID_PROFILE"; then
71 | CFLAGS="$CFLAGS -DU8ID_PROFILE=$U8ID_PROFILE"
72 | dnl AC_DEFINE([U8ID_PROFILE],[$U8ID_PROFILE],[Define to 2-6,C26_4,C11_6 to support only this Unicode security profile])
73 | fi
74 | AC_SUBST([U8ID_PROFILE])
75 |
76 | # Recommended: --with-norm=NFC
77 | AC_MSG_CHECKING([for --with-norm=])
78 | AC_ARG_WITH([norm],
79 | [AS_HELP_STRING([--with-norm=NF{K,}{C,D}],
80 | [support only this normalization method. @<:@default=@:>@
81 | Recommended: --with-norm=NFC])],
82 | [case "${with_norm}" in
83 | NFKD|NFKC|NFC|NFD) U8ID_NORM=${with_norm}; AC_MSG_RESULT([$with_norm]) ;;
84 | FCD|FCD) U8ID_NORM=${with_norm}; AC_MSG_RESULT([$with_norm]) ;;
85 | *) AC_MSG_ERROR([invalid ${with_norm}: only NFC,NFKC,NFD,NFKD,FCC,FCD]) ;;
86 | esac],
87 | [ U8ID_NORM=; AC_MSG_RESULT([none. support all (default). Recommended: NFC]) ]
88 | )
89 | if test -n "$U8ID_NORM"; then
90 | if test x$U8ID_NORM != xNFC && test x$U8ID_PROFILE = xC26_4 -o x$U8ID_TR31 = xSAFEC26 -o x$U8ID_TR31 = xC23
91 | then
92 | AC_MSG_ERROR([Profile $U8ID_PROFILE or TR31 $U8ID_TR31 requires NFC])
93 | else
94 | CFLAGS="$CFLAGS -DU8ID_NORM=$U8ID_NORM"
95 | dnl AC_DEFINE([U8ID_NORM],[$U8ID_NORM],[Define to NF{K,}{C,D} to support only this Unicode normalization])
96 | fi
97 | fi
98 | AC_SUBST([U8ID_NORM])
99 |
100 | AC_MSG_CHECKING([for --enable-confus])
101 | AC_ARG_ENABLE([confus],
102 | [AS_HELP_STRING([--enable-confus],
103 | [add the confusables set @<:@default=no@:>@])],
104 | [case "${enableval}" in
105 | yes) HAVE_CONFUS=1;
106 | AC_DEFINE([HAVE_CONFUS], 1,
107 | [Defined to 1 to add the confusable API])
108 | AC_MSG_RESULT([yes])
109 | ;;
110 | *) AC_MSG_RESULT([no]) ;;
111 | esac], [HAVE_CONFUS= ])
112 | if test -z "$HAVE_CONFUS"; then
113 | AC_MSG_RESULT([no])
114 | fi
115 | AC_SUBST([HAVE_CONFUS])
116 |
117 | download_croaring() {
118 | git clone https://github.com/RoaringBitmap/CRoaring
119 | }
120 |
121 | AC_MSG_CHECKING([for --with-croaring])
122 | AC_ARG_WITH([croaring],
123 | [AS_HELP_STRING([--with-croaring=optional_path],
124 | [Use the CRoaring library.])],
125 | [case "${with_croaring}" in
126 | yes) HAVE_CROARING=1
127 | AC_MSG_RESULT([$with_croaring])
128 | AC_CHECK_TOOLS([XXD], [xxd])
129 | # in a parallel dir?
130 | with_croaring="$srcdir/../CRoaring"
131 | if test -d "${with_croaring}" && \
132 | test -e "${with_croaring}/roaring.c" && \
133 | test -e "${with_croaring}/roaring.h"
134 | then
135 | CROARING_PATH="\"$srcdir/../CRoaring\""
136 | AC_MSG_RESULT([found "$srcdir/../CRoaring"])
137 | else
138 | download_croaring && CROARING_PATH="CRoaring"
139 | fi
140 | ;;
141 | *) if test -d "${with_croaring}" && \
142 | test -e "${with_croaring}/roaring.c" && \
143 | test -e "${with_croaring}/roaring.h"
144 | then
145 | HAVE_CROARING=1
146 | CROARING_PATH="\"${with_croaring}\""
147 | AC_MSG_RESULT(["$with_croaring"])
148 | else
149 | AC_MSG_ERROR([invalid ${with_profile} path])
150 | fi
151 | ;;
152 | esac],
153 | [ HAVE_CROARING=; AC_MSG_RESULT([no]) ]
154 | )
155 |
156 | if test -n "$HAVE_CROARING"; then
157 | AC_DEFINE([HAVE_CROARING], 1,
158 | [Defined to 1 when to use the CRoaring library])
159 | fi
160 | AC_SUBST([HAVE_CROARING])
161 | AC_SUBST([CROARING_PATH])
162 | if test -n "$CROARING_PATH"; then
163 | AC_DEFINE_UNQUOTED([CROARING_PATH], $CROARING_PATH,
164 | [Defined to the path to the CRoaring library])
165 | fi
166 | AM_CONDITIONAL([HAVE_CONFUS], [test x$HAVE_CONFUS = x1])
167 | AM_CONDITIONAL([HAVE_CROARING], [test x$HAVE_CROARING = x1])
168 |
169 | AC_MSG_NOTICE([
170 |
171 | Check programs])
172 | # ===============================================
173 | AC_CANONICAL_HOST
174 | AC_PROG_CC
175 | AC_PROG_CPP
176 | AC_PROG_INSTALL
177 | AM_PROG_CC_C_O
178 | AC_PROG_CC
179 | dnl AC_PROG_CC_C11
180 | if test "x$ac_cv_prog_cc_c11" != "xno"; then
181 | AC_DEFINE([HAVE_C11], 1,
182 | [Defined to 1 when the compiler supports c11])
183 | if test "x$ac_cv_prog_cc_c11" != "x"; then
184 | AC_MSG_RESULT([added $ac_cv_prog_cc_c11 to CFLAGS])
185 | AM_CFLAGS="$AM_CFLAGS $ac_cv_prog_cc_c11"
186 | fi
187 | fi
188 | AC_SUBST(HAVE_C11)
189 | AC_CACHE_CHECK([if compiling with clang], [ax_cv_cc_target_clang],
190 | [
191 | saved_CFLAGS="$CFLAGS"
192 | CFLAGS="$TARGET_CFLAGS"
193 | AC_COMPILE_IFELSE(
194 | [AC_LANG_PROGRAM([], [[
195 | #ifdef __clang__
196 | #error "is clang"
197 | #endif
198 | ]])],
199 | [ax_cv_cc_target_clang=no], [ax_cv_cc_target_clang=yes])
200 | CFLAGS="$saved_CFLAGS"
201 | ])
202 |
203 | # clang has no broken return-local-addr detection
204 | if test x$ac_cv_c_compiler_gnu = xyes -a $ax_cv_cc_target_clang = xno; then
205 | case $host_os in
206 | mingw*|cygwin*|msys*)
207 | WARN_CFLAGS="-Wall -Wextra" ;;
208 | *) dnl older non-c99 g++ complain about __VA_ARGS__, for();
209 | if test "x$ac_cv_prog_cc_c99" = "xno"; then
210 | WARN_CFLAGS="-Wall -Wextra"
211 | else
212 | WARN_CFLAGS="-Wall -Wextra -Werror -Wno-return-local-addr"
213 | fi
214 | ;;
215 | esac
216 | elif test $ax_cv_cc_target_clang = xyes; then
217 | WARN_CFLAGS="-Wall -Wextra -Werror"
218 | else
219 | WARN_CFLAGS="-Wall -Wextra"
220 | fi
221 | AC_SUBST(WARN_CFLAGS)
222 |
223 | dnl optional for packaging or maintainance
224 | AC_CHECK_TOOLS([WGET], [wget2 wget])
225 | # dnf install rubygem-ronn-ng
226 | AC_CHECK_TOOLS([RONN], [ronn])
227 | AC_CHECK_TOOLS([PERL], [cperl perl])
228 | AC_CHECK_TOOLS([PANDOC], [pandoc])
229 | AM_CONDITIONAL([HAVE_PANDOC], [test -n "$PANDOC"])
230 | AC_CHECK_TOOLS([GPERF], [gperf])
231 |
232 | AC_MSG_NOTICE([Check header files])
233 | # ===============================================
234 | AC_HEADER_STDBOOL
235 | AC_CHECK_HEADERS([stdlib.h \
236 | malloc.h \
237 | string.h \
238 | stddef.h \
239 | inttypes.h \
240 | stdint.h \
241 | assert.h \
242 | unistd.h \
243 | getopt.h \
244 | sys/types.h \
245 | sys/stat.h \
246 | dirent.h \
247 | libgen.h \
248 | uniwbrk.h \
249 | ])
250 | AC_SEARCH_LIBS([u8_wordbreaks],[unistring],
251 | [AC_DEFINE([HAVE_U8_WORDBREAKS], [1], [Define to 1 if you have the `u8_wordbreaks' function.])
252 | u8idlint_LIBS=$ac_cv_search_u8_wordbreaks
253 | AC_SUBST([u8idlint_LIBS])
254 | AC_SUBST([HAVE_U8_WORDBREAKS])
255 | ],
256 | [if pkg-config --exists libunistring; then
257 | AC_DEFINE([HAVE_U8_WORDBREAKS], [1], [Define to 1 if you have the `u8_wordbreaks' function.])
258 | U8IDLINT_CFLAGS=`pkg-config libunistring --cflags`
259 | u8idlint_LIBS=`pkg-config libunistring --libs`
260 | AC_SUBST([U8IDLINT_CFLAGS])
261 | AC_SUBST([u8idlint_LIBS])
262 | AC_SUBST([HAVE_U8_WORDBREAKS])
263 | else
264 | AC_MSG_WARN([libunistring for u8idlint not found])
265 | fi])
266 | AM_CONDITIONAL([HAVE_LIBUNISTRING], [test x$HAVE_U8_WORDBREAKS = x1])
267 |
268 | AC_MSG_NOTICE([Check typedefs, structures, and compiler characteristics])
269 | # ===============================================
270 | AC_C_CONST
271 | AC_C_INLINE
272 | AC_C_RESTRICT
273 | AC_TYPE_SIZE_T
274 | AC_TYPE_INT32_T
275 | AC_TYPE_UINT8_T
276 | AC_TYPE_UINT16_T
277 | AC_TYPE_UINT32_T
278 | AC_TYPE_UINT64_T
279 | dnl AC_TYPE_UINTPTR_T
280 | AC_CHECK_SIZEOF(size_t)
281 | dnl AC_CHECK_FUNCS([ memset strcmp ])
282 | AC_CHECK_FUNCS([getopt_long],[],
283 | AC_MSG_WARN([getopt_long not found. u8idlint will not accept long options.]))
284 | AX_GCC_BUILTIN(__builtin_ffs)
285 |
286 | case $host_os in
287 | mingw*|cygwin*|msys*)
288 | have_windows=1
289 | CFLAGS="$CFLAGS -fstack-protector"
290 | ;;
291 | esac
292 | AM_CONDITIONAL([HAVE_WINDOWS], [test x1 = x$have_windows])
293 |
294 | AC_CHECK_TOOLS([PERL], [cperl perl])
295 | AC_CHECK_TOOLS([WGET], [wget])
296 |
297 | AH_TOP([
298 | #ifndef __U8ID_CONF_H__
299 | #define __U8ID_CONF_H__
300 | ])
301 |
302 | AH_BOTTOM([
303 | #endif /* __U8ID_CONF_H__ */
304 | ])
305 |
306 | LTEXEC="./libtool --mode=execute"
307 | AC_SUBST(LTEXEC)
308 |
309 | AC_CONFIG_HEADERS([config.h])
310 | AC_CONFIG_FILES([libu8ident.pc
311 | Makefile])
312 | AC_CONFIG_FILES([test-texts.test], [chmod +x test-texts.test])
313 | AC_OUTPUT
314 |
--------------------------------------------------------------------------------
/doc/D2528R1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/D2528R1.pdf
--------------------------------------------------------------------------------
/doc/P2528R0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/P2528R0.pdf
--------------------------------------------------------------------------------
/doc/c11.md:
--------------------------------------------------------------------------------
1 | # C11
2 |
3 | The unicode security guidelines for identifiers specify some rules
4 | for identifiers to stay identifiable. Most language and tool
5 | implementors ignored them when they added unicode support to their language.
6 |
7 | Only extremely few resisted, or did implement it. Java, cperl and rust
8 | properly implemented it. zig and J refused. gcc came up with in version
9 | 10, but the feature request started in 2015. clang has it since 3.3,
10 | msvc focuses on insecure UTF-16 instead.
11 |
12 | # The insecure C Standard C11
13 |
14 | The C11 standard started allowing a certain range of (mostly insecure) AltId
15 | codepoints, and did not define a XID profile, nor any TR39 security restrictions.
16 | Thus this is an insecure Unrestricted profile 6.
17 | In libu8ident this is `U8ID_PROFILE_C11_6`, defined by `-DU8ID_PROFILE_C11STD`
18 | See `unic11.h` and [N2731](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2731.pdf)
19 | Annex D (p. 425)
20 |
21 | The C11 standard ignores secure IdentifierTypes, mixed scripts violations and more.
22 | This allows easy Arabic right-to-left overrides, Greek + Cyrillic confusables, and
23 | does not mandate a normalization of identifiers, leading to confusables
24 | even with Latin-only C identifiers.
25 |
26 | See
27 | summarizing the
28 | [tr31#Alternative_Identifier_Syntax](http://unicode.org/reports/tr31/#Alternative_Identifier_Syntax)
29 | \[AltId\] in where anything is allowed but: white space, "syntax" characters,
30 | private use characters, surrogates, control characters, and
31 | non-characters.
32 |
33 | They hereby violated their own recommendations of
34 | to
35 | only allow Allowed Identifiers, which is to only allow the following
36 | properties:
37 |
38 | * L& The symbol "L&" indicates characters of type Lu, Ll, or Lt (see below).
39 | * Lu Letter, Uppercase
40 | * Ll Letter, Lowercase
41 | * Lt Letter, Titlecase
42 | * Lm Letter, Modifier
43 | * Lo Letter, Other
44 | * Mn Mark, Non-Spacing
45 | * Mc Mark, Spacing Combining
46 | * Nd Number, Decimal Digit
47 | * Nl Number, Letter
48 | * Pc Punctuation, Connector
49 |
50 | referring to the table TR10176-4. In practice you would also disallow
51 | Excluded and Limited Use scripts, only allow Allowed IdentifierStatus XID's,
52 | and check for mixed-script violations.
53 |
54 | The C++ Working Group detected the XID problem in
55 | [n2836](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2836.pdf) for
56 | C++23, but found that implementing TR39 would be too hard. "This proposal
57 | does not address some potential security concerns—so called homoglyph
58 | attacks—where letters that appear the same may be treated as
59 | distinct. Methods of defense against such attacks are complex and
60 | evolving, and requiring mitigation strategies would impose substantial
61 | implementation burden." [p194R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html)
62 |
63 | Mixed script detection such as done here is certainly not as hard as
64 | TR31 mitigation and esp. NFC normalization. It is also not evolving, TR39 is
65 | standardized since Unicode 6, with its earliest revisions in [2006](https://www.unicode.org/Public/security/revision-02/).
66 |
67 | # The SAFEC26 Extension
68 |
69 | I provide a fixed version of the C++ and C standard for secure unicode
70 | identifers with a special profile, called SAFEC26. This is an
71 | extended Moderate Profile, plus allowing **Greek** scripts with Latin,
72 | plus only allowing a special SAFEC26 TR31 charset. You realistically only
73 | get something into the C standard if you get it into C++ first.
74 |
75 | The secure extension disallows the Restricted and Limited_Use scripts
76 | and identifiers, disallows arbitrary right-to-left and left-to-right
77 | overrides within identifiers (i.e. no Latin + Arabic mixes), and
78 | disallows all the insecure mixed scripts combinations. But you can
79 | still write and compile unreadable Asian or Indian C code, or even
80 | Arabic (right-to-left), as long as you don't mix it with Latin
81 | (left-to-right).
82 |
83 | # Compilers
84 |
85 | The insecure C11 unicode identifiers were added to gcc, clang and
86 | chibicc. C99 already allowed Extended C99 Identifiers with \u and \U escape
87 | sequences.
88 | msvc claims to support them, but does not so yet, it rather focuses on insecure
89 | UTF-16 identifiers.
90 | [pcc](http://pcc.ludd.ltu.se/ftp/pub/pcc-docs/pcc-utf8-ver3.pdf) was
91 | worked on also. Don't know about other C compilers, most likely they
92 | are still secure.
93 |
94 | * gcc added these with version 10. See [PR
95 | 67224](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67224) but it
96 | generated (unrestricted) UTF-8 encoded ASM output before. So you can
97 | link against earlier GCC object files.
98 |
99 | * clang started secure Extended C99 Identifier support (`\uxxxx`) plus insecure
100 | C11 Unicode Identifiers with version 3.3.
101 | [ReleaseNotes](https://releases.llvm.org/3.3/tools/clang/docs/ReleaseNotes.html),
102 | [godbolt clang 3.3](https://godbolt.org/z/1dzMxPhvc)
103 |
104 | * MSVC has [Unicode support in the compiler and linker](https://docs.microsoft.com/en-us/cpp/build/reference/unicode-support-in-the-compiler-and-linker?view=msvc-140)
105 | for identifiers, included filenames (apparently unsanitized, so RTL
106 | attacks are possible. Plus for macros, string and character
107 | literals. Since at least Visual Studio 2015. But there's a MSVC
108 | [Feature Request](https://developercommunity.visualstudio.com/t/feature-request-allow-unicode-characters-in-identi/821782)
109 | to add insecure UTF-8 identifiers to MSVC. Apparently it doesn't
110 | support them now. [godbolt msvc 19](https://godbolt.org/z/xrnPnGPff)
111 |
112 | * icc since version 19. [godbolt icc 18](https://godbolt.org/z/8fhsf5xhT)
113 |
114 | * chibicc also took the C11 standard ranges:
115 | [unicode.c](https://github.com/rui314/chibicc/commits/main/unicode.c).
116 | My [Unicode security improvements over
117 | C11](https://github.com/rui314/chibicc/issues/32) request is open.
118 |
119 | * pcc acknowledges that the original C11 set _"is not restrictive
120 | enough to ensure readable code"_.
121 | ,
122 | but does nothing much against it. The preprocessor translates UTF-8
123 | chars to escaped `\Uxxxxxxxxx` strings though, to make them even
124 | more unreadable, but at least allows for easier tool checks.
125 |
126 | # Linters
127 |
128 | * clang-tidy can check with readability-identifier-naming for
129 | identifiers naming style mismatch (cases), but does not warn on any
130 | unicode security guideline violations. It could be easily added though.
131 |
132 | * This u8idlint started with unicode identifiers checks.
133 |
134 | * found no malicious homoglyphs
135 | in publicly available Go source files yet.
136 |
137 | I know of no other linters or security checkers doing unicode
138 | identifiers checks, even when the C11 standard referred to linters to
139 | do such checks. A binutils linter is also needed to check compiled
140 | libraries for insecure identifiers. See my binutils-gdb repo on
141 | github for a patched readelf to perform checks on unicode symbols.
142 |
143 | # Tools
144 |
145 | I provide with a libu8ident a fixed version of the C++/C standard for
146 | a secure unicode identifers profile, called
147 | **`U8ID_PROFILE_C26_4`**, defined by `-DU8ID_PROFILE_SAFEC26`. This is
148 | an extended Moderate Profile (4), plus allowing Greek with Latin, plus
149 | only allowing Allowed IdentifierTypes. It also demands NFC normalization.
150 | See `unic11.h`, `unic26.h` and `mkc26.c`.
151 |
152 | # Others
153 |
154 | C++, perl5, perl6 (now raku), ruby, python, php, swift, julia, D, nim, crystal
155 | ... all added insecure unicode identifiers in the last years. php just by accident,
156 | not on purpose.
157 |
158 | **zig** refused to add unicode identifiers in the latest two feature requests.
159 | Not so much because of its insecurity, more of hazzles with the yearly changing
160 | Unicode standard. Valid source code could suddenly become invalid.
161 | **J** also stayed with ASCII only.
162 |
163 | **Clojure** is also safe recommending only ASCII identifiers,
164 | [Symbols](https://clojure.org/reference/reader#_symbols), but you can intern
165 | abritrary strings at will. Same as **Common Lisp** and all lisps in general.
166 | There are no identifier/symbol security considerations in place at all, similar
167 | to filesystems.
168 |
169 | **cperl** (an improved perl5) is very secure. It uses the security
170 | profile 4, does NFC normalization internally, and allows adding
171 | Limited_Use scripts, and special encodings for identifiers.
172 |
173 | **rust** is pretty secure.
174 | Support was discussed in [RFC 2457](https://github.com/rust-lang/rust/issues/55467),
175 | [GH 28979](https://github.com/rust-lang/rust/issues/28979) and
176 | [GH 2253](https://github.com/rust-lang/rust/issues/2253)
177 | and is described as [TR31 compatible](https://doc.rust-lang.org/reference/identifiers.html).
178 | Don't know if they decided on NFC or NFKC normalization.
179 | People are still pushing for _more inclusive_ (i.e. insecure) unicode identifiers.
180 |
181 | **C#** and **F#** follow TR31 and do NFC normalization, but do not check for mixed
182 | scripts nor Excluded scripts. [Identifiers](https://go.microsoft.com/fwlink/?LinkId=199552)
183 |
184 | **python** [PIP 3131](https://www.python.org/dev/peps/pep-3131/) at least
185 | does normalization (NFKC, an odd joice), but does not check for mixed
186 | scripts nor Excluded scripts. They did read TR39, but didn't understand
187 | nor follow it.
188 |
189 | **Java**
190 | [identifiers](https://docs.oracle.com/javase/specs/jls/se17/html/jls-3.html#jls-3.8)
191 | mentions only TR31, but no mixed scripts profile nor Excluded scripts
192 | checks. Comparison/lookup only ignores ignorables, but does no normalization.
193 |
194 | **perl** checks same as python for `XID_Start`/`XID_Continue` properties,
195 | which is at least a bit better than C11. It does not check for mixed
196 | scripts nor Excluded scripts, nor does it identifier normalization.
197 | But perl5.16 started even adding support for binary names with embedded
198 | `\0` in 2012. Which is highly insecure, because names are mapped to
199 | filenames and paths when including modules, and the filesystem just
200 | stops searching at the `\0`. This way you can trivially hide shellcode
201 | in user-identifiers.
202 | Earlier perls allowed single source encoding, like Cyrillic, which was
203 | very secure, but abandonded thata few years ago. cperl kept that. ALGOL 68
204 | also allowed encodings (a representation language), but even for its keywords.
205 |
206 | **C++** has the same insecure unicode identifiers as C.
207 |
208 | **javascript** in
209 | [ES5](https://mathiasbynens.be/notes/javascript-identifiers) used Lu,
210 | Ll, Lt, Lm, Lo or Nl for their `ID_Start`, and adds Mn, Mc, Nd and Pc
211 | to their `ID_Cont`. (i.e. TR31 profile 5), but no mixed scripts nor
212 | Excluded scripts, No identifier normalization. Same as DefId as they
213 | discussed early in C11, but abandoned for the insecure AltId
214 | variant. [ES6](https://mathiasbynens.be/notes/javascript-identifiers-es6)
215 | then went more insecure by allowing all ID\_Start and ID\_Cont
216 | characters.
217 |
218 | **Julia** [Allowed Variable
219 | Names](https://docs.julialang.org/en/v1/manual/variables/#man-allowed-variable-names)
220 | went with TR31 and profile 5, but has some special subset. No mixed scripts nor
221 | Excluded scripts, but NFC normalization and some confusable handling.
222 |
223 | **nim** [Identifiers &
224 | Keywords](https://nim-lang.org/docs/manual.html#lexical-analysis-identifiers-amp-keywords)
225 | allows everything as identifier, every unicode character above 127 is
226 | a letter, not-normalized. So they have unidentifiable identifers, and
227 | should rather use a new name for that. Maybe "name" or "symbol"
228 | instead of identifier.
229 |
230 | **crystal** [Symbol](https://crystal-lang.org/reference/1.2/syntax_and_semantics/literals/symbol.html)
231 | allows similarilty binary chunk as nim, everything above 159 is a letter for them.
232 | **Lua** and **luajit** also don't care much about unicode identifiers being identifiable.
233 |
234 | **D** also supports all raw unicode characters as identifiers.
235 |
236 |
237 | **Factor** ditto.
238 |
239 | **Go** is UTF-8 and defines its
240 | [identifers](https://go.dev/ref/spec#Identifiers) as letters, so
241 | TR31 ID. It does not check for mixed scripts nor Excluded scripts, nor
242 | does normalization. An exported name must begin with Lu (Uppercase Letter).
243 |
244 | **Haskell** identifiers must start with Ll (Lowercase Letter), but no further
245 | restrictions or normalization support AFAIK.
246 |
247 | **php** does not support unicode identifiers nor UTF-8 encodings, but
248 | does accidently so, by allowing everything in
249 | `[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*`, which leads the same
250 | insecurities as with nim and crystal, treating everything above 127 as letter.
251 |
252 | **Ada 95** supports raw unicode identifiers, but you need to specify
253 | its encoding. You can bet its entirely insecure.
254 |
255 | Modern **Forth** as glorified stack assembler allows everything
256 | unidentifiable. Older FORTH was safe though.
257 |
258 | **Rosetta Code** has a general overview of unicode identifiers in all
259 | programming languages, but without its special insecurities and
260 | caveats:
261 |
262 | Generally they were all very fast to add insecure unicode names
263 | (ignoring the official security guidelines), and also provided a slow
264 | SipHash security theatre for their hash tables instead of proper hash
265 | table security.
266 |
267 | ----
268 | Created Reini Urban, 28.Dec 2021
269 |
--------------------------------------------------------------------------------
/doc/n2916.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/n2916.pdf
--------------------------------------------------------------------------------
/doc/n2932.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/n2932.pdf
--------------------------------------------------------------------------------
/doc/nXXXX.patch:
--------------------------------------------------------------------------------
1 | --- doc/P2528R1.md 2022-02-15 13:44:14.210682108 +0100
2 | +++ doc/n2932.md 2022-02-15 13:45:14.746174957 +0100
3 | @@ -1,20 +1,19 @@
4 | - C++ Identifier Security using Unicode Standard Annex 39
5 | + n2932 - C Identifier Security using Unicode Standard Annex 39 v2
6 |
7 | - Document #: P2538R1
8 | - Date: 2022-02-12
9 | - Project: Programming Language C++
10 | - Audience: SG-16
11 | - EWG
12 | - CWG
13 | + Date: 2022-02-15
14 | + Project: Programming Language C
15 | + Audience: WG14
16 | + SG-16
17 | Reply-to: Reini Urban
18 |
19 | 1 Abstract
20 | ==========
21 |
22 | -In response to [P1949R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html), and
23 | -in parallel to [n2932](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2932.htm) for C.
24 | +In response to [P1949R7](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html),
25 | +replaces [n2916](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2916.htm),
26 | +in parallel to [P2528R1](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/P2528R1.html) for C++.
27 |
28 | -Adopt Unicode Annex 39 "Unicode Security Mechanisms" as part of C++26.
29 | +Adopt Unicode Annex 39 "Unicode Security Mechanisms" as part of C26.
30 |
31 | * Comply to a variant of [TR39#5.2](https://www.unicode.org/reports/tr39/#Restriction_Level_Detection)
32 | Mixed-Scripts Moderately Restrictive profile, but allow some Greek letters without
33 | @@ -37,7 +36,7 @@
34 | Recommend binutils/linker ABI identifier rules: names are UTF-8,
35 | add identifier checks. E.g. `readelf -L -Ue`.
36 |
37 | -In addition adopt this proposal as a Defect Report against C++23 and
38 | +In addition adopt this proposal as a Defect Report against C11 and
39 | earlier. The author provides the
40 | [libu8ident](https://github.com/rurban/libu8ident/) library (Apache 2
41 | licensed) and its generated tables to all implementors.
42 | @@ -48,10 +47,9 @@
43 | 2 Changes
44 | =========
45 |
46 | -From R0:
47 | +From [n2916](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2916.htm) 2022-01-22:
48 |
49 | -* Add internal links.
50 | -* Rename C23 to C26, it's too late for C++23.
51 | +* Rename C23 to C26, it's too late for C23, at least for C++23.
52 | * Disallow non-confusable `Technical` U+1C0..U+1C3
53 | * Fix a lot of not Allowed ID_Start ranges. safec26_start_list
54 | from 355 ranges, 115 singles, 99350 codepoints
55 | @@ -83,7 +81,7 @@
56 | as they appeared in browsers and email. Also names in C object files:
57 | linkers, .def files, ffi's.
58 |
59 | -Implementing TR39 mixed script detection per document (C++ Header and
60 | +Implementing TR39 mixed script detection per document (C Header and
61 | Source file) forbids insecure mixes of Greek and Cyrillic, dangerous
62 | Arabic RTL bidi attacks and most confusables. You can still write in
63 | your language, but then only in commonly written languages, and not
64 | @@ -91,13 +89,13 @@
65 |
66 | The question remains if TR39 security violations should be ill-formed
67 | (throw an compilation error or warning), or not. Since we do have the
68 | -`-std=c++26` option, and the issues are security relevant, an error
69 | -the issues are security relevant, an error seems to be best.
70 | -Implementations might choose to go for warnings on not-valid scripts,
71 | -mixed scripts, or invalid sequences of combining marks though, even if
72 | -the Unicode Standard recommended for decades that identifiers should
73 | -stay identifiable. If the standard committee opts for the insecure
74 | -option, they should rather rename identifiers to symbols then.
75 | +`-std=c2x` option, and the issues are security relevant, an error
76 | +seems to be best. Implementations might choose to go for warnings on
77 | +not-valid scripts, mixed scripts, or invalid sequences of combining
78 | +marks though, even if the Unicode Standard recommended for decades
79 | +that identifiers should stay identifiable. If the standard committee
80 | +opts for the insecure option, they should rather rename identifiers to
81 | +symbols then.
82 |
83 | 4 Motivation
84 | ============
85 | @@ -109,11 +107,12 @@
86 | ,
87 |
88 | *
89 | -* with \*-sec\*.c\*
90 | +* with \*-sec\*.c
91 |
92 | These changes would fix all of the known security problems with C++/C
93 | identifiers. With C++ it is more severe as declarations are easily
94 | -confusable with initializations.
95 | +confusable with initializations, but C is still affected in unclear
96 | +situations.
97 |
98 | 5 What will this proposal change
99 | ================================
100 | @@ -196,7 +195,7 @@
101 | 5.3 Documents with identifiers in many multiple scripts/languages will become illegal
102 | -------------------------------------------------------------------------------------
103 |
104 | -C++26 (and C26) will follow the TR39 Security Profile 4 **Moderately
105 | +C26 (and C26++) will follow the TR39 Security Profile 4 **Moderately
106 | Restrictive**, with an exception for Greek.
107 |
108 | * All identifiers in a document qualify as Single Script, or
109 | @@ -213,7 +212,7 @@
110 | 5.4 Mixed-script runs with combining marks will become illegal
111 | --------------------------------------------------------------
112 |
113 | -C++26 will check for unlikely sequences of **combining marks**, and
114 | +C26 will check for unlikely sequences of **combining marks**, and
115 | reject some. Combining Marks have no script property per se, but a
116 | variable list of allowed SCX scripts, which need to be checked against
117 | the base character. Also 4 Japanese KATAKANA-HIRAGANA PROLONGED SOUND
118 | @@ -278,7 +277,7 @@
119 | 8.1 SC
120 | -----
121 |
122 | -C++ only needs to map unicode characters to a script property via a
123 | +C only needs to map unicode characters to a script property via a
124 | single byte. There are currently 161 scripts assigned, 32 of them are
125 | in common use as identifiers, hence called **Recommended** scripts. The
126 | rest is split up into 127-31 **Excluded** scripts, which are not in common
127 | @@ -389,7 +388,7 @@
128 | SOUND MARKs, all other Lm modifiers may mix with all SCX.
129 |
130 | The list of allowed combining mark characters (with Common or Inherited
131 | -scripts) in the C++26 TR31 profile is: Lm `Modifier_Letter`,
132 | +scripts) in the C26 TR31 profile is: Lm `Modifier_Letter`,
133 | Mc `Spacing_Mark`, Mn `Nonspacing_Mark`, Me `Enclosing_Mark`. Sk and Cf are
134 | not part of XIDs.
135 |
136 | @@ -416,7 +415,7 @@
137 | ...
138 | ```
139 |
140 | -From these 67 Lm plus 513 M\[cn\] ranges filtering out the non-C++26 XID
141 | +From these 67 Lm plus 513 M\[cn\] ranges filtering out the non-C26 XID
142 | candidates, only #8 Identifier_Type = Recommended, Inclusion,
143 | non-confusable Technical, plus only #4.2 Recommended Scripts, plus only codepoints
144 | with multiple SCX entries, plus only codepoints which don't decompose
145 | @@ -425,7 +424,7 @@
146 |
147 | So some of the Common `XID_Continue` marks therefore cannot be
148 | detected with the SCX logic. But all of them do not combine with Latin
149 | -and are already filtered by the C++26 Mixed Script profile.
150 | +and are already filtered by the C26 Mixed Script profile.
151 | And all of the Combining Marks are caught by the NFC requirement.
152 |
153 | Most Lm Modifier Letters (besides the 4 Japanese PROLONGED SOUND
154 | @@ -564,11 +563,11 @@
155 | identifiers. One could argue that a mixed-script profile is valid
156 | only for a single identifier, or it is valid for the whole source file
157 | document. And there needs to be a definition if before or after the
158 | -preprocessor, and if to treat names in private structs, classes and
159 | -local names in functions as seperate contexts.
160 | +preprocessor, and if to treat names in private structs and local names
161 | +in functions as seperate contexts.
162 |
163 | If valid for only a single identifier you could arbitralily mix up
164 | -Cyrillic with Greek identifiers in a C++ namespace, and thus these
165 | +Cyrillic with Greek identifiers in a C files, and thus these
166 | identifiers would not be identifiable anymore, as both both can render
167 | to the very same glyphs. Thus we adopt the notion of identifier
168 | contexts.
169 | @@ -605,11 +604,10 @@
170 | - **private**: Another argument would be that all exported names end
171 | up in the object files and library flat, which would support the
172 | seperation of private and public name contexts, where to perform the
173 | - mixed-script checks. Private contexts (e.g. static structs, private
174 | - class fields, local names in functions) should be seperated from the
175 | - rest. This would prevent from confusables in struct/class
176 | - fields/methods, and the rest is seperated by the checks for the
177 | - public names.
178 | + mixed-script checks. Private contexts (e.g. static structs fields or
179 | + local names in functions) should be seperated from the rest. This
180 | + would prevent from confusables in struct fields/methods, and the
181 | + rest is seperated by the checks for the public names.
182 |
183 | - **after-cpp**: The third, strictest variant would define the context in
184 | the file after cpp. You would not be able to include a Cyrillic-only
185 | @@ -668,13 +666,13 @@
186 | there, and names are charset (=user) specific, whilst there are no
187 | header fields for the used charset (e.g. if SHIFT-JIS or UTF-8), nor
188 | are there any rules for name lookup (normalization). This is not
189 | -solvable here (in C nor C++), only there or the gABI. Only in the Rust
190 | +solvable here (in C), only there or the gABI. Only in the Rust
191 | ecosystem there are proper unicode identifier rules, but Rust can link
192 | -against C++/C. I haven't detected any exported unicode names in the
193 | -wild, they are only used in local symbols still. UTF-16 compilers such
194 | -as MSVC do export their UNICODE names either in the local character
195 | -set or as UTF-8. If used wildly, object files would not link anymore,
196 | -as local character sets vary, and there is no character set standard
197 | +against C. I haven't detected any exported unicode names in the wild,
198 | +they are only used in local symbols still. UTF-16 compilers such as
199 | +MSVC do export their UNICODE names either in the local character set
200 | +or as UTF-8. If used wildly, object files would not link anymore, as
201 | +local character sets vary, and there is no character set standard
202 | defined.
203 |
204 | The C++/C working groups should urge the binutils/linker working
205 |
--------------------------------------------------------------------------------
/doc/nXXXX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rurban/libu8ident/905e037c5fd5f9ce0ce0b78274ac2a42ff8374bd/doc/nXXXX.pdf
--------------------------------------------------------------------------------
/doc/tr31-bugs.md:
--------------------------------------------------------------------------------
1 | TR31 Security Bugs (UCD Versions 1-14)
2 | ======================================
3 |
4 | U+FF00..U+FFEF not as ID
5 | ---------------------------
6 |
7 | Most of the U+FF00..U+FFEF Full and Halfwidth letters have incorrectly
8 | `ID_Start` resp. `ID_Continue` properties. XID ditto.
9 | They should not, because they are confusable with the normal
10 | characters in the base planes. E.g. LATIN A-Z are indistuingishable
11 | from A..Z, LATIN a-z from a..z, likewise for the Katakana ヲ..ッ and
12 | ア..ン, and the Hangul ᅠ..ᄒ, ᅡ..ᅦ, ᅧ..ᅬ, ᅭ..ᅲ ᅳ..ᅵ halfwidth letters.
13 |
14 | This is esp. for TR39 a security risk. TR39 provides Identifier Type
15 | properties to exclude insecure identifiers, but I cannot find any
16 | other type property to set these U+FF21..U+FFDC IDs to, than
17 | `Not_XID`. Thus the `ID_Start`/`ID_Continue` property should be
18 | deleted for all of them. If they are not identifiable, they should not
19 | be marked as such.
20 |
21 | Medial letters in `ID_Start`, not `ID_Continue`
22 | -------------------------------------------------
23 |
24 | DerivedCoreProperties lists all of the Arabic and Thai MEDIAL letters,
25 | which are part of identifiers in `ID_Start`, not in `ID Continue`. Only
26 | the Combining marks are in `ID_Continue`. Thus all unicode-aware
27 | parsers accept such MEDIAL letters incorrectly in the start
28 | position. They should only be allowed in the `ID_Continue` position,
29 | and parsers should disallow them in the end positions for identifiers.
30 |
31 | All the other medial letters (Myanmar, Canadian Aboriginal, Ahom,
32 | Dives Akuru) are not part of Recommended Scripts, so they do not
33 | affect TR39 security. But since almost nobody but Java, cperl and Rust
34 | honor TR39 it's still affecting most parsers.
35 |
36 | Other medial exceptions are noted in TR31 at 2.4 Specific Character
37 | Adjustments, but the tables DerivedCoreProperties and TR39 Identifier
38 | tables and thus all user parsers are wrong.
39 |
40 |
41 | For the proposed C++26/C26 standard no such medial characters are
42 | included.
43 |
44 | Confusable Technical IdTypes
45 | ----------------------------
46 |
47 | DerivedCoreProperties.txt lists the three ǃ U+1C3 "LATIN LETTER ALVEOLAR CLICK"
48 | ǀ U+1C0 "LATIN LETTER DENTAL CLICK" and ǁ U+1C1 "LATIN LETTER LATERAL CLICK"
49 | as `ID_Start`. But they should not be included in ID at all, as they are
50 | confusable with common operators.
51 |
52 | TR39 IdentifierTypes.txt lists the three ǃ U+1C3 "LATIN LETTER
53 | ALVEOLAR CLICK" ǀ U+1C0 "LATIN LETTER DENTAL CLICK" and ǁ U+1C1 "LATIN
54 | LETTER LATERAL CLICK" as Technical. So for normal IdentifierStatus
55 | Allowed these would be excluded. But C26/C++26 whishes to add Technical
56 | also. Add at least an Exclusion, if not the Non_XID property for XID
57 | stability concerns. But security bugs should trump stability
58 | guarantees.
59 |
60 | See
61 | for an exploit.
62 |
63 | Arabic Presentation Forms-A: U+FB50–U+FDFF and Forms-B: U+FE70–U+FEFF not as ID
64 | -------------------------------------------------------------------------------
65 |
66 | Forms-A contains a list of Arabic presentation forms encoded as
67 | characters primarily for compatibility reasons. Forms-B are for
68 | compatibility with preexisting standards and legacy implementations
69 | that use these forms as character. Instead of these, letters from the
70 | Arabic block (U+0600..U+06FF) should be used for identifiers. See
71 | and
72 | . The TR39 idtype of these
73 | should be changed to Obsolete.
74 |
75 | ----
76 | Last checked against DerivedCoreProperties-15.0.0d1.txt from 2022-04-26
77 |
--------------------------------------------------------------------------------
/example.c:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2022 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | Contrary to the tests and helpers, provide a test binary to link
6 | against the public methods and lib only.
7 | */
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include "u8ident.h"
14 |
15 | static void version(void) { puts("libu8ident example"); }
16 | static void usage(int exitcode) {
17 | version();
18 | puts("Usage: example [--help|--version]");
19 | puts("\nSEE ALSO:");
20 | puts(" u8idlint.1");
21 | puts("\nAUTHOR:");
22 | puts(" Reini Urban ");
23 | exit(exitcode);
24 | }
25 |
26 | int main(int argc, char **argv) {
27 | unsigned linenr = 0U;
28 | enum u8id_options xid = U8ID_TR31_XID;
29 | enum u8id_norm norm = U8ID_NFC;
30 | enum u8id_profile profile = U8ID_PROFILE_C26_4;
31 | unsigned u8idopts = (unsigned)xid;
32 |
33 | char path[256] = {0};
34 | static char line[1024] = {0};
35 |
36 | if (argc > 1 && !strcmp(argv[1], "--help"))
37 | usage(0);
38 | if (argc > 1 && !strcmp(argv[1], "--version")) {
39 | version();
40 | exit(0);
41 | }
42 | u8ident_init(profile, norm, u8idopts);
43 | if (getenv("U8IDTEST_TEXTS")) {
44 | strncpy(path, getenv("U8IDTEST_TEXTS"), 255);
45 | #ifdef _WIN32
46 | path[255] = '\0'; // only windows does not terminate
47 | #endif
48 | }
49 | if (argc > 1) {
50 | strncpy(path, argv[1], 255);
51 | #ifdef _WIN32
52 | path[255] = '\0'; // only windows does not terminate
53 | #endif
54 | } else {
55 | if (*path)
56 | strncat(path, "/myanmar-1.txt", 255);
57 | else
58 | strncpy(path, "texts/myanmar-1.txt", 255);
59 | }
60 |
61 | FILE *f = fopen(path, "r");
62 | if (!f) {
63 | perror("fopen");
64 | fprintf(stderr, "%s\n", path);
65 | return -1;
66 | }
67 | printf("u8ident_check %s\n", path);
68 |
69 | while (fgets(line, 1023, f)) {
70 | linenr++;
71 | // split on whitespace
72 | char *s = &line[0];
73 | while (*s) {
74 | // we really should split on identifier boundaries, as in a real parser.
75 | // but this is just a simple example. So we get lots of unneeded XID
76 | // errors.
77 | char word[256];
78 | char *norm = NULL;
79 | char *p = strchr(s, ' ');
80 | ptrdiff_t len = p ? (ptrdiff_t)(p - s) : (ptrdiff_t)strlen(s);
81 | if (len && s[len - 1] == '\n')
82 | len--;
83 | if (!len)
84 | break;
85 | int ret = u8ident_check_buf(s, len, NULL);
86 | switch (ret) {
87 | case U8ID_EOK_NORM:
88 | if (len < 256) {
89 | strncpy(word, s, len);
90 | word[len] = '\0';
91 | u8ident_check((uint8_t *)word, &norm);
92 | printf("\"%s\".%u: NFC => \"%s\"\n", word, linenr, norm);
93 | free(norm);
94 | } else {
95 | printf("\"%.*s\".%u: NFC ...\n", (int)len, s, linenr);
96 | }
97 | break;
98 | case U8ID_ERR_XID:
99 | printf("\"%.*s\".%u: Wrong XID U+%X\n", (int)len, s, linenr,
100 | u8ident_failed_char(0));
101 | break;
102 | case U8ID_ERR_COMBINE:
103 | printf("\"%.*s\".%u: Wrong Combining Mark U+%X\n", (int)len, s, linenr,
104 | u8ident_failed_char(0));
105 | break;
106 | case U8ID_ERR_SCRIPT:
107 | case U8ID_ERR_SCRIPTS: {
108 | uint32_t cp = u8ident_failed_char(0);
109 | const char *scr = u8ident_failed_script_name(0);
110 | const char *scripts = u8ident_existing_scripts(0);
111 | printf("\"%.*s\".%u: Wrong SCRIPT %s for U+%X, have %s\n", (int)len, s,
112 | linenr, scr, cp, scripts);
113 | free((char*)scripts);
114 | break;
115 | }
116 | default:
117 | break;
118 | }
119 | if (p)
120 | s = p + 1;
121 | else
122 | break;
123 | }
124 | }
125 | fclose(f);
126 |
127 | u8ident_free();
128 | return 0;
129 | }
130 |
--------------------------------------------------------------------------------
/hangul.h:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is built by cperl regen/regcharclass-safec.pl.
3 | * cp_high means only codepoints > 0xff
4 | */
5 |
6 | #ifndef H_HANGUL
7 | #define H_HANGUL 1
8 |
9 | /*
10 | HANGUL (Korean) has special normalization rules.
11 | Unicode 9.0
12 | */
13 | // clang-format off
14 | #define Hangul_SBase 0xAC00
15 | #define Hangul_SFinal 0xD7A3
16 | #define Hangul_SCount 11172
17 |
18 | #define Hangul_NCount 588
19 |
20 | #define Hangul_LBase 0x1100
21 | #define Hangul_LFinal 0x1112
22 | #define Hangul_LCount 19
23 |
24 | #define Hangul_VBase 0x1161
25 | #define Hangul_VFinal 0x1175
26 | #define Hangul_VCount 21
27 |
28 | #define Hangul_TBase 0x11A7
29 | #define Hangul_TFinal 0x11C2
30 | #define Hangul_TCount 28
31 |
32 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
33 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
34 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
35 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
36 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
37 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
38 |
39 | /* much better would be the simple Hangul_IsS(cp) check */
40 | /*** perl5/cperl GENERATED CODE for all Hangul codepoints ***/
41 | #define is_HANGUL_cp_high(cp) \
42 | ( ( 0x1100 <= cp && cp <= 0x11FF ) || ( 0x11FF < cp && \
43 | ( 0x302E == cp || ( 0x302E < cp && \
44 | ( 0x302F == cp || ( 0x302F < cp && \
45 | ( ( 0x3131 <= cp && cp <= 0x318E ) || ( 0x318E < cp && \
46 | ( ( 0x3200 <= cp && cp <= 0x321E ) || ( 0x321E < cp && \
47 | ( ( 0x3260 <= cp && cp <= 0x327E ) || ( 0x327E < cp && \
48 | ( ( 0xA960 <= cp && cp <= 0xA97C ) || ( 0xA97C < cp && \
49 | ( ( 0xAC00 <= cp && cp <= 0xD7A3 ) || ( 0xD7A3 < cp && \
50 | ( ( 0xD7B0 <= cp && cp <= 0xD7C6 ) || ( 0xD7C6 < cp && \
51 | ( ( 0xD7CB <= cp && cp <= 0xD7FB ) || ( 0xD7FB < cp && \
52 | ( ( 0xFFA0 <= cp && cp <= 0xFFBE ) || ( 0xFFBE < cp && \
53 | ( ( 0xFFC2 <= cp && cp <= 0xFFC7 ) || ( 0xFFC7 < cp && \
54 | ( ( 0xFFCA <= cp && cp <= 0xFFCF ) || ( 0xFFCF < cp && \
55 | ( ( 0xFFD2 <= cp && cp <= 0xFFD7 ) || ( 0xFFDA <= cp && cp <= 0xFFDC ) \
56 | )))))))))))))))))))))))))) )
57 |
58 | // clang-format on
59 | #endif /* H_HANGUL */
60 |
--------------------------------------------------------------------------------
/htable.c:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2022 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | A simple open-addressing string hash table for the confusables check.
6 | No deletions needed.
7 | */
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include "htable.h"
15 |
16 | struct htable * new_htab(unsigned cap) {
17 | struct htable *ht = malloc(sizeof(struct htable));
18 | assert(cap % 2 == 0);
19 | ht->keys = calloc(cap, sizeof(char*));
20 | ht->values = calloc(cap, sizeof(char*));
21 | ht->cap = cap;
22 | ht->size = 0;
23 | return ht;
24 | }
25 |
26 | void free_htab(struct htable *htab) {
27 | for (unsigned i=0; i < htab->cap; i++) {
28 | if (htab->keys[i]) {
29 | free(htab->keys[i]);
30 | free(htab->values[i]);
31 | }
32 | }
33 | free(htab->keys);
34 | free(htab->values);
35 | htab->cap = htab->size = 0;
36 | return;
37 | }
38 |
39 | static inline float fill_htab(struct htable *htab) {
40 | return htab->size / htab->cap;
41 | }
42 |
43 | static inline uint32_t hash(const char *key) {
44 | uint32_t h = 2166136261;
45 | uint8_t *p = (uint8_t *)key;
46 | uint8_t c;
47 | while ((c = *p++)) {
48 | h ^= c;
49 | h *= 16777619;
50 | }
51 | return h;
52 | }
53 |
54 | void add_htab(struct htable *htab, const char *key, const char *value) {
55 | assert(htab);
56 | uint32_t h = hash(key) & (htab->cap - 1);
57 | uint32_t oh = h;
58 | assert(key);
59 | if (fill_htab(htab) > 0.75f) {
60 | struct htable *ht2;
61 | resize:
62 | ht2 = new_htab(htab->cap * 2);
63 | //fprintf(stderr, "resize %p[%lu] to %lu\n", htab, htab->size, htab->cap * 2);
64 | for (unsigned i=0; i < htab->cap; i++) {
65 | if (htab->keys[h]) {
66 | add_htab(ht2, htab->keys[h], htab->values[h]);
67 | }
68 | }
69 | free_htab(htab);
70 | memcpy(htab, ht2, sizeof *htab); // copy all over, and start again.
71 | free(ht2);
72 | oh = h = hash(key) & (htab->cap - 1);
73 | }
74 | while (htab->keys[h] && strcmp(htab->keys[h], key)) { // not found
75 | h++;
76 | if (h >= htab->cap) // wraparound
77 | h = 0;
78 | if (h == oh)
79 | goto resize;
80 | }
81 | if (!htab->keys[h]) { // found a free slot
82 | //fprintf(stderr, "add %s key to %p[%lu]\n", key, htab, htab->size+1);
83 | htab->keys[h] = strdup(key);
84 | htab->values[h] = strdup(value);
85 | htab->size++;
86 | } // else already exists, ignore
87 | return;
88 | }
89 |
90 | char * find_htab(struct htable *htab, const char *key) {
91 | assert(htab);
92 | if (!htab->size)
93 | return false;
94 | uint32_t h = hash(key) & (htab->cap - 1);
95 | const uint32_t oh = h;
96 | assert(key);
97 | while (htab->keys[h] && strcmp(htab->keys[h], key)) { // not found
98 | h++;
99 | if (h >= htab->cap) // wraparound
100 | h = 0;
101 | if (h == oh)
102 | return false;
103 | }
104 | //if (htab->keys[h])
105 | // fprintf(stderr, "find %s -> %s in %p[%lu]\n", htab->keys[h], htab->values[h], htab, htab->size);
106 | return htab->keys[h] ? htab->values[h] : NULL;
107 | }
108 |
--------------------------------------------------------------------------------
/htable.h:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2022 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | A simple open-addressing string hash table for the confusables check.
6 | No deletion needed.
7 | */
8 |
9 | #pragma once
10 | #include
11 | #include
12 |
13 | struct htable {
14 | unsigned size;
15 | unsigned cap;
16 | char **keys;
17 | char **values;
18 | };
19 |
20 | struct htable * new_htab(unsigned cap);
21 | void free_htab(struct htable *htab);
22 | // adds a copy to the htable
23 | void add_htab(struct htable *htab, const char *key, const char *value);
24 | // return value
25 | char * find_htab(struct htable *htab, const char *key);
26 |
--------------------------------------------------------------------------------
/include/u8ident.h:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2021,2022 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 | */
5 |
6 | #include
7 | #include
8 |
9 | #define U8IDENT_VERSION_MAJOR 0
10 | #define U8IDENT_VERSION_MINOR 0
11 | #define U8IDENT_UNICODE_VERSION 15
12 |
13 | enum u8id_norm {
14 | U8ID_NFC = 0, // the default, shorter canonical composed normalization
15 | U8ID_NFD = 1, // the longer, canonical decomposed normalization, as in the
16 | // previous Apple HPFS filesystem
17 | U8ID_NFKC = 2, // the compatibility composed normalization, as in Python 3
18 | U8ID_NFKD = 3, // the longer compatibility decomposed normalization
19 | U8ID_FCD = 4, // the faster variants
20 | U8ID_FCC = 5
21 | };
22 | enum u8id_profile {
23 | U8ID_PROFILE_1 = 1, // ASCII only
24 | U8ID_PROFILE_2 = 2, // Single Script only
25 | U8ID_PROFILE_3 = 3, // Highly Restrictive
26 | U8ID_PROFILE_4 = 4, // Moderately Restrictive
27 | U8ID_PROFILE_5 = 5, // Minimally Restrictive
28 | U8ID_PROFILE_6 = 6, // Unrestricted
29 | U8ID_PROFILE_C11_6 = 7, // The C11 std
30 | // PROFILE_4 + Greek with only Allowed ID's ("SAFEC26")
31 | U8ID_PROFILE_C26_4 = 8,
32 | };
33 | enum u8id_options {
34 | // clang-format off
35 | // Note: The parser/tokenizer should do that. Without, the checker can be
36 | // faster.
37 | // Can be disallowed with --u8id-tr31=NONE, hardcoded with --u8id-tr31=.
38 | U8ID_TR31_XID = 64, // ID without NFKC quirks, labelled stable, the default
39 | U8ID_TR31_ID = 65, // all letters, plus numbers, punctuation and marks. With
40 | // exotic scripts.
41 | U8ID_TR31_ALLOWED = 66, // TR39 ID with only recommended scripts. Allowed IdentifierStatus.
42 | U8ID_TR31_SAFEC26 = 67, // practical XID with TR39 security measures. see doc/D2528R1.md
43 | U8ID_TR31_C23 = 68, // XID with NFC from the upcoming C23 standard (P1949, N2828).
44 | U8ID_TR31_C11 = 69, // the stable insecure AltId ranges from the C11 standard, Annex D
45 | U8ID_TR31_ALLUTF8 = 70, // allow all > 128, e.g. D, php, nim, crystal
46 | U8ID_TR31_ASCII = 71, // only ASCII letters (as e.g. zig, j. older compilers)
47 | // room for more tr31 profiles
48 |
49 | U8ID_FOLDCASE = 128,
50 | U8ID_WARN_CONFUSABLE = 256, // requires -DHAVE_CONFUS
51 | U8ID_ERROR_CONFUSABLE = 512, // requires -DHAVE_CONFUS
52 | // clang-format on
53 | };
54 | #define U8ID_TR31_MASK 127
55 | typedef unsigned u8id_ctx_t;
56 |
57 | #ifndef U8ID_NORM_DEFAULT
58 | # define U8ID_NORM_DEFAULT U8ID_NFC
59 | #endif
60 | #ifndef U8ID_PROFILE_DEFAULT
61 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4
62 | #endif
63 |
64 | #ifndef _U8ID_PRIVATE_H
65 | // from outside the dll
66 | # if defined _WIN32 || defined __CYGWIN__
67 | # define EXTERN __declspec(dllimport)
68 | # else
69 | # define EXTERN extern
70 | # endif
71 | //# define LOCAL
72 | #else
73 | // inside the dll
74 | # if defined _WIN32 || defined __CYGWIN__
75 | # define EXTERN __declspec(dllexport)
76 | # define LOCAL
77 | # elif __GNUC__ >= 4
78 | # define EXTERN __attribute__((visibility("default")))
79 | # define LOCAL __attribute__((visibility("hidden")))
80 | # else
81 | # define EXTERN
82 | # define LOCAL
83 | # endif
84 | #endif
85 |
86 | /* Initialize the library with a tr39 profile, normalization and bitmask of
87 | options, which define more performed checks. Recommended is
88 | `(U8ID_PROFILE_DEFAULT, U8ID_NORM_DEFAULT, 0)`. return -1 on error, 0 if
89 | options are valid.
90 | */
91 | EXTERN int u8ident_init(enum u8id_profile, enum u8id_norm, unsigned options);
92 |
93 | /* maxlength of an identifier. Default: 1024. Beware that such longs
94 | identifiers, are not really identifiable anymore, and keep them under 80 or
95 | even less. Some filesystems do allow now 32K identifiers, which is a glaring
96 | security hole, waiting to be exploited */
97 | EXTERN void u8ident_set_maxlength(unsigned maxlen);
98 |
99 | /* Generates a new identifier document/context/directory, which
100 | initializes a new list of seen scripts. Contexts are optional, by
101 | default all checks are done in the same context 0. With compilers
102 | and interpreters a context is a source file, with filesystems a directory,
103 | with usernames you may choose if you need to support different languages at
104 | once.
105 | I cannot think of any such usage, so better avoid contexts with usernames to
106 | avoid mixups. */
107 | EXTERN u8id_ctx_t u8ident_new_ctx(void);
108 |
109 | /* Changes to the context previously generated with `u8ident_new_ctx`. */
110 | EXTERN int u8ident_set_ctx(u8id_ctx_t ctx);
111 |
112 | /* Optionally adds a script to the context, if it's known or declared
113 | beforehand. Such as `use utf8 "Greek";` in cperl.
114 |
115 | All http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
116 | need not to be declared beforehand.
117 |
118 | Common Inherited Arabic Armenian Bengali Bopomofo Cyrillic
119 | Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi Hangul Han Hebrew
120 | Hiragana Katakana Kannada Khmer Lao Latin Malayalam Myanmar Oriya
121 | Sinhala Tamil Telugu Thaana Thai Tibetan
122 |
123 | All http://www.unicode.org/reports/tr31/#Table_Limited_Use_Scripts need
124 | to be or are disallowed by profile:
125 |
126 | Adlam Balinese Bamum Batak Canadian_Aboriginal Chakma Cham Cherokee
127 | Hanifi_Rohingya Javanese Kayah_Li Lepcha Limbu Lisu Mandaic
128 | Meetei_Mayek Miao New_Tai_Lue Newa Nko Nyiakeng_Puachue_Hmong Ol_Chiki
129 | Osage Saurashtra Sundanese Syloti_Nagri Syriac Tai_Le Tai_Tham
130 | Tai_Viet Tifinagh Vai Wancho Yi Unknown
131 |
132 | All others need to be added with u8ident_add_script_name().
133 | */
134 | EXTERN int u8ident_add_script_name(const char *name);
135 | EXTERN int u8ident_add_script(uint8_t script);
136 |
137 | EXTERN uint8_t u8ident_get_script(const uint32_t cp);
138 | EXTERN const char *u8ident_script_name(const int scr);
139 |
140 | /* Deletes the context generated with `u8ident_new_ctx`. This is
141 | optional, all remaining contexts are deleted by `u8ident_free` */
142 | EXTERN int u8ident_free_ctx(u8id_ctx_t ctx);
143 |
144 | /* End this library, cleaning up all internal structures. */
145 | EXTERN void u8ident_free(void);
146 |
147 | /* Returns a freshly allocated normalized string, in the option defined at
148 | `u8ident_init`. Defaults to U8ID_NFC. */
149 | EXTERN char *u8ident_normalize(const char *buf, int len);
150 |
151 | /*
152 | Lookup if the codepoint is a confusable. Only with --enable-confus
153 | -DHAVE_CONFUS. With --with-croaring -DHAVE_CROARING this is
154 | twice as fast, and needs half the size.
155 | */
156 | EXTERN bool u8ident_is_confusable(const uint32_t cp);
157 |
158 | enum u8id_errors {
159 | U8ID_EOK = 0,
160 | U8ID_EOK_NORM = 1,
161 | U8ID_EOK_WARN_CONFUS = 2,
162 | U8ID_EOK_NORM_WARN_CONFUS = 3,
163 | U8ID_ERR_XID = -1,
164 | U8ID_ERR_SCRIPT = -2,
165 | U8ID_ERR_SCRIPTS = -3,
166 | U8ID_ERR_ENCODING = -4,
167 | U8ID_ERR_COMBINE = -5,
168 | U8ID_ERR_CONFUS = -6,
169 | };
170 |
171 | /* Two variants to check if this identifier is valid. u8ident_check_buf avoids
172 | allocating a fresh string from the parsed input. buf must not be
173 | zero-terminated.
174 |
175 | Return values (enum u8id_errors):
176 | * 0 - valid without need to normalize.
177 | * 1 - valid with need to normalize.
178 | * 2 - warn about confusable
179 | * 3 - warn about confusable and need to normalize
180 | * -1 - invalid xid, disallowed via IdentifierStatus.txt
181 | * -2 - invalid script
182 | * -3 - invalid mixed scripts
183 | * -4 - invalid encoding
184 | * -5 - invalid because confusable (not yet implemented)
185 | outnorm is set to a fresh normalized string if valid.
186 |
187 | Note that in the check we explicitly allow the Latin confusables: 0 1 I `
188 | i.e. U+30, U+31, U+49, U+60
189 | */
190 | EXTERN enum u8id_errors u8ident_check(const uint8_t *string, char **outnorm);
191 | EXTERN enum u8id_errors u8ident_check_buf(const char *buf, const int len,
192 | char **outnorm);
193 |
194 | /* A different, but much less reliable check strategy via
195 | confusables.txt only, described in TR 39, Section 4, the skeleton
196 | algorithm. Each identifier is stored in two dynamic hash tables,
197 | and for each confusable match, normalized to NFC, the first
198 | wins. Only with `--enable-confus / -DHAVE_CONFUS`.
199 | */
200 | EXTERN enum u8id_errors u8ident_check_confusables(const char *buf, const int len);
201 |
202 | /* returns the failing codepoint, which failed in the last check. */
203 | EXTERN uint32_t u8ident_failed_char(const u8id_ctx_t ctx);
204 | /* returns the constant script name, which failed in the last check. */
205 | EXTERN const char *u8ident_failed_script_name(const u8id_ctx_t ctx);
206 |
207 | /* Returns a fresh string of the list of the seen scripts in this
208 | context whenever a mixed script error occurs. Needed for the error message
209 | "Invalid script %s, already have %s", where the 2nd %s is returned by this
210 | function. The returned string needs to be freed by the user.
211 |
212 | Usage:
213 |
214 | if (u8id_check("wrongᴧᴫ") == U8ID_ERR_SCRIPTS) {
215 | const char *scripts = u8ident_existing_scripts(ctx);
216 | fprintf(stdout, "Invalid script %s for U+%X, already have %s.\n",
217 | u8ident_failed_script_name(ctx), u8ident_failed_char(ctx),
218 | scripts);
219 | free(scripts);
220 | }
221 | */
222 | EXTERN const char *u8ident_existing_scripts(const u8id_ctx_t ctx);
223 |
--------------------------------------------------------------------------------
/libu8ident.pc.in:
--------------------------------------------------------------------------------
1 | # @configure_input@
2 | prefix=@prefix@
3 | exec_prefix=@exec_prefix@
4 | libdir=@libdir@
5 | includedir=@includedir@
6 |
7 | Name: @PACKAGE@
8 | Description: check unicode security guidelines for identifiers
9 | Version: @VERSION@
10 | URL: @PACKAGE_URL@
11 | Libs: -L${libdir} -lu8ident
12 | Cflags: -I${includedir}
13 |
--------------------------------------------------------------------------------
/m4/ax_gcc_builtin.m4:
--------------------------------------------------------------------------------
1 | # ===========================================================================
2 | # https://www.gnu.org/software/autoconf-archive/ax_gcc_builtin.html
3 | # ===========================================================================
4 | #
5 | # SYNOPSIS
6 | #
7 | # AX_GCC_BUILTIN(BUILTIN)
8 | #
9 | # DESCRIPTION
10 | #
11 | # This macro checks if the compiler supports one of GCC's built-in
12 | # functions; many other compilers also provide those same built-ins.
13 | #
14 | # The BUILTIN parameter is the name of the built-in function.
15 | #
16 | # If BUILTIN is supported define HAVE_. Keep in mind that since
17 | # builtins usually start with two underscores they will be copied over
18 | # into the HAVE_ definition (e.g. HAVE___BUILTIN_EXPECT for
19 | # __builtin_expect()).
20 | #
21 | # The macro caches its result in the ax_cv_have_ variable (e.g.
22 | # ax_cv_have___builtin_expect).
23 | #
24 | # The macro currently supports the following built-in functions:
25 | #
26 | # __builtin_assume_aligned
27 | # __builtin_bswap16
28 | # __builtin_bswap32
29 | # __builtin_bswap64
30 | # __builtin_choose_expr
31 | # __builtin___clear_cache
32 | # __builtin_clrsb
33 | # __builtin_clrsbl
34 | # __builtin_clrsbll
35 | # __builtin_clz
36 | # __builtin_clzl
37 | # __builtin_clzll
38 | # __builtin_complex
39 | # __builtin_constant_p
40 | # __builtin_ctz
41 | # __builtin_ctzl
42 | # __builtin_ctzll
43 | # __builtin_expect
44 | # __builtin_ffs
45 | # __builtin_ffsl
46 | # __builtin_ffsll
47 | # __builtin_fpclassify
48 | # __builtin_huge_val
49 | # __builtin_huge_valf
50 | # __builtin_huge_vall
51 | # __builtin_inf
52 | # __builtin_infd128
53 | # __builtin_infd32
54 | # __builtin_infd64
55 | # __builtin_inff
56 | # __builtin_infl
57 | # __builtin_isinf_sign
58 | # __builtin_nan
59 | # __builtin_nand128
60 | # __builtin_nand32
61 | # __builtin_nand64
62 | # __builtin_nanf
63 | # __builtin_nanl
64 | # __builtin_nans
65 | # __builtin_nansf
66 | # __builtin_nansl
67 | # __builtin_object_size
68 | # __builtin_parity
69 | # __builtin_parityl
70 | # __builtin_parityll
71 | # __builtin_popcount
72 | # __builtin_popcountl
73 | # __builtin_popcountll
74 | # __builtin_powi
75 | # __builtin_powif
76 | # __builtin_powil
77 | # __builtin_prefetch
78 | # __builtin_trap
79 | # __builtin_types_compatible_p
80 | # __builtin_unreachable
81 | #
82 | # Unsupported built-ins will be tested with an empty parameter set and the
83 | # result of the check might be wrong or meaningless so use with care.
84 | #
85 | # LICENSE
86 | #
87 | # Copyright (c) 2013 Gabriele Svelto
88 | # Copyright (c) 2018 Reini Urban
89 | #
90 | # Copying and distribution of this file, with or without modification, are
91 | # permitted in any medium without royalty provided the copyright notice
92 | # and this notice are preserved. This file is offered as-is, without any
93 | # warranty.
94 |
95 | #serial 7
96 |
97 | AC_DEFUN([AX_GCC_BUILTIN], [
98 | AS_VAR_PUSHDEF([ac_var], [ax_cv_have_$1])
99 |
100 | AC_CACHE_CHECK([for $1], [ac_var], [
101 | AC_LINK_IFELSE([AC_LANG_PROGRAM([], [
102 | m4_case([$1],
103 | [__builtin___mbsnrtowcs_chk], [$1("", "", 0, 0, "", 0)],
104 | [__builtin___mbsrtowcs_chk], [$1("", "", 0, "", 0)],
105 | [__builtin___mbstowcs_chk], [$1("", "", 0, 0)],
106 | [__builtin___memcpy_chk], [$1("", "", 0, 0)],
107 | [__builtin___memmove_chk], [$1("", "", 0, 0)],
108 | [__builtin___mempcpy_chk], [$1("", "", 0, 0)],
109 | [__builtin___memset_chk], [$1("", 0, 0, 0)],
110 | [__builtin___strcpy_chk], [$1("", "", 0)],
111 | [__builtin___stpcpy_chk], [$1("", "", 0)],
112 | [__builtin___stpncpy_chk], [$1("", "", 0, 0)],
113 | [__builtin___strncpy_chk], [$1("", "", 0, 0)],
114 | [__builtin___strcat_chk], [$1("", "", 0)],
115 | [__builtin___strncat_chk], [$1("", "", 0, 0)],
116 | [__builtin___printf_chk], [$1(0, "")],
117 | [__builtin___read_chk], [$1(0, "", 0, 0)],
118 | [__builtin___sprintf_chk], [$1("", 0, 0, "")],
119 | [__builtin___snprintf_chk], [$1("", 0, 0, 0, "")],
120 | [__builtin___swprintf_chk], [$1("", 0, 0, 0, "")],
121 | [__builtin___vfprintf_chk], [$1("", 0, "", 0)],
122 | [__builtin___vfwprintf_chk], [$1("", 0, "", 0)],
123 | [__builtin___vprintf_chk], [$1(0, "", 0)],
124 | [__builtin___vsnprintf_chk], [$1("", 0, 0, 0, "", 0)],
125 | [__builtin___vsprintf_chk], [$1("", 0, 0, "", 0)],
126 | [__builtin___vswprintf_chk], [$1("", 0, 0, 0, "", 0)],
127 | [__builtin___vwprintf_chk], [$1(0, "", 0)],
128 | [__builtin___wcrtomb_chk], [$1("", "", "", 0)],
129 | [__builtin___wcscat_chk], [$1("", "", 0)],
130 | [__builtin___wcscpy_chk], [$1("", "", 0)],
131 | [__builtin___wcsncat_chk], [$1("", "", 0, 0)],
132 | [__builtin___wcsncpy_chk], [$1("", "", 0, 0)],
133 | [__builtin___wcsnrtombs_chk], [$1("", "", 0, 0, "", 0)],
134 | [__builtin___wcsrtombs_chk], [$1("", "", 0, "", 0)],
135 | [__builtin___wcstombs_chk], [$1("", "", 0, 0)],
136 | [__builtin___wctomb_chk], [$1("", 0, 0)],
137 | [__builtin___wmemcpy_chk], [$1("", "", 0, 0)],
138 | [__builtin___wmemmove_chk], [$1("", "", 0, 0)],
139 | [__builtin___wmempcpy_chk], [$1("", "", 0, 0)],
140 | [__builtin___wmemset_chk], [$1("", "", 0, 0)],
141 | [__builtin___wprintf_chk], [$1(0, "")],
142 | [__builtin___bnd_set_ptr_bounds], [$1("", 0)],
143 | [__builtin___bnd_narrow_ptr_bounds], [$1("", "", 0)],
144 | [__builtin___bnd_copy_ptr_bounds], [$1("", "")],
145 | [__builtin___bnd_init_ptr_bounds], [$1("")],
146 | [__builtin___bnd_null_ptr_bounds], [$1("")],
147 | [__builtin___bnd_store_ptr_bounds], [$1("", "")],
148 | [__builtin___bnd_chk_ptr_lbounds], [$1("")],
149 | [__builtin___bnd_chk_ptr_ubounds], [$1("")],
150 | [__builtin___bnd_chk_ptr_bounds], [$1("", 0)],
151 | [__builtin___bnd_get_ptr_lbound], [$1("")],
152 | [__builtin___bnd_get_ptr_ubound], [$1("")],
153 | [__builtin_assume_aligned], [$1("", 0)],
154 | [__builtin_bswap16], [$1(0)],
155 | [__builtin_bswap32], [$1(0)],
156 | [__builtin_bswap64], [$1(0)],
157 | [__builtin_choose_expr], [if($1(1, 1, 0)) 1],
158 | [__builtin___clear_cache], [$1("", "")],
159 | [__builtin_clrsb], [$1(0)],
160 | [__builtin_clrsbl], [$1(0)],
161 | [__builtin_clrsbll], [$1(0)],
162 | [__builtin_clz], [$1(0)],
163 | [__builtin_clzl], [$1(0)],
164 | [__builtin_clzll], [$1(0)],
165 | [__builtin_complex], [$1(0.0, 0.0)],
166 | [__builtin_constant_p], [$1(0)],
167 | [__builtin_ctz], [$1(0)],
168 | [__builtin_ctzl], [$1(0)],
169 | [__builtin_ctzll], [$1(0)],
170 | [__builtin_expect], [$1(0, 0)],
171 | [__builtin_ffs], [$1(0)],
172 | [__builtin_ffsl], [$1(0)],
173 | [__builtin_ffsll], [$1(0)],
174 | [__builtin_fpclassify], [$1(0, 1, 2, 3, 4, 0.0)],
175 | [__builtin_huge_val], [$1()],
176 | [__builtin_huge_valf], [$1()],
177 | [__builtin_huge_vall], [$1()],
178 | [__builtin_inf], [$1()],
179 | [__builtin_infd128], [$1()],
180 | [__builtin_infd32], [$1()],
181 | [__builtin_infd64], [$1()],
182 | [__builtin_inff], [$1()],
183 | [__builtin_infl], [$1()],
184 | [__builtin_isinf_sign], [$1(0.0)],
185 | [__builtin_nan], [$1("")],
186 | [__builtin_nand128], [$1("")],
187 | [__builtin_nand32], [$1("")],
188 | [__builtin_nand64], [$1("")],
189 | [__builtin_nanf], [$1("")],
190 | [__builtin_nanl], [$1("")],
191 | [__builtin_nans], [$1("")],
192 | [__builtin_nansf], [$1("")],
193 | [__builtin_nansl], [$1("")],
194 | [__builtin_object_size], [$1("", 0)],
195 | [__builtin_parity], [$1(0)],
196 | [__builtin_parityl], [$1(0)],
197 | [__builtin_parityll], [$1(0)],
198 | [__builtin_popcount], [$1(0)],
199 | [__builtin_popcountl], [$1(0)],
200 | [__builtin_popcountll], [$1(0)],
201 | [__builtin_powi], [$1(0, 0)],
202 | [__builtin_powif], [$1(0, 0)],
203 | [__builtin_powil], [$1(0, 0)],
204 | [__builtin_prefetch], [$1("")],
205 | [__builtin_trap], [$1()],
206 | [__builtin_types_compatible_p], [$1(int, int)],
207 | [__builtin_unreachable], [$1()],
208 | [m4_warn([syntax], [Unsupported built-in $1, the test may fail])
209 | $1()]
210 | )
211 | ])],
212 | [AS_VAR_SET([ac_var], [yes])],
213 | [AS_VAR_SET([ac_var], [no])])
214 | ])
215 |
216 | AS_IF([test yes = AS_VAR_GET([ac_var])],
217 | [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_$1), 1,
218 | [Define to 1 if the system has the `$1' built-in function])], [])
219 |
220 | AS_VAR_POPDEF([ac_var])
221 | ])
222 |
--------------------------------------------------------------------------------
/makefile.gnu:
--------------------------------------------------------------------------------
1 | # -*- Makefile -*-
2 | #DEFS := -DU8ID_NORM=NFKC -DU8ID_PROFILE=4 -DDISABLE_CHECK_XID
3 | DEFINES = ${DEFS} -DHAVE___BUILTIN_FFS
4 | HAVE_CONFUS := 1
5 | CC := cc
6 | CFLAGS := -Wall -Wextra
7 | AR := ar
8 | RANLIB := ranlib
9 | GPERF := gperf
10 | # dnf install rubygem-ronn-ng
11 | RONN := ronn
12 | # Maintainer only
13 | VERSION := $(shell build-aux/git-version-gen .version)
14 | SO_MAJ = 1
15 | DEFINES += -DPACKAGE_VERSION="\"$(VERSION)\""
16 | # This should to be a recent perl, matching the target unicode version
17 | PERL := perl
18 | WGET := wget
19 | PANDOC := pandoc
20 | # End Maintainer-only
21 |
22 | HEADER = include/u8ident.h
23 | NORMHDRS = un8ifcan.h un8ifcmb.h un8ifcmp.h un8ifcpt.h un8ifexc.h
24 | HDRS = u8id_private.h u8id_gc.h scripts.h $(NORMHDRS) hangul.h \
25 | mark.h medial.h unic11.h scripts16.h htable.h
26 | SRC = u8ident.c u8idscr.c u8idnorm.c
27 | ifeq (${HAVE_CONFUS}, 1)
28 | SRC += u8idroar.c htable.c
29 | HDRS += u8idroar.h confus.h gconfus.h
30 | DEFINES += -DHAVE_CONFUS
31 | endif
32 | ifneq (,$(wildcard /usr/include/sys/stat.h))
33 | DEFINES += -DHAVE_SYS_STAT_H
34 | endif
35 | ifneq (,$(wildcard /usr/include/dirent.h))
36 | DEFINES += -DHAVE_DIRENT_H
37 | endif
38 | ifneq (,$(wildcard /usr/include/getopt.h))
39 | DEFINES += -DHAVE_GETOPT_H
40 | endif
41 | ifneq (,$(wildcard /usr/include/uniwbrk.h))
42 | DEFINES += -DHAVE_UNIWBRK_H -DHAVE_LIBUNISTRING
43 | LIBUNISTR = -lunistring
44 | else
45 | LIBUNISTR =
46 | endif
47 | ifneq (,$(wildcard roaring.c))
48 | DEFINES += -DHAVE_CROARING
49 | HDRS += confus_croar.h roaring.h
50 | else
51 | ifneq (,$(wildcard ../CRoaring/roaring.c))
52 | DEFINES += -DHAVE_CROARING
53 | CFLAGS += -I../CRoaring
54 | HDRS += confus_croar.h roaring.h
55 | endif
56 | endif
57 | ALLHDRS = $(HDRS) unic26.h
58 | #OBJS = u8ident.o u8idscr.o u8idnorm.o u8idroar.o
59 | OBJS = $(SRC:.c=.o)
60 | LIB = libu8ident.a
61 | SOLIB = libu8ident.so
62 | PCXX = P2528R0
63 | NC = n2916
64 | DCURCXX = D2528R1
65 | NCURC = n2932
66 | DOCS = README.md NEWS NOTICE LICENSE doc/c11.md doc/$(PCXX).html doc/$(DCURCXX).html \
67 | doc/$(PCXX).md doc/$(DCURCXX).md doc/$(NC).html doc/$(NC).md doc/$(NCURC).patch \
68 | doc/$(NCURC).html doc/$(NCURC).md \
69 | doc/tr31-bugs.md
70 | MAN3 = u8ident.3
71 | MAN1 = u8idlint.1
72 | MAN = $(MAN1) $(MAN3)
73 | PREFIX = usr
74 | PKG = libu8ident-$(VERSION)
75 | PKG_BIN = $(PKG)-`uname -m`
76 |
77 | CFLAGS_REL = $(CFLAGS) -O2 -DNDEBUG
78 | CFLAGS_PERF = $(CFLAGS) -O2 -DNDEBUG
79 | CFLAGS_DBG = $(CFLAGS) -g -DDEBUG
80 | LTOFLAGS =
81 |
82 | MACHINE := $(shell uname -m)
83 | ifeq (x86_64,$(MACHINE))
84 | DEFINES += -DHAVE_SYS_STAT_H
85 | LTOFLAGS = -flto
86 | CFLAGS_REL += -march=native
87 | CFLAGS_PERF += -march=native
88 | endif
89 |
90 | RUN_LD_PATH=LD_LIBRARY_PATH=.
91 | OS := $(shell uname -s)
92 | ifeq (Darwin,$(OS))
93 | RUN_LD_PATH=DYLD_LIBRARY_PATH=.
94 | else
95 | ifeq (AIX,$(OS))
96 | RUN_LD_PATH=LIBPATH=.
97 | else
98 | ifeq (HP-UX,$(OS))
99 | RUN_LD_PATH=SHLIB_PATH=.
100 | endif
101 | endif
102 | endif
103 |
104 | # cc prints name as cc, not gcc. so check for the copyright banner
105 | CC_COPY := $(shell $(CC) --version | head -n2 | tail -n1 | cut -c1-7)
106 | # gcc has "Copyright (C) 2020 Free Software Foundation, Inc"
107 | # clang would have "Target: ..."
108 | # icc would have "Copyright (c) (C) 1985-2014 Intel Corporation"
109 | # pcc has "Portable C Compiler 1.2.0.DEVEL 20200630 for x86_64-pc-linux-gnu" on the 1st line
110 | # chibicc has no --version, just --help
111 | ifeq (Copyrig,$(CC_COPY))
112 | IS_GCC = 1
113 | CFLAGS_REL += -Werror -Wno-return-local-addr
114 | CFLAGS_PERF += -Wno-return-local-addr
115 | CFLAGS_DBG += -Wno-return-local-addr
116 | else
117 | ifeq (Target:,$(CC_COPY))
118 | IS_CLANG = 1
119 | CFLAGS_REL += -Werror
120 | else
121 | ifeq (Portable C Compiler,$(shell $(CC) --version | cut -c1-19))
122 | IS_PCC = 1
123 | CFLAGS_REL += -Werror
124 | endif
125 | endif
126 | endif
127 |
128 | most: $(LIB) $(SOLIB) u8idlint doc/$(DCURCXX).html
129 |
130 | all: mkc26 most test-texts test perf docs
131 |
132 | .version: makefile.gnu
133 | build-aux/git-version-gen .version
134 |
135 | .c.o:
136 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c $< -o $@
137 | .c.i:
138 | $(CC) $(CFLAGS) $(DEFINES) -Iinclude -E -c $< -o $@
139 | u8idnorm.o: u8idnorm.c u8id_private.h hangul.h $(NORMHDRS) $(HEADER)
140 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c u8idnorm.c -o $@
141 | u8idroar.o: u8idroar.c u8id_private.h $(HEADER) confus_croar.h
142 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) $(DEFINES) -Iinclude -c u8idroar.c -o $@
143 |
144 | $(LIB): $(SRC) $(HEADER) $(ALLHDRS) $(OBJS)
145 | $(AR) $(ARFLAGS) $@ $(OBJS)
146 | $(RANLIB) $@
147 |
148 | $(SOLIB): $(SRC) $(HEADER) $(ALLHDRS)
149 | $(CC) $(CFLAGS_REL) $(LTOFLAGS) -shared -fPIC $(DEFINES) -Iinclude \
150 | -Wl,-soname,$(SOLIB).$(SO_MAJ) -o $@.$(SO_MAJ) $(SRC)
151 | -rm -f $(SOLIB)
152 | ln -s $(SOLIB).$(SO_MAJ) $(SOLIB)
153 |
154 | scripts.h scripts16.h: mkscripts.pl # Scripts.txt ScriptExtensions.txt DerivedNormalizationProps.txt
155 | $(PERL) mkscripts.pl
156 | gconfus.h.in: mkconfus.pl # confusables.txt
157 | $(PERL) mkconfus.pl -c
158 | confus.h: mkconfus.pl mkroar.c # confusables.txt
159 | $(PERL) mkconfus.pl -c
160 | gconfus.h: gconfus.h.in
161 | $(GPERF) -n gconfus.h.in > gconfus.h.tmp && sed -e's,static const unsigned int asso_values,(void)len; static const unsigned int asso_values,' gconfus.h && rm gconfus.h.tmp
162 | confus_croar.h: mkroar.c mkconfus.pl
163 | $(PERL) mkconfus.pl -c
164 | mark.h: mkmark.pl # UnicodeData.txt
165 | $(PERL) mkmark.pl
166 | u8id_gc.h: mkgc.pl # UnicodeData.txt
167 | $(PERL) mkgc.pl
168 | medial.h: mkmedial.pl # UnicodeData.txt
169 | $(PERL) mkmedial.pl
170 | allowed_croar.h nfkc_croar.h nfc_croar.h nfkd_croar.h nfd_croar.h: mkroar.c mkconfus.pl FORCE
171 | $(PERL) mkconfus.pl
172 |
173 | u8idlint: u8idlint.c unic26.h unic11.h $(LIB)
174 | $(CC) $(CFLAGS_REL) -fpie $(DEFINES) -I. -Iinclude u8idlint.c -o $@ $(LIB) $(LIBUNISTR)
175 |
176 | .PHONY: check check-all check-extra check-asan check-norms check-profiles check-tr31 check-mdl \
177 | check-all-combinations clean regen-scripts regen-norm regen-confus regen-u8idlint-test \
178 | install man dist-src dist clang-format docs
179 |
180 | ifeq (-DHAVE_CONFUS,$(DEFINES))
181 | check: test test-texts u8idlint example
182 | ./test
183 | ./test-texts > texts.tst
184 | diff texts.tst texts/result.lst && rm texts.tst
185 | ./u8idlint.test
186 | $(RUN_LD_PATH) ./example
187 | else
188 | ifeq (-DHAVE_CONFUS -DHAVE_CROARING,$(DEFINES))
189 | check: test test-texts u8idlint example
190 | ./test
191 | ./test-texts > texts.tst
192 | diff texts.tst texts/result.lst && rm texts.tst
193 | ./u8idlint.test
194 | $(RUN_LD_PATH) ./example
195 | else
196 | check: test test-texts u8idlint example
197 | ./test
198 | ./test-texts > texts.tst
199 | diff texts.tst texts/result.lst && rm texts.tst
200 | ./u8idlint.test
201 | $(RUN_LD_PATH) ./example
202 | endif
203 | endif
204 |
205 | check-all: check check-norms check-profiles check-tr31 check-asan
206 | check-extra: check-all check-all-combinations check-mdl
207 | shellcheck *.test test-all-fast.sh test-all.sh
208 |
209 | test: test.c $(SRC) $(HEADER) $(ALLHDRS)
210 | $(CC) $(CFLAGS_DBG) $(DEFINES) -I. -Iinclude test.c $(SRC) -o test
211 | test-texts: test-texts.c $(SRC) $(HEADER) $(ALLHDRS)
212 | $(CC) $(CFLAGS_DBG) -O1 $(DEFINES) -I. -Iinclude test-texts.c $(SRC) -o test-texts $(LIBUNISTR)
213 | example: example.c $(SOLIB)
214 | $(CC) $(CFLAGS_DBG) $(DEFINES) -Iinclude example.c -o $@ -L. -lu8ident
215 | regen-u8idlint-test: u8idlint
216 | -./u8idlint -xsafec26 texts/homo-sec-1.c >texts/homo-sec-1.tst
217 | -./u8idlint -p1 -xc11 texts/homo-sec-1.c >texts/homo-sec-1-p1.tst
218 | -./u8idlint -xc11 texts/homo-1.c >texts/homo-1.tst
219 | -./u8idlint -xallowed texts/bidi-sec-1.c >texts/bidi-sec-1.tst
220 | -./u8idlint -xc11 texts/bidi-sec-1.c >texts/bidi-sec-1-c11.tst
221 | -./u8idlint -xsafec26 texts/bidi-sec-1.c >texts/bidi-sec-1-c26.tst
222 | -./u8idlint -xascii texts/bidi-sec-2.c >texts/bidi-sec-2-ascii.tst
223 | -./u8idlint -xallowed texts/bidi-sec-2.c >texts/bidi-sec-2-allowed.tst
224 | -./u8idlint -xid texts/bidi-sec-2.c >texts/bidi-sec-2.tst
225 | -./u8idlint -xc11 texts/bidi-sec-2.c >texts/bidi-sec-2-c11.tst
226 |
227 | c11-all.h unic26.h: mkc26 scripts.h mark.h medial.h
228 | ./mkc26
229 | mkc26: mkc26.c $(SRC) $(HEADER) $(HDRS)
230 | $(CC) $(CFLAGS_DBG) -O1 $(DEFINES) -DU8ID_PROFILE_SAFEC26 -I. -Iinclude mkc26.c $(SRC) -o $@
231 | check-asan: test.c $(SRC) $(HEADER) $(ALLHDRS)
232 | $(CC) $(CFLAGS_DBG) $(DEFINES) -fsanitize=address -I. -Iinclude test.c $(SRC) -o test-asan
233 | ./test-asan
234 | # gem install mdl
235 | check-mdl:
236 | mdl *.md doc/*.md
237 |
238 | perf: perf.c u8idroar.c $(HEADER) $(ALLHDRS) \
239 | nfkc_croar.h nfc_croar.h nfkd_croar.h nfd_croar.h allowed_croar.h confus_croar.h mark.h scripts16.h
240 | $(CC) $(CFLAGS_PERF) -Wno-unused-function $(DEFINES) -DPERF_TEST -I. -Iinclude \
241 | perf.c u8idroar.c -o perf && \
242 | ./perf
243 |
244 | clean:
245 | -rm -f u8ident.o u8idnorm.o u8idscr.o u8idroar.o $(LIB) $(SOLIB) \
246 | perf mkroar mkc26 u8idlint example \
247 | test test-texts test-asan test-tr31 \
248 | test-prof{2,3,4,5,6,C26_4,C11_6,SAFEC26,C11STD} \
249 | test-norm-{NFKC,NFC,FCC,NFKD,NFD,FCD} \
250 |
251 | # Maintainer-only
252 | # Check coverage and sizes for all configure combinations
253 | check-norms: $(SRC) $(HEADER) $(ALLHDRS)
254 | for n in NFKC NFC FCC NFKD NFD FCD; do \
255 | echo $$n; \
256 | $(CC) $(CFLAGS_REL) -DU8ID_NORM=$$n -Wfatal-errors -Iinclude -c u8idnorm.c -o u8idnorm.o && \
257 | ls -gGh u8idnorm.o; \
258 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_NORM=$$n -I. -Iinclude test.c $(SRC) \
259 | -o test-norm-$$n && \
260 | if ./test-norm-$$n norm; then rm test-norm-$$n; else exit; fi; \
261 | done
262 | check-profiles: $(SRC) $(HEADER) $(ALLHDRS)
263 | for n in 2 3 4 5 6 C11_6 C26_4; do \
264 | echo PROFILE_$${n}; \
265 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_PROFILE=$$n -I. -Iinclude test.c $(SRC) \
266 | -o test-prof$$n && \
267 | if ./test-prof$$n profile; then rm test-prof$$n; else exit; fi; \
268 | done
269 | for n in SAFEC26 C11STD; do \
270 | echo PROFILE_$${n}; \
271 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_PROFILE_$${n} -I. -Iinclude test.c $(SRC) \
272 | -o test-prof$$n && \
273 | if ./test-prof$$n profile; then rm test-prof$$n; else exit; fi; \
274 | done
275 | check-tr31: $(SRC) $(HEADER) $(ALLHDRS)
276 | for x in ALLOWED SAFEC26 ID XID C11 C23 ALLUTF8 NONE; do \
277 | echo U8ID_TR31_$$x; \
278 | $(CC) $(CFLAGS_DBG) $(DEFINES) -DU8ID_TR31=$$x -I. -Iinclude test.c $(SRC) \
279 | -o test-xid-$$x && \
280 | if ./test-xid-$$x xid; then rm test-xid-$$x; else exit; fi; \
281 | done
282 | check-all-combinations: $(SRC) $(HEADER) $(ALLHDRS)
283 | for n in NFKC NFC NFKD NFD FCD FCC; do \
284 | for p in 2 3 4 5 6 C11_6 C26_4; do \
285 | for x in ALLOWED SAFEC26 ID XID C11 C23 ALLUTF8 NONE; do \
286 | if [ $$n != NFC ] && [ $$p = C26_4 -o $$x = SAFEC26 -o $$x = C23 ]; then \
287 | echo "skip -DU8ID_NORM=$$n -DU8ID_PROFILE=$$p -DU8ID_TR31=$$x"; \
288 | else \
289 | echo "check -DU8ID_NORM=$$n -DU8ID_PROFILE=$$p -DU8ID_TR31=$$x"; \
290 | $(CC) $(CFLAGS_DBG) $(DEFINES) -I. -Iinclude -DU8ID_PROFILE=$$p -DU8ID_NORM=$$n -DU8ID_TR31=$$x \
291 | -Wfatal-errors test.c $(SRC) \
292 | -o test-profiles && \
293 | ./test-profiles || exit; \
294 | fi; \
295 | done; \
296 | done; \
297 | done; \
298 | rm test-profiles
299 |
300 | # Create the normalization headers via a current perl
301 | Unicode-Normalize: un8ifcan.h
302 | if test -d Unicode-Normalize; then \
303 | cd Unicode-Normalize && git pull --rebase && cd ..; \
304 | else \
305 | git clone https://github.com/rurban/Unicode-Normalize; fi
306 | regen-norm: Unicode-Normalize un8ifcan.h
307 | cd Unicode-Normalize && \
308 | $(PERL) Makefile.PL && \
309 | make && \
310 | $(PERL) mkheader -ind -std && \
311 | cd - && cp Unicode-Normalize/un8if*.h .
312 | # Download some UCD files and create scripts.h
313 | regen-scripts:
314 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/Scripts.txt
315 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
316 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
317 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
318 | $(WGET) -N https://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
319 | $(WGET) -N https://www.unicode.org/Public/security/latest/IdentifierType.txt
320 | $(WGET) -N https://www.unicode.org/Public/security/latest/IdentifierStatus.txt
321 | $(PERL) mkscripts.pl
322 | regen-confus:
323 | $(WGET) -N https://www.unicode.org/Public/security/latest/confusables.txt
324 | $(PERL) mkconfus.pl
325 |
326 | docs: $(DOCS)
327 | # doc/P2528R0.* and doc/n2916.* are frozen
328 | html: doc/$(DCURCXX).html doc/$(NCURC).html
329 | pdf: doc/$(DCURCXX).pdf doc/$(NCURC).pdf
330 | doc/$(DCURCXX).html: doc/$(DCURCXX).md
331 | -$(PANDOC) -s -o $@ doc/$(DCURCXX).md --metadata title="$(DCURCXX) - C++ Identifier Security using Unicode Standard Annex 39"
332 | doc/$(DCURCXX).pdf: doc/$(DCURCXX).md
333 | -$(PANDOC) -s --pdf-engine=xelatex -o $@ doc/$(DCURCXX).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono"
334 | doc/$(NCURC).html: doc/$(NCURC).md
335 | -$(PANDOC) -s -o $@ doc/$(NCURC).md --metadata title="$(NCURC) - C Identifier Security using Unicode Standard Annex 39 v2"
336 | doc/$(NCURC).pdf: doc/$(NCURC).md
337 | -$(PANDOC) -s --pdf-engine=xelatex -o $@ doc/$(NCURC).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono" --metadata title="$(NCURC) - C Identifier Security using Unicode Standard Annex 39 v2"
338 |
339 | Dockerfile.pandoc: makefile.gnu
340 | echo "FROM fedora:35" >Dockerfile.pandoc
341 | echo "RUN yum -y install pandoc texlive-xetex dejavu-sans-mono-fonts dejavu-serif-fonts dejavu-sans-fonts" >> Dockerfile.pandoc
342 | echo "RUN useradd --no-log-init -U user -u 1000 -m" >> Dockerfile.pandoc
343 | echo "USER user" >> Dockerfile.pandoc
344 | echo "WORKDIR /home/user" >> Dockerfile.pandoc
345 | docker build -t pandoc -f Dockerfile.pandoc .
346 | docker-html: Dockerfile.pandoc
347 | -docker run -u user -v `pwd`/doc:/doc -it pandoc pandoc -s -o /doc/$(DCURCXX).html /doc/$(DCURCXX).md
348 | chown $$USER:$$USER doc/$(DCURCXX).html
349 | docker-pdf: Dockerfile.pandoc
350 | -docker run -u user -v `pwd`/doc:/doc -it pandoc pandoc -s --pdf-engine=xelatex -o /doc/$(DCURCXX).pdf /doc/$(DCURCXX).md --variable mainfont="DejaVu Serif" --variable sansfont="DejaVu Sans" --variable monofont="DejaVu Sans Mono"
351 | chown $$USER:$$USER doc/$(DCURCXX).pdf
352 |
353 | patch-c-doc:
354 | patch -f doc/$(NCURC).md doc/$(NCURC).patch
355 | regen-c-patch:
356 | -diff -bu doc/$(DCURCXX).md doc/$(NCURC).md >doc/$(NCURC).patch
357 |
358 | clang-format:
359 | clang-format -i *.c include/*.h scripts.h confus.h mark.h scripts16.h u8id*.h
360 | GTAGS: $(SRC) $(HEADER) $(ALLHDRS)
361 | ls $(SRC) $(HEADER) $(ALLHDRS) | gtags -f -
362 |
363 | # End Maintainer-only
364 |
365 | man: $(MAN)
366 |
367 | RONN_ARGS=--roff --manual "U8IDENT Manual $(VERSION)" --organization=rurban/libu8ident
368 | u8ident.3: README.md
369 | $(RONN) $(RONN_ARGS) < README.md > $@
370 | u8idlint.1: u8idlint
371 | LC_ALL=C help2man -N -s1 -p libu8ident --manual "U8IDENT Manual $(VERSION)" -o $@ ./u8idlint$(EXEEXT)
372 |
373 | dist: $(LIB) $(MAN) $(DOCS)
374 | -rm -rf $(PKG)
375 | $(MAKE) -f makefile.gnu install DESTDIR="$(PKG)"
376 | tar cfz $(PKG_BIN).tar.gz -C $(PKG) .
377 | -rm -rf $(PKG)
378 |
379 | dist-src:
380 | -rm -rf $(PKG)
381 | -mkdir -p $(PKG)/include
382 | cp $(HEADER) $(PKG)/include/
383 | cp `git ls-tree -r --name-only HEAD` $(PKG)/
384 | tar cfz $(PKG).tar.gz $(PKG)
385 | rm -rf $(PKG)
386 |
387 | install: $(LIB) $(MAN)
388 | -mkdir -p $(DESTDIR)/$(PREFIX)/include
389 | -mkdir -p $(DESTDIR)/$(PREFIX)/lib
390 | -mkdir -p $(DESTDIR)/$(PREFIX)/bin
391 | -mkdir -p $(DESTDIR)/$(PREFIX)/share/doc/libu8ident
392 | -mkdir -p $(DESTDIR)/$(PREFIX)/share/man/man3
393 | install -m0644 $(HEADER) $(DESTDIR)/$(PREFIX)/include
394 | install -m0644 $(LIB) $(DESTDIR)/$(PREFIX)/lib
395 | install -m0755 u8idlint $(DESTDIR)/$(PREFIX)/bin
396 | install -m0644 $(MAN1) $(DESTDIR)/$(PREFIX)/share/man/man1
397 | install -m0644 $(MAN3) $(DESTDIR)/$(PREFIX)/share/man/man3
398 | install -m0644 $(DOCS) $(DESTDIR)/$(PREFIX)/share/doc/libu8ident
399 |
--------------------------------------------------------------------------------
/medial.h:
--------------------------------------------------------------------------------
1 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
2 | /* libu8ident - Check unicode security guidelines for identifiers.
3 | Copyright 2022 Reini Urban
4 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 |
6 | UCD Medial Letters and Forms, not at Start and not at End of a word.
7 | Includes MEDIAL and https://www.unicode.org/reports/tr31/#Table_Optional_Medial
8 | Generated by mkmedial.pl, do not modify.
9 | */
10 |
11 | #ifdef EXTERN_SCRIPTS
12 | extern const struct range_bool medial_list[77];
13 | #else
14 | const struct range_bool medial_list[] = {
15 | // clang-format off
16 | {0x27, 0x27}, // '
17 | {0x2D, 0x2E}, // -...
18 | {0x3A, 0x3A}, // :
19 | {0xB7, 0xB7}, // ·
20 | {0x58A, 0x58A}, // ֊
21 | {0x5F4, 0x5F4}, // ״
22 | {0xF0B, 0xF0B}, // ་
23 | {0x103B, 0x103E}, // ျ..ှ
24 | {0x105E, 0x1060}, // ၞ..ၠ
25 | {0x1082, 0x1082}, // ႂ
26 | {0x14EC, 0x14EC}, // ᓬ
27 | {0x1552, 0x1552}, // ᕒ
28 | {0x1A55, 0x1A56}, // ᩕ..ᩖ
29 | {0x200C, 0x200C}, //
30 | {0x2010, 0x2010}, // ‐
31 | {0x2019, 0x2019}, // ’
32 | {0x2027, 0x2027}, // ‧
33 | {0x30A0, 0x30A0}, // ゠
34 | {0x30FB, 0x30FB}, // ・
35 | {0xFB55, 0xFB55}, // ﭕ
36 | {0xFB59, 0xFB59}, // ﭙ
37 | {0xFB5D, 0xFB5D}, // ﭝ
38 | {0xFB61, 0xFB61}, // ﭡ
39 | {0xFB65, 0xFB65}, // ﭥ
40 | {0xFB69, 0xFB69}, // ﭩ
41 | {0xFB6D, 0xFB6D}, // ﭭ
42 | {0xFB71, 0xFB71}, // ﭱ
43 | {0xFB75, 0xFB75}, // ﭵ
44 | {0xFB79, 0xFB79}, // ﭹ
45 | {0xFB7D, 0xFB7D}, // ﭽ
46 | {0xFB81, 0xFB81}, // ﮁ
47 | {0xFB91, 0xFB91}, // ﮑ
48 | {0xFB95, 0xFB95}, // ﮕ
49 | {0xFB99, 0xFB99}, // ﮙ
50 | {0xFB9D, 0xFB9D}, // ﮝ
51 | {0xFBA3, 0xFBA3}, // ﮣ
52 | {0xFBA9, 0xFBA9}, // ﮩ
53 | {0xFBAD, 0xFBAD}, // ﮭ
54 | {0xFBD6, 0xFBD6}, // ﯖ
55 | {0xFBE7, 0xFBE7}, // ﯧ
56 | {0xFBE9, 0xFBE9}, // ﯩ
57 | {0xFBFF, 0xFBFF}, // ﯿ
58 | {0xFCDF, 0xFCF4}, // ﳟ..ﳴ
59 | {0xFD34, 0xFD3B}, // ﴴ..ﴻ
60 | {0xFE77, 0xFE77}, // ﹷ
61 | {0xFE79, 0xFE79}, // ﹹ
62 | {0xFE7B, 0xFE7B}, // ﹻ
63 | {0xFE7D, 0xFE7D}, // ﹽ
64 | {0xFE7F, 0xFE7F}, // ﹿ
65 | {0xFE8C, 0xFE8C}, // ﺌ
66 | {0xFE92, 0xFE92}, // ﺒ
67 | {0xFE98, 0xFE98}, // ﺘ
68 | {0xFE9C, 0xFE9C}, // ﺜ
69 | {0xFEA0, 0xFEA0}, // ﺠ
70 | {0xFEA4, 0xFEA4}, // ﺤ
71 | {0xFEA8, 0xFEA8}, // ﺨ
72 | {0xFEB4, 0xFEB4}, // ﺴ
73 | {0xFEB8, 0xFEB8}, // ﺸ
74 | {0xFEBC, 0xFEBC}, // ﺼ
75 | {0xFEC0, 0xFEC0}, // ﻀ
76 | {0xFEC4, 0xFEC4}, // ﻄ
77 | {0xFEC8, 0xFEC8}, // ﻈ
78 | {0xFECC, 0xFECC}, // ﻌ
79 | {0xFED0, 0xFED0}, // ﻐ
80 | {0xFED4, 0xFED4}, // ﻔ
81 | {0xFED8, 0xFED8}, // ﻘ
82 | {0xFEDC, 0xFEDC}, // ﻜ
83 | {0xFEE0, 0xFEE0}, // ﻠ
84 | {0xFEE4, 0xFEE4}, // ﻤ
85 | {0xFEE8, 0xFEE8}, // ﻨ
86 | {0xFEEC, 0xFEEC}, // ﻬ
87 | {0xFEF4, 0xFEF4}, // ﻴ
88 | {0x1171D, 0x1171F}, // 𑜝..𑜟
89 | {0x11940, 0x11940}, // 𑥀
90 | {0x11942, 0x11942}, // 𑥂
91 | {0x1612A, 0x1612C}, // ..
92 | {0x1612E, 0x1612E}, //
93 | // clang-format on
94 | }; // 8 ranges, 69 singles, 116 codepoints
95 | #endif
96 |
--------------------------------------------------------------------------------
/mingw-cross.cmake:
--------------------------------------------------------------------------------
1 | # from linux or a bsd:
2 | # cmake . -DCMAKE_TOOLCHAIN_FILE=mingw-cross.cmake
3 | # from windows: cmake . -G "MinGW Makefiles"
4 | set(CMAKE_SYSTEM_NAME Windows)
5 |
6 | set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
7 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
8 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
9 |
10 | # where is the target environment located
11 | set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
12 |
13 | # adjust the default behavior of the FIND_XXX() commands:
14 | # search programs in the host environment
15 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
16 | # search headers and libraries in the target environment
17 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
18 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
19 |
--------------------------------------------------------------------------------
/mkconfus.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl -s
2 | # libu8ident - Check unicode security guidelines for identifiers.
3 | # Copyright 2014, 2021, 2022 Reini Urban
4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 | #
6 | # Create confus.h from https://www.unicode.org/Public/security/latest/confusables.txt
7 | # with mkconfus.pl -c.
8 | # perf needs mkconfus.pl without -c to generate all headers.
9 | #
10 | # Note that the first array is just a binary-search in an unoptimized,
11 | # uncompressed array, without any values. It might be smaller and
12 | # faster with gperf or cbitset/croaring.
13 |
14 | use vars qw($c);
15 | use strict;
16 | use Config;
17 | use utf8;
18 | use Encode ();
19 | use B 'cstring';
20 | use Unicode::Normalize;
21 |
22 | my $confus = "confusables.txt";
23 | for ($confus) {
24 | if (!-e $_) {
25 | system("wget -N https://www.unicode.org/Public/security/latest/$_");
26 | }
27 | }
28 |
29 | # fixup wrong pairs. these are not really confusable
30 | my %confbugs = map { $_ => 1 } qw(0022 0025 0030 0031 0049 006D 007C
31 | 0251 028B 028F
32 | 03BD 03C3 03D1 03F1 03F4
33 | 0561 0570 0575 0584);
34 | my (@CONF, @ucd_version, $started);
35 | open my $CONF, "<", $confus or die "$confus $!";
36 | while (<$CONF>) {
37 | if (!$started) {
38 | if (/^# Version: (\d+)\.(\d)\.(\d)/) { @ucd_version = ($1,$2,$3); }
39 | if (/^# For documentation and/) { $started++; }
40 | next unless $started;
41 | } else {
42 | if (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5}) ;/) {
43 | my ($from, $to1) = (hex($1), hex($2));
44 | push @CONF, [$from, $to1];
45 | }
46 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) {
47 | my ($from, $ids) = (hex($1), $2);
48 | my @ids = map{ hex $_ } split(' ',$ids);
49 | push @CONF, [$from, @ids];
50 | }
51 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) {
52 | my ($from, $ids) = (hex($1), $2);
53 | my @ids = map{ hex $_ } split(' ',$ids);
54 | push @CONF, [$from, @ids];
55 | }
56 | elsif (/^([0-9A-F]{4,5}) ;\t([0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} [0-9A-F]{4,5} );\t/) {
57 | my ($from, $ids) = (hex($1), $2);
58 | my @ids = map{ hex $_ } split(' ',$ids);
59 | push @CONF, [$from, @ids];
60 | }
61 | if (exists $confbugs{$1}) {
62 | $CONF[$#CONF] = [ @{$CONF[$#CONF]}, "//" ];
63 | }
64 | }
65 | }
66 | close $CONF;
67 | @CONF = sort {$a->[0] <=> $b->[0]} @CONF;
68 | printf "%u confusables\n", scalar @CONF;
69 |
70 | my $ofile1 = "confus.h";
71 | chmod 0644, $ofile1 if -e $ofile1;
72 | open my $H1, ">:utf8", $ofile1 or die "writing $ofile1 $!";
73 | print $H1 <<"EOF";
74 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
75 | /* libu8ident - Check unicode security guidelines for identifiers.
76 | Copyright 2021 Reini Urban
77 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
78 |
79 | Generated by mkconfus.pl, do not modify.
80 | UNICODE version $ucd_version[0].$ucd_version[1]
81 | */
82 | #include
83 |
84 | /* Sorted set of all confusables without mapping,
85 | from https://www.unicode.org/Public/security/latest/confusables.txt
86 | Some confusables are outcommented.
87 | */
88 | #ifndef EXTERN_SCRIPTS
89 | const uint32_t confusables[] = {
90 | // clang-format off
91 | EOF
92 |
93 | my $ofile = "gconfus.h.in";
94 | chmod 0644, $ofile if -e $ofile;
95 | open my $H, ">:utf8", $ofile or die "writing $ofile $!";
96 | print $H <<"EOF";
97 | %{/* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
98 | /* libu8ident - Check unicode security guidelines for identifiers.
99 | Copyright 2014, 2021, 2022 Reini Urban
100 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
101 |
102 | generated by mkconfus.pl, do not modify.
103 | UNICODE version $ucd_version[0].$ucd_version[1]
104 | */
105 |
106 | #include
107 | #include "u8id_private.h"
108 |
109 | // v3.1 changed len type from unsigned int to size_t (gperf d519d1a821511eaa22eae6d9019a548aea21e6)
110 | #ifdef GPERF_VERSION
111 | # if GPERF_VERSION < 301
112 | # define SIZE_TYPE unsigned int
113 | # else
114 | # define SIZE_TYPE size_t
115 | # endif
116 | #else
117 | # define SIZE_TYPE size_t
118 | #endif
119 |
120 | /* FIXME gperf to accept uint32_t keys.
121 | key is "%05X" of the unicode codepoint, fixed length 5.
122 | u8nfc is the rhs of confusables, already decomposed into NFC and encoded as UTF-8.
123 | Some confusables are outcommented.
124 | */
125 | %}
126 | %7bit
127 | %language=ANSI-C
128 | %struct-type
129 | %readonly-tables
130 | %null-strings
131 |
132 | struct confus_gperf { const char *const name; const char *const u8nfc; };
133 |
134 | %%
135 | EOF
136 | my $i = 0;
137 | my $enc = Encode::find_encoding("UTF-8");
138 | for my $c (@CONF) {
139 | if (@$c == 2) {
140 | printf $H "#-- %s -> %s\n", chr $c->[0], chr $c->[1];
141 | printf $H1 " 0x%04X,\t/* %s -> %s */\n", $c->[0], chr $c->[0], chr $c->[1];
142 | printf $H "%05X,\t%s\n", $c->[0], B::cstring NFC(chr $c->[1]);
143 | $i++;
144 | } else {
145 | my $f = shift @$c;
146 | my $j = @$c - 1;
147 | if ($c->[$j] eq "//") {
148 | printf $H1 "//";
149 | pop @$c;
150 | printf $H "#-- %s -> %s\n", chr $f, join('', map {chr $_} @$c);
151 | printf $H "# ";
152 | } else {
153 | printf $H "#-- %s -> %s\n", chr $f, join('', map {chr $_} @$c);
154 | $i++;
155 | }
156 | printf $H1 " 0x%04X,\t/* %s -> ", $f, chr $f;
157 | printf $H "%05X,\t", $f;
158 | my $nfd = "";
159 | for (@$c) {
160 | printf $H1 "%s", chr $_;
161 | $nfd .= NFC(chr $_);
162 | }
163 | printf $H1 " */\n";
164 | printf $H "%s\n", B::cstring $nfd;
165 | }
166 | }
167 |
168 | print $H <<'EOF';
169 | %%
170 |
171 | /*
172 | * Local variables:
173 | * c-file-style: "gnu"
174 | * End:
175 | * vim: expandtab shiftwidth=4 cinoptions='\:2=2' :
176 | */
177 | EOF
178 | close $H;
179 | chmod 0444, $ofile;
180 |
181 | print $H1 <<"EOF";
182 | // clang-format on
183 | };
184 | #else
185 | extern const uint32_t confusables[$i];
186 | #endif
187 | EOF
188 | close $H1;
189 | chmod 0444, $ofile1;
190 |
191 | print "Create serialized roaring bitmaps:\n";
192 | my $arg = $c ? "confus" : "";
193 | if ($^O =~ /Win32/) {
194 | system($Config{cc}." mkroar.c -I. -o mkroar.exe");
195 | if ($c) {
196 | system("mkroar.exe", $arg);
197 | } else {
198 | system("mkroar.exe");
199 | }
200 | # ignore vms for now
201 | } else {
202 | system($Config{cc}." mkroar.c -I. -o mkroar");
203 | if ($c) {
204 | system("./mkroar", $arg);
205 | } else {
206 | system("./mkroar");
207 | }
208 | }
209 | print "\n";
210 | # allowed not optimized. stayed with 816 array
211 | # confus optimized from 8552 byte to 4731 byte (3 run-length encoded containers)
212 | # NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M
213 | my @list = $c ? qw(confus)
214 | : qw(allowed confus nfd_n nfc_n nfc_m nfkd_n nfkc_n nfkc_m);
215 | for my $name (@list) {
216 | my $c = $name . "_croar";
217 | my $b = $c;
218 | if ($name =~ /(nfk?[cd])_./) {
219 | $b = $1 . "_croar";
220 | }
221 | if ($name =~ /^nfk?c_m/) {
222 | system("xxd -i $c.bin >> $b.h");
223 | unlink "$c.bin";
224 | } else {
225 | open my $F, '>', "$b.h";
226 | print $F "/* generated via mkroar.c */\n";
227 | close $F;
228 | system("xxd -i $c.bin >> $b.h");
229 | unlink "$c.bin";
230 | print "Created $b.h\n";
231 | }
232 | }
233 |
--------------------------------------------------------------------------------
/mkgc.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # libu8ident - Check unicode security guidelines for identifiers.
3 | # Copyright 2022 Reini Urban
4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 | #
6 | # Create u8id_gc.h General_Category API
7 |
8 | use strict;
9 | use Config;
10 | use utf8;
11 |
12 | my $ucd = "UnicodeData.txt";
13 | if (!-e $ucd) {
14 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd");
15 | }
16 |
17 | my $gc_h = "u8id_gc.h";
18 | my (@GC, %GC);
19 | open my $UCD, "<", $ucd or die "$ucd $!";
20 | my ($from, $to, $oldto, $oldgc, $gc) = (0,0,0,'Cc','');
21 | while (<$UCD>) {
22 | my @l = split ';';
23 | $gc = @l[2];
24 | if ($gc =~ /^\w.$/) {
25 | $GC{$gc}++;
26 | $to = hex($l[0]);
27 | if ($oldgc ne $gc) {
28 | push @GC, [$from, $to-1, $oldgc];
29 | $from = $to;
30 | $oldgc = $gc;
31 | }
32 | #else {
33 | # $oldto = $GC[$#GC]->[1];
34 | # # FIXME
35 | # if ($oldto + 1 != $to) {
36 | # push @GC, [$to, $to, $gc];
37 | # $from = $to;
38 | # } else { # update last
39 | # $GC[$#GC]->[1] = $to;
40 | # }
41 | #}
42 | }
43 | }
44 | push @GC, [$from, $to, $gc]; # the remainder
45 | close $UCD;
46 |
47 | if (!-w $gc_h) {
48 | chmod 0644, $gc_h;
49 | }
50 | open my $H, ">:utf8", $gc_h or die "writing $gc_h $!";
51 | print $H <<'EOF';
52 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
53 | /* libu8ident - Check unicode security guidelines for identifiers.
54 | Copyright 2022 Reini Urban
55 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
56 |
57 | UCD General_Category API
58 | For scripts.h we are only interested for the Identifier parts in scx_list[]
59 | to detect illegal runs.
60 | But u8idlint.c/unic26.h needs the full list.
61 | Thanksfully this is only needed offline, to create unic26.h.
62 | Generated by mkgc.pl, do not modify.
63 | */
64 |
65 | #include
66 |
67 | enum u8id_gc {
68 | EOF
69 | my @GCs = sort keys %GC;
70 | for my $g (@GCs) {
71 | $g =~ s/&/amp/;
72 | printf $H " GC_%s,\n", $g;
73 | }
74 | printf $H " GC_L, // is L&, all letters. (for unic26.h only)\n";
75 | printf $H " GC_V, // is varying, eg L or S. (for unic26.h only),\n";
76 | printf $H " GC_INVALID,\n";
77 | printf $H <<'EOF', scalar(@GCs) + 3;
78 | };
79 |
80 | #ifdef EXTERN_SCRIPTS
81 | extern const char *const u8id_gc_names[%u];
82 | #else
83 | LOCAL const char *const u8id_gc_names[] = {
84 | EOF
85 | for my $g (@GCs) {
86 | $g =~ s/&/amp/;
87 | printf $H " \"%s\",\n", $g;
88 | }
89 | printf $H " \"L\", // L& really (for unic26.h only)\n";
90 | printf $H " \"V\", // varying (for unic26.h only)\n";
91 | printf $H " \"Zz\"\n";
92 | printf $H <<'EOF';
93 | };
94 | #endif
95 |
96 | struct gc {
97 | uint32_t from;
98 | uint32_t to;
99 | enum u8id_gc gc;
100 | };
101 |
102 | EOF
103 |
104 | printf $H <<'EOF', scalar @GC;
105 | #ifdef EXTERN_SCRIPTS
106 | extern const struct gc gc_list[%u];
107 | #else
108 | const struct gc gc_list[] = {
109 | // clang-format off
110 | EOF
111 | for my $r (@GC) {
112 | my $u = chr $r->[0];
113 | $u = '' if $r->[0] >= 0xDB70 && $r->[0] <= 0xDFFF;
114 | $u = '' if $u !~ /\w/;
115 | $u = '' if $r->[2] =~ /^M/;
116 | if ($r->[0] == $r->[1]) {
117 | printf $H " {0x%X, 0x%X, GC_%s},\t// %s\n", $r->[0], $r->[1], $r->[2], $u;
118 | } else {
119 | my $u1 = chr $r->[1];
120 | $u1 = '' if $r->[1] >= 0xD800 && $r->[1] <= 0xDFFF;
121 | $u1 = '' if $u !~ /\w/;
122 | $u1 = '' if $r->[2] =~ /^M/;
123 | printf $H " {0x%X, 0x%X, GC_%s},\t// %s..%s\n", $r->[0], $r->[1], $r->[2], $u, $u1;
124 | }
125 | }
126 | printf $H <<'EOF';
127 | // clang-format on
128 | };
129 | #endif
130 | EOF
131 |
132 | close $H;
133 | chmod 0444, $gc_h;
134 | print "Created $gc_h\n";
135 |
136 |
--------------------------------------------------------------------------------
/mkmark.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # libu8ident - Check unicode security guidelines for identifiers.
3 | # Copyright 2014, 2021 Reini Urban
4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 | #
6 | # Create mark.h Combining_Mark (Mc | Me | Mn)
7 |
8 | use strict;
9 | use Config;
10 | my $ucd = "UnicodeData.txt";
11 | if (!-e $ucd) {
12 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd");
13 | }
14 |
15 | my $mark_h = "mark.h";
16 | my $doc = "doc/appendix-h.md";
17 | my (@MARK, @NSM, %NSM);
18 | open my $UCD, "<", $ucd or die "$ucd $!";
19 | my ($from, $to, $oldto);
20 | while (<$UCD>) {
21 | my @l = split ';';
22 | my $mark = $l[2];
23 | if ($mark =~ /^M[cen]$/) {
24 | $to = hex($l[0]);
25 | if (!$from) {
26 | push @MARK, [$to, $to];
27 | $from = $to;
28 | } else {
29 | $oldto = $MARK[$#MARK]->[1];
30 | if ($oldto + 1 != $to) {
31 | push @MARK, [$to, $to];
32 | $from = $to;
33 | } else { # update last
34 | $MARK[$#MARK]->[1] = $to;
35 | }
36 | }
37 | my $name = $l[1];
38 | my $n = $l[10];
39 | if ($name =~ /^COMBINING / && $l[4] eq 'NSM' && $n =~ /^NON-SPACING /) {
40 | $n =~ s/^NON-SPACING //;
41 | my $nsm = $n;
42 | push @NSM, [ $nsm, $to ];
43 | }
44 | }
45 | }
46 | # $NSM{'DOT ABOVE'} = [ ord('i') ];
47 | seek($UCD,0,0);
48 | while (<$UCD>) {
49 | my @l = split ';';
50 | my $name = $l[1];
51 | # Check "HIRAGANA LETTER PA;Lo;0;L;306F 309A;" NFD expansion
52 | if ($l[2] =~ /^L/ && $l[5] =~ / /) {
53 | for my $r (@NSM) {
54 | my $nsm = $r->[0];
55 | my $n = sprintf("%04X", $r->[1]);
56 | if ($l[5] =~ / \Q$n\E$/) {
57 | my $cp = hex $l[0];
58 | if (exists $NSM{$nsm}) {
59 | push @{$NSM{$nsm}}, $cp;
60 | } else {
61 | $NSM{$nsm} = [ $cp ];
62 | }
63 | }
64 | }
65 | }
66 | # but also check names with NSM's, e.g. LATIN SMALL LETTER Q WITH HOOK
67 | # without any NFD expansion. => DOT ABOVE, DIAERESIS, DOT BELOW, THREE DOTS ABOVE,
68 | # FOUR DOTS ABOVE.
69 | elsif ($name =~ /LETTER .+ WITH .+/) {
70 | for my $r (@NSM) {
71 | my $nsm = $r->[0];
72 | if ($name =~ /LETTER .+ WITH \Q$nsm\E$/) {
73 | my $cp = hex $l[0];
74 | if (exists $NSM{$nsm}) {
75 | push @{$NSM{$nsm}}, $cp;
76 | } else {
77 | $NSM{$nsm} = [ $cp ];
78 | }
79 | }
80 | }
81 | }
82 | # plus all the DOTLESS i and j letters, but not arabic BEH, NOON, FEH, QAF
83 | if ($name =~ /LETTER .+ DOTLESS [IJ]/) {
84 | my $nsm = 'DOT ABOVE';
85 | my $cp = hex $l[0];
86 | #if (exists $NSM{$nsm}) {
87 | push @{$NSM{$nsm}}, $cp;
88 | #} else {
89 | # $NSM{$nsm} = [ $cp ];
90 | #}
91 | }
92 | }
93 | close $UCD;
94 | # delete empty NSM's
95 | my $NUM_NSM = scalar @NSM;
96 | my @N;
97 | for my $r (@NSM) {
98 | if (exists $NSM{$r->[0]}) {
99 | push @N, $r;
100 | } else {
101 | warn "skip empty $r->[0] ",sprintf("%04X",$r->[1]),"\n";
102 | }
103 | }
104 | @NSM = @N;
105 |
106 | chmod 0644, $mark_h if !-w $mark_h;
107 | chmod 0644, $doc if !-w $doc;
108 | open my $H, ">encoding(UTF-8)", $mark_h or die "writing $mark_h $!";
109 | open my $DOC, ">encoding(UTF-8)", $doc or die "writing $doc $!";
110 | print $H <<'EOF';
111 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
112 | /* libu8ident - Check unicode security guidelines for identifiers.
113 | Copyright 2014, 2021, 2022 Reini Urban
114 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
115 |
116 | All Combining_Mark (Mc | Me | Mn),
117 | All letters with non-spacing combining marks.
118 | Generated by mkmark.pl, do not modify.
119 | */
120 | #include
121 | #include
122 |
123 | struct nsm_ws {
124 | uint32_t nsm;
125 | wchar_t *letters;
126 | };
127 |
128 | EOF
129 | print $DOC <[0], $r->[1];
144 | }
145 | printf $H <<'EOF';
146 | // clang-format on
147 | };
148 | #endif
149 | EOF
150 |
151 | printf $H <<'EOF';
152 |
153 | /* All non-spacing combining marks, sorted */
154 | enum nsm_marks {
155 | EOF
156 | for my $r (@NSM) {
157 | my $n = $r->[0];
158 | $n =~ s/[ -]/_/g;
159 | printf $H " NSM_%s,\t/* %X */\n", $n, $r->[1];
160 | }
161 | printf $H <<'EOF';
162 | NSM_LAST
163 | };
164 | EOF
165 |
166 | printf $H <<'EOF', scalar @NSM;
167 |
168 | /* All letters with non-spacing combining marks, sorted.
169 | The first entry is the NSM, if letters exist.
170 | */
171 | #ifdef EXTERN_SCRIPTS
172 | extern const struct nsm_ws nsm_letters[%u];
173 | #else
174 | const struct nsm_ws nsm_letters[] = {
175 | // clang-format off
176 | EOF
177 | printf $DOC <<'EOF', $NUM_NSM, scalar @NSM;
178 |
179 | From all %u non-spacing marks,
180 | the list of letters already including its %u non-spacing marks:
181 |
182 | EOF
183 |
184 | for my $r (@NSM) {
185 | my $nsm = $r->[0];
186 | printf $H " { ";
187 | if ($NSM{$nsm}) {
188 | my $u = "";
189 | if ($r->[1] >= 0x10000) {
190 | printf $H "0x%05X, /* NSM: %s %X */\n ", $r->[1], $nsm, $r->[1];
191 | printf $DOC "- NSM: %s %05X\n\n ", $nsm, $r->[1];
192 | } else {
193 | printf $H "0x%04X, /* NSM: %s %X */\n ", $r->[1], $nsm, $r->[1];
194 | printf $DOC "- NSM: %s %04X\n\n ", $nsm, $r->[1];
195 | }
196 | printf $H "L\"";
197 | my $i = 0;
198 | my @L = @{$NSM{$nsm}};
199 | for (@L) {
200 | if ($_ != 0) {
201 | $u .= chr ($_);
202 | if ($_ >= 0x10000) {
203 | printf $H "\\U%08X", $_;
204 | printf $DOC "%05X", $_;
205 | } else {
206 | printf $H "\\u%04X", $_;
207 | printf $DOC "%04X", $_;
208 | }
209 | if (@L == $i + 1) {
210 | print $DOC "\n";
211 | } else {
212 | print $DOC ++$i % 8 ? ", " : "\n ";
213 | }
214 | } else {
215 | warn "0 in $nsm";
216 | }
217 | }
218 | printf $H "\" },\n /* %s */\n", $u;
219 | printf $DOC "\n \"%s\"\n", $u;
220 | printf $DOC "\n";
221 | } else {
222 | printf $H "0U }, /* NSM: %s %X */\n", $nsm, $r->[1];
223 | printf $DOC "None\n";
224 | }
225 | }
226 | printf $H <<'EOF';
227 | // clang-format on
228 | };
229 | #endif
230 | EOF
231 |
232 | # assume symlinked
233 | if (-e "roaring.h" && -e "roaring.c") {
234 | print "\nCreate serialized roaring bitmaps:\n";
235 | if ($^O =~ /Win32/) {
236 | system($Config{cc}." mkroar.c -I. -o mkroar.exe");
237 | system("mkroar.exe mark");
238 | # ignore vms for now
239 | } else {
240 | system($Config{cc}." mkroar.c -I. -o mkroar");
241 | system("./mkroar mark");
242 | }
243 | print "\n";
244 | my $c = "mark_croar";
245 | print $H <<'EOF';
246 |
247 | // This was just an experiment. It's slower than binary search in ranges.
248 | #ifdef HAVE_CROARING
249 | # ifndef EXTERN_SCRIPTS
250 | /* generated via mkroar.c */
251 | EOF
252 | close $H;
253 | system("xxd -i $c.bin >> $mark_h");
254 | unlink "$c.bin";
255 | open $H, ">>", $mark_h or die "appending $mark_h $!";
256 | print $H <<'EOF';
257 | # else
258 | extern const unsigned int mark_croar_bin_len;
259 | extern const unsigned char mark_croar_bin[1219]; // checkme on updates
260 | # endif // EXTERN_SCRIPTS
261 | #endif // HAVE_CROARING
262 | EOF
263 | }
264 |
265 | close $H;
266 | close $DOC;
267 | chmod 0444, $mark_h;
268 | chmod 0444, $doc;
269 | print "Created mark.h and $doc\n";
270 |
271 |
--------------------------------------------------------------------------------
/mkmedial.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # libu8ident - Check unicode security guidelines for identifiers.
3 | # Copyright 2022 Reini Urban
4 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5 | #
6 | # Create medial.h, letters in MEDIAL position
7 |
8 | use strict;
9 | use Config;
10 | use utf8;
11 |
12 | my $ucd = "UnicodeData.txt";
13 | if (!-e $ucd) {
14 | system("wget -N https://www.unicode.org/Public/UNIDATA/$ucd");
15 | }
16 |
17 | my $medial_h = "medial.h";
18 | my @M;
19 | open my $UCD, "<", $ucd or die "$ucd $!";
20 | my ($from, $oldto, $to) = (0,0,0);
21 | while (<$UCD>) {
22 | my @l = split ';';
23 | my $m = $l[0];
24 | my $name = $l[1];
25 | if ($name =~ / MEDIAL /) {
26 | my $to = hex($m);
27 | unless ($from) {
28 | $from = $oldto = $to;
29 | } elsif ($oldto + 1 != $to) {
30 | push @M, [$from, $oldto];
31 | $from = $oldto = $to;
32 | } else {
33 | $oldto = $to;
34 | }
35 | }
36 | }
37 | $to = $oldto unless $to;
38 | push @M, [$from, $to]; # the remainder
39 | close $UCD;
40 |
41 | # add https://www.unicode.org/reports/tr31/#Table_Optional_Medial
42 | # They are mostly not part of any TR31 id set, but at least forbid them at the start and end.
43 | push @M, [0x27, 0x27]; # APOSTROPHE
44 | push @M, [0x2D, 0x2E]; # HYPHEN-MINUS..FULL-STOP
45 | push @M, [0x3A, 0x3A]; # COLON
46 | push @M, [0xB7, 0xB7]; # MIDDLE DOT. disallowed in SAFEC26
47 | push @M, [0x58A, 0x58A]; # ARMENIAN HYPHEN
48 | push @M, [0x5F4, 0x5F4]; # HEBREW PUNCTUATION GERSHAYIM
49 | push @M, [0xF0B, 0xF0B]; # TIBETAN MARK INTERSYLLABIC TSHEG
50 | push @M, [0x200C, 0x200C]; # ZERO WIDTH NON-JOINER
51 | push @M, [0x2010, 0x2010]; # HYPHEN
52 | push @M, [0x2019, 0x2019]; # RIGHT SINGLE QUOTATION MARK
53 | push @M, [0x2027, 0x2027]; # HYPHENATION POINT
54 | push @M, [0x30A0, 0x30A0]; # KATAKANA-HIRAGANA DOUBLE HYPHEN
55 | push @M, [0x30FB, 0x30FB]; # KATAKANA MIDDLE DOT
56 |
57 | @M = sort { $a->[0] <=> $b->[0] } @M;
58 |
59 | if (!-w $medial_h) {
60 | chmod 0644, $medial_h;
61 | }
62 | open my $H, ">:utf8", $medial_h or die "writing $medial_h $!";
63 | print $H <<'EOF';
64 | /* ex: set ro ft=c: -*- mode: c; buffer-read-only: t -*- */
65 | /* libu8ident - Check unicode security guidelines for identifiers.
66 | Copyright 2022 Reini Urban
67 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
68 |
69 | UCD Medial Letters and Forms, not at Start and not at End of a word.
70 | Includes MEDIAL and https://www.unicode.org/reports/tr31/#Table_Optional_Medial
71 | Generated by mkmedial.pl, do not modify.
72 | */
73 |
74 | EOF
75 |
76 | printf $H <<'EOF', scalar @M;
77 | #ifdef EXTERN_SCRIPTS
78 | extern const struct range_bool medial_list[%u];
79 | #else
80 | const struct range_bool medial_list[] = {
81 | // clang-format off
82 | EOF
83 | my ($b, $s, $sum) = (0,0,0);
84 | for my $r (@M) {
85 | my $u = chr $r->[0];
86 | $u = '' if $r->[0] >= 0xDB70 && $r->[0] <= 0xDFFF;
87 | #$u = '' if $u !~ /\w/;
88 | if ($r->[0] == $r->[1]) {
89 | $s++; $sum++;
90 | printf $H " {0x%X, 0x%X},\t// %s\n", $r->[0], $r->[1], $u;
91 | } else {
92 | my $u1 = chr $r->[1];
93 | $u1 = '' if $r->[1] >= 0xD800 && $r->[1] <= 0xDFFF;
94 | #$u1 = '' if $u !~ /\w/;
95 | $b++; $sum += ($r->[1] + 1 - $r->[0]);
96 | printf $H " {0x%X, 0x%X},\t// %s..%s\n", $r->[0], $r->[1], $u, $u1;
97 | }
98 | }
99 | printf $H <<'EOF', $b, $s, $sum;
100 | // clang-format on
101 | }; // %u ranges, %u singles, %u codepoints
102 | #endif
103 | EOF
104 |
105 | close $H;
106 | chmod 0444, $medial_h;
107 | print "Created $medial_h\n";
108 |
109 |
--------------------------------------------------------------------------------
/mkroar.c:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2021 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | generate roaring bitmaps for some sets.
6 | */
7 |
8 | #include
9 | #include "roaring.c"
10 | #include "u8id_private.h"
11 | #include "u8id_gc.h"
12 | #include "scripts.h"
13 | #include "confus.h"
14 | #include "mark.h"
15 |
16 | enum what_list {
17 | ALLOWED_ID_LIST,
18 | CONFUSABLES,
19 | MARK,
20 | NFD_N,
21 | NFC_N,
22 | NFC_M,
23 | NFKD_N,
24 | NFKC_N,
25 | NFKC_M
26 | };
27 |
28 | int serialize(size_t size, const uint32_t *list, enum what_list what) {
29 | FILE *f;
30 | roaring_bitmap_t *rb = roaring_bitmap_create_with_capacity(size);
31 | roaring_statistics_t stat;
32 | const char *file;
33 | if (what == CONFUSABLES) { // simple uint32_t[]
34 | for (uint32_t i = 0; i < size; i++)
35 | roaring_bitmap_add(rb, list[i]);
36 | } else /* if (what == ALLOWED_ID_LIST) */ { // struct range_bool
37 | const struct range_bool *blist = (const struct range_bool *)list;
38 | for (uint32_t i = 0; i < size; i++) {
39 | for (uint32_t cp = blist[i].from; cp <= blist[i].to; cp++) {
40 | roaring_bitmap_add(rb, cp);
41 | }
42 | }
43 | }
44 | switch (what) {
45 | case ALLOWED_ID_LIST:
46 | file = "allowed_croar.bin";
47 | break;
48 | case CONFUSABLES:
49 | file = "confus_croar.bin";
50 | break;
51 | case MARK:
52 | file = "mark_croar.bin";
53 | break;
54 | /* NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M */
55 | case NFD_N:
56 | file = "nfd_n_croar.bin";
57 | break;
58 | case NFC_N: // this might be slower than binary search
59 | file = "nfc_n_croar.bin";
60 | break;
61 | case NFC_M:
62 | file = "nfc_m_croar.bin";
63 | break;
64 | case NFKD_N:
65 | file = "nfkd_n_croar.bin";
66 | break;
67 | case NFKC_N:
68 | file = "nfkc_n_croar.bin";
69 | break;
70 | case NFKC_M:
71 | file = "nfkc_m_croar.bin";
72 | break;
73 | default:
74 | fprintf(stderr, "Unhandled case %d\n", what);
75 | exit(1);
76 | }
77 |
78 | f = fopen(file, "w");
79 | uint32_t sizebefore = roaring_bitmap_portable_size_in_bytes(rb);
80 | roaring_bitmap_run_optimize(rb);
81 |
82 | uint32_t sizeafter = roaring_bitmap_portable_size_in_bytes(rb);
83 | char *serializedbytes = malloc(sizeafter);
84 | roaring_bitmap_portable_serialize(rb, serializedbytes);
85 | fwrite(serializedbytes, 1, sizeafter, f);
86 | fclose(f);
87 | free(serializedbytes);
88 | printf("\nwrote %u serialized bytes to %s\n", sizeafter, file);
89 |
90 | printf("cardinality = %d\n", (int)roaring_bitmap_get_cardinality(rb));
91 | printf("size before/after optim: %u/%u\n", sizebefore, sizeafter);
92 | roaring_bitmap_statistics(rb, &stat);
93 | printf("n_bitset_containers = %u\n", stat.n_bitset_containers);
94 | printf("n_values_bitset_containers = %u\n", stat.n_values_bitset_containers);
95 | printf("n_bytes_bitset_containers = %u\n", stat.n_bytes_bitset_containers);
96 | printf("n_array_containers = %u\n", stat.n_array_containers);
97 | printf("n_values_array_containers = %u\n", stat.n_values_array_containers);
98 | printf("n_bytes_array_containers = %u\n", stat.n_bytes_array_containers);
99 | printf("n_run_containers = %u\n", stat.n_run_containers);
100 | printf("n_values_run_containers = %u\n", stat.n_values_run_containers);
101 | printf("n_bytes_run_containers = %u\n", stat.n_bytes_run_containers);
102 | roaring_bitmap_free(rb);
103 | return 0;
104 | }
105 |
106 | int main(int argc, char **argv) {
107 | if (argc < 2 || strcmp(argv[1], "confus") == 0)
108 | serialize(ARRAY_SIZE(confusables), confusables, CONFUSABLES);
109 | if (argc > 1 && strcmp(argv[1], "confus") == 0)
110 | exit(0);
111 |
112 | if (argc < 2 || strcmp(argv[1], "mark") == 0)
113 | serialize(ARRAY_SIZE(mark_list), (const uint32_t *)mark_list, MARK);
114 | if (argc > 1 && strcmp(argv[1], "mark") == 0)
115 | exit(0);
116 |
117 | serialize(ARRAY_SIZE(allowed_id_list), (const uint32_t *)allowed_id_list,
118 | ALLOWED_ID_LIST);
119 | /* NFD_N, NFC_N, NFC_M, NFKD_N, NFKC_N, NFKC_M */
120 | serialize(ARRAY_SIZE(NFD_N_list), (const uint32_t *)NFD_N_list, NFD_N);
121 | serialize(ARRAY_SIZE(NFC_N_list), (const uint32_t *)NFC_N_list, NFC_N);
122 | serialize(ARRAY_SIZE(NFC_M_list), (const uint32_t *)NFC_M_list, NFC_M);
123 | serialize(ARRAY_SIZE(NFKD_N_list), (const uint32_t *)NFKD_N_list, NFKD_N);
124 | serialize(ARRAY_SIZE(NFKC_N_list), (const uint32_t *)NFKC_N_list, NFKC_N);
125 | serialize(ARRAY_SIZE(NFKC_M_list), (const uint32_t *)NFKC_M_list, NFKC_M);
126 |
127 | return 0;
128 | }
129 |
--------------------------------------------------------------------------------
/mktest-norm.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use Unicode::Normalize qw(NFC NFKC NFD NFKD FCD FCC);
3 | use Encode;
4 | sub wstr($) {
5 | join('',map{sprintf'\x%02x',$_} unpack 'W*', encode_utf8 $_[0]);
6 | }
7 | binmode *STDOUT, ":utf8";
8 | # all NFC letters different from NFKC
9 | # unichars -a '\pL' 'NFC ne NFKC'
10 | # e.g. U+1C5 U+140
11 |
12 | for ("Cafe\x{301}", "Caf\x{e9}",
13 | "\x{1f87}", "\x{03B1}\x{0314}\x{0342}\x{0345}",
14 | "\x{1c5}\x{140}", "D\x{17e}l\x{b7}"
15 | )
16 | {
17 | #"\xe1\xbe\x87"
18 | print "$_ [",wstr($_),"]:\n";
19 | printf "NFC: %s [%s]\n", NFC($_), wstr(NFC($_));
20 | printf "NFKC: %s [%s]\n", NFKC($_), wstr(NFKC($_));
21 | printf "FCC: %s [%s]\n", FCC($_), wstr(FCC($_));
22 | printf "NFD: %s [%s]\n", NFD($_), wstr(NFD($_));
23 | printf "NFKD: %s [%s]\n", NFKD($_), wstr(NFKD($_));
24 | printf "FCD: %s [%s]\n", FCD($_), wstr(FCD($_));
25 | print "\n";
26 | }
27 |
--------------------------------------------------------------------------------
/test-all-fast.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 | make -f makefile.gnu clean
4 | echo "check -UU8ID_NORM -UU8ID_PROFILE"
5 | make -f makefile.gnu check
6 | make -f makefile.gnu check-asan
7 | make -f makefile.gnu clean
8 | for norm in NFC NFD NFKC NFKD FCD FCC; do
9 | for profile in 2 3 4 5 6 C23_4 C11_6; do
10 | echo "check -DU8ID_NORM=$norm -DU8ID_PROFILE=$profile"
11 | make -f makefile.gnu check DEFS="-DU8ID_NORM=$norm -DU8ID_PROFILE=$profile"
12 | make -f makefile.gnu check-asan DEFS="-DU8ID_NORM=$norm -DU8ID_PROFILE=$profile"
13 | make -f makefile.gnu clean
14 | done
15 | done
16 |
--------------------------------------------------------------------------------
/test-all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | make clean
3 | echo "check -UU8ID_NORM -UU8ID_PROFILE"
4 | ./configure && make check
5 | make check-asan
6 | make clean
7 | for norm in NFKC NFC NFKD NFD FCC FCD; do
8 | for profile in 2 3 4 5 6; do
9 | echo "check -DU8ID_NORM=$norm -DU8ID_PROFILE=$profile"
10 | ./configure --with-norm=$norm --with-profile=$profile && make check
11 | make check-asan
12 | make clean
13 | done
14 | done
15 |
16 |
--------------------------------------------------------------------------------
/test-texts.c:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2021, 2022 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | Check some files, wordbreak them to valid identifiers in some common scripts.
6 | */
7 | #include "u8id_private.h"
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #ifdef HAVE_SYS_TYPES_H
14 | # include
15 | #endif
16 | #ifdef HAVE_SYS_STAT_H
17 | # include
18 | #endif
19 | #ifdef HAVE_DIRENT_H
20 | # include
21 | #endif
22 | #if defined HAVE_DIRENT_H && !defined _MSC_VER
23 | # include
24 | #endif
25 | #ifdef _MSC_VER
26 | # define WIN32_LEAN_AND_MEAN
27 | # include
28 | #endif
29 | #ifdef HAVE_LIBGEN_H
30 | # include
31 | #endif
32 |
33 | #include "u8ident.h"
34 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING
35 | # include "uniwbrk.h"
36 | #endif
37 | #include "u8idscr.h"
38 | //#undef EXTERN_SCRIPTS
39 | #include "unic11.h"
40 |
41 | int verbose = 0;
42 | // private access
43 | unsigned u8ident_options(void);
44 | enum u8id_profile u8ident_profile(void);
45 | char *enc_utf8(char *dest, size_t *lenp, const uint32_t cp);
46 |
47 | #ifdef HAVE_SYS_STAT_H
48 | static int file_exists(const char *path) {
49 | struct stat st;
50 | return (stat(path, &st) == 0) && (st.st_mode & S_IFDIR) != S_IFDIR;
51 | }
52 | #elif defined _MSC_VER
53 | static int file_exists(const char *path) {
54 | const uint16_t dwAttrib = GetFileAttributes(path);
55 | return (dwAttrib != INVALID_FILE_ATTRIBUTES &&
56 | !(dwAttrib & FILE_ATTRIBUTE_DIRECTORY);
57 | }
58 | #endif
59 |
60 | int testdir(const char *dir, const char *fname) {
61 | char path[256];
62 | static char line[1024] = {0};
63 | unsigned ln = 0;
64 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING
65 | static char brks[1024] = {0};
66 | #endif
67 | static char word[128] = {0};
68 | if (!dir) {
69 | strncpy(path, fname, sizeof(path) - 1);
70 | } else {
71 | strncpy(path, dir, sizeof(path) - 1);
72 | path[255] = '\0';
73 | strncat(path, "/", sizeof(path) - 1);
74 | path[255] = '\0';
75 | strncat(path, fname, sizeof(path) - 1);
76 | path[255] = '\0';
77 | }
78 |
79 | FILE *f = fopen(path, "r");
80 | if (!f) {
81 | perror("fopen");
82 | fprintf(stderr, "texts/%s\n", fname);
83 | return -1;
84 | }
85 |
86 | printf("-- texts/%s\n", fname);
87 | int ctx = u8ident_new_ctx();
88 | // while (fscanf(f, " %1023s", word) == 1)
89 | // Check now also against libunistring: u8_wordbreaks
90 | while (fgets(line, 1023, f)) {
91 | char *s = &line[0];
92 | bool prev_isword = false;
93 | char *wp = &word[0];
94 | ln++;
95 | *word = '\0';
96 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING
97 | u8_wordbreaks((uint8_t *)s, strlen(s), brks);
98 | #endif
99 | while (*s) {
100 | char *olds = s;
101 | uint32_t cp = dec_utf8(&s);
102 | if (!cp) {
103 | printf("ERROR %s illegal UTF-8 at line %u, col %lu\n", olds, ln,
104 | s - olds);
105 | exit(1);
106 | }
107 |
108 | // unicode #29 word-break, but simplified:
109 | // must not split at continuations (Combining marks). e.g. for
110 | // texts/arabic-1.txt
111 | const bool iscont = isXID_cont(cp);
112 | bool isword = prev_isword ? (isXID_start(cp) || iscont) : isXID_start(cp);
113 | char force_break = (prev_isword != isword && !iscont);
114 | #if defined HAVE_UNIWBRK_H && defined HAVE_LIBUNISTRING
115 | if (force_break != brks[s - olds] && verbose)
116 | /* break at: if libunistring found a break, but we not.
117 | no break at: if we found a break, but libunistring not. */
118 | fprintf(stderr, "WARN: %sbreak at U+%X, line %u, col %lu\n",
119 | force_break ? "" : "no ", cp, ln, s - olds);
120 | // don't rely in the CI on an optional lib
121 | // force_break = brks[s - olds];
122 | #endif
123 | // first, or changed from non-word to word, and is no mark (continuation)
124 | if (olds == &line[0] || force_break) {
125 | prev_isword = isword;
126 | if (isword) {
127 | int l = s - olds;
128 | if (l == 1) {
129 | *wp++ = *olds;
130 | } else if (wp + l < &word[128]) {
131 | memcpy(wp, olds, l);
132 | wp += l;
133 | }
134 | continue; // started new word
135 | } else { // word-end: fall-through to word check
136 | *wp = '\0';
137 | }
138 | } else { // no change. in word or non-word
139 | if (isword) {
140 | int l = s - olds;
141 | if (l == 1) {
142 | *wp++ = *olds;
143 | } else if (wp + l < &word[128]) {
144 | memcpy(wp, olds, l);
145 | wp += l;
146 | }
147 | }
148 | if (*s != '\n')
149 | continue;
150 | }
151 | // bad case "\xd8\xa8\xd8\xb1\xd9\x88\xd8\xad" "بروح" Arabic
152 | // also bad case "أحرارًا" Arabic at pos 12 (missing basesc)
153 | if (!*wp && *word && force_break) { // non-empty word-end
154 | int ret = u8ident_check((uint8_t *)word, NULL);
155 | const char *scripts = u8ident_existing_scripts(ctx);
156 | printf("%s: %s (%s", word, u8ident_errstr(ret), scripts);
157 | if (ret < 0) {
158 | const uint32_t cp = u8ident_failed_char(ctx);
159 | const uint8_t scr = u8ident_get_script(cp);
160 | if (scr != SC_Unknown)
161 | printf(" + U+%X %s)!\n", cp, u8ident_script_name(scr));
162 | else
163 | printf(" + U+%X)!\n", cp);
164 | } else
165 | printf(")\n");
166 | free((char *)scripts);
167 | *word = '\0';
168 | wp = &word[0];
169 | }
170 | }
171 | }
172 |
173 | u8ident_free_ctx(ctx);
174 | fclose(f);
175 | return 0;
176 | }
177 |
178 | int cmp_str(const void *a, const void *b) {
179 | return strcmp(*(const char **)a, *(const char **)b);
180 | }
181 |
182 | int main(int argc, char **argv) {
183 | int i = 1;
184 | char *dirname = "texts";
185 | u8ident_init(U8ID_PROFILE_DEFAULT, U8ID_NORM_DEFAULT, U8ID_TR31_XID);
186 |
187 | if (getenv("U8IDTEST_TEXTS"))
188 | dirname = getenv("U8IDTEST_TEXTS");
189 | else if (argc > 1 && strEQc(argv[1], "-v")) {
190 | verbose++;
191 | i++;
192 | }
193 |
194 | #if !defined _MSC_VER && defined HAVE_SYS_STAT_H
195 | if (argc > i && file_exists(argv[i])) {
196 | while (argc > i && file_exists(argv[i])) {
197 | testdir(NULL, argv[i++]);
198 | }
199 | u8ident_free();
200 | return 0;
201 | }
202 | if (argc > i && !file_exists(argv[i])) {
203 | dirname = argv[i];
204 | }
205 | #endif
206 |
207 | #if defined HAVE_DIRENT_H && !defined _MSC_VER
208 | DIR *dir = opendir(dirname);
209 | if (!dir) {
210 | perror("opendir");
211 | exit(1);
212 | }
213 | #elif defined _MSC_VER
214 | WIN32_FIND_DATA FindFileData;
215 | HANDLE hdir;
216 | #endif
217 |
218 | const char *const exts[] = {".txt", ".c"};
219 | for (size_t j = 0; j < ARRAY_SIZE(exts); j++) {
220 | const char *ext = exts[j];
221 | int s = 0;
222 | size_t lext = strlen(ext);
223 |
224 | #if defined HAVE_DIRENT_H && !defined _MSC_VER
225 | struct dirent *d;
226 | # define NEXT_FILE (d = readdir(dir))
227 | # define CUR_FILE d->d_name
228 | #elif defined _MSC_VER
229 | hdir = FindFirstFile(dirname, &FindFileData);
230 | if (hdir == INVALID_HANDLE_VALUE)
231 | continue;
232 | # define NEXT_FILE FindNextFile(hdir, &FindFileData)
233 | # define CUR_FILE FindFileData.cFileName
234 | #endif
235 |
236 | // sort the names, to compare against the result
237 | while (NEXT_FILE) {
238 | size_t l = strlen(CUR_FILE);
239 | if (l > lext && '.' != CUR_FILE[0] && strEQ(&CUR_FILE[l - lext], ext)) {
240 | s++;
241 | }
242 | }
243 |
244 | #if defined HAVE_DIRENT_H && !defined _MSC_VER
245 | rewinddir(dir);
246 | #elif defined _MSC_VER
247 | FindClose(hdir);
248 | hdir = FindFirstFile(dirname, &FindFileData);
249 | #endif
250 |
251 | const char **files = calloc(s, sizeof(char *));
252 | int i = 0;
253 | while (NEXT_FILE) {
254 | size_t l = strlen(CUR_FILE);
255 | if (l > lext && '.' != CUR_FILE[0] && strEQ(&CUR_FILE[l - lext], ext)) {
256 | if (i >= s)
257 | break;
258 | assert(i < s);
259 | files[i] = malloc(l + 1);
260 | strcpy((char *)files[i], CUR_FILE);
261 | i++;
262 | }
263 | }
264 | if (s)
265 | qsort(files, s, sizeof(char *), cmp_str);
266 | for (i = 0; i < s; i++) {
267 | // printf("%s\n", files[i]);
268 | testdir(dirname, files[i]);
269 | }
270 | for (int i = 0; i < s; i++)
271 | free((void *)files[i]);
272 | free(files);
273 | }
274 | #if defined HAVE_DIRENT_H && !defined _MSC_VER
275 | closedir(dir);
276 | #elif defined _MSC_VER
277 | FindClose(hdir);
278 | #endif
279 |
280 | u8ident_free();
281 | return 0;
282 | }
283 |
--------------------------------------------------------------------------------
/test-texts.test.in:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 | U8IDTEST_TEXTS="@top_srcdir@/texts" @LTEXEC@ ./test-texts@EXEEXT@ > texts.rst && \
4 | diff "@top_srcdir@/texts/result.lst" texts.rst && rm texts.rst
5 |
--------------------------------------------------------------------------------
/texts/amharic-1.txt:
--------------------------------------------------------------------------------
1 | የግዕዝ ፊደል
2 | ሁሉም የሰው ልጆች የተወለዱት በነፃነት እና በክብር እና በመብት እኩል ናቸው። የማሰብ እና የህሊና ችሎታ ተሰጥቷቸው እርስ በርሳቸው በወንድማማችነት መንፈስ መተጋገዝ አለባቸው።
3 |
--------------------------------------------------------------------------------
/texts/arabic-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | int الناس = 0;
3 |
4 | int الإء() {
5 | return الناس;
6 | }
7 | int main() {
8 | int ير = 1;
9 | assert(ير == 1 );
10 | return الإء();
11 | /* يولد جميع الناس أحرارًا ومتساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإء. */
12 | }
13 |
--------------------------------------------------------------------------------
/texts/arabic-1.txt:
--------------------------------------------------------------------------------
1 | نص عربي
2 | يولد جميع الناس أحرارًا ومتساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإخاء.
3 |
--------------------------------------------------------------------------------
/texts/armenian-1.txt:
--------------------------------------------------------------------------------
1 | Հայ գիր
2 | Բոլոր մարդիկ ծնվում են ազատ և հավասար արժանապատվության և իրավունքների մեջ: Նրանք օժտված են բանականությամբ և խղճով և պետք է միմյանց հանդեպ վարվեն եղբայրության ոգով:
3 |
--------------------------------------------------------------------------------
/texts/bengali-1.txt:
--------------------------------------------------------------------------------
1 | বাংলা লিপি
2 | সমস্ত মানুষ স্বাধীনভাবে জন্মগ্রহণ করে এবং মর্যাদা ও অধিকারে সমান। তারা যুক্তি এবং বিবেক দ্বারা সমৃদ্ধ এবং ভ্রাতৃত্বের চেতনায় একে অপরের প্রতি আচরণ করা উচিত।
3 |
--------------------------------------------------------------------------------
/texts/bidi-sec-1-allowed.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-1.c
2 | admin.: ERR_XID (Latin + U+2E Common)!
3 |
--------------------------------------------------------------------------------
/texts/bidi-sec-1-c11.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-1.c
2 | : ERR_SCRIPT (Latin + U+202E Common)!
3 | if: ERR_SCRIPT (Latin + U+2066 Common)!
4 | : ERR_SCRIPT (Latin + U+2069 Common)!
5 | : ERR_SCRIPT (Latin + U+2066 Common)!
6 | : ERR_SCRIPT (Latin + U+202E Common)!
7 | : ERR_SCRIPT (Latin + U+2066 Common)!
8 |
--------------------------------------------------------------------------------
/texts/bidi-sec-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | // RLO U+202E, LRE U+202A, PDI U+2069, LRI U+2066
4 |
5 | int main() {
6 | bool isAdmin = false;
7 | /* } if (isAdmin) begin admins only */
8 | printf("You are an admin.\n");
9 | /* end admins only { */
10 | return 0;
11 | }
12 |
--------------------------------------------------------------------------------
/texts/bidi-sec-1.py:
--------------------------------------------------------------------------------
1 | s = "א" * 100 # "א" is assigned
2 | print(s)
3 |
--------------------------------------------------------------------------------
/texts/bidi-sec-1.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-1.c
2 |
--------------------------------------------------------------------------------
/texts/bidi-sec-2-allowed.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-2.c
2 |
3 | : ERR_BIDI (Latin + U+202E)!
4 |
5 | : ERR_BIDI (Latin + U+2066)!
6 | // Check if admin)
7 | : ERR_BIDI (Latin + U+202E)!
8 | // Check if admin)
9 | : ERR_BIDI (Latin + U+2066)!
10 | )
11 | : ERR_BIDI (Latin + U+2069)!
12 | )
13 | : ERR_BIDI (Latin + U+2066)!
14 | admin.: ERR_XID (Latin + U+2E Common)!
15 |
--------------------------------------------------------------------------------
/texts/bidi-sec-2-ascii.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-2.c
2 |
3 | : ERR_BIDI (Latin + U+202E)!
4 |
5 | : ERR_BIDI (Latin + U+2066)!
6 | // Check if admin)
7 | : ERR_BIDI (Latin + U+202E)!
8 | // Check if admin)
9 | : ERR_BIDI (Latin + U+2066)!
10 | )
11 | : ERR_BIDI (Latin + U+2069)!
12 | )
13 | : ERR_BIDI (Latin + U+2066)!
14 |
--------------------------------------------------------------------------------
/texts/bidi-sec-2-c11.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-2.c
2 | : ERR_SCRIPT (Latin + U+202E Common)!
3 | USER: ERR_SCRIPT (Latin + U+202E Common)!
4 | admin: ERR_SCRIPT (Latin + U+2069 Common)!
5 |
--------------------------------------------------------------------------------
/texts/bidi-sec-2.c:
--------------------------------------------------------------------------------
1 | #include
2 | #define USER 0
3 | #define ADMIN 1
4 | // RLO U+202E, LRE U+202A, PDI U+2069, LRI U+2066
5 |
6 | int main() {
7 | int accessLevel = USER;
8 | if (accessLevel != USER// Check if admin)
9 | printf("You are an admin.\n");
10 | return 0;
11 | }
12 |
--------------------------------------------------------------------------------
/texts/bidi-sec-2.tst:
--------------------------------------------------------------------------------
1 | texts/bidi-sec-2.c
2 |
3 | : ERR_BIDI (Latin + U+202E)!
4 |
5 | : ERR_BIDI (Latin + U+2066)!
6 | // Check if admin)
7 | : ERR_BIDI (Latin + U+202E)!
8 | // Check if admin)
9 | : ERR_BIDI (Latin + U+2066)!
10 | )
11 | : ERR_BIDI (Latin + U+2069)!
12 | )
13 | : ERR_BIDI (Latin + U+2066)!
14 |
--------------------------------------------------------------------------------
/texts/bidi-sec-3.c:
--------------------------------------------------------------------------------
1 | #include
2 | // RLI U+2067
3 |
4 | int main() {
5 | /* Say hello; newline /*/ return 0 ;
6 | printf("Hello world\n");
7 | return 0;
8 | }
9 |
--------------------------------------------------------------------------------
/texts/bidi-sec-4.cc:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int main() {
4 | constexpr char password[] = "12345";
5 | printf(R"("א(ב)"? password: ")");
6 | }
7 |
--------------------------------------------------------------------------------
/texts/bopomofo-1.txt:
--------------------------------------------------------------------------------
1 | ㄅㄞˇ ㄎㄜ ㄑㄩㄢˊ ㄕㄨ 百科全書 百科全书
2 | ㄓㄨˋ ㄧㄣ ㄈㄨˊ ㄏㄠˋ
3 |
--------------------------------------------------------------------------------
/texts/bulgarian-1.txt:
--------------------------------------------------------------------------------
1 | български (кирилица)
2 | Всички човешки същества се раждат свободни и равни по достойнство и права. Те са надарени с разум и съвест и трябва да се отнасят един към друг в дух на братство.
3 |
--------------------------------------------------------------------------------
/texts/chinese-s-1.txt:
--------------------------------------------------------------------------------
1 | 简体中文)
2 | 人人生而自由,在尊严和权利上一律平等。 他们被赋予了理性和良知,应该本着兄弟情谊的精神相互对待。
3 |
--------------------------------------------------------------------------------
/texts/chinese-trad-1.txt:
--------------------------------------------------------------------------------
1 | 中國傳統的)
2 | 人人生而自由,在尊嚴和權利上一律平等。 他們被賦予了理性和良知,應該本著兄弟情誼的精神相互對待。
3 |
--------------------------------------------------------------------------------
/texts/cyrillic-1.txt:
--------------------------------------------------------------------------------
1 | Кириллица
2 | Все люди рождаются свободными и равными в своем достоинстве и правах. Они наделены разумом и совестью и должны действовать по отношению друг к другу в духе братства.
3 |
--------------------------------------------------------------------------------
/texts/english-1.txt:
--------------------------------------------------------------------------------
1 | All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.
2 |
--------------------------------------------------------------------------------
/texts/farsi-1.txt:
--------------------------------------------------------------------------------
1 | خط فارسی
2 | همه انسانها آزاد به دنیا می آیند و از نظر حیثیت و حقوق با هم برابرند. آنها دارای عقل و وجدان هستند و باید نسبت به یکدیگر با روحیه برادری رفتار کنند.
3 |
--------------------------------------------------------------------------------
/texts/georgian-1.txt:
--------------------------------------------------------------------------------
1 | ქართული დამწერლობა
2 | ყველა ადამიანი იბადება თავისუფალი და თანასწორი ღირსებითა და უფლებებით. ისინი დაჯილდოვებულნი არიან გონიერებითა და სინდისით და ერთმანეთის მიმართ ძმობის სულისკვეთებით უნდა მოიქცნენ.
3 |
--------------------------------------------------------------------------------
/texts/greek-1.txt:
--------------------------------------------------------------------------------
1 | Ελληνική γραφή
2 | Όλοι οι άνθρωποι γεννιούνται ελεύθεροι και ίσοι σε αξιοπρέπεια και δικαιώματα. Είναι προικισμένοι με λογική και συνείδηση και πρέπει να ενεργούν ο ένας προς τον άλλον με πνεύμα αδελφοσύνης.
3 |
--------------------------------------------------------------------------------
/texts/gujarati-1.txt:
--------------------------------------------------------------------------------
1 | ગુજરાતી લિપિ
2 | બધા મનુષ્યો સ્વતંત્ર જન્મે છે અને ગૌરવ અને અધિકારોમાં સમાન છે. તેઓ તર્ક અને વિવેકથી સંપન્ન છે અને તેઓ એકબીજા પ્રત્યે ભાઈચારાની ભાવનાથી વર્તે છે.
3 |
--------------------------------------------------------------------------------
/texts/hebrew-1.txt:
--------------------------------------------------------------------------------
1 | כתב עברי
2 | כל בני האדם נולדו חופשיים ושווים בכבוד ובזכויות. הם ניחנים בשכל ובמצפון וצריכים לפעול זה כלפי זה ברוח של אחווה.
3 |
--------------------------------------------------------------------------------
/texts/hindi-1.txt:
--------------------------------------------------------------------------------
1 | देवनागरी लिपि
2 | सभी मनुष्य स्वतंत्र पैदा होते हैं और सम्मान और अधिकारों में समान होते हैं। वे तर्क और विवेक से संपन्न हैं और उन्हें भाईचारे की भावना से एक दूसरे के प्रति कार्य करना चाहिए।
3 |
--------------------------------------------------------------------------------
/texts/homo-1-p4.tst:
--------------------------------------------------------------------------------
1 | texts/homo-1.c
2 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)!
3 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)!
4 |
--------------------------------------------------------------------------------
/texts/homo-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | int main() {
3 | int a = 2; // which is which?
4 | int а = 3;
5 | assert(a == 2); // OK
6 | assert(а == 3); // OK
7 | return 0;
8 | }
9 |
--------------------------------------------------------------------------------
/texts/homo-1.tst:
--------------------------------------------------------------------------------
1 | texts/homo-1.c
2 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)!
3 | а: ERR_SCRIPTS (Latin + U+430 Cyrillic)!
4 |
--------------------------------------------------------------------------------
/texts/homo-sec-1-p1.tst:
--------------------------------------------------------------------------------
1 | texts/homo-sec-1.c
2 | сhесk: ERR_XID (Latin + U+441 Cyrillic)!
3 | СНЕСК: ERR_XID (Latin + U+421 Cyrillic)!
4 |
--------------------------------------------------------------------------------
/texts/homo-sec-1.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int CHECK (const char *arg) {
4 | return strcmp(arg, "check") == 0;
5 | }
6 | int СНЕСК (const char *arg) {
7 | return strcmp(arg, "сhесk") == 0;
8 | }
9 |
10 | int main () {
11 | return СНЕСК("check");
12 | }
13 |
--------------------------------------------------------------------------------
/texts/homo-sec-1.js:
--------------------------------------------------------------------------------
1 | const [ ENV_PROD, ENV_DEV ] = [ 'PRODUCTION', 'DEVELOPMENT'];
2 | /* … */
3 | const environment = 'PRODUCTION';
4 | /* … */
5 | function isUserAdmin(user) {
6 | if(environmentǃ=ENV_PROD){
7 | // bypass authZ checks in DEV
8 | return true;
9 | }
10 |
11 | /* … */
12 | return false;
13 | }
14 |
--------------------------------------------------------------------------------
/texts/homo-sec-1.py:
--------------------------------------------------------------------------------
1 | print (int('৪୨'))
2 | # => 42
3 | print ('{٥}'.format('zero', 'one', 'two', 'three', 'four', 'five'))
4 | # => five
5 |
--------------------------------------------------------------------------------
/texts/homo-sec-1.tst:
--------------------------------------------------------------------------------
1 | texts/homo-sec-1.c
2 | сhесk: ERR_SCRIPTS (Latin + U+441 Cyrillic)!
3 | СНЕСК: ERR_SCRIPTS (Latin + U+421 Cyrillic)!
4 |
--------------------------------------------------------------------------------
/texts/homo-sec-2.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | enum env_e {
4 | ENV_PRODUCTION,
5 | ENV_DEVELOPMENT
6 | };
7 | /* … */
8 | const enum env_e environment = ENV_PRODUCTION;
9 | /* … */
10 | bool isUserAdmin(const char *user) {
11 | if(environmentǃ=ENV_PRODUCTION){
12 | // bypass authZ checks in DEVELOPMENT
13 | return true;
14 | }
15 |
16 | /* … */
17 | return false;
18 | }
19 |
--------------------------------------------------------------------------------
/texts/hungarian-1.txt:
--------------------------------------------------------------------------------
1 | Magyar
2 | Minden emberi lény szabadnak és egyenlőnek születik méltóságában és jogaiban. Ésszel és lelkiismerettel rendelkeznek, és a testvériség szellemében kell viselkedniük egymással.
3 |
--------------------------------------------------------------------------------
/texts/igbo-1.txt:
--------------------------------------------------------------------------------
1 | Igbo
2 | A mụrụ mmadụ niile nweere onwe ha nhata na ugwu na ikike. Enyere ha uche na akọ na uche ma kwesị ịkpakọrịta ibe ha n'ime mmụọ nke ụmụnna.
3 |
--------------------------------------------------------------------------------
/texts/japanese-1.txt:
--------------------------------------------------------------------------------
1 | 日本語の台本
2 | すべての人間は自由に生まれ、尊厳と権利において平等です。 彼らは理性と良心に恵まれており、兄弟愛の精神でお互いに向かって行動する必要があります。
3 |
--------------------------------------------------------------------------------
/texts/japanese-2.txt:
--------------------------------------------------------------------------------
1 | ワンワン
2 | Tシャツを3枚購入しました。
3 |
--------------------------------------------------------------------------------
/texts/kannada-1.txt:
--------------------------------------------------------------------------------
1 | ಕನ್ನಡ ಲಿಪಿ
2 | ಎಲ್ಲಾ ಮಾನವರು ಸ್ವತಂತ್ರವಾಗಿ ಮತ್ತು ಘನತೆ ಮತ್ತು ಹಕ್ಕುಗಳಲ್ಲಿ ಸಮಾನವಾಗಿ ಹುಟ್ಟಿದ್ದಾರೆ. ಅವರು ವಿವೇಚನೆ ಮತ್ತು ಆತ್ಮಸಾಕ್ಷಿಯನ್ನು ಹೊಂದಿದ್ದಾರೆ ಮತ್ತು ಸಹೋದರತ್ವದ ಮನೋಭಾವದಿಂದ ಪರಸ್ಪರ ವರ್ತಿಸಬೇಕು.
3 |
--------------------------------------------------------------------------------
/texts/khmer-1.txt:
--------------------------------------------------------------------------------
1 | អក្សរខ្មែរ
2 | មនុស្សគ្រប់រូបកើតមកមានសេរីភាព និងស្មើភាពគ្នាក្នុងសេចក្តីថ្លៃថ្នូរ និងសិទ្ធិ។ ពួកគេត្រូវបានផ្តល់ដោយហេតុផល និងមនសិការ ហើយគួរតែប្រព្រឹត្តចំពោះគ្នាទៅវិញទៅមកក្នុងស្មារតីភាតរភាព។
3 |
--------------------------------------------------------------------------------
/texts/korean-1.txt:
--------------------------------------------------------------------------------
1 | 한글 스크립트
2 | 모든 인간은 태어날 때부터 자유롭고 존엄성과 권리가 평등합니다. 그들은 이성과 양심을 부여받았으며 형제애의 정신으로 서로를 대해야 합니다.
3 |
--------------------------------------------------------------------------------
/texts/korean-2.txt:
--------------------------------------------------------------------------------
1 | ฝ오::ʉ
2 | $ฝ오::ʉ
3 |
4 |
--------------------------------------------------------------------------------
/texts/lao-1.txt:
--------------------------------------------------------------------------------
1 | ອັກສອນລາວ
2 | ມະນຸດທຸກຄົນເກີດມາມີເສລີພາບ ແລະສະເໝີພາບໃນກຽດສັກສີ ແລະສິດທິ. ເຂົາເຈົ້າມີເຫດຜົນແລະສະຕິຮູ້ສຶກຜິດຊອບ ແລະຄວນປະຕິບັດຕໍ່ກັນແລະກັນໃນຈິດໃຈຂອງພີ່ນ້ອງ.
3 |
--------------------------------------------------------------------------------
/texts/latin-1.txt:
--------------------------------------------------------------------------------
1 | aʙȼ1
2 | aʙȼ2
3 | aʙȼ3
--------------------------------------------------------------------------------
/texts/malayalam-1.txt:
--------------------------------------------------------------------------------
1 | മലയാളം ലിപി
2 | എല്ലാ മനുഷ്യരും സ്വതന്ത്രരും അന്തസ്സിലും അവകാശങ്ങളിലും തുല്യരായി ജനിച്ചവരാണ്. അവർ യുക്തിയും മനസ്സാക്ഷിയും ഉള്ളവരാണ്, അവർ പരസ്പരം സാഹോദര്യത്തിന്റെ മനോഭാവത്തിൽ പ്രവർത്തിക്കണം.
--------------------------------------------------------------------------------
/texts/marathi-1.txt:
--------------------------------------------------------------------------------
1 | मराठी लिपी
2 | सर्व माणसं स्वतंत्रपणे जन्माला येतात आणि सन्मान आणि अधिकारांमध्ये समान असतात. ते तर्क आणि विवेकाने संपन्न आहेत आणि त्यांनी एकमेकांशी बंधुभावाच्या भावनेने वागले पाहिजे.
3 |
--------------------------------------------------------------------------------
/texts/math-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main(void) {
5 | int λ = 1;
6 | double π = M_PI;
7 | double π2 = M_PI * 2.;
8 | printf ("%d %f %f\n", λ, π, π2);
9 | }
10 |
--------------------------------------------------------------------------------
/texts/math-2.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int main()
4 | {
5 | double εₙ₊₁, εₙ, εₙ₋₁, β₁, β₂, β₃, Δtₙ, k;
6 | const double Δtₙ₊₁ = pow(εₙ₊₁, β₁/k) * pow(εₙ, β₂/k) * pow(εₙ₋₁, β₃/k) * Δtₙ;
7 | return 0;
8 | }
9 |
--------------------------------------------------------------------------------
/texts/math-2.cc:
--------------------------------------------------------------------------------
1 | // GH 12 wrong greek ERR_SCRIPTS with subscripts
2 | #include
3 |
4 | int main()
5 | {
6 | double εₙ₊₁, εₙ, εₙ₋₁, β₁, β₂, β₃, Δtₙ, k;
7 | const auto Δtₙ₊₁ = std::pow(εₙ₊₁, β₁/k) * std::pow(εₙ, β₂/k) * std::pow(εₙ₋₁, β₃/k) * Δtₙ;
8 | return 0;
9 | }
10 |
--------------------------------------------------------------------------------
/texts/myanmar-1.txt:
--------------------------------------------------------------------------------
1 | မြန်မာစာပဲဗျ။
2 | လူသားတိုင်းသည် ဂုဏ်သိက္ခာနှင့် အခွင့်အရေးများဖြင့် လွတ်လပ်စွာ မွေးဖွားလာကြသည်။ ၎င်းတို့သည် ဆင်ခြင်တုံတရားနှင့် အသိတရားနှင့် ပြည့်စုံပြီး အချင်းချင်း ညီရင်းအစ်ကို စိတ်ဓာတ်ဖြင့် ပြုမူသင့်သည်။
3 |
--------------------------------------------------------------------------------
/texts/nfc-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | int main() {
3 | int Café = 1; // with Mn. Cafe\u0301 is not in NFC -Wnormalized
4 | assert(Café == 1); // Caf\u00e9 without Mn, same NFC
5 | return 0;
6 | }
7 |
--------------------------------------------------------------------------------
/texts/nfkc-1.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main() {
5 | int ℌ = 1; // which is which?
6 | int Ⅸ = 2;
7 | assert(H == 1); // same NFKC as ℌ
8 | assert(IX == 2); // same NFKC as Ⅸ
9 | double ℯ = M_E;
10 | assert(e == M_E); // same NFKC as ℯ
11 | int ℨ = 3;
12 | assert(Z == 3); // same NFKC as ℨ
13 | return 0;
14 | }
15 |
--------------------------------------------------------------------------------
/texts/norm-sec-1.cperl:
--------------------------------------------------------------------------------
1 | use utf8;
2 | use feature 'unicode_strings';
3 | $x = 8;
4 | print $x²; # => 64
5 |
--------------------------------------------------------------------------------
/texts/norm-sec-1.py:
--------------------------------------------------------------------------------
1 | xⁿ = 8
2 | print(xn) # => 8
3 |
--------------------------------------------------------------------------------
/texts/norm-sec-1.raku:
--------------------------------------------------------------------------------
1 | my $x = 8;
2 | print $x²; # => 64
3 |
--------------------------------------------------------------------------------
/texts/odia-1.txt:
--------------------------------------------------------------------------------
1 | ଓଡିଆ ସ୍କ୍ରିପ୍ଟ |
2 | ସମସ୍ତ ମଣିଷ ମୁକ୍ତ ଏବଂ ସମ୍ମାନ ଏବଂ ଅଧିକାରରେ ସମାନ ଭାବରେ ଜନ୍ମଗ୍ରହଣ କରନ୍ତି | ସେମାନଙ୍କୁ କାରଣ ଏବଂ ବିବେକ ଦିଆଯାଇଛି ଏବଂ ଭାଇଚାରା ଭାବନାରେ ପରସ୍ପର ପ୍ରତି କାର୍ଯ୍ୟ କରିବା ଉଚିତ୍ |
3 |
--------------------------------------------------------------------------------
/texts/punjabi-1.txt:
--------------------------------------------------------------------------------
1 | ਗੁਰਮੁਖੀ ਲਿਪੀ
2 | ਸਾਰੇ ਮਨੁੱਖ ਅਜ਼ਾਦ ਪੈਦਾ ਹੋਏ ਹਨ ਅਤੇ ਸਨਮਾਨ ਅਤੇ ਅਧਿਕਾਰਾਂ ਵਿੱਚ ਬਰਾਬਰ ਹਨ। ਉਹ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਨਾਲ ਸੰਪੰਨ ਹਨ ਅਤੇ ਉਹਨਾਂ ਨੂੰ ਇੱਕ ਦੂਜੇ ਪ੍ਰਤੀ ਭਾਈਚਾਰੇ ਦੀ ਭਾਵਨਾ ਨਾਲ ਕੰਮ ਕਰਨਾ ਚਾਹੀਦਾ ਹੈ।
3 |
--------------------------------------------------------------------------------
/texts/scots-gaelic-1.txt:
--------------------------------------------------------------------------------
1 | Albannaich-gaelic
2 | Tha mac an duine air a bhreith an-asgaidh agus co-ionann ann an urram agus còirichean. Tha iad air an toirt seachad le adhbhar agus cogais agus bu chòir dhaibh a bhith ag obair an aghaidh a chèile ann an spiorad bràithreachas.
3 |
--------------------------------------------------------------------------------
/texts/sinhala-1.txt:
--------------------------------------------------------------------------------
1 | සිංහල අකුරු
2 | සියලුම මිනිසුන් නිදහස් හා සමාන ගෞරවයෙන් හා අයිතිවාසිකම්වලින් උපත ලබයි. ඔවුන් තර්කානුකූලව හා හෘදය සාක්ෂියෙන් යුක්ත වන අතර එකිනෙකාට සහෝදරත්වයෙන් යුතුව කටයුතු කළ යුතුය.
3 |
--------------------------------------------------------------------------------
/texts/swedish-1.txt:
--------------------------------------------------------------------------------
1 | latinsk skrift
2 | Alla människor är födda fria och lika i värdighet och rättigheter. De är utrustade med förnuft och samvete och bör handla mot varandra i en anda av broderskap.
3 |
--------------------------------------------------------------------------------
/texts/tamil-1.txt:
--------------------------------------------------------------------------------
1 | தமிழ் எழுத்து
2 | எல்லா மனிதர்களும் சுதந்திரமாகவும் கண்ணியத்திலும் உரிமைகளிலும் சமமாகப் பிறந்தவர்கள். அவர்கள் பகுத்தறிவும் மனசாட்சியும் கொண்டவர்கள் மற்றும் சகோதரத்துவ உணர்வோடு ஒருவருக்கொருவர் செயல்பட வேண்டும்.
3 |
--------------------------------------------------------------------------------
/texts/telugu-1.txt:
--------------------------------------------------------------------------------
1 | తెలుగు లిపి
2 | మానవులందరూ స్వేచ్ఛగా మరియు గౌరవం మరియు హక్కులలో సమానంగా జన్మించారు. వారు హేతువు మరియు మనస్సాక్షిని కలిగి ఉంటారు మరియు సోదరభావంతో ఒకరితో ఒకరు వ్యవహరించాలి.
3 |
--------------------------------------------------------------------------------
/texts/thai-1.txt:
--------------------------------------------------------------------------------
1 | อักษรไทย
2 | มนุษย์ทุกคนเกิดมาอย่างเสรีและเท่าเทียมกันในศักดิ์ศรีและสิทธิ พวกเขามีเหตุผลและมโนธรรมและควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ
3 |
--------------------------------------------------------------------------------
/texts/turkish-1.txt:
--------------------------------------------------------------------------------
1 | Türkçe
2 | Bütün insanlar hür, haysiyet ve haklar bakımından eşit doğarlar. Akıl ve vicdan sahibidirler ve birbirlerine karşı kardeşlik ruhuyla hareket etmelidirler.
3 |
--------------------------------------------------------------------------------
/texts/urdu-1.txt:
--------------------------------------------------------------------------------
1 | اردو
2 | تمام انسان آزاد پیدا ہوئے ہیں اور عزت اور حقوق میں برابر ہیں۔ وہ عقل اور ضمیر کے حامل ہیں اور انہیں بھائی چارے کے جذبے سے ایک دوسرے کے ساتھ برتاؤ کرنا چاہیے۔
3 |
--------------------------------------------------------------------------------
/texts/vietnamese-1.txt:
--------------------------------------------------------------------------------
1 | Chữ quốc ngữ
2 | Tất cả con người sinh ra đều tự do, bình đẳng về nhân phẩm và quyền. Họ được phú cho lý trí và lương tâm và nên hành động với nhau trong tinh thần anh em.
3 |
--------------------------------------------------------------------------------
/u8id_private.h:
--------------------------------------------------------------------------------
1 | #ifndef _U8ID_PRIVATE_H
2 | #define _U8ID_PRIVATE_H
3 |
4 | #ifdef HAVE_CONFIG_H
5 | # include "config.h"
6 | #endif
7 | #include
8 | #include
9 | #include
10 |
11 | #if defined _WIN32 || defined __CYGWIN__
12 | # define EXTERN __declspec(dllexport)
13 | # define LOCAL
14 | #elif __GNUC__ >= 4
15 | # define EXTERN __attribute__((visibility("default")))
16 | # define LOCAL __attribute__((visibility("hidden")))
17 | #else
18 | # define EXTERN
19 | # define LOCAL
20 | #endif
21 |
22 | #ifndef PERF_TEST
23 | // they are all too slow
24 | # undef USE_ALLOWED_CROAR
25 | # undef USE_MARK_CROAR
26 | # undef USE_NORM_CROAR
27 | #else
28 | # define USE_ALLOWED_CROAR
29 | # define USE_MARK_CROAR
30 | # define USE_NORM_CROAR
31 | #endif
32 |
33 | #if __GNUC__ >= 3
34 | # define _expect(expr, value) __builtin_expect((expr), (value))
35 | # define INLINE static inline
36 | #else
37 | # define _expect(expr, value) (expr)
38 | # define INLINE static
39 | #endif
40 | #ifndef likely
41 | # define likely(expr) _expect((long)((expr) != 0), 1)
42 | # define unlikely(expr) _expect((long)((expr) != 0), 0)
43 | #endif
44 |
45 | #define ARRAY_SIZE(x) sizeof(x) / sizeof(*x)
46 | #define strEQ(s1, s2) !strcmp((s1), (s2))
47 | #define strEQc(s1, s2) !strcmp((s1), s2 "")
48 |
49 | #define NFC 0
50 | #define NFD 1
51 | #define NFKC 2
52 | #define NFKD 3
53 | #define FCD 4
54 | #define FCC 5
55 | #define C11_6 7
56 | #define C26_4 8
57 |
58 | // allowed set of identifiers. TR31 --xid tokenizer options
59 | // we need XID, the default, as first for uninitialized options.
60 | enum xid_e {
61 | XID, // ID minus NFKC quirks, labelled stable, the default.
62 | ID, // all letters, plus numbers, punctuation and marks. With exotic scripts.
63 | ALLOWED, // TR39 ID with only recommended scripts. Allowed IdentifierStatus.
64 | SAFEC26, // practical XID with TR39 security measures, see P2528R1.
65 | C23, // XID with NFC requirement from C23 (P1949, N2828).
66 | C11, // the stable insecure AltId ranges from the C11 standard, Annex D.
67 | ALLUTF8, // all > 128, e.g. D, php, nim, crystal.
68 | ASCII, // only ASCII letters.
69 | };
70 | #define XID 0
71 | #define ID 1
72 | #define ALLOWED 2
73 | #define SAFEC26 3
74 | #define C23 4
75 | #define C11 5
76 | #define ALLUTF8 6
77 | #define ASCII 7
78 | #define NONE 8
79 | #define FIRST_XID_E XID
80 | #define LAST_XID_E ASCII
81 |
82 | #define _XSTR(s) _STR(s)
83 | #define _STR(s) #s
84 | #define CAT(a, b) a##b
85 | #define PASTE(a, b) CAT(a, b)
86 | #define JOIN(prefix, name) PASTE(prefix, PASTE(_, name))
87 |
88 | #ifdef U8ID_NORM
89 | # if U8ID_NORM == NFC
90 | # define U8ID_NORM_DEFAULT U8ID_NFC
91 | # elif U8ID_NORM == NFD
92 | # define U8ID_NORM_DEFAULT U8ID_NFD
93 | # elif U8ID_NORM == NFKD
94 | # define U8ID_NORM_DEFAULT U8ID_NFKD
95 | # elif U8ID_NORM == NFKC
96 | # define U8ID_NORM_DEFAULT U8ID_NFKC
97 | # elif U8ID_NORM == FCC
98 | # define U8ID_NORM_DEFAULT U8ID_FCC
99 | # elif U8ID_NORM == FCD
100 | # define U8ID_NORM_DEFAULT U8ID_FCD
101 | # else
102 | # error "Invalid U8ID_NORM "_XSTR(U8ID_NORM)
103 | # endif
104 | #else
105 | # define U8ID_NORM_DEFAULT U8ID_NFC
106 | #endif
107 |
108 | #ifdef U8ID_PROFILE
109 | # if U8ID_PROFILE == 1
110 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_1
111 | # elif U8ID_PROFILE == 2
112 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_2
113 | # elif U8ID_PROFILE == 3
114 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_3
115 | # elif U8ID_PROFILE == 4
116 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4
117 | # elif U8ID_PROFILE == 5
118 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_5
119 | # elif U8ID_PROFILE == 6
120 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_6
121 | # elif U8ID_PROFILE == C26_4
122 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C26_4
123 | # define U8ID_PROFILE_SAFEC26
124 | # elif U8ID_PROFILE == C11_6
125 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C11_6
126 | # define U8ID_PROFILE_C11STD
127 | # else
128 | # error "Invalid U8ID_PROFILE "_XSTR(U8ID_PROFILE)
129 | # endif
130 | #elif defined U8ID_PROFILE_SAFEC26
131 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C26_4
132 | # define U8ID_PROFILE C26_4
133 | #elif defined U8ID_PROFILE_C11STD
134 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_C11_6
135 | # define U8ID_PROFILE C11_6
136 | #else
137 | // Moderately Restrictive
138 | # define U8ID_PROFILE_DEFAULT U8ID_PROFILE_4
139 | #endif
140 |
141 | #ifdef DISABLE_U8ID_TR31
142 | # define DISABLE_CHECK_XID
143 | # define U8ID_TR31_DEFAULT 0
144 | # undef U8ID_TR31
145 | # undef ENABLE_CHECK_XID
146 | #endif
147 | #ifdef U8ID_TR31
148 | //# pragma message("U8ID_TR31=" _XSTR(U8ID_TR31))
149 | # if U8ID_TR31 == NONE
150 | # define DISABLE_CHECK_XID
151 | # define U8ID_TR31_DEFAULT 0
152 | # else
153 | # define ENABLE_CHECK_XID
154 | # if U8ID_TR31 == ALLOWED
155 | # define U8ID_TR31_DEFAULT U8ID_TR31_ALLOWED
156 | # elif U8ID_TR31 == ASCII
157 | # define U8ID_TR31_DEFAULT U8ID_TR31_ASCII
158 | # elif U8ID_TR31 == SAFEC26
159 | # define U8ID_TR31_DEFAULT U8ID_TR31_SAFEC26
160 | # elif U8ID_TR31 == ID
161 | # define U8ID_TR31_DEFAULT U8ID_TR31_ID
162 | # elif U8ID_TR31 == XID
163 | # define U8ID_TR31_DEFAULT U8ID_TR31_XID
164 | # elif U8ID_TR31 == C11
165 | # define U8ID_TR31_DEFAULT U8ID_TR31_C11
166 | # elif U8ID_TR31 == C23
167 | # define U8ID_TR31_DEFAULT U8ID_TR31_C23
168 | # elif U8ID_TR31 == ALLUTF8
169 | # define U8ID_TR31_DEFAULT U8ID_TR31_ALLUTF8
170 | # endif
171 | # endif
172 | #else
173 | # define U8ID_TR31_DEFAULT U8ID_TR31_XID
174 | #endif
175 |
176 | #if defined U8ID_NORM && (U8ID_NORM != NFC)
177 | # if (U8ID_TR31 == SAFEC26) || (U8ID_TR31 == C23)
178 | # error "Invalid U8ID_NORM with U8ID_TR31"
179 | # endif
180 | #endif
181 |
182 | #include "htable.h"
183 |
184 | #define U8ID_CTX_TRESH 5
185 | #define U8ID_SCR_TRESH 8
186 | struct ctx_t {
187 | uint8_t count;
188 | uint8_t has_han : 1;
189 | uint8_t is_japanese : 1;
190 | uint8_t is_chinese : 1;
191 | uint8_t is_korean : 1;
192 | uint8_t is_rtl : 1; // Hebrew or Arabic
193 | uint32_t last_cp; // only set on errors
194 | union {
195 | uint64_t scr64; // room for 8 scripts
196 | uint8_t scr8[U8ID_SCR_TRESH];
197 | // we need more than 8 only with insecure
198 | // profiles, or when we manually add extra scripts.
199 | uint8_t *u8p; // or if count > 8
200 | };
201 | struct htable *htab;
202 | struct htable *htab1;
203 | };
204 |
205 | // clang-format off
206 | #if (defined(__GNUC__) && ((__GNUC__ * 100) + __GNUC_MINOR__) >= 480)
207 | # define GCC_DIAG_PRAGMA(x) _Pragma (#x)
208 | # define GCC_DIAG_IGNORE(x) \
209 | GCC_DIAG_PRAGMA (GCC diagnostic ignored #x)
210 | #else
211 | # define GCC_DIAG_IGNORE(w)
212 | #endif
213 |
214 | LOCAL enum u8id_norm u8ident_norm(void);
215 | LOCAL enum u8id_profile u8ident_profile(void);
216 | LOCAL enum u8id_options u8ident_tr31(void);
217 | LOCAL unsigned u8ident_options(void);
218 | LOCAL unsigned u8ident_maxlength(void);
219 | LOCAL const char *u8ident_errstr(int errcode);
220 | // from u8idnorm.c
221 | LOCAL uint32_t dec_utf8(char **strp);
222 | LOCAL char *enc_utf8(char *dest, size_t *lenp, const uint32_t cp);
223 |
224 | #endif // _U8ID_PRIVATE_H
225 |
--------------------------------------------------------------------------------
/u8idlint.1:
--------------------------------------------------------------------------------
1 | .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.49.3.
2 | .TH U8IDLINT "1" "December 2024" "u8idlint 0.3.3_d005" "U8IDENT Manual 0.3.3_d005"
3 | .SH NAME
4 | u8idlint \- manual page for u8idlint 0.3.3_d005
5 | .SH SYNOPSIS
6 | .B u8idlint
7 | [\fI\,OPTIONS\/\fR] [\fI\,dirs or files\/\fR]...
8 | .SH DESCRIPTION
9 | u8idlint 0.3.3_d005
10 | .SS "OPTIONS:"
11 | .TP
12 | \fB\-n\fR|\-\-norm=nfc,nfkc,nfd,nfkc
13 | default: nfc
14 | .IP
15 | set to nfkd by default for python
16 | .TP
17 | \fB\-p\fR|\-\-profile=1,2,3,4,5,6,c26_4,c11_6
18 | default: c26_4
19 | .IP
20 | TR39 unicode mixed\-script security profile for identifiers:
21 | .TP
22 | 1
23 | ASCII. Sets xid ascii.
24 | .TP
25 | 2
26 | Single script
27 | .TP
28 | 3
29 | Highly Restrictive
30 | .TP
31 | 4
32 | Moderately Restrictive
33 | .TP
34 | 5
35 | Minimally Restrictive
36 | .TP
37 | 6
38 | Unrestricted
39 | .TP
40 | c11_6
41 | C11STD. Sets xid c11.
42 | .TP
43 | c26_4
44 | SAFEC26 (i.e. 4 with Greek). Sets xid safec26.
45 | .TP
46 | \fB\-x\fR|\-\-xid=ascii,allowed,safec26,id,xid,c11,c23,allutf8
47 | default: allowed
48 | .IP
49 | TR31 set of identifiers:
50 | .TP
51 | ascii
52 | only ASCII letters, punctuations. plus numbers
53 | .TP
54 | allowed
55 | tr31 with only recommended scripts, IdentifierStatus
56 | .TP
57 | safec26
58 | allowed but different Identifier_Type and NFC
59 | .TP
60 | id
61 | all letters. plus numbers, punctuations and combining marks
62 | .TP
63 | xid
64 | stable id subset, no NFKC quirks
65 | .TP
66 | c23
67 | xid with NFC requirement from C23
68 | .TP
69 | c11
70 | some AltId unicode ranges from C11
71 | .TP
72 | allutf8
73 | allow all >128. e.g. php, nim, crystal
74 | .TP
75 | \fB\-e\fR|\-\-ext=.c
76 | only this file extension
77 | .HP
78 | \fB\-r\fR|\-\-recursive
79 | .HP
80 | \fB\-v\fR|\-\-verbose
81 | .HP
82 | \fB\-q\fR|\-\-quiet
83 | .HP
84 | \fB\-\-help\fR
85 | .PP
86 | u8idlint checks all words in UTF\-8 source files for
87 | violations against various unicode security guidelines for identifiers.
88 | For special known file extensions it uses its default xid to parse identifiers.
89 | (i.e. *.c uses C11)
90 | It adds a special BIDI warning on bidi formatting chars and when the document
91 | is not in Hebrew nor Arabic.
92 | .SS "SEE ALSO:"
93 | .IP
94 | u8ident.3
95 | .SS "AUTHOR:"
96 | .IP
97 | Reini Urban
98 |
--------------------------------------------------------------------------------
/u8idlint.test:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # cmd in pwd
3 | # texts in this dir
4 | ROOT="$(dirname "$0")"
5 | if [ "$ROOT" != "." ] && [ ! -d texts ]; then
6 | NEWTEXTS=1
7 | cp -r "$ROOT"/texts . || exit 0
8 | chmod +w texts || exit 0
9 | fi
10 |
11 | check3() {
12 | arg=$1
13 | file=texts/$2
14 | tst=texts/$3
15 | rst=$3.rst
16 | # shellcheck disable=SC2086
17 | ./u8idlint $arg "$file" > "$rst"
18 | if diff "$tst" "$rst"; then
19 | rm -- ./"$rst"
20 | else
21 | echo "$arg $3.rst failed"
22 | exit
23 | fi
24 | }
25 |
26 | check3 -xsafec26 homo-sec-1.c homo-sec-1.tst
27 | check3 -xc11 homo-sec-1.c homo-sec-1.tst
28 | check3 "-p1 -xc11" homo-sec-1.c homo-sec-1-p1.tst
29 | check3 "-p2 -xc11" homo-sec-1.c homo-sec-1.tst
30 | check3 "-p3 -xc11" homo-sec-1.c homo-sec-1.tst
31 | check3 "-p4 -xc11" homo-sec-1.c homo-sec-1.tst
32 |
33 | check3 -xsafec26 homo-1.c homo-1.tst
34 | check3 -xc11 homo-1.c homo-1.tst
35 |
36 | # ascii-xid ignore BIDI words here
37 | check3 -xascii bidi-sec-1.c bidi-sec-1.tst
38 | check3 -xallowed bidi-sec-1.c bidi-sec-1-allowed.tst
39 | check3 -xsafec26 bidi-sec-1.c bidi-sec-1.tst
40 | check3 -xid bidi-sec-1.c bidi-sec-1.tst
41 | check3 -xxid bidi-sec-1.c bidi-sec-1.tst
42 | check3 "" bidi-sec-1.c bidi-sec-1-c11.tst
43 | check3 -xc11 bidi-sec-1.c bidi-sec-1-c11.tst
44 |
45 | check3 -xascii bidi-sec-2.c bidi-sec-2-ascii.tst
46 | check3 -xallowed bidi-sec-2.c bidi-sec-2-allowed.tst
47 | check3 -xsafec26 bidi-sec-2.c bidi-sec-2.tst
48 | check3 -xid bidi-sec-2.c bidi-sec-2.tst
49 | check3 -xxid bidi-sec-2.c bidi-sec-2.tst
50 | check3 -xc11 bidi-sec-2.c bidi-sec-2-c11.tst
51 | check3 "" bidi-sec-2.c bidi-sec-2-c11.tst
52 |
53 | #./u8idlint -xallowed texts/homo-1.c > homo-1-allowed.rst &&
54 | # diff texts/homo-1-p4.tst homo-1-allowed.rst && homo-1-allowed.rst
55 | #./u8idlint -xsafec26 texts/homo-1.c > homo-1-c26.rst &&
56 | # diff texts/homo-1-p4.tst homo-1-c26.rst && rm homo-1-c26.rst
57 | #./u8idlint -p4 -xc11 texts/homo-1.c > homo-1-c11.rst &&
58 | # diff texts/homo-1-p4.tst homo-1-c11.rst && rm homo-1-c11.rst
59 | #./u8idlint -xxid texts/homo-1.c > homo-1-xid.rst &&
60 | # diff texts/homo-1-p4st homo-1-xid.rst && homo-1-xid.rst
61 | #./u8idlint -xid texts/homo-1.c > homo-1-id.rst &&
62 | # diff texts/homo-1-p4st homo-1-id.rst && rm homo-1-id.rst
63 | #./u8idlint -xc11 texts/bidi-sec-1.c > bidi-sec-1.rst &&
64 | # diff texts/bidi-sec-1.tst bidi-sec-1.rst && rm bidi-sec-1.rst
65 | #./u8idlint -xc11 texts/bidi-sec-2.c > bidi-sec-2.rst &&
66 | # diff texts/bidi-sec-2.tst bidi-sec-2.rst && rm bidi-sec-2.rst
67 |
68 | rm -f homo-sec-1.tst.rst
69 | #rm -f homo-1-p4.rst bidi-sec-1.rst bidi-sec-2.rst
70 | if [ -n "$NEWTEXTS" ]; then
71 | rm -rf texts || true
72 | fi
73 |
--------------------------------------------------------------------------------
/u8idroar.c:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2021 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | use roaring bitmaps for some sets.
6 | Currently only confus_croar is faster than binary search in our ranges lists.
7 | */
8 |
9 | #include "u8id_private.h"
10 | #ifdef HAVE_CROARING
11 | # include
12 | # include
13 | # include "roaring.c"
14 | # include "confus_croar.h"
15 |
16 | static roaring_bitmap_t *rc = NULL;
17 |
18 | # ifdef USE_ALLOWED_CROAR
19 | # include "allowed_croar.h"
20 | static roaring_bitmap_t *ra = NULL;
21 | # endif
22 |
23 | # ifdef USE_MARK_CROAR
24 | # define EXTERN_SCRIPTS
25 | # include "scripts.h"
26 | # undef EXTERN_SCRIPTS
27 | # include "mark.h"
28 | static roaring_bitmap_t *rm = NULL;
29 | # endif
30 |
31 | # ifdef USE_NORM_CROAR
32 | # include "nfkc_croar.h"
33 | # include "nfc_croar.h"
34 | # include "nfkd_croar.h"
35 | # include "nfd_croar.h"
36 | static roaring_bitmap_t *rnfkc_m = NULL;
37 | static roaring_bitmap_t *rnfc_m = NULL;
38 | static roaring_bitmap_t *rnfkc_n = NULL;
39 | static roaring_bitmap_t *rnfc_n = NULL;
40 | static roaring_bitmap_t *rnfkd_n = NULL;
41 | static roaring_bitmap_t *rnfd_n = NULL;
42 | # endif
43 |
44 | int u8ident_roar_init(void) {
45 | if (!rc) {
46 | rc = roaring_bitmap_portable_deserialize_safe((char *)confus_croar_bin,
47 | confus_croar_bin_len);
48 | if (!rc)
49 | return -1;
50 | }
51 |
52 | // These are disabled by default. Only used by perf
53 | # ifdef USE_NORM_CROAR
54 |
55 | # define DEF_DESERIALIZE_SAFE(rn, n) \
56 | if (!rn) { \
57 | rn = roaring_bitmap_portable_deserialize_safe( \
58 | (char *)JOIN(n, croar_bin), JOIN(n, croar_bin_len)); \
59 | if (!rn) \
60 | return -1; \
61 | }
62 |
63 | DEF_DESERIALIZE_SAFE(rnfkc_m, nfkc_m)
64 | DEF_DESERIALIZE_SAFE(rnfkc_n, nfkc_n)
65 | DEF_DESERIALIZE_SAFE(rnfc_m, nfc_m)
66 | DEF_DESERIALIZE_SAFE(rnfc_n, nfc_n)
67 | DEF_DESERIALIZE_SAFE(rnfkd_n, nfkc_n)
68 | DEF_DESERIALIZE_SAFE(rnfd_n, nfd_n)
69 | # endif
70 | # ifdef USE_ALLOWED_CROAR
71 | DEF_DESERIALIZE_SAFE(ra, allowed)
72 | # endif
73 | # ifdef USE_MARK_CROAR
74 | DEF_DESERIALIZE_SAFE(rm, mark)
75 | # endif
76 | # undef DEF_DESERIALIZE_SAFE
77 | return 0;
78 | }
79 |
80 | void u8ident_roar_free(void) {
81 |
82 | # define FREE_R(rc) \
83 | if (rc) \
84 | roaring_bitmap_free(rc); \
85 | rc = NULL
86 |
87 | FREE_R(rc);
88 |
89 | # ifdef USE_ALLOWED_CROAR
90 | FREE_R(ra);
91 | # endif
92 | # ifdef USE_MARK_CROAR
93 | FREE_R(rm);
94 | # endif
95 | # ifdef USE_NORM_CROAR
96 | FREE_R(rnfkc_m);
97 | FREE_R(rnfkc_n);
98 | FREE_R(rnfc_m);
99 | FREE_R(rnfc_n);
100 | FREE_R(rnfkd_n);
101 | FREE_R(rnfd_n);
102 | # endif
103 | # undef FREE_R
104 | }
105 |
106 | EXTERN bool u8ident_is_confusable(const uint32_t cp) {
107 | return roaring_bitmap_contains(rc, cp);
108 | }
109 |
110 | # ifdef USE_ALLOWED_CROAR
111 | bool u8ident_roar_is_allowed(const uint32_t cp) {
112 | return roaring_bitmap_contains(ra, cp);
113 | }
114 | # endif
115 |
116 | # ifdef USE_MARK_CROAR
117 | bool u8ident_roar_is_mark(const uint32_t cp) {
118 | return roaring_bitmap_contains(rm, cp);
119 | }
120 | # endif
121 |
122 | # ifdef USE_NORM_CROAR
123 | bool u8ident_roar_maybe_nfkc(const uint32_t cp) {
124 | if (roaring_bitmap_contains(rnfkc_n, cp))
125 | return true;
126 | return roaring_bitmap_contains(rnfkc_m, cp);
127 | }
128 | bool u8ident_roar_maybe_nfc(const uint32_t cp) {
129 | if (roaring_bitmap_contains(rnfc_n, cp))
130 | return true;
131 | return roaring_bitmap_contains(rnfc_m, cp);
132 | }
133 | bool u8ident_roar_maybe_nfkd(const uint32_t cp) {
134 | return roaring_bitmap_contains(rnfkd_n, cp);
135 | }
136 | bool u8ident_roar_maybe_nfd(const uint32_t cp) {
137 | return roaring_bitmap_contains(rnfd_n, cp);
138 | }
139 | # endif
140 |
141 | #endif
142 |
--------------------------------------------------------------------------------
/u8idroar.h:
--------------------------------------------------------------------------------
1 | /* libu8ident - Check unicode security guidelines for identifiers.
2 | Copyright 2021 Reini Urban
3 | SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4 |
5 | use roaring bitmaps for some sets.
6 | */
7 |
8 | #include "u8id_private.h"
9 | #include
10 | int u8ident_roar_init(void);
11 | int u8ident_roar_free(void);
12 | EXTERN bool u8ident_is_confusable(const uint32_t cp);
13 |
14 | #ifdef USE_ALLOWED_CROAR
15 | bool u8ident_roar_is_allowed(const uint32_t cp);
16 | #endif
17 |
18 | #ifdef USE_MARK_CROAR
19 | bool u8ident_roar_is_mark(const uint32_t cp);
20 | #endif
21 |
22 | #ifdef USE_NORM_CROAR
23 | bool u8ident_roar_maybe_nfkc(const uint32_t cp);
24 | bool u8ident_roar_maybe_nfc(const uint32_t cp);
25 | bool u8ident_roar_maybe_nfkd(const uint32_t cp);
26 | bool u8ident_roar_maybe_nfd(const uint32_t cp);
27 | #endif
28 |
--------------------------------------------------------------------------------
/u8idscr.h:
--------------------------------------------------------------------------------
1 | #ifndef _U8IDSCR_H
2 | #define _U8IDSCR_H
3 |
4 | #include
5 | #include
6 | #include "u8id_private.h"
7 | #define EXTERN_SCRIPTS
8 | #include "u8id_gc.h"
9 | #include "scripts.h"
10 |
11 | bool u8ident_has_script(const uint8_t scr);
12 | bool u8ident_has_script_ctx(const uint8_t scr, const struct ctx_t *ctx);
13 | int u8ident_add_script_ctx(const uint8_t scr, struct ctx_t *ctx);
14 | struct ctx_t *u8ident_ctx(void);
15 | uint8_t u8ident_get_script(const uint32_t cp);
16 | /* list of script indices */
17 | const struct scx *u8ident_get_scx(const uint32_t cp);
18 | /* search for safec23 XID entry, in start or cont lists */
19 | const struct sc_c23 *u8ident_get_safec23(const uint32_t cp);
20 | bool u8ident_is_MARK(const uint32_t cp);
21 | bool u8ident_is_MEDIAL(const uint32_t cp);
22 | // member or bidi formatting characters for reordering attacks.
23 | // Only valid with RTL scripts, such as Hebrew and Arabic.
24 | bool u8ident_is_bidi(const uint32_t cp);
25 | // Greek letters confusable with Latin
26 | bool u8ident_is_greek_latin_confus(const uint32_t cp);
27 | // bitmask of u8id_idtypes
28 | uint16_t u8ident_get_idtypes(const uint32_t cp);
29 | const char *u8ident_script_name(const int scr);
30 | // bool u8ident_is_decomposed(const uint32_t cp, const uint8_t scr);
31 | bool u8ident_maybe_normalized(const uint32_t cp);
32 |
33 | typedef bool func_tr31(const uint32_t cp);
34 | struct func_tr31_s {
35 | func_tr31 *start;
36 | func_tr31 *cont;
37 | };
38 | bool isASCII_start(const uint32_t cp);
39 | bool isASCII_cont(const uint32_t cp);
40 | bool isALLOWED_start(const uint32_t cp);
41 | bool isALLOWED_cont(const uint32_t cp);
42 | bool isSAFEC26_start(const uint32_t cp);
43 | bool isSAFEC26_cont(const uint32_t cp);
44 | bool isID_start(const uint32_t cp);
45 | bool isID_cont(const uint32_t cp);
46 | bool isXID_start(const uint32_t cp);
47 | bool isXID_cont(const uint32_t cp);
48 | bool isC11_start(const uint32_t cp);
49 | bool isC11_cont(const uint32_t cp);
50 | bool isC23_start(const uint32_t cp);
51 | bool isC23_cont(const uint32_t cp);
52 | bool isALLUTF8_start(const uint32_t cp);
53 | bool isALLUTF8_cont(const uint32_t cp);
54 |
55 | enum u8id_gc u8ident_get_gc(const uint32_t cp);
56 | const char *u8ident_gc_name(const enum u8id_gc);
57 |
58 | #endif // _U8IDSCR_H
59 |
--------------------------------------------------------------------------------
/un8ifexc.h:
--------------------------------------------------------------------------------
1 | /* ex: set ro ft=c: -*- buffer-read-only: t -*-
2 | *
3 | * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
4 | * This file is auto-generated by Unicode-Normalize 1.32
5 | * mkheader -ind -std
6 | * for Unicode 15.0.0 UTF-8
7 | * Any changes here will be lost!
8 | */
9 | /* Composite exclusions */
10 | bool isExclusion (uint32_t uv);
11 |
12 | bool isExclusion (uint32_t uv)
13 | {
14 | return
15 | (2392 <= uv && uv <= 2399)
16 | || (2524 <= uv && uv <= 2525)
17 | || uv == 2527
18 | || uv == 2611
19 | || uv == 2614
20 | || (2649 <= uv && uv <= 2651)
21 | || uv == 2654
22 | || (2908 <= uv && uv <= 2909)
23 | || uv == 3907
24 | || uv == 3917
25 | || uv == 3922
26 | || uv == 3927
27 | || uv == 3932
28 | || uv == 3945
29 | || uv == 3958
30 | || uv == 3960
31 | || uv == 3987
32 | || uv == 3997
33 | || uv == 4002
34 | || uv == 4007
35 | || uv == 4012
36 | || uv == 4025
37 | || uv == 10972
38 | || uv == 64285
39 | || uv == 64287
40 | || (64298 <= uv && uv <= 64310)
41 | || (64312 <= uv && uv <= 64316)
42 | || uv == 64318
43 | || (64320 <= uv && uv <= 64321)
44 | || (64323 <= uv && uv <= 64324)
45 | || (64326 <= uv && uv <= 64334)
46 | || (119134 <= uv && uv <= 119140)
47 | || (119227 <= uv && uv <= 119232)
48 | ? TRUE : FALSE;
49 | }
50 |
51 | /* Singletons */
52 | bool isSingleton (uint32_t uv);
53 |
54 | bool isSingleton (uint32_t uv)
55 | {
56 | return
57 | (832 <= uv && uv <= 833)
58 | || uv == 835
59 | || uv == 884
60 | || uv == 894
61 | || uv == 903
62 | || uv == 8049
63 | || uv == 8051
64 | || uv == 8053
65 | || uv == 8055
66 | || uv == 8057
67 | || uv == 8059
68 | || uv == 8061
69 | || uv == 8123
70 | || uv == 8126
71 | || uv == 8137
72 | || uv == 8139
73 | || uv == 8147
74 | || uv == 8155
75 | || uv == 8163
76 | || uv == 8171
77 | || (8174 <= uv && uv <= 8175)
78 | || uv == 8185
79 | || uv == 8187
80 | || uv == 8189
81 | || (8192 <= uv && uv <= 8193)
82 | || uv == 8486
83 | || (8490 <= uv && uv <= 8491)
84 | || (9001 <= uv && uv <= 9002)
85 | || (63744 <= uv && uv <= 64013)
86 | || uv == 64016
87 | || uv == 64018
88 | || (64021 <= uv && uv <= 64030)
89 | || uv == 64032
90 | || uv == 64034
91 | || (64037 <= uv && uv <= 64038)
92 | || (64042 <= uv && uv <= 64109)
93 | || (64112 <= uv && uv <= 64217)
94 | || (194560 <= uv && uv <= 195101)
95 | ? TRUE : FALSE;
96 | }
97 |
98 | /* non-starter decompositions */
99 | bool isNonStDecomp (uint32_t uv);
100 |
101 | bool isNonStDecomp (uint32_t uv)
102 | {
103 | return
104 | uv == 836
105 | || uv == 3955
106 | || uv == 3957
107 | || uv == 3969
108 | ? TRUE : FALSE;
109 | }
110 |
111 | /* may be composed with a prev char */
112 | bool isComp2nd (uint32_t uv);
113 |
114 | bool isComp2nd (uint32_t uv)
115 | {
116 | return
117 | (768 <= uv && uv <= 772)
118 | || (774 <= uv && uv <= 780)
119 | || uv == 783
120 | || uv == 785
121 | || (787 <= uv && uv <= 788)
122 | || uv == 795
123 | || (803 <= uv && uv <= 808)
124 | || (813 <= uv && uv <= 814)
125 | || (816 <= uv && uv <= 817)
126 | || uv == 824
127 | || uv == 834
128 | || uv == 837
129 | || (1619 <= uv && uv <= 1621)
130 | || uv == 2364
131 | || uv == 2494
132 | || uv == 2519
133 | || uv == 2878
134 | || (2902 <= uv && uv <= 2903)
135 | || uv == 3006
136 | || uv == 3031
137 | || uv == 3158
138 | || uv == 3266
139 | || (3285 <= uv && uv <= 3286)
140 | || uv == 3390
141 | || uv == 3415
142 | || uv == 3530
143 | || uv == 3535
144 | || uv == 3551
145 | || uv == 4142
146 | || (4449 <= uv && uv <= 4469)
147 | || (4520 <= uv && uv <= 4546)
148 | || uv == 6965
149 | || (12441 <= uv && uv <= 12442)
150 | || uv == 69818
151 | || uv == 69927
152 | || uv == 70462
153 | || uv == 70487
154 | || uv == 70832
155 | || uv == 70842
156 | || uv == 70845
157 | || uv == 71087
158 | || uv == 71984
159 | ? TRUE : FALSE;
160 | }
161 |
162 |
--------------------------------------------------------------------------------