├── data ├── Project.toml ├── Manifest.toml ├── Makefile └── data_generator.jl ├── .github ├── dependabot.yml └── workflows │ ├── ci-fuzz.yml │ ├── make.yml │ └── cmake.yml ├── MANIFEST ├── libutf8proc.pc.in ├── libutf8proc.pc.cmakein ├── bench ├── util.h ├── util.c ├── Makefile ├── unistring.c ├── bench.c └── icu.c ├── test ├── ossfuzz.sh ├── tests.h ├── custom.c ├── fuzz_main.c ├── valid.c ├── tests.c ├── normtest.c ├── iscase.c ├── printproperty.c ├── misc.c ├── fuzzer.c ├── case.c ├── charwidth.c ├── iterate.c └── graphemetest.c ├── .gitignore ├── utils.cmake ├── lump.md ├── appveyor.yml ├── CMakeLists.txt ├── LICENSE.md ├── README.md ├── Makefile ├── NEWS.md ├── utf8proc.c └── utf8proc.h /data/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | commit-message: 8 | prefix: "ci" 9 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | include/ 2 | include/utf8proc.h 3 | lib/ 4 | lib/libutf8proc.a 5 | lib/libutf8proc.so -> libutf8proc.so.3.1.0 6 | lib/libutf8proc.so.2 -> libutf8proc.so.3.1.0 7 | lib/libutf8proc.so.3.1.0 8 | lib/pkgconfig/ 9 | lib/pkgconfig/libutf8proc.pc 10 | -------------------------------------------------------------------------------- /libutf8proc.pc.in: -------------------------------------------------------------------------------- 1 | prefix=PREFIX 2 | exec_prefix=${prefix} 3 | libdir=${prefix}/LIBDIR 4 | includedir=${prefix}/INCLUDEDIR 5 | 6 | Name: libutf8proc 7 | Description: UTF8 processing 8 | Version: VERSION 9 | Libs: -L${libdir} -lutf8proc 10 | Cflags: -I${includedir} -DUTF8PROC_EXPORTS 11 | -------------------------------------------------------------------------------- /libutf8proc.pc.cmakein: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=@CMAKE_INSTALL_FULL_BINDIR@ 3 | libdir=@CMAKE_INSTALL_FULL_LIBDIR@ 4 | includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ 5 | 6 | Name: libutf8proc 7 | Description: UTF8 processing 8 | Version: @PROJECT_VERSION@ 9 | Libs: -L${libdir} -lutf8proc 10 | Cflags: -I${includedir} -DUTF8PROC_EXPORTS 11 | -------------------------------------------------------------------------------- /bench/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 1 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | uint8_t *readfile(const char *filename, size_t *len); 13 | 14 | typedef struct timeval mytime; 15 | mytime gettime(void); 16 | double elapsed(mytime t1, mytime t0); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | 22 | #endif /* UTIL_H */ 23 | -------------------------------------------------------------------------------- /test/ossfuzz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | # This script is meant to be run by 3 | # https://github.com/google/oss-fuzz/blob/master/projects/utf8proc/Dockerfile 4 | 5 | mkdir build 6 | cd build 7 | cmake .. -DUTF8PROC_ENABLE_TESTING=ON -DLIB_FUZZING_ENGINE="$LIB_FUZZING_ENGINE" 8 | make -j$(nproc) 9 | 10 | cp $SRC/utf8proc/build/fuzzer $OUT/utf8proc_fuzzer 11 | 12 | find $SRC/utf8proc/test -name "*.txt" | \ 13 | xargs zip $OUT/utf8proc_fuzzer_seed_corpus.zip 14 | -------------------------------------------------------------------------------- /data/Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | julia_version = "1.11.2" 4 | manifest_format = "2.0" 5 | project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6" 6 | 7 | [[deps.OffsetArrays]] 8 | git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c" 9 | uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" 10 | version = "1.15.0" 11 | 12 | [deps.OffsetArrays.extensions] 13 | OffsetArraysAdaptExt = "Adapt" 14 | 15 | [deps.OffsetArrays.weakdeps] 16 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.tar.gz 2 | *.exe 3 | *.dll 4 | *.do 5 | *.o 6 | *.so* 7 | *.a 8 | *.dll 9 | *.dylib 10 | *.dSYM 11 | *.out 12 | *.new 13 | .vscode 14 | /data/*.txt 15 | /data/*.ttf 16 | /data/*.sfd 17 | /docs/ 18 | /bench/bench 19 | /bench/icu 20 | /bench/unistring 21 | /test/normtest 22 | /test/graphemetest 23 | /test/printproperty 24 | /test/charwidth 25 | /test/misc 26 | /test/valid 27 | /test/iterate 28 | /test/case 29 | /test/iscase 30 | /test/custom 31 | /tmp/ 32 | /mingw_static/ 33 | /mingw_shared/ 34 | /msvc_shared/ 35 | /msvc_static/ 36 | /build/ 37 | NEWS-update.jl 38 | libutf8proc.pc 39 | 40 | # clangd 41 | /.cache/ 42 | /compile_commands.json 43 | -------------------------------------------------------------------------------- /utils.cmake: -------------------------------------------------------------------------------- 1 | 2 | function (disallow_intree_builds) 3 | # Adapted from LLVM's toplevel CMakeLists.txt file 4 | if( CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE ) 5 | message(FATAL_ERROR " 6 | In-source builds are not allowed. CMake would overwrite the 7 | makefiles distributed with utf8proc. Please create a directory 8 | and run cmake from there. Building in a subdirectory is 9 | fine, e.g.: 10 | 11 | mkdir build 12 | cd build 13 | cmake .. 14 | 15 | This process created the file `CMakeCache.txt' and the 16 | directory `CMakeFiles'. Please delete them. 17 | 18 | ") 19 | endif() 20 | endfunction() 21 | -------------------------------------------------------------------------------- /.github/workflows/ci-fuzz.yml: -------------------------------------------------------------------------------- 1 | name: CIFuzz 2 | on: [pull_request] 3 | jobs: 4 | Fuzzing: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: Build Fuzzers 8 | uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master 9 | with: 10 | oss-fuzz-project-name: 'utf8proc' 11 | dry-run: false 12 | - name: Run Fuzzers 13 | uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master 14 | with: 15 | oss-fuzz-project-name: 'utf8proc' 16 | fuzz-seconds: 600 17 | dry-run: false 18 | - name: Upload Crash 19 | uses: actions/upload-artifact@v4 20 | if: failure() 21 | with: 22 | name: artifacts 23 | path: ./out/artifacts 24 | -------------------------------------------------------------------------------- /test/tests.h: -------------------------------------------------------------------------------- 1 | /* Common functions and includes for our test programs. */ 2 | 3 | /* 4 | * Set feature macro to enable wcwidth(). 5 | * 6 | * Please refer to section 2.2.1 of POSIX.1-2008: 7 | * http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_02_01_02 8 | */ 9 | #define _XOPEN_SOURCE 700 10 | 11 | /* silence warnings about sscanf on Windows */ 12 | #define _CRT_SECURE_NO_WARNINGS 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../utf8proc.h" 21 | 22 | extern size_t lineno; 23 | 24 | void check(int cond, const char *format, ...); 25 | size_t skipspaces(const unsigned char *buf, size_t i); 26 | size_t encode(unsigned char *dest, const unsigned char *buf); 27 | size_t simple_getline(unsigned char buf[8192], FILE *f); 28 | -------------------------------------------------------------------------------- /bench/util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "util.h" 6 | 7 | /* read file named FILENAME into an array of *len bytes, 8 | returning NULL on error */ 9 | uint8_t *readfile(const char *filename, size_t *len) 10 | { 11 | *len = 0; 12 | struct stat st; 13 | if (0 != stat(filename, &st)) return NULL; 14 | *len = st.st_size; 15 | FILE *f = fopen(filename, "r"); 16 | if (!f) return NULL; 17 | uint8_t *s = (uint8_t *) malloc(sizeof(uint8_t) * *len); 18 | if (!s) return NULL; 19 | if (fread(s, 1, *len, f) != *len) { 20 | free(s); 21 | s = NULL; 22 | } 23 | fclose(f); 24 | return s; 25 | } 26 | 27 | mytime gettime(void) { 28 | mytime t; 29 | gettimeofday(&t, NULL); 30 | return t; 31 | } 32 | 33 | /* time difference in seconds */ 34 | double elapsed(mytime t1, mytime t0) 35 | { 36 | return (double)(t1.tv_sec - t0.tv_sec) + 37 | (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6; 38 | } 39 | 40 | -------------------------------------------------------------------------------- /bench/Makefile: -------------------------------------------------------------------------------- 1 | CURL=curl 2 | 3 | CC = cc 4 | CFLAGS ?= -O2 5 | CFLAGS += -std=c99 -pedantic -Wall 6 | 7 | all: bench 8 | 9 | LIBUTF8PROC = ../utf8proc.o 10 | 11 | bench: bench.o util.o $(LIBUTF8PROC) 12 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ bench.o util.o $(LIBUTF8PROC) 13 | 14 | DATAURL = https://raw.githubusercontent.com/duerst/eprun/master/benchmark 15 | DATAFILES = Deutsch_.txt Japanese_.txt Korean_.txt Vietnamese_.txt 16 | 17 | $(DATAFILES): 18 | $(CURL) -O $(DATAURL)/$@ 19 | 20 | bench.out: $(DATAFILES) bench 21 | ./bench -nfkc $(DATAFILES) > $@ 22 | 23 | # you may need make CPPFLAGS=... LDFLAGS=... to help it find ICU 24 | icu: icu.o util.o 25 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ icu.o util.o -licuuc 26 | 27 | icu.out: $(DATAFILES) icu 28 | ./icu $(DATAFILES) > $@ 29 | 30 | unistring: unistring.o util.o 31 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ unistring.o util.o -lunistring 32 | 33 | unistring.out: $(DATAFILES) unistring 34 | ./unistring $(DATAFILES) > $@ 35 | 36 | .c.o: 37 | $(CC) $(CPPFLAGS) -I.. $(CFLAGS) -c -o $@ $< 38 | 39 | clean: 40 | rm -rf *.o *.txt bench *.out icu unistring 41 | -------------------------------------------------------------------------------- /test/custom.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | 3 | static int thunk_test = 1; 4 | 5 | static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk) 6 | { 7 | check(((int *) thunk) == &thunk_test, "unexpected thunk passed"); 8 | if (codepoint == 'a') 9 | return 'b'; 10 | if (codepoint == 'S') 11 | return 0x00df; /* ß */ 12 | return codepoint; 13 | } 14 | 15 | int main(void) 16 | { 17 | utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */ 18 | utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */ 19 | utf8proc_uint8_t *output; 20 | utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM, 21 | custom, &thunk_test); 22 | printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output); 23 | check(strlen((char*) output) == 6, "incorrect output length"); 24 | check(!memcmp(correct, output, 7), "incorrect output data"); 25 | free(output); 26 | printf("map_custom tests SUCCEEDED.\n"); 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /.github/workflows/make.yml: -------------------------------------------------------------------------------- 1 | name: Make 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - 'release-*' 8 | pull_request: 9 | # run on all pr 10 | 11 | jobs: 12 | build: 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, macOS-latest] 16 | runs-on: ${{ matrix.os }} 17 | name: ${{ matrix.os }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | # TODO: update makefile to check MANIFEST 21 | - name: Install dependencies (MacOS) 22 | if: runner.os == 'macOS' 23 | run: brew install julia 24 | 25 | - name: Check MANIFEST 26 | if: matrix.config.os == 'ubuntu-latest' 27 | run: make manifest && diff MANIFEST.new MANIFEST 28 | - name: Run Test 29 | run: make check 30 | - name: Check utf8proc_data.c 31 | run: make data && diff data/utf8proc_data.c.new utf8proc_data.c 32 | - name: Clean 33 | run: make clean && git status --ignored --porcelain && test -z "$(git status --ignored --porcelain)" 34 | 35 | - name: Make lib 36 | run: make 37 | - name: Upload shared lib 38 | uses: actions/upload-artifact@v4 39 | with: 40 | name: make-${{ matrix.os }} 41 | path: libutf8proc.* 42 | -------------------------------------------------------------------------------- /test/fuzz_main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* Fuzz target entry point, works without libFuzzer */ 6 | 7 | int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size); 8 | 9 | int main(int argc, char **argv) 10 | { 11 | FILE *f; 12 | char *buf = NULL; 13 | long siz_buf; 14 | 15 | if(argc < 2) 16 | { 17 | fprintf(stderr, "no input file\n"); 18 | goto err; 19 | } 20 | 21 | f = fopen(argv[1], "rb"); 22 | if(f == NULL) 23 | { 24 | fprintf(stderr, "error opening input file %s\n", argv[1]); 25 | goto err; 26 | } 27 | 28 | fseek(f, 0, SEEK_END); 29 | 30 | siz_buf = ftell(f); 31 | rewind(f); 32 | 33 | if(siz_buf < 1) goto err; 34 | 35 | buf = (char*)malloc(siz_buf); 36 | if(buf == NULL) 37 | { 38 | fprintf(stderr, "malloc() failed\n"); 39 | goto err; 40 | } 41 | 42 | if(fread(buf, siz_buf, 1, f) != 1) 43 | { 44 | fprintf(stderr, "fread() failed\n"); 45 | goto err; 46 | } 47 | 48 | (void)LLVMFuzzerTestOneInput((uint8_t*)buf, siz_buf); 49 | 50 | err: 51 | free(buf); 52 | 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /test/valid.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | #include 3 | #include 4 | 5 | int main(int argc, char **argv) 6 | { 7 | int c, error = 0; 8 | 9 | (void) argc; /* unused */ 10 | (void) argv; /* unused */ 11 | 12 | /* some simple sanity tests of */ 13 | for (c = 0; c < 0xd800; c++) { 14 | if (!utf8proc_codepoint_valid(c)) { 15 | fprintf(stderr, "Failed: codepoint_valid(%04x) -> false\n", c); 16 | error++; 17 | } 18 | } 19 | for (;c < 0xe000; c++) { 20 | if (utf8proc_codepoint_valid(c)) { 21 | fprintf(stderr, "Failed: codepoint_valid(%04x) -> true\n", c); 22 | error++; 23 | } 24 | } 25 | for (;c < 0x110000; c++) { 26 | if (!utf8proc_codepoint_valid(c)) { 27 | fprintf(stderr, "Failed: codepoint_valid(%06x) -> false\n", c); 28 | error++; 29 | } 30 | } 31 | for (;c < 0x110010; c++) { 32 | if (utf8proc_codepoint_valid(c)) { 33 | fprintf(stderr, "Failed: codepoint_valid(%06x) -> true\n", c); 34 | error++; 35 | } 36 | } 37 | check(!error, "utf8proc_codepoint_valid FAILED %d tests.", error); 38 | printf("Validity tests SUCCEEDED.\n"); 39 | 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /lump.md: -------------------------------------------------------------------------------- 1 | ``` 2 | U+0020 <-- all space characters (general category Zs) 3 | U+0027 ' <-- left/right single quotation mark U+2018..2019, 4 | modifier letter apostrophe U+02BC, 5 | modifier letter vertical line U+02C8 6 | U+002D - <-- all dash characters (general category Pd), 7 | minus U+2212 8 | U+002F / <-- fraction slash U+2044, 9 | division slash U+2215 10 | U+003A : <-- ratio U+2236 11 | U+003C < <-- single left-pointing angle quotation mark U+2039, 12 | left-pointing angle bracket U+2329, 13 | left angle bracket U+3008 14 | U+003E > <-- single right-pointing angle quotation mark U+203A, 15 | right-pointing angle bracket U+232A, 16 | right angle bracket U+3009 17 | U+005C \ <-- set minus U+2216 18 | U+005E ^ <-- modifier letter up arrowhead U+02C4, 19 | modifier letter circumflex accent U+02C6, 20 | caret U+2038, 21 | up arrowhead U+2303 22 | U+005F _ <-- all connector characters (general category Pc), 23 | modifier letter low macron U+02CD 24 | U+0060 ` <-- modifier letter grave accent U+02CB 25 | U+007C | <-- divides U+2223 26 | U+007E ~ <-- tilde operator U+223C 27 | ``` 28 | -------------------------------------------------------------------------------- /bench/unistring.c: -------------------------------------------------------------------------------- 1 | /* comparative benchmark of GNU libunistring */ 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | /* libunistring */ 8 | #include 9 | #include 10 | 11 | #include "util.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | int i; 16 | uninorm_t nf = UNINORM_NFKC; 17 | 18 | for (i = 1; i < argc; ++i) { 19 | if (!strcmp(argv[i], "-nfkc")) { 20 | nf = UNINORM_NFKC; 21 | continue; 22 | } 23 | if (!strcmp(argv[i], "-nfkd")) { 24 | nf = UNINORM_NFKD; 25 | continue; 26 | } 27 | if (!strcmp(argv[i], "-nfc")) { 28 | nf = UNINORM_NFC; 29 | continue; 30 | } 31 | if (!strcmp(argv[i], "-nfd")) { 32 | nf = UNINORM_NFD; 33 | continue; 34 | } 35 | if (argv[i][0] == '-') { 36 | fprintf(stderr, "unrecognized option: %s\n", argv[i]); 37 | return EXIT_FAILURE; 38 | } 39 | 40 | size_t len; 41 | uint8_t *src = readfile(argv[i], &len); 42 | if (!src) { 43 | fprintf(stderr, "error reading %s\n", argv[i]); 44 | return EXIT_FAILURE; 45 | } 46 | 47 | size_t destlen; 48 | uint8_t *dest; 49 | mytime start = gettime(); 50 | for (int i = 0; i < 100; ++i) { 51 | dest = u8_normalize(nf, src, len, NULL, &destlen); 52 | if (!dest) return EXIT_FAILURE; 53 | free(dest); 54 | } 55 | printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); 56 | free(src); 57 | } 58 | 59 | return EXIT_SUCCESS; 60 | } 61 | -------------------------------------------------------------------------------- /bench/bench.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utf8proc.h" 6 | #include "util.h" 7 | 8 | int main(int argc, char **argv) 9 | { 10 | int i, j; 11 | int options = 0; 12 | 13 | for (i = 1; i < argc; ++i) { 14 | if (!strcmp(argv[i], "-nfkc")) { 15 | options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT; 16 | continue; 17 | } 18 | if (!strcmp(argv[i], "-nfkd")) { 19 | options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT; 20 | continue; 21 | } 22 | if (!strcmp(argv[i], "-nfc")) { 23 | options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE; 24 | continue; 25 | } 26 | if (!strcmp(argv[i], "-nfd")) { 27 | options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE; 28 | continue; 29 | } 30 | if (!strcmp(argv[i], "-casefold")) { 31 | options |= UTF8PROC_CASEFOLD; 32 | continue; 33 | } 34 | if (argv[i][0] == '-') { 35 | fprintf(stderr, "unrecognized option: %s\n", argv[i]); 36 | return EXIT_FAILURE; 37 | } 38 | 39 | size_t len; 40 | uint8_t *src = readfile(argv[i], &len); 41 | if (!src) { 42 | fprintf(stderr, "error reading %s\n", argv[i]); 43 | return EXIT_FAILURE; 44 | } 45 | uint8_t *dest; 46 | mytime start = gettime(); 47 | for (j = 0; j < 100; ++j) { 48 | utf8proc_map(src, len, &dest, options); 49 | free(dest); 50 | } 51 | printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); 52 | free(src); 53 | } 54 | 55 | return EXIT_SUCCESS; 56 | } 57 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | - /release-.*/ 5 | 6 | notifications: 7 | - provider: Email 8 | on_build_success: false 9 | on_build_failure: false 10 | on_build_status_changed: false 11 | 12 | build_script: 13 | - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` 14 | https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` 15 | Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` 16 | throw "There are newer queued builds for this pull request, failing early." } 17 | - mkdir msvc_static 18 | - cd msvc_static 19 | - cmake .. -DUTF8PROC_ENABLE_TESTING=On 20 | - cmake --build . 21 | - ctest 22 | - mkdir ..\msvc_shared 23 | - cd ..\msvc_shared 24 | - cmake .. -DBUILD_SHARED_LIBS=ON -DUTF8PROC_ENABLE_TESTING=On 25 | - cmake --build . 26 | - ctest 27 | - set PATH=C:\MinGW\bin;%PATH% 28 | - C:\MinGW\msys\1.0\bin\sh --login -c " 29 | cd /c/projects/utf8proc && 30 | mkdir mingw_static && 31 | cd mingw_static && 32 | cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DUTF8PROC_ENABLE_TESTING=On -G'MSYS Makefiles' && 33 | make && 34 | ctest && 35 | mkdir ../mingw_shared && 36 | cd ../mingw_shared && 37 | cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON -DUTF8PROC_ENABLE_TESTING=On -G'MSYS Makefiles' && 38 | make && 39 | ctest 40 | " 41 | 42 | on_finish: 43 | # Uncomment the following line for interactive debugging, which 44 | # will print login data for a temporary remote session after the 45 | # build. This requires an RDP version 6 client, e.g., FreeRDP. 46 | #- ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) 47 | -------------------------------------------------------------------------------- /.github/workflows/cmake.yml: -------------------------------------------------------------------------------- 1 | name: CMake 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - 'release-*' 8 | pull_request: 9 | # run on all pr 10 | 11 | jobs: 12 | build: 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, windows-latest, macOS-latest] 16 | shared: ["ON", "OFF"] 17 | runs-on: ${{ matrix.os }} 18 | name: ${{ matrix.os }} - shared=${{ matrix.shared }} 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Build 22 | run: | 23 | mkdir build 24 | cmake -S . -B build -DBUILD_SHARED_LIBS=${{ matrix.shared }} -DUTF8PROC_ENABLE_TESTING=ON 25 | cmake --build build 26 | - name: Run Test 27 | run: ctest --test-dir build -V 28 | - name: Upload shared lib 29 | if: matrix.shared == 'ON' 30 | uses: actions/upload-artifact@v4 31 | with: 32 | name: ${{ matrix.os }} 33 | path: | 34 | build/libutf8proc.* 35 | build/Debug/utf8proc.* 36 | 37 | mingw: 38 | strategy: 39 | matrix: 40 | os: [windows-latest] 41 | shared: ["ON", "OFF"] 42 | runs-on: ${{ matrix.os }} 43 | name: mingw64 - shared=${{ matrix.shared }} 44 | defaults: 45 | run: 46 | shell: msys2 {0} 47 | steps: 48 | - uses: actions/checkout@v4 49 | - uses: msys2/setup-msys2@v2 50 | with: 51 | install: gcc make mingw-w64-x86_64-cmake 52 | - name: Build 53 | run: | 54 | mkdir build 55 | cmake -S . -B build -DBUILD_SHARED_LIBS=${{ matrix.shared }} -DUTF8PROC_ENABLE_TESTING=ON -G'MSYS Makefiles' 56 | cmake --build build 57 | - name: Run Test 58 | run: ctest --test-dir build -V 59 | - name: Upload shared lib 60 | if: matrix.shared == 'ON' 61 | uses: actions/upload-artifact@v4 62 | with: 63 | name: windows-mingw64 64 | path: build/libutf8proc.* 65 | -------------------------------------------------------------------------------- /bench/icu.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* ICU4C */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "util.h" 11 | 12 | int main(int argc, char **argv) 13 | { 14 | int i; 15 | 16 | UErrorCode err; 17 | UConverter *uc = ucnv_open("UTF8", &err); 18 | if (U_FAILURE(err)) return EXIT_FAILURE; 19 | 20 | const UNormalizer2 *NFKC = unorm2_getNFKCInstance(&err); 21 | if (U_FAILURE(err)) return EXIT_FAILURE; 22 | 23 | for (i = 1; i < argc; ++i) { 24 | if (argv[i][0] == '-') { 25 | fprintf(stderr, "unrecognized option: %s\n", argv[i]); 26 | return EXIT_FAILURE; 27 | } 28 | 29 | size_t len; 30 | uint8_t *src = readfile(argv[i], &len); 31 | if (!src) { 32 | fprintf(stderr, "error reading %s\n", argv[i]); 33 | return EXIT_FAILURE; 34 | } 35 | 36 | /* convert UTF8 data to ICU's UTF16 */ 37 | UChar *usrc = (UChar*) malloc(2*len * sizeof(UChar)); 38 | ucnv_toUChars(uc, usrc, 2*len, (char*) src, len, &err); 39 | if (U_FAILURE(err)) return EXIT_FAILURE; 40 | size_t ulen = u_strlen(usrc); 41 | 42 | /* ICU's insane normalization API requires you to 43 | know the size of the destination buffer in advance, 44 | or alternatively to repeatedly try normalizing and 45 | double the buffer size until it succeeds. Here, I just 46 | allocate a huge destination buffer to avoid the issue. */ 47 | UChar *udest = (UChar*) malloc(10*ulen * sizeof(UChar)); 48 | 49 | mytime start = gettime(); 50 | for (int i = 0; i < 100; ++i) { 51 | unorm2_normalize(NFKC, usrc, ulen, udest, 10*ulen, &err); 52 | if (U_FAILURE(err)) return EXIT_FAILURE; 53 | } 54 | printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); 55 | free(udest); 56 | free(usrc); 57 | free(src); 58 | } 59 | 60 | return EXIT_SUCCESS; 61 | } 62 | -------------------------------------------------------------------------------- /test/tests.c: -------------------------------------------------------------------------------- 1 | /* Common functions for our test programs. */ 2 | 3 | #include "tests.h" 4 | 5 | size_t lineno = 0; 6 | 7 | void check(int cond, const char *format, ...) 8 | { 9 | if (!cond) { 10 | va_list args; 11 | fprintf(stderr, "line %zd: ", lineno); 12 | va_start(args, format); 13 | vfprintf(stderr, format, args); 14 | va_end(args); 15 | fprintf(stderr, "\n"); 16 | exit(1); 17 | } 18 | } 19 | 20 | size_t skipspaces(const unsigned char *buf, size_t i) 21 | { 22 | while (isspace(buf[i])) ++i; 23 | return i; 24 | } 25 | 26 | /* if buf points to a sequence of codepoints encoded as hexadecimal strings, 27 | separated by whitespace, and terminated by any character not in 28 | [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string 29 | in dest, returning the number of bytes read from buf */ 30 | size_t encode(unsigned char *dest, const unsigned char *buf) 31 | { 32 | size_t i = 0, j; 33 | utf8proc_ssize_t d = 0; 34 | for (;;) { 35 | int c; 36 | i = skipspaces(buf, i); 37 | for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) 38 | ; /* find end of hex input */ 39 | if (j == i) { /* no codepoint found */ 40 | dest[d] = 0; /* NUL-terminate destination string */ 41 | return i + 1; 42 | } 43 | check(sscanf((char *) (buf + i), "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i); 44 | i = j; /* skip to char after hex input */ 45 | d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d)); 46 | } 47 | } 48 | 49 | /* simplistic, portable replacement for getline, sufficient for our tests */ 50 | size_t simple_getline(unsigned char buf[8192], FILE *f) { 51 | size_t i = 0; 52 | while (i < 8191) { 53 | int c = getc(f); 54 | if (c == EOF || c == '\n') break; 55 | buf[i++] = (unsigned char) c; 56 | } 57 | buf[i] = 0; 58 | return i; 59 | } 60 | -------------------------------------------------------------------------------- /test/normtest.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | 3 | #define CHECK_NORM(NRM, norm, src) { \ 4 | unsigned char *src_norm = (unsigned char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \ 5 | check(!strcmp((char *) norm, (char *) src_norm), \ 6 | "normalization failed for %s -> %s", src, norm); \ 7 | free(src_norm); \ 8 | } 9 | 10 | int main(int argc, char **argv) 11 | { 12 | unsigned char buf[8192]; 13 | FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; 14 | unsigned char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024]; 15 | 16 | check(f != NULL, "error opening NormalizationTest.txt"); 17 | while (simple_getline(buf, f) > 0) { 18 | size_t offset; 19 | lineno += 1; 20 | 21 | if (buf[0] == '@') { 22 | printf("line %zd: %s", lineno, buf + 1); 23 | continue; 24 | } 25 | else if (lineno % 1000 == 0) 26 | printf("checking line %zd...\n", lineno); 27 | 28 | if (buf[0] == '#') continue; 29 | 30 | offset = encode(source, buf); 31 | offset += encode(NFC, buf + offset); 32 | offset += encode(NFD, buf + offset); 33 | offset += encode(NFKC, buf + offset); 34 | offset += encode(NFKD, buf + offset); 35 | 36 | CHECK_NORM(NFC, NFC, source); 37 | CHECK_NORM(NFC, NFC, NFC); 38 | CHECK_NORM(NFC, NFC, NFD); 39 | CHECK_NORM(NFC, NFKC, NFKC); 40 | CHECK_NORM(NFC, NFKC, NFKD); 41 | 42 | CHECK_NORM(NFD, NFD, source); 43 | CHECK_NORM(NFD, NFD, NFC); 44 | CHECK_NORM(NFD, NFD, NFD); 45 | CHECK_NORM(NFD, NFKD, NFKC); 46 | CHECK_NORM(NFD, NFKD, NFKD); 47 | 48 | CHECK_NORM(NFKC, NFKC, source); 49 | CHECK_NORM(NFKC, NFKC, NFC); 50 | CHECK_NORM(NFKC, NFKC, NFD); 51 | CHECK_NORM(NFKC, NFKC, NFKC); 52 | CHECK_NORM(NFKC, NFKC, NFKD); 53 | 54 | CHECK_NORM(NFKD, NFKD, source); 55 | CHECK_NORM(NFKD, NFKD, NFC); 56 | CHECK_NORM(NFKD, NFKD, NFD); 57 | CHECK_NORM(NFKD, NFKD, NFKC); 58 | CHECK_NORM(NFKD, NFKD, NFKD); 59 | } 60 | fclose(f); 61 | printf("Passed tests after %zd lines!\n", lineno); 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /test/iscase.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | 3 | int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end) 4 | { 5 | unsigned char buf[8192]; 6 | size_t len = simple_getline(buf, f); 7 | size_t pos = skipspaces(buf, 0); 8 | unsigned char s[16]; 9 | if (pos == len || buf[pos] == '#') return 0; 10 | pos += encode(s, buf + pos) - 1; 11 | check(s[0], "invalid line %s in data", buf); 12 | utf8proc_iterate((utf8proc_uint8_t*) s, -1, start); 13 | if (buf[pos] == '.' && buf[pos+1] == '.') { 14 | encode(s, buf + pos + 2); 15 | check(s[0], "invalid line %s in data", buf); 16 | utf8proc_iterate((utf8proc_uint8_t*) s, -1, end); 17 | } 18 | else 19 | *end = *start; 20 | return 1; 21 | } 22 | 23 | int test_iscase(const char *fname, int (*iscase)(utf8proc_int32_t), 24 | utf8proc_int32_t (*thatcase)(utf8proc_int32_t)) 25 | { 26 | FILE *f = fopen(fname, "r"); 27 | int lines = 0, tests = 0, success = 1; 28 | utf8proc_int32_t c = 0; 29 | 30 | check(f != NULL, "error opening data file \"%s\"\n", fname); 31 | 32 | while (success && !feof(f)) { 33 | utf8proc_int32_t start, end; 34 | if (read_range(f, &start, &end)) { 35 | for (; c < start; ++c) { 36 | check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname); 37 | } 38 | for (; c <= end; ++c) { 39 | check(iscase(c), "failed iscase(%04x) in %s\n", c, fname); 40 | check(thatcase(c) == c, "inconsistent thatcase(%04x) in %s\n", c, fname); 41 | ++tests; 42 | } 43 | } 44 | ++lines; 45 | } 46 | for (; c <= 0x110000; ++c) { 47 | check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname); 48 | } 49 | 50 | printf("Checked %d characters from %d lines of %s\n", tests, lines, fname); 51 | fclose(f); 52 | return success; 53 | } 54 | 55 | int main(int argc, char **argv) 56 | { 57 | check(argc == 3, "Expected Lowercase.txt and Uppercase.txt as arguments"); 58 | check(test_iscase(argv[1], utf8proc_islower, utf8proc_tolower), "Lowercase tests failed"); 59 | check(test_iscase(argv[2], utf8proc_isupper, utf8proc_toupper), "Uppercase tests failed"); 60 | printf("utf8proc iscase tests SUCCEEDED.\n"); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /test/printproperty.c: -------------------------------------------------------------------------------- 1 | /* simple test program to print out the utf8proc properties for a codepoint */ 2 | 3 | #include "tests.h" 4 | 5 | int main(int argc, char **argv) 6 | { 7 | int i; 8 | 9 | for (i = 1; i < argc; ++i) { 10 | utf8proc_uint8_t cstr[16], *map; 11 | utf8proc_uint32_t x; 12 | utf8proc_int32_t c; 13 | if (!strcmp(argv[i], "-V")) { 14 | printf("utf8proc version %s\n", utf8proc_version()); 15 | continue; 16 | } 17 | check(sscanf(argv[i],"%x", &x) == 1, "invalid hex input %s", argv[i]); 18 | c = (utf8proc_int32_t)x; 19 | const utf8proc_property_t *p = utf8proc_get_property(c); 20 | 21 | if (utf8proc_codepoint_valid(c)) 22 | cstr[utf8proc_encode_char(c, cstr)] = 0; 23 | else 24 | strcat((char*)cstr, "N/A"); 25 | utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD); 26 | 27 | printf("U+%s: %s\n" 28 | " category = %s\n" 29 | " combining_class = %d\n" 30 | " bidi_class = %d\n" 31 | " decomp_type = %d\n" 32 | " uppercase_mapping = %04x (seqindex %04x)%s\n" 33 | " lowercase_mapping = %04x (seqindex %04x)%s\n" 34 | " titlecase_mapping = %04x (seqindex %04x)\n" 35 | " casefold = %s\n" 36 | " comb_index = %d\n" 37 | " comb_length = %d\n" 38 | " comb_issecond = %d\n" 39 | " bidi_mirrored = %d\n" 40 | " comp_exclusion = %d\n" 41 | " ignorable = %d\n" 42 | " control_boundary = %d\n" 43 | " boundclass = %d\n" 44 | " indic_conjunct_break = %d\n" 45 | " charwidth = %d\n", 46 | argv[i], (char*) cstr, 47 | utf8proc_category_string(c), 48 | p->combining_class, 49 | p->bidi_class, 50 | p->decomp_type, 51 | utf8proc_toupper(c), p->uppercase_seqindex, utf8proc_isupper(c) ? " (isupper)" : "", 52 | utf8proc_tolower(c), p->lowercase_seqindex, utf8proc_islower(c) ? " (islower)" : "", 53 | utf8proc_totitle(c), p->titlecase_seqindex, 54 | (char *) map, 55 | p->comb_index, 56 | p->comb_length, 57 | p->comb_issecond, 58 | p->bidi_mirrored, 59 | p->comp_exclusion, 60 | p->ignorable, 61 | p->control_boundary, 62 | p->boundclass, 63 | p->indic_conjunct_break, 64 | utf8proc_charwidth(c)); 65 | free(map); 66 | } 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /test/misc.c: -------------------------------------------------------------------------------- 1 | /* Miscellaneous tests, e.g. regression tests */ 2 | 3 | #include "tests.h" 4 | 5 | static void issue128(void) /* #128 */ 6 | { 7 | utf8proc_uint8_t input[] = {0x72, 0xcc, 0x87, 0xcc, 0xa3, 0x00}; /* "r\u0307\u0323" */ 8 | utf8proc_uint8_t nfc[] = {0xe1, 0xb9, 0x9b, 0xcc, 0x87, 0x00}; /* "\u1E5B\u0307" */ 9 | utf8proc_uint8_t nfd[] = {0x72, 0xcc, 0xa3, 0xcc, 0x87, 0x00}; /* "r\u0323\u0307" */ 10 | utf8proc_uint8_t *nfc_out, *nfd_out; 11 | nfc_out = utf8proc_NFC(input); 12 | printf("NFC \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfc_out, (char*)nfc); 13 | check(strlen((char*) nfc_out) == 5, "incorrect nfc length"); 14 | check(!memcmp(nfc, nfc_out, 6), "incorrect nfc data"); 15 | nfd_out = utf8proc_NFD(input); 16 | printf("NFD \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)nfd_out, (char*)nfd); 17 | check(strlen((char*) nfd_out) == 5, "incorrect nfd length"); 18 | check(!memcmp(nfd, nfd_out, 6), "incorrect nfd data"); 19 | free(nfd_out); free(nfc_out); 20 | } 21 | 22 | static void issue102(void) /* #128 */ 23 | { 24 | utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */ 25 | utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */ 26 | utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */ 27 | utf8proc_uint8_t *output; 28 | utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 29 | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA); 30 | printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna); 31 | check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length"); 32 | check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data"); 33 | free(output); 34 | output = utf8proc_NFKC_Casefold(input); 35 | printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct); 36 | check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length"); 37 | check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data"); 38 | free(output); 39 | } 40 | 41 | int main(void) 42 | { 43 | issue128(); 44 | issue102(); 45 | #ifdef UNICODE_VERSION 46 | printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version()); 47 | check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch"); 48 | #endif 49 | printf("Misc tests SUCCEEDED.\n"); 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /data/Makefile: -------------------------------------------------------------------------------- 1 | # Unicode data generation rules. Except for the test data files, most 2 | # users will not use these Makefile rules, which are primarily to re-generate 3 | # unicode_data.c when we get a new Unicode version or charwidth data; they 4 | # require julia to be installed. 5 | 6 | # programs 7 | CURL=curl 8 | PERL=perl 9 | MAKE=make 10 | JULIA=julia 11 | CURLFLAGS = --retry 5 --location 12 | 13 | .PHONY: clean 14 | 15 | .DELETE_ON_ERROR: 16 | 17 | RAWDATA = UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt EastAsianWidth.txt emoji-data.txt 18 | 19 | utf8proc_data.c.new: data_generator.jl $(RAWDATA) 20 | $(JULIA) --project=. -e 'using Pkg; Pkg.instantiate()' 21 | $(JULIA) --project=. data_generator.jl > $@ 22 | 23 | # Unicode data version (must also update utf8proc_unicode_version function) 24 | UNICODE_VERSION=16.0.0 25 | 26 | UnicodeData.txt: 27 | $(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt 28 | 29 | EastAsianWidth.txt: 30 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt 31 | 32 | GraphemeBreakProperty.txt: 33 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt 34 | 35 | DerivedCoreProperties.txt: 36 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt 37 | 38 | CompositionExclusions.txt: 39 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CompositionExclusions.txt 40 | 41 | CaseFolding.txt: 42 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/CaseFolding.txt 43 | 44 | NormalizationTest.txt: 45 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/NormalizationTest.txt 46 | 47 | GraphemeBreakTest.txt: 48 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt 49 | 50 | emoji-data.txt: 51 | $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt 52 | 53 | Uppercase.txt: DerivedCoreProperties.txt 54 | $(JULIA) -e 'print(match(r"# Derived Property: Uppercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@ 55 | 56 | Lowercase.txt: DerivedCoreProperties.txt 57 | $(JULIA) -e 'print(match(r"# Derived Property: Lowercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@ 58 | 59 | clean: 60 | rm -f $(RAWDATA) NormalizationTest.txt GraphemeBreakTest.txt 61 | rm -f Uppercase.txt Lowercase.txt 62 | rm -f utf8proc_data.c.new 63 | -------------------------------------------------------------------------------- /test/fuzzer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) 5 | { 6 | if(size < 1) return 0; 7 | 8 | /* Avoid timeout with long inputs */ 9 | if(size > (64 * 1024)) return 0; 10 | 11 | if(data[size-1] != '\0') return 0; 12 | 13 | const uint8_t* ptr = data; 14 | utf8proc_int32_t c = 0, c_prev = 0, state = 0; 15 | utf8proc_option_t options; 16 | utf8proc_ssize_t ret, bytes = 0; 17 | utf8proc_uint8_t *str = NULL; 18 | size_t len = strlen((const char*)data); 19 | 20 | while(bytes != len) 21 | { 22 | ret = utf8proc_iterate(ptr, -1, &c); 23 | 24 | if(ret < 0 || ret == 0) break; 25 | 26 | bytes += ret; 27 | ptr += ret; 28 | 29 | utf8proc_tolower(c); 30 | utf8proc_toupper(c); 31 | utf8proc_totitle(c); 32 | utf8proc_islower(c); 33 | utf8proc_isupper(c); 34 | utf8proc_charwidth(c); 35 | utf8proc_category(c); 36 | utf8proc_category_string(c); 37 | utf8proc_codepoint_valid(c); 38 | 39 | utf8proc_grapheme_break(c_prev, c); 40 | utf8proc_grapheme_break_stateful(c_prev, c, &state); 41 | 42 | c_prev = c; 43 | } 44 | 45 | utf8proc_int32_t *copy = size >= 4 ? NULL : malloc(size); 46 | 47 | if(copy) 48 | { 49 | size /= 4; 50 | 51 | options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS | UTF8PROC_NLF2PS; 52 | memcpy(copy, data, size); 53 | utf8proc_normalize_utf32(copy, size, options); 54 | 55 | options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS; 56 | memcpy(copy, data, size); 57 | utf8proc_normalize_utf32(copy, size, options); 58 | 59 | options = UTF8PROC_STRIPCC | UTF8PROC_NLF2PS; 60 | memcpy(copy, data, size); 61 | utf8proc_normalize_utf32(copy, size, options); 62 | 63 | options = UTF8PROC_STRIPCC; 64 | memcpy(copy, data, size); 65 | utf8proc_normalize_utf32(copy, size, options); 66 | 67 | options = UTF8PROC_LUMP; 68 | memcpy(copy, data, size); 69 | utf8proc_normalize_utf32(copy, size, options); 70 | 71 | options = 0; 72 | memcpy(copy, data, size); 73 | utf8proc_normalize_utf32(copy, size, options); 74 | 75 | free(copy); 76 | } 77 | 78 | free(utf8proc_NFD(data)); 79 | free(utf8proc_NFC(data)); 80 | free(utf8proc_NFKD(data)); 81 | free(utf8proc_NFKC(data)); 82 | free(utf8proc_NFKC_Casefold(data)); 83 | 84 | utf8proc_map(data, len, &str, UTF8PROC_CHARBOUND | UTF8PROC_STRIPNA); 85 | free(str); 86 | 87 | utf8proc_map(data, len, &str, UTF8PROC_LUMP | UTF8PROC_NLF2LS | UTF8PROC_NLF2PS); 88 | free(str); 89 | 90 | utf8proc_map(data, len, &str, UTF8PROC_COMPOSE | UTF8PROC_STRIPMARK); 91 | free(str); 92 | 93 | return 0; 94 | } -------------------------------------------------------------------------------- /test/case.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | #include 3 | 4 | int main(int argc, char **argv) 5 | { 6 | int error = 0, better = 0; 7 | utf8proc_int32_t c; 8 | 9 | (void) argc; /* unused */ 10 | (void) argv; /* unused */ 11 | 12 | /* some simple sanity tests of the character widths */ 13 | for (c = 0; c <= 0x110000; ++c) { 14 | utf8proc_int32_t l = utf8proc_tolower(c); 15 | utf8proc_int32_t u = utf8proc_toupper(c); 16 | utf8proc_int32_t t = utf8proc_totitle(c); 17 | 18 | check(l == c || utf8proc_codepoint_valid(l), "invalid tolower"); 19 | check(u == c || utf8proc_codepoint_valid(u), "invalid toupper"); 20 | check(t == c || utf8proc_codepoint_valid(t), "invalid totitle"); 21 | 22 | if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) && 23 | /* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */ 24 | !(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) { 25 | fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c); 26 | ++error; 27 | } 28 | 29 | if (sizeof(wint_t) > 2 || (c < (1<<16) && u < (1<<16) && l < (1<<16))) { 30 | wint_t l0 = towlower((wint_t)c), u0 = towupper((wint_t)c); 31 | 32 | /* OS unicode tables may be out of date. But if they 33 | do have a lower/uppercase mapping, hopefully it 34 | is correct? */ 35 | if (l0 != (wint_t)c && l0 != (wint_t)l) { 36 | fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n", 37 | l, c, l0); 38 | ++error; 39 | } 40 | else if (l0 != (wint_t)l) { /* often true for out-of-date OS unicode */ 41 | ++better; 42 | /* printf("%x != towlower(%x) == %x\n", l, c, l0); */ 43 | } 44 | if (u0 != (wint_t)c && u0 != (wint_t)u) { 45 | fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n", 46 | u, c, u0); 47 | ++error; 48 | } 49 | else if (u0 != (wint_t)u) { /* often true for out-of-date OS unicode */ 50 | ++better; 51 | /* printf("%x != towupper(%x) == %x\n", u, c, u0); */ 52 | } 53 | } 54 | } 55 | check(!error, "utf8proc case conversion FAILED %d tests.", error); 56 | 57 | /* issue #130 */ 58 | check(utf8proc_toupper(0x00df) == 0x1e9e && 59 | utf8proc_totitle(0x00df) == 0x1e9e && 60 | utf8proc_tolower(0x00df) == 0x00df && 61 | utf8proc_tolower(0x1e9e) == 0x00df && 62 | utf8proc_toupper(0x1e9e) == 0x1e9e, 63 | "incorrect 0x00df/0x1e9e case conversions"); 64 | utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00}; 65 | utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00}; 66 | utf8proc_uint8_t *s1 = utf8proc_NFKC_Casefold(str_00df); 67 | utf8proc_uint8_t *s2 = utf8proc_NFKC_Casefold(str_1e9e); 68 | check(!strcmp((char*)s1, "ss") && 69 | !strcmp((char*)s2, "ss"), 70 | "incorrect 0x00df/0x1e9e casefold normalization"); 71 | free(s1); 72 | free(s2); 73 | printf("More up-to-date than OS unicode tables for %d tests.\n", better); 74 | printf("utf8proc case conversion tests SUCCEEDED.\n"); 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /test/charwidth.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | #include 3 | #include 4 | 5 | static int my_unassigned(int c) { 6 | int cat = utf8proc_get_property(c)->category; 7 | return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); 8 | } 9 | 10 | static int my_isprint(int c) { 11 | int cat = utf8proc_get_property(c)->category; 12 | return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) || 13 | (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) || 14 | (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); 15 | } 16 | 17 | int main(int argc, char **argv) 18 | { 19 | int c, error = 0, updates = 0; 20 | 21 | (void) argc; /* unused */ 22 | (void) argv; /* unused */ 23 | 24 | /* some simple sanity tests of the character widths */ 25 | for (c = 0; c <= 0x110000; ++c) { 26 | int cat = utf8proc_get_property(c)->category; 27 | int w = utf8proc_charwidth(c); 28 | int ambiguous = utf8proc_charwidth_ambiguous(c); 29 | if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) { 30 | fprintf(stderr, "nonzero width %d for combining char %x\n", w, c); 31 | error += 1; 32 | } 33 | if (w == 0 && 34 | ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) || 35 | (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) || 36 | (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) { 37 | fprintf(stderr, "zero width for symbol-like char %x\n", c); 38 | error += 1; 39 | } 40 | if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) { 41 | fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n", 42 | wcwidth(c), w, 43 | isprint(c) ? "printable" : "non-printable", c); 44 | error += 1; 45 | } 46 | if (c <= 127 && utf8proc_charwidth_ambiguous(c)) { 47 | fprintf(stderr, "ambiwith set for ASCII %x\n", c); 48 | error += 1; 49 | } 50 | if (!my_isprint(c) && w > 0) { 51 | fprintf(stderr, "non-printing %x had width %d\n", c, w); 52 | error += 1; 53 | } 54 | if (my_unassigned(c) && w != 1) { 55 | fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c); 56 | error += 1; 57 | } 58 | if (ambiguous && w >= 2) { 59 | fprintf(stderr, "char %x is both doublewidth and ambiguous\n", c); 60 | error += 1; 61 | } 62 | } 63 | check(!error, "utf8proc_charwidth FAILED %d tests.", error); 64 | 65 | check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)"); 66 | check(utf8proc_charwidth_ambiguous(0x00ad) , "incorrect ambiguous width for U+00AD (soft hyphen)"); 67 | check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)"); 68 | check(utf8proc_charwidth_ambiguous(0xe000), "incorrect ambiguous width for U+e000 (PUA)"); 69 | 70 | check(utf8proc_charwidth_ambiguous(0x00A1), "incorrect ambiguous width for U+00A1 (inverted exclamation mark)"); 71 | check(!utf8proc_charwidth_ambiguous(0x00A2), "incorrect ambiguous width for U+00A2 (cent sign)"); 72 | 73 | /* print some other information by compariing with system wcwidth */ 74 | printf("Mismatches with system wcwidth (not necessarily errors):\n"); 75 | for (c = 0; c <= 0x110000; ++c) { 76 | int w = utf8proc_charwidth(c); 77 | int wc = wcwidth(c); 78 | if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue; 79 | /* lots of these errors for out-of-date system unicode tables */ 80 | if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0) 81 | updates += 1; 82 | if (wc == -1 && !my_isprint(c) && w > 0) 83 | printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w); 84 | if (wc >= 0 && wc != w) 85 | printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w); 86 | } 87 | printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates); 88 | printf("Character-width tests SUCCEEDED.\n"); 89 | 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.10) 2 | 3 | include (utils.cmake) 4 | 5 | disallow_intree_builds() 6 | 7 | # API version - be sure to update utf8proc.h and Makefile, too! 8 | project (utf8proc VERSION 2.10.0 LANGUAGES C) 9 | 10 | # This is the ABI version number, which may differ from the 11 | # API version number (defined in utf8proc.h and above). 12 | # Be sure to also update these in Makefile and MANIFEST! 13 | set(SO_MAJOR 3) 14 | set(SO_MINOR 1) 15 | set(SO_PATCH 0) 16 | 17 | option(UTF8PROC_INSTALL "Enable installation of utf8proc" On) 18 | option(UTF8PROC_ENABLE_TESTING "Enable testing of utf8proc" Off) 19 | option(LIB_FUZZING_ENGINE "Fuzzing engine to link against" Off) 20 | 21 | add_library (utf8proc 22 | utf8proc.c 23 | utf8proc.h 24 | ) 25 | 26 | # expose header path, for when this is part of a larger cmake project 27 | target_include_directories(utf8proc PUBLIC .) 28 | 29 | if (BUILD_SHARED_LIBS) 30 | # Building shared library 31 | else() 32 | # Building static library 33 | target_compile_definitions(utf8proc PUBLIC "UTF8PROC_STATIC") 34 | if (MSVC) 35 | set_target_properties(utf8proc PROPERTIES OUTPUT_NAME "utf8proc_static") 36 | endif() 37 | endif() 38 | 39 | target_compile_definitions(utf8proc PRIVATE "UTF8PROC_EXPORTS") 40 | 41 | if (NOT MSVC) 42 | set_target_properties( 43 | utf8proc PROPERTIES 44 | COMPILE_FLAGS "-O2 -std=c99 -pedantic -Wall" 45 | ) 46 | endif () 47 | 48 | set_target_properties (utf8proc PROPERTIES 49 | POSITION_INDEPENDENT_CODE ON 50 | VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}" 51 | SOVERSION ${SO_MAJOR} 52 | ) 53 | 54 | if (UTF8PROC_INSTALL) 55 | include(GNUInstallDirs) 56 | install(FILES utf8proc.h DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}") 57 | install(TARGETS utf8proc 58 | ARCHIVE DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}" 59 | LIBRARY DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}" 60 | RUNTIME DESTINATION "${CMAKE_INSTALL_FULL_BINDIR}" 61 | ) 62 | configure_file(libutf8proc.pc.cmakein libutf8proc.pc @ONLY) 63 | install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libutf8proc.pc" DESTINATION "${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig") 64 | endif() 65 | 66 | if(UTF8PROC_ENABLE_TESTING) 67 | enable_testing() 68 | file(MAKE_DIRECTORY data) 69 | set(UNICODE_VERSION 16.0.0) 70 | file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS) 71 | file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS) 72 | add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c) 73 | target_link_libraries(case utf8proc) 74 | add_executable(custom test/tests.h test/tests.c utf8proc.h test/custom.c) 75 | target_link_libraries(custom utf8proc) 76 | add_executable(iterate test/tests.h test/tests.c utf8proc.h test/iterate.c) 77 | target_link_libraries(iterate utf8proc) 78 | add_executable(misc test/tests.h test/tests.c utf8proc.h test/misc.c) 79 | target_link_libraries(misc utf8proc) 80 | add_executable(printproperty test/tests.h test/tests.c utf8proc.h test/printproperty.c) 81 | target_link_libraries(printproperty utf8proc) 82 | add_executable(valid test/tests.h test/tests.c utf8proc.h test/valid.c) 83 | target_link_libraries(valid utf8proc) 84 | add_test(utf8proc.testcase case) 85 | add_test(utf8proc.testcustom custom) 86 | add_test(utf8proc.testiterate iterate) 87 | add_test(utf8proc.testmisc misc) 88 | add_test(utf8proc.testprintproperty printproperty) 89 | add_test(utf8proc.testvalid valid) 90 | 91 | if (NOT WIN32) 92 | # no wcwidth function on Windows 93 | add_executable(charwidth test/tests.h test/tests.c utf8proc.h test/charwidth.c) 94 | target_link_libraries(charwidth utf8proc) 95 | add_test(utf8proc.testcharwidth charwidth) 96 | endif() 97 | add_executable(graphemetest test/tests.h test/tests.c utf8proc.h test/graphemetest.c) 98 | target_link_libraries(graphemetest utf8proc) 99 | add_executable(normtest test/tests.h test/tests.c utf8proc.h test/normtest.c) 100 | target_link_libraries(normtest utf8proc) 101 | add_test(utf8proc.testgraphemetest graphemetest data/GraphemeBreakTest.txt) 102 | add_test(utf8proc.testnormtest normtest data/NormalizationTest.txt) 103 | 104 | if(LIB_FUZZING_ENGINE) 105 | add_executable(fuzzer utf8proc.h test/fuzzer.c) 106 | target_link_libraries(fuzzer ${LIB_FUZZING_ENGINE} utf8proc) 107 | else() 108 | add_executable(fuzzer utf8proc.h test/fuzz_main.c test/fuzzer.c) 109 | target_link_libraries(fuzzer utf8proc) 110 | endif() 111 | endif() 112 | -------------------------------------------------------------------------------- /test/iterate.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | #include 3 | #include 4 | 5 | static int tests; 6 | static int error; 7 | 8 | #define CHECKVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,len,__LINE__) 9 | #define CHECKINVALID(pos, val, len) buf[pos] = val; testbytes(buf,len,UTF8PROC_ERROR_INVALIDUTF8,__LINE__) 10 | 11 | static void testbytes(utf8proc_uint8_t *buf, utf8proc_ssize_t len, utf8proc_ssize_t retval, int line) 12 | { 13 | utf8proc_int32_t out[16]; 14 | utf8proc_ssize_t ret; 15 | 16 | /* Make a copy to ensure that memory is left uninitialized after "len" 17 | * bytes. This way, Valgrind can detect overreads. 18 | */ 19 | utf8proc_uint8_t tmp[16]; 20 | memcpy(tmp, buf, (unsigned long int)len); 21 | 22 | tests++; 23 | if ((ret = utf8proc_iterate(tmp, len, out)) != retval) { 24 | fprintf(stderr, "Failed (%d):", line); 25 | for (utf8proc_ssize_t i = 0; i < len ; i++) { 26 | fprintf(stderr, " 0x%02x", tmp[i]); 27 | } 28 | fprintf(stderr, " -> %zd\n", ret); 29 | error++; 30 | } 31 | } 32 | 33 | int main(int argc, char **argv) 34 | { 35 | utf8proc_int32_t byt; 36 | utf8proc_uint8_t buf[16]; 37 | 38 | (void) argc; (void) argv; /* unused */ 39 | 40 | tests = error = 0; 41 | 42 | // Check valid sequences that were considered valid erroneously before 43 | buf[0] = 0xef; 44 | buf[1] = 0xb7; 45 | for (byt = 0x90; byt < 0xa0; byt++) { 46 | CHECKVALID(2, byt, 3); 47 | } 48 | // Check 0xfffe and 0xffff 49 | buf[1] = 0xbf; 50 | CHECKVALID(2, 0xbe, 3); 51 | CHECKVALID(2, 0xbf, 3); 52 | // Check 0x??fffe & 0x??ffff 53 | for (byt = 0x1fffe; byt < 0x110000; byt += 0x10000) { 54 | buf[0] = 0xf0 | (byt >> 18); 55 | buf[1] = 0x80 | ((byt >> 12) & 0x3f); 56 | CHECKVALID(3, 0xbe, 4); 57 | CHECKVALID(3, 0xbf, 4); 58 | } 59 | 60 | // Continuation byte not after lead 61 | for (byt = 0x80; byt < 0xc0; byt++) { 62 | CHECKINVALID(0, byt, 1); 63 | } 64 | 65 | // Continuation byte not after lead 66 | for (byt = 0x80; byt < 0xc0; byt++) { 67 | CHECKINVALID(0, byt, 1); 68 | } 69 | 70 | // Test lead bytes 71 | for (byt = 0xc0; byt <= 0xff; byt++) { 72 | // Single lead byte at end of string 73 | CHECKINVALID(0, byt, 1); 74 | // Lead followed by non-continuation character < 0x80 75 | CHECKINVALID(1, 65, 2); 76 | // Lead followed by non-continuation character > 0xbf 77 | CHECKINVALID(1, 0xc0, 2); 78 | } 79 | 80 | // Test overlong 2-byte 81 | buf[0] = 0xc0; 82 | for (byt = 0x81; byt <= 0xbf; byt++) { 83 | CHECKINVALID(1, byt, 2); 84 | } 85 | buf[0] = 0xc1; 86 | for (byt = 0x80; byt <= 0xbf; byt++) { 87 | CHECKINVALID(1, byt, 2); 88 | } 89 | 90 | // Test overlong 3-byte 91 | buf[0] = 0xe0; 92 | buf[2] = 0x80; 93 | for (byt = 0x80; byt <= 0x9f; byt++) { 94 | CHECKINVALID(1, byt, 3); 95 | } 96 | 97 | // Test overlong 4-byte 98 | buf[0] = 0xf0; 99 | buf[2] = 0x80; 100 | buf[3] = 0x80; 101 | for (byt = 0x80; byt <= 0x8f; byt++) { 102 | CHECKINVALID(1, byt, 4); 103 | } 104 | 105 | // Test 4-byte > 0x10ffff 106 | buf[0] = 0xf4; 107 | buf[2] = 0x80; 108 | buf[3] = 0x80; 109 | for (byt = 0x90; byt <= 0xbf; byt++) { 110 | CHECKINVALID(1, byt, 4); 111 | } 112 | buf[1] = 0x80; 113 | for (byt = 0xf5; byt <= 0xf7; byt++) { 114 | CHECKINVALID(0, byt, 4); 115 | } 116 | 117 | // Test 5-byte 118 | buf[4] = 0x80; 119 | for (byt = 0xf8; byt <= 0xfb; byt++) { 120 | CHECKINVALID(0, byt, 5); 121 | } 122 | 123 | // Test 6-byte 124 | buf[5] = 0x80; 125 | for (byt = 0xfc; byt <= 0xfd; byt++) { 126 | CHECKINVALID(0, byt, 6); 127 | } 128 | 129 | // Test 7-byte 130 | buf[6] = 0x80; 131 | CHECKINVALID(0, 0xfe, 7); 132 | 133 | // Three and above byte sequences 134 | for (byt = 0xe0; byt < 0xf0; byt++) { 135 | // Lead followed by only 1 continuation byte 136 | CHECKINVALID(0, byt, 2); 137 | // Lead ended by non-continuation character < 0x80 138 | CHECKINVALID(2, 65, 3); 139 | // Lead ended by non-continuation character > 0xbf 140 | CHECKINVALID(2, 0xc0, 3); 141 | } 142 | 143 | // 3-byte encoded surrogate character(s) 144 | buf[0] = 0xed; buf[2] = 0x80; 145 | // Single surrogate 146 | CHECKINVALID(1, 0xa0, 3); 147 | // Trailing surrogate first 148 | CHECKINVALID(1, 0xb0, 3); 149 | 150 | // Four byte sequences 151 | buf[1] = 0x80; 152 | for (byt = 0xf0; byt < 0xf5; byt++) { 153 | // Lead followed by only 1 continuation bytes 154 | CHECKINVALID(0, byt, 2); 155 | // Lead followed by only 2 continuation bytes 156 | CHECKINVALID(0, byt, 3); 157 | // Lead followed by non-continuation character < 0x80 158 | CHECKINVALID(3, 65, 4); 159 | // Lead followed by non-continuation character > 0xbf 160 | CHECKINVALID(3, 0xc0, 4); 161 | 162 | } 163 | 164 | check(!error, "utf8proc_iterate FAILED %d tests out of %d", error, tests); 165 | printf("utf8proc_iterate tests SUCCEEDED, (%d) tests passed.\n", tests); 166 | 167 | return 0; 168 | } 169 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## utf8proc license ## 2 | 3 | **utf8proc** is a software package originally developed 4 | by Jan Behrens and the rest of the Public Software Group, who 5 | deserve nearly all of the credit for this library, that is now maintained by the Julia-language developers. Like the original utf8proc, 6 | whose copyright and license statements are reproduced below, all new 7 | work on the utf8proc library is licensed under the [MIT "expat" 8 | license](http://opensource.org/licenses/MIT): 9 | 10 | *Copyright © 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.* 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a 13 | copy of this software and associated documentation files (the "Software"), 14 | to deal in the Software without restriction, including without limitation 15 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 16 | and/or sell copies of the Software, and to permit persons to whom the 17 | Software is furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in 20 | all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 27 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 28 | DEALINGS IN THE SOFTWARE. 29 | 30 | ## Original utf8proc license ## 31 | 32 | *Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany* 33 | 34 | Permission is hereby granted, free of charge, to any person obtaining a 35 | copy of this software and associated documentation files (the "Software"), 36 | to deal in the Software without restriction, including without limitation 37 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 38 | and/or sell copies of the Software, and to permit persons to whom the 39 | Software is furnished to do so, subject to the following conditions: 40 | 41 | The above copyright notice and this permission notice shall be included in 42 | all copies or substantial portions of the Software. 43 | 44 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 49 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 50 | DEALINGS IN THE SOFTWARE. 51 | 52 | ## Unicode data license ## 53 | 54 | This software contains data (`utf8proc_data.c`) derived from processing 55 | the Unicode data files. The following license applies to that data: 56 | 57 | **COPYRIGHT AND PERMISSION NOTICE** 58 | 59 | *Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed 60 | under the Terms of Use in http://www.unicode.org/copyright.html.* 61 | 62 | Permission is hereby granted, free of charge, to any person obtaining a 63 | copy of the Unicode data files and any associated documentation (the "Data 64 | Files") or Unicode software and any associated documentation (the 65 | "Software") to deal in the Data Files or Software without restriction, 66 | including without limitation the rights to use, copy, modify, merge, 67 | publish, distribute, and/or sell copies of the Data Files or Software, and 68 | to permit persons to whom the Data Files or Software are furnished to do 69 | so, provided that (a) the above copyright notice(s) and this permission 70 | notice appear with all copies of the Data Files or Software, (b) both the 71 | above copyright notice(s) and this permission notice appear in associated 72 | documentation, and (c) there is clear notice in each modified Data File or 73 | in the Software as well as in the documentation associated with the Data 74 | File(s) or Software that the data or software has been modified. 75 | 76 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 77 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 78 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF 79 | THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS 80 | INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR 81 | CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 82 | USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 83 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 84 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 85 | 86 | Except as contained in this notice, the name of a copyright holder shall 87 | not be used in advertising or otherwise to promote the sale, use or other 88 | dealings in these Data Files or Software without prior written 89 | authorization of the copyright holder. 90 | 91 | Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be 92 | registered in some jurisdictions. All other trademarks and registered 93 | trademarks mentioned herein are the property of their respective owners. 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # utf8proc 2 | [![CI](https://github.com/NanoComp/meep/actions/workflows/build-ci.yml/badge.svg)](https://github.com/JuliaStrings/utf8proc/actions/workflows/build-ci.yml) 3 | [![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc) 4 | 5 | [utf8proc](http://juliastrings.github.io/utf8proc/) is a small, clean C 6 | library that provides Unicode normalization, case-folding, and other 7 | operations for data in the [UTF-8 8 | encoding](http://en.wikipedia.org/wiki/UTF-8). It was [initially 9 | developed](http://www.public-software-group.org/utf8proc) by Jan 10 | Behrens and the rest of the [Public Software 11 | Group](http://www.public-software-group.org/), who deserve *nearly all 12 | of the credit* for this package. With the blessing of the Public 13 | Software Group, the [Julia developers](http://julialang.org/) have 14 | taken over development of utf8proc, since the original developers have 15 | moved to other projects. 16 | 17 | (utf8proc is used for basic Unicode 18 | support in the [Julia language](http://julialang.org/), and the Julia 19 | developers became involved because they wanted to add Unicode 7 support and other features.) 20 | 21 | (The original utf8proc package also includes Ruby and PostgreSQL plug-ins. 22 | We removed those from utf8proc in order to focus exclusively on the C 23 | library.) 24 | 25 | The utf8proc package is licensed under the 26 | free/open-source [MIT "expat" 27 | license](http://opensource.org/licenses/MIT) (plus certain Unicode 28 | data governed by the similarly permissive [Unicode data 29 | license](http://www.unicode.org/copyright.html#Exhibit1)); please see 30 | the included `LICENSE.md` file for more detailed information. 31 | 32 | ## Quick Start 33 | 34 | Typical users should download a [utf8proc release](http://juliastrings.github.io/utf8proc/releases/) rather than cloning directly from github. 35 | 36 | For compilation of the C library, run `make`. You can also install the library and header file with `make install` (by default into `/usr/local/lib` and `/usr/local/bin`, but this can be changed by `make prefix=/some/dir`). `make check` runs some tests, and `make clean` deletes all of the generated files. 37 | 38 | Alternatively, you can compile with `cmake`, e.g. by 39 | ```sh 40 | mkdir build 41 | cmake -S . -B build 42 | cmake --build build 43 | ``` 44 | 45 | ### Using other compilers 46 | The included `Makefile` supports GNU/Linux flavors and MacOS with `gcc`-like compilers; Windows users will typically use `cmake`. 47 | 48 | For other Unix-like systems and other compilers, you may need to pass modified settings to `make` in order to use the correct compilation flags for building shared libraries on your system. 49 | 50 | For HP-UX with HP's `aCC` compiler and GNU Make (installed as `gmake`), you can compile with 51 | ``` 52 | gmake CC=/opt/aCC/bin/aCC CFLAGS="+O2" PICFLAG="+z" C99FLAG="-Ae" WCFLAGS="+w" LDFLAG_SHARED="-b" SOFLAG="-Wl,+h" 53 | ``` 54 | To run `gmake install` you will need GNU coreutils for the `install` command, and you may want to pass `prefix=/opt libdir=/opt/lib/hpux32` or similar to change the installation location. 55 | 56 | ## General Information 57 | 58 | The C library is found in this directory after successful compilation 59 | and is named `libutf8proc.a` (for the static library) and 60 | `libutf8proc.so` (for the dynamic library). 61 | 62 | The Unicode version supported is 16.0.0. 63 | 64 | For Unicode normalizations, the following options are used: 65 | 66 | * Normalization Form C: `STABLE`, `COMPOSE` 67 | * Normalization Form D: `STABLE`, `DECOMPOSE` 68 | * Normalization Form KC: `STABLE`, `COMPOSE`, `COMPAT` 69 | * Normalization Form KD: `STABLE`, `DECOMPOSE`, `COMPAT` 70 | 71 | ## C Library 72 | 73 | The documentation for the C library is found in the `utf8proc.h` header file. 74 | `utf8proc_map` is function you will most likely be using for mapping UTF-8 75 | strings, unless you want to allocate memory yourself. 76 | 77 | ## To Do 78 | 79 | See the Github [issues list](https://github.com/JuliaLang/utf8proc/issues). 80 | 81 | ## Contact 82 | 83 | Bug reports, feature requests, and other queries can be filed at 84 | the [utf8proc issues page on Github](https://github.com/JuliaLang/utf8proc/issues). 85 | 86 | ## See also 87 | 88 | An independent Lua translation of this library, [lua-mojibake](https://github.com/differentprogramming/lua-mojibake), is also available. 89 | 90 | ## Examples 91 | 92 | ### Convert codepoint to string 93 | ```c 94 | // Convert codepoint `a` to utf8 string `str` 95 | utf8proc_int32_t a = 223; 96 | utf8proc_uint8_t str[16] = { 0 }; 97 | utf8proc_encode_char(a, str); 98 | printf("%s\n", str); 99 | // ß 100 | ``` 101 | 102 | ### Convert string to codepoint 103 | ```c 104 | // Convert string `str` to pointer to codepoint `a` 105 | utf8proc_uint8_t str[] = "ß"; 106 | utf8proc_int32_t a; 107 | utf8proc_iterate(str, -1, &a); 108 | printf("%d\n", a); 109 | // 223 110 | ``` 111 | 112 | ### Casefold 113 | 114 | ```c 115 | // Convert "ß" (U+00DF) to its casefold variant "ss" 116 | utf8proc_uint8_t str[] = "ß"; 117 | utf8proc_uint8_t *fold_str; 118 | utf8proc_map(str, 0, &fold_str, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD); 119 | printf("%s\n", fold_str); 120 | // ss 121 | free(fold_str); 122 | ``` 123 | 124 | ### Normalization Form C/D (NFC/NFD) 125 | ```c 126 | // Decompose "\u00e4\u00f6\u00fc" = "äöü" into "a\u0308o\u0308u\u0308" (= "äöü" via combining char U+0308) 127 | utf8proc_uint8_t input[] = {0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc}; // "\u00e4\u00f6\u00fc" = "äöü" in UTF-8 128 | utf8proc_uint8_t *nfd= utf8proc_NFD(input); // = {0x61, 0xcc, 0x88, 0x6f, 0xcc, 0x88, 0x75, 0xcc, 0x88} 129 | 130 | // Compose "a\u0308o\u0308u\u0308" into "\u00e4\u00f6\u00fc" (= "äöü" via precomposed characters) 131 | utf8proc_uint8_t *nfc= utf8proc_NFC(nfd); 132 | 133 | free(nfd); 134 | free(nfc); 135 | ``` 136 | -------------------------------------------------------------------------------- /test/graphemetest.c: -------------------------------------------------------------------------------- 1 | #include "tests.h" 2 | 3 | /* check one line in the format of GraphemeBreakTest.txt */ 4 | void checkline(const char *_buf, bool verbose) { 5 | size_t bi = 0, si = 0; 6 | utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */ 7 | const unsigned char *buf = (const unsigned char *) _buf; 8 | 9 | while (buf[bi]) { 10 | bi = skipspaces(buf, bi); 11 | if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */ 12 | src[si++] = '/'; 13 | bi += 2; 14 | } 15 | else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */ 16 | bi += 2; 17 | } 18 | else if (buf[bi] == '#') { /* start of comments */ 19 | break; 20 | } 21 | else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */ 22 | src[si++] = '/'; 23 | bi += 1; 24 | } 25 | else { /* hex-encoded codepoint */ 26 | size_t len = encode((unsigned char*) (src + si), buf + bi) - 1; 27 | while (src[si]) ++si; /* advance to NUL termination */ 28 | bi += len; 29 | } 30 | } 31 | if (si && src[si-1] == '/') 32 | --si; /* no break after final grapheme */ 33 | src[si] = 0; /* NUL-terminate */ 34 | 35 | if (si) { /* test utf8proc_map */ 36 | utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ 37 | size_t i = 0, j = 0; 38 | utf8proc_ssize_t glen, k; 39 | utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ 40 | while (i < si) { 41 | if (src[i] != '/') 42 | utf8[j++] = src[i++]; 43 | else 44 | i++; 45 | } 46 | glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND); 47 | if (glen == UTF8PROC_ERROR_INVALIDUTF8) { 48 | /* the test file contains surrogate codepoints, which are only for UTF-16 */ 49 | printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); 50 | } 51 | else { 52 | check(glen >= 0, "utf8proc_map error = %s", 53 | utf8proc_errmsg(glen)); 54 | for (k = 0; k <= glen; ++k) 55 | if (g[k] == 0xff) 56 | g[k] = '/'; /* easier-to-read output (/ is not in test strings) */ 57 | check(!strcmp((char*)g, (char*)src), 58 | "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); 59 | } 60 | free(g); 61 | } 62 | 63 | if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ 64 | utf8proc_int32_t state = 0, prev_codepoint = 0; 65 | size_t i = 0; 66 | utf8proc_bool expectbreak = false; 67 | do { 68 | utf8proc_int32_t codepoint; 69 | i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint); 70 | check(codepoint >= 0, "invalid UTF-8 data"); 71 | if (codepoint == 0x002F) 72 | expectbreak = true; 73 | else { 74 | if (prev_codepoint != 0) { 75 | check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), 76 | "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); 77 | } 78 | expectbreak = false; 79 | prev_codepoint = codepoint; 80 | } 81 | } while (i < si); 82 | } 83 | 84 | if (verbose) 85 | printf("passed grapheme test: \"%s\"\n", (char*) src); 86 | } 87 | 88 | int main(int argc, char **argv) 89 | { 90 | unsigned char buf[8192]; 91 | FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; 92 | 93 | check(f != NULL, "error opening GraphemeBreakTest.txt"); 94 | while (simple_getline(buf, f) > 0) { 95 | if ((++lineno) % 100 == 0) 96 | printf("checking line %zd...\n", lineno); 97 | if (buf[0] == '#') continue; 98 | checkline((char *) buf, false); 99 | } 100 | fclose(f); 101 | printf("Passed tests after %zd lines!\n", lineno); 102 | 103 | printf("Performing regression tests...\n"); 104 | 105 | /* issue 144 */ 106 | { 107 | utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */ 108 | utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */ 109 | utf8proc_ssize_t glen; 110 | utf8proc_uint8_t *g; 111 | glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); 112 | check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); 113 | check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks"); 114 | free(g); 115 | }; 116 | 117 | /* https://github.com/JuliaLang/julia/issues/37680 */ 118 | checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */ 119 | checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */ 120 | checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */ 121 | 122 | /* more GB9c tests */ 123 | checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true); 124 | checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true); 125 | checkline("/ 0915 0300 0300 / 0924 / 0915 /", true); 126 | checkline("/ 0915 0300 094d 0300 / 0078 /", true); 127 | checkline("/ 0300 094d 0300 / 0924 / 0915 /", true); 128 | 129 | check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test"); 130 | check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test"); 131 | 132 | printf("Passed regression tests!\n"); 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # libutf8proc Makefile 2 | 3 | # programs 4 | AR?=ar 5 | CC?=gcc 6 | INSTALL=install 7 | FIND=find 8 | PERL=perl 9 | 10 | # compiler settings 11 | CFLAGS ?= -O2 12 | PICFLAG = -fPIC 13 | C99FLAG = -std=c99 14 | WCFLAGS = -Wsign-conversion -Wall -Wextra -pedantic 15 | UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES) 16 | LDFLAG_SHARED = -shared 17 | SOFLAG = -Wl,-soname 18 | 19 | # shared-library version MAJOR.MINOR.PATCH ... this may be *different* 20 | # from the utf8proc version number because it indicates ABI compatibility, 21 | # not API compatibility: MAJOR should be incremented whenever *binary* 22 | # compatibility is broken, even if the API is backward-compatible. 23 | # The API version number is defined in utf8proc.h. 24 | # Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt! 25 | MAJOR=3 26 | MINOR=1 27 | PATCH=0 28 | 29 | # api version (also in utf8proc.h and CMakeLists.txt) 30 | VERSION=2.10.0 31 | 32 | OS := $(shell uname) 33 | ifeq ($(OS),Darwin) # MacOS X 34 | SHLIB_EXT = dylib 35 | SHLIB_VERS_EXT = $(MAJOR).dylib 36 | else # GNU/Linux, at least (Windows should probably use cmake) 37 | SHLIB_EXT = so 38 | SHLIB_VERS_EXT = so.$(MAJOR).$(MINOR).$(PATCH) 39 | endif 40 | 41 | # installation directories (for 'make install') 42 | prefix=/usr/local 43 | libdir=$(prefix)/lib 44 | includedir=$(prefix)/include 45 | pkgconfigdir=$(libdir)/pkgconfig 46 | 47 | pkglibdir=$(libdir:$(prefix)/%=%) 48 | pkgincludedir=$(includedir:$(prefix)/%=%) 49 | 50 | # meta targets 51 | 52 | .PHONY: all clean data update manifest install 53 | 54 | all: libutf8proc.a libutf8proc.$(SHLIB_EXT) 55 | 56 | clean: 57 | rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) 58 | rm -f libutf8proc.pc 59 | ifneq ($(OS),Darwin) 60 | rm -f libutf8proc.so.$(MAJOR) 61 | endif 62 | rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase 63 | rm -rf MANIFEST.new tmp 64 | $(MAKE) -C bench clean 65 | $(MAKE) -C data clean 66 | 67 | data: data/utf8proc_data.c.new 68 | 69 | update: data/utf8proc_data.c.new 70 | cp -f data/utf8proc_data.c.new utf8proc_data.c 71 | 72 | manifest: MANIFEST.new 73 | 74 | # real targets 75 | 76 | data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.jl 77 | $(MAKE) -C data utf8proc_data.c.new 78 | 79 | utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c 80 | $(CC) $(UCFLAGS) -c -o utf8proc.o utf8proc.c 81 | 82 | libutf8proc.a: utf8proc.o 83 | rm -f libutf8proc.a 84 | $(AR) crs libutf8proc.a utf8proc.o 85 | 86 | libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH): utf8proc.o 87 | $(CC) $(LDFLAGS) $(LDFLAG_SHARED) -o $@ $(SOFLAG) -Wl,libutf8proc.so.$(MAJOR) utf8proc.o 88 | chmod a-x $@ 89 | 90 | libutf8proc.so: libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) 91 | ln -f -s libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) $@ 92 | ln -f -s libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) $@.$(MAJOR) 93 | 94 | libutf8proc.$(MAJOR).dylib: utf8proc.o 95 | $(CC) $(LDFLAGS) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH) 96 | 97 | libutf8proc.dylib: libutf8proc.$(MAJOR).dylib 98 | ln -f -s libutf8proc.$(MAJOR).dylib $@ 99 | 100 | libutf8proc.pc: libutf8proc.pc.in 101 | sed \ 102 | -e 's#PREFIX#$(prefix)#' \ 103 | -e 's#LIBDIR#$(pkglibdir)#' \ 104 | -e 's#INCLUDEDIR#$(pkgincludedir)#' \ 105 | -e 's#VERSION#$(MAJOR).$(MINOR).$(PATCH)#' \ 106 | libutf8proc.pc.in > libutf8proc.pc 107 | 108 | install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.pc 109 | mkdir -m 755 -p $(DESTDIR)$(includedir) 110 | $(INSTALL) -m 644 utf8proc.h $(DESTDIR)$(includedir) 111 | mkdir -m 755 -p $(DESTDIR)$(libdir) 112 | $(INSTALL) -m 644 libutf8proc.a $(DESTDIR)$(libdir) 113 | $(INSTALL) -m 755 libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir) 114 | mkdir -m 755 -p $(DESTDIR)$(pkgconfigdir) 115 | $(INSTALL) -m 644 libutf8proc.pc $(DESTDIR)$(pkgconfigdir)/libutf8proc.pc 116 | ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.$(SHLIB_EXT) 117 | ifneq ($(OS),Darwin) 118 | ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.so.$(MAJOR) 119 | endif 120 | 121 | MANIFEST.new: 122 | rm -rf tmp 123 | $(MAKE) install prefix=/usr DESTDIR=$(PWD)/tmp 124 | $(FIND) tmp/usr -mindepth 1 -type l -printf "%P -> %l\n" -or -type f -printf "%P\n" -or -type d -printf "%P/\n" | LC_ALL=C sort > $@ 125 | rm -rf tmp 126 | 127 | # Test programs 128 | 129 | data/NormalizationTest.txt: 130 | $(MAKE) -C data NormalizationTest.txt 131 | 132 | data/GraphemeBreakTest.txt: 133 | $(MAKE) -C data GraphemeBreakTest.txt 134 | 135 | data/Lowercase.txt: 136 | $(MAKE) -C data Lowercase.txt 137 | 138 | data/Uppercase.txt: 139 | $(MAKE) -C data Uppercase.txt 140 | 141 | test/tests.o: test/tests.c test/tests.h utf8proc.h 142 | $(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c 143 | 144 | test/normtest: test/normtest.c test/tests.o utf8proc.o utf8proc.h test/tests.h 145 | $(CC) $(UCFLAGS) $(LDFLAGS) test/normtest.c test/tests.o utf8proc.o -o $@ 146 | 147 | test/graphemetest: test/graphemetest.c test/tests.o utf8proc.o utf8proc.h test/tests.h 148 | $(CC) $(UCFLAGS) $(LDFLAGS) test/graphemetest.c test/tests.o utf8proc.o -o $@ 149 | 150 | test/printproperty: test/printproperty.c test/tests.o utf8proc.o utf8proc.h test/tests.h 151 | $(CC) $(UCFLAGS) $(LDFLAGS) test/printproperty.c test/tests.o utf8proc.o -o $@ 152 | 153 | test/charwidth: test/charwidth.c test/tests.o utf8proc.o utf8proc.h test/tests.h 154 | $(CC) $(UCFLAGS) $(LDFLAGS) test/charwidth.c test/tests.o utf8proc.o -o $@ 155 | 156 | test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h 157 | $(CC) $(UCFLAGS) $(LDFLAGS) test/valid.c test/tests.o utf8proc.o -o $@ 158 | 159 | test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h 160 | $(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@ 161 | 162 | test/iscase: test/iscase.c test/tests.o utf8proc.o utf8proc.h test/tests.h 163 | $(CC) $(UCFLAGS) $(LDFLAGS) test/iscase.c test/tests.o utf8proc.o -o $@ 164 | 165 | test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h 166 | $(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@ 167 | 168 | test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h 169 | $(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@ 170 | 171 | test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h 172 | $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@ 173 | 174 | # make release tarball from master branch 175 | dist: 176 | git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz 177 | 178 | # build tarball, make sure it passes checks, and make sure version numbers are consistent 179 | distcheck: dist 180 | test `grep UTF8PROC_VERSION utf8proc.h | cut -d' ' -f3 | tr '\n' .` = $(VERSION). || exit 1 181 | test `grep "utf8proc VERSION" CMakeLists.txt |cut -d' ' -f 4` = $(VERSION) || exit 1 182 | test `grep libutf8proc.so.$(MAJOR).$(MINOR).$(PATCH) MANIFEST | wc -l` = 3 || exit 1 183 | test `grep 'set(SO_' CMakeLists.txt |cut -d' ' -f2 | tr -d ')' | tr '\n' '.'` = $(MAJOR).$(MINOR).$(PATCH). || exit 1 184 | rm -rf utf8proc-$(VERSION) 185 | tar xzf utf8proc-$(VERSION).tar.gz 186 | make -C utf8proc-$(VERSION) check 187 | rm -rf utf8proc-$(VERSION) 188 | 189 | check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o 190 | $(MAKE) -C bench 191 | test/normtest data/NormalizationTest.txt 192 | test/graphemetest data/GraphemeBreakTest.txt 193 | test/charwidth 194 | test/misc 195 | test/valid 196 | test/iterate 197 | test/case 198 | test/iscase data/Lowercase.txt data/Uppercase.txt 199 | test/custom 200 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # utf8proc release history # 2 | 3 | ## Version 2.10.0 ## 4 | 5 | 2024-12-31 6 | 7 | - Unicode 16 support ([#277]). 8 | - New `utf8proc_charwidth_ambiguous` function to return whether a character has 9 | East Asian width class A (Ambiguous) ([#270]). 10 | 11 | ## Version 2.9.0 ## 12 | 13 | 2023-10-20 14 | 15 | - Unicode 15.1 support ([#253]). 16 | 17 | ## Version 2.8.0 ## 18 | 19 | 2022-10-30 20 | 21 | - Unicode 15 support ([#247]). 22 | 23 | ## Version 2.7.0 ## 24 | 25 | 2021-12-16 26 | 27 | - Unicode 14 support ([#233]). 28 | 29 | - Support `GNUInstallDirs` in CMake build ([#159]). 30 | 31 | - `cmake` build now installs `pkg-config` file ([#224]). 32 | 33 | - Various build and portability improvements. 34 | 35 | ## Version 2.6.1 ## 36 | 37 | 2020-12-15 38 | 39 | - Bugfix in `utf8proc_grapheme_break_stateful` for `NULL` state argument, which 40 | also broke `utf8proc_grapheme_break`. 41 | 42 | ## Version 2.6 ## 43 | 44 | 2020-11-23 45 | 46 | - New `utf8proc_islower` and `utf8proc_isupper` functions ([#196]). 47 | 48 | - Bugfix for manual calls to `grapheme_break_extended` for initial characters ([#205]). 49 | 50 | - Various build and portability improvements. 51 | 52 | ## Version 2.5 ## 53 | 54 | 2019-03-27 55 | 56 | - Unicode 13 support ([#179]). 57 | 58 | - No longer report zero width for category Sk ([#167]). 59 | 60 | - `cmake` support improvements ([#173]). 61 | 62 | ## Version 2.4 ## 63 | 64 | 2019-05-10 65 | 66 | - Unicode 12.1 support ([#156]). 67 | 68 | - New `-DUTF8PROC_INSTALL=No` option for `cmake` builds to disable installation ([#152]). 69 | 70 | - Better `make` support for HP-UX ([#154]). 71 | 72 | - Fixed incorrect `UTF8PROC_VERSION_MINOR` version number in header and bumped shared-library version. 73 | 74 | ## Version 2.3 ## 75 | 76 | 2019-03-30 77 | 78 | - Unicode 12 support ([#148]). 79 | 80 | - New function `utf8proc_unicode_version` to return the supported Unicode version ([#151]). 81 | 82 | - Simpler character-width computation that no longer uses GNU Unifont metrics: East-Asian wide 83 | characters have width 2, and all other printable characters have width 1 ([#150]). 84 | 85 | - Fix `CHARBOUND` option for `utf8proc_map` to preserve U+FFFE and U+FFFF non-characters ([#149]). 86 | 87 | - Various build-system improvements ([#141], [#142], [#147]). 88 | 89 | ## Version 2.2 ## 90 | 91 | 2018-07-24 92 | 93 | - Unicode 11 support ([#132] and [#140]). 94 | 95 | - `utf8proc_NFKC_Casefold` convenience function for `NFKC_Casefold` 96 | normalization ([#133]). 97 | 98 | - `UTF8PROC_STRIPNA` option to strip unassigned codepoints ([#133]). 99 | 100 | - Support building static libraries on Windows (callers need to 101 | `#define UTF8PROC_STATIC`) ([#123]). 102 | 103 | - `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]). 104 | 105 | - `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl; 106 | case-folding still yields the standard "ss" mapping. 107 | 108 | - `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and 109 | for unassigned/PUA codepoints ([#135]). 110 | 111 | ## Version 2.1.1 ## 112 | 113 | 2018-04-27 114 | 115 | - Fixed composition bug ([#128]). 116 | 117 | - Minor build fixes ([#94], [#99], [#113], [#125]). 118 | 119 | ## Version 2.1 ## 120 | 121 | 2016-12-26: 122 | 123 | - New functions `utf8proc_map_custom` and `utf8proc_decompose_custom` 124 | to allow user-supplied transformations of codepoints, in conjunction 125 | with other transformations ([#89]). 126 | 127 | - New function `utf8proc_normalize_utf32` to apply normalizations 128 | directly to UTF-32 data (not just UTF-8) ([#88]). 129 | 130 | - Fixed stack overflow that could occur due to incorrect definition 131 | of `UINT16_MAX` with some compilers ([#84]). 132 | 133 | - Fixed conflict with `stdbool.h` in Visual Studio ([#90]). 134 | 135 | - Updated font metrics to use Unifont 9.0.04. 136 | 137 | ## Version 2.0.2 ## 138 | 139 | 2016-07-27: 140 | 141 | - Move `-Wmissing-prototypes` warning flag from `Makefile` to `.travis.yml` 142 | since MSVC does not understand this flag and it is occasionally useful to 143 | build using MSVC through the `Makefile` ([#79]). 144 | 145 | - Use a different variable name for a nested loop in `bench/bench.c`, and 146 | declare it in a C89 way rather than inside the `for` to avoid "error: 147 | 'for' loop initial declarations are only allowed in C99 mode" ([#80]). 148 | 149 | ## Version 2.0.1 ## 150 | 151 | 2016-07-13: 152 | 153 | - Bug fix in `utf8proc_grapheme_break_stateful` ([#77]). 154 | 155 | - Tests now use versioned Unicode files, so they will no longer 156 | break when a new version of Unicode is released ([#78]). 157 | 158 | ## Version 2.0 ## 159 | 160 | 2016-07-13: 161 | 162 | - Updated for Unicode 9.0 ([#70]). 163 | 164 | - New `utf8proc_grapheme_break_stateful` to handle the complicated 165 | grapheme-breaking rules in Unicode 9. The old `utf8proc_grapheme_break` 166 | is still provided, but may incorrectly identify grapheme breaks 167 | in some Unicode-9 sequences. 168 | 169 | - Smaller Unicode tables ([#62], [#68]). This required changes 170 | in the `utf8proc_property_t` structure, which breaks backward 171 | compatibility if you access this `struct` directly. The 172 | functions in the API remain backward-compatible, however. 173 | 174 | - Buffer overrun fix ([#66]). 175 | 176 | ## Version 1.3.1 ## 177 | 178 | 2015-11-02: 179 | 180 | - Do not export symbol for internal function `unsafe_encode_char()` ([#55]). 181 | 182 | - Install relative symbolic links for shared libraries ([#58]). 183 | 184 | - Enable and fix compiler warnings ([#55], [#58]). 185 | 186 | - Add missing files to `make clean` ([#58]). 187 | 188 | ## Version 1.3 ## 189 | 190 | 2015-07-06: 191 | 192 | - Updated for Unicode 8.0 ([#45]). 193 | 194 | - New `utf8proc_tolower` and `utf8proc_toupper` functions, portable 195 | replacements for `towlower` and `towupper` in the C library ([#40]). 196 | 197 | - Don't treat Unicode "non-characters" as invalid, and improved 198 | validity checking in general ([#35]). 199 | 200 | - Prefix all typedefs with `utf8proc_`, e.g. `utf8proc_int32_t`, 201 | to avoid collisions with other libraries ([#32]). 202 | 203 | - Rename `DLLEXPORT` to `UTF8PROC_DLLEXPORT` to prevent collisions. 204 | 205 | - Fix build breakage in the benchmark routines. 206 | 207 | - More fine-grained Makefile variables (`PICFLAG` etcetera), so that 208 | compilation flags can be selectively overridden, and in particular 209 | so that `CFLAGS` can be changed without accidentally eliminating 210 | necessary flags like `-fPIC` and `-std=c99` ([#43]). 211 | 212 | - Updated character-width tables based on Unifont 8.0.01 ([#51]) and 213 | the Unicode 8 character categories ([#47]). 214 | 215 | ## Version 1.2 ## 216 | 217 | 2015-03-28: 218 | 219 | - Updated for Unicode 7.0 ([#6]). 220 | 221 | - New function `utf8proc_grapheme_break(c1,c2)` that returns whether 222 | there is a grapheme break between `c1` and `c2` ([#20]). 223 | 224 | - New function `utf8proc_charwidth(c)` that returns the number of 225 | column-positions that should be required for `c`; essentially a 226 | portable replacment for `wcwidth(c)` ([#27]). 227 | 228 | - New function `utf8proc_category(c)` that returns the Unicode 229 | category of `c` (as one of the constants `UTF8PROC_CATEGORY_xx`). 230 | Also, a function `utf8proc_category_string(c)` that returns the Unicode 231 | category of `c` as a two-character string. 232 | 233 | - `cmake` script `CMakeLists.txt`, in addition to `Makefile`, for 234 | easier compilation on Windows ([#28]). 235 | 236 | - Various `Makefile` improvements: a `make check` target to perform 237 | tests ([#13]), `make install`, a rule to automate updating the Unicode 238 | tables, etcetera. 239 | 240 | - The shared library is now versioned (e.g. has a soname on GNU/Linux) ([#24]). 241 | 242 | - C++/MSVC compatibility ([#17]). 243 | 244 | - Most `#defined` constants are now `enums` ([#29]). 245 | 246 | - New preprocessor constants `UTF8PROC_VERSION_MAJOR`, 247 | `UTF8PROC_VERSION_MINOR`, and `UTF8PROC_VERSION_PATCH` for compile-time 248 | detection of the API version. 249 | 250 | - Doxygen-formatted documentation ([#29]). 251 | 252 | - The Ruby and PostgreSQL plugins have been removed due to lack of testing ([#22]). 253 | 254 | ## Version 1.1.6 ## 255 | 256 | 2013-11-27: 257 | 258 | - PostgreSQL 9.2 and 9.3 compatibility (lowercase `c` language name) 259 | 260 | ## Version 1.1.5 ## 261 | 262 | 2009-08-20: 263 | 264 | - Use `RSTRING_PTR()` and `RSTRING_LEN()` instead of `RSTRING()->ptr` and 265 | `RSTRING()->len` for ruby1.9 compatibility (and `#define` them, if not 266 | existent) 267 | 268 | 2009-10-02: 269 | 270 | - Patches for compatibility with Microsoft Visual Studio 271 | 272 | 2009-10-08: 273 | 274 | - Fixes to make utf8proc usable in C++ programs 275 | 276 | 2009-10-16: 277 | 278 | ## Version 1.1.4 ## 279 | 280 | 2009-06-14: 281 | 282 | - replaced C++ style comments for compatibility reasons 283 | - added typecasts to suppress compiler warnings 284 | - removed redundant source files for ruby-gemfile generation 285 | 286 | 2009-08-19: 287 | 288 | - Changed copyright notice for Public Software Group e. V. 289 | - Minor changes in the `README` file 290 | 291 | ## Version 1.1.3 ## 292 | 293 | 2008-10-04: 294 | 295 | - Added a function `utf8proc_version` returning a string containing the version 296 | number of the library. 297 | - Included a target `libutf8proc.dylib` for MacOSX. 298 | 299 | 2009-05-01: 300 | - PostgreSQL 8.3 compatibility (use of `SET_VARSIZE` macro) 301 | 302 | ## Version 1.1.2 ## 303 | 304 | 2007-07-25: 305 | 306 | - Fixed a serious bug in the data file generator, which caused characters 307 | being treated incorrectly, when stripping default ignorable characters or 308 | calculating grapheme cluster boundaries. 309 | 310 | ## Version 1.1.1 ## 311 | 312 | 2007-06-25: 313 | 314 | - Added a new PostgreSQL function `unistrip`, which behaves like `unifold`, 315 | but also removes all character marks (e.g. accents). 316 | 317 | 2007-07-22: 318 | 319 | - Changed license from BSD to MIT style. 320 | - Added a new function `utf8proc_codepoint_valid` to the C library. 321 | - Changed compiler flags in `Makefile` from `-g -O0` to `-O2` 322 | - The ruby script, which was used to build the `utf8proc_data.c` file, is now 323 | included in the distribution. 324 | 325 | ## Version 1.0.3 ## 326 | 327 | 2007-03-16: 328 | 329 | - Fixed a bug in the ruby library, which caused an error, when splitting an 330 | empty string at grapheme cluster boundaries (method `String#utf8chars`). 331 | 332 | ## Version 1.0.2 ## 333 | 334 | 2006-09-21: 335 | 336 | - included a check in `Integer#utf8`, which raises an exception, if the given 337 | code-point is invalid because of being too high (this was missing yet) 338 | 339 | 2006-12-26: 340 | 341 | - added support for PostgreSQL version 8.2 342 | 343 | ## Version 1.0.1 ## 344 | 345 | 2006-09-20: 346 | 347 | - included a gem file for the ruby version of the library 348 | 349 | Release of version 1.0.1 350 | 351 | ## Version 1.0 ## 352 | 353 | 2006-09-17: 354 | 355 | - added the `LUMP` option, which lumps certain characters together (see `lump.md`) (also used for the PostgreSQL `unifold` function) 356 | - added the `STRIPMARK` option, which strips marking characters (or marks of composed characters) 357 | - deprecated ruby method `String#char_ary` in favour of `String#utf8chars` 358 | 359 | ## Version 0.3 ## 360 | 361 | 2006-07-18: 362 | 363 | - changed normalization from NFC to NFKC for postgresql unifold function 364 | 365 | 2006-08-04: 366 | 367 | - added support to mark the beginning of a grapheme cluster with 0xFF (option: `CHARBOUND`) 368 | - added the ruby method `String#chars`, which is returning an array of UTF-8 encoded grapheme clusters 369 | - added `NLF2LF` transformation in postgresql `unifold` function 370 | - added the `DECOMPOSE` option, if you neither use `COMPOSE` or `DECOMPOSE`, no normalization will be performed (different from previous versions) 371 | - using integer constants rather than C-strings for character properties 372 | - fixed (hopefully) a problem with the ruby library on Mac OS X, which occurred when compiler optimization was switched on 373 | 374 | ## Version 0.2 ## 375 | 376 | 2006-06-05: 377 | 378 | - changed behaviour of PostgreSQL function to return NULL in case of invalid input, rather than raising an exceptional condition 379 | - improved efficiency of PostgreSQL function (no transformation to C string is done) 380 | 381 | 2006-06-20: 382 | 383 | - added -fpic compiler flag in Makefile 384 | - fixed bug in the C code for the ruby library (usage of non-existent function) 385 | 386 | ## Version 0.1 ## 387 | 388 | 2006-06-02: initial release of version 0.1 389 | 390 | 391 | 392 | [#6]: https://github.com/JuliaStrings/utf8proc/issues/6 393 | [#13]: https://github.com/JuliaStrings/utf8proc/issues/13 394 | [#17]: https://github.com/JuliaStrings/utf8proc/issues/17 395 | [#20]: https://github.com/JuliaStrings/utf8proc/issues/20 396 | [#22]: https://github.com/JuliaStrings/utf8proc/issues/22 397 | [#24]: https://github.com/JuliaStrings/utf8proc/issues/24 398 | [#27]: https://github.com/JuliaStrings/utf8proc/issues/27 399 | [#28]: https://github.com/JuliaStrings/utf8proc/issues/28 400 | [#29]: https://github.com/JuliaStrings/utf8proc/issues/29 401 | [#32]: https://github.com/JuliaStrings/utf8proc/issues/32 402 | [#35]: https://github.com/JuliaStrings/utf8proc/issues/35 403 | [#40]: https://github.com/JuliaStrings/utf8proc/issues/40 404 | [#43]: https://github.com/JuliaStrings/utf8proc/issues/43 405 | [#45]: https://github.com/JuliaStrings/utf8proc/issues/45 406 | [#47]: https://github.com/JuliaStrings/utf8proc/issues/47 407 | [#51]: https://github.com/JuliaStrings/utf8proc/issues/51 408 | [#55]: https://github.com/JuliaStrings/utf8proc/issues/55 409 | [#58]: https://github.com/JuliaStrings/utf8proc/issues/58 410 | [#62]: https://github.com/JuliaStrings/utf8proc/issues/62 411 | [#66]: https://github.com/JuliaStrings/utf8proc/issues/66 412 | [#68]: https://github.com/JuliaStrings/utf8proc/issues/68 413 | [#70]: https://github.com/JuliaStrings/utf8proc/issues/70 414 | [#77]: https://github.com/JuliaStrings/utf8proc/issues/77 415 | [#78]: https://github.com/JuliaStrings/utf8proc/issues/78 416 | [#79]: https://github.com/JuliaStrings/utf8proc/issues/79 417 | [#80]: https://github.com/JuliaStrings/utf8proc/issues/80 418 | [#84]: https://github.com/JuliaStrings/utf8proc/issues/84 419 | [#88]: https://github.com/JuliaStrings/utf8proc/issues/88 420 | [#89]: https://github.com/JuliaStrings/utf8proc/issues/89 421 | [#90]: https://github.com/JuliaStrings/utf8proc/issues/90 422 | [#94]: https://github.com/JuliaStrings/utf8proc/issues/94 423 | [#99]: https://github.com/JuliaStrings/utf8proc/issues/99 424 | [#113]: https://github.com/JuliaStrings/utf8proc/issues/113 425 | [#121]: https://github.com/JuliaStrings/utf8proc/issues/121 426 | [#123]: https://github.com/JuliaStrings/utf8proc/issues/123 427 | [#125]: https://github.com/JuliaStrings/utf8proc/issues/125 428 | [#128]: https://github.com/JuliaStrings/utf8proc/issues/128 429 | [#132]: https://github.com/JuliaStrings/utf8proc/issues/132 430 | [#133]: https://github.com/JuliaStrings/utf8proc/issues/133 431 | [#134]: https://github.com/JuliaStrings/utf8proc/issues/134 432 | [#135]: https://github.com/JuliaStrings/utf8proc/issues/135 433 | [#140]: https://github.com/JuliaStrings/utf8proc/issues/140 434 | [#141]: https://github.com/JuliaStrings/utf8proc/issues/141 435 | [#142]: https://github.com/JuliaStrings/utf8proc/issues/142 436 | [#147]: https://github.com/JuliaStrings/utf8proc/issues/147 437 | [#148]: https://github.com/JuliaStrings/utf8proc/issues/148 438 | [#149]: https://github.com/JuliaStrings/utf8proc/issues/149 439 | [#150]: https://github.com/JuliaStrings/utf8proc/issues/150 440 | [#151]: https://github.com/JuliaStrings/utf8proc/issues/151 441 | [#152]: https://github.com/JuliaStrings/utf8proc/issues/152 442 | [#154]: https://github.com/JuliaStrings/utf8proc/issues/154 443 | [#156]: https://github.com/JuliaStrings/utf8proc/issues/156 444 | [#159]: https://github.com/JuliaStrings/utf8proc/issues/159 445 | [#167]: https://github.com/JuliaStrings/utf8proc/issues/167 446 | [#173]: https://github.com/JuliaStrings/utf8proc/issues/173 447 | [#179]: https://github.com/JuliaStrings/utf8proc/issues/179 448 | [#196]: https://github.com/JuliaStrings/utf8proc/issues/196 449 | [#205]: https://github.com/JuliaStrings/utf8proc/issues/205 450 | [#224]: https://github.com/JuliaStrings/utf8proc/issues/224 451 | [#233]: https://github.com/JuliaStrings/utf8proc/issues/233 452 | [#247]: https://github.com/JuliaStrings/utf8proc/issues/247 453 | [#253]: https://github.com/JuliaStrings/utf8proc/issues/253 454 | [#270]: https://github.com/JuliaStrings/utf8proc/issues/270 455 | [#277]: https://github.com/JuliaStrings/utf8proc/issues/277 456 | -------------------------------------------------------------------------------- /data/data_generator.jl: -------------------------------------------------------------------------------- 1 | using OffsetArrays: Origin 2 | 3 | parsehex(str) = parse(UInt32, str, base=16) 4 | 5 | function parse_hex_range(line) 6 | m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line) 7 | if isnothing(m) 8 | return nothing 9 | end 10 | i = parsehex(m[1]) 11 | j = !isnothing(m[3]) ? parsehex(m[3]) : i 12 | desc = rstrip(m[4]) 13 | return (i:j, desc) 14 | end 15 | 16 | function read_hex_ranges(filename) 17 | [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)] 18 | end 19 | 20 | function collect_codepoints(range_desc, description) 21 | list = UInt32[] 22 | for (r,d) in range_desc 23 | if d == description 24 | append!(list, r) 25 | end 26 | end 27 | list 28 | end 29 | 30 | function set_all!(d, keys, value) 31 | for k in keys 32 | d[k] = value 33 | end 34 | end 35 | 36 | #------------------------------------------------------------------------------- 37 | 38 | derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt") 39 | 40 | ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point")) 41 | uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase")) 42 | lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase")) 43 | 44 | 45 | #------------------------------------------------------------------------------- 46 | function derive_indic_conjunct_break(derived_core_properties) 47 | props = Dict{UInt32, String}() 48 | set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"), "LINKER") 49 | set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT") 50 | set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"), "EXTEND") 51 | props 52 | end 53 | 54 | let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties) 55 | global function get_indic_conjunct_break(code) 56 | get(indic_conjunct_break, code, "NONE") 57 | end 58 | end 59 | 60 | #------------------------------------------------------------------------------- 61 | function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename) 62 | grapheme_boundclass = Dict{UInt32, String}() 63 | for (r,desc) in read_hex_ranges(grapheme_break_filename) 64 | set_all!(grapheme_boundclass, r, Base.uppercase(desc)) 65 | end 66 | for (r,desc) in read_hex_ranges(emoji_data_filename) 67 | if desc == "Extended_Pictographic" 68 | set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC") 69 | elseif desc == "Emoji_Modifier" 70 | set_all!(grapheme_boundclass, r, "EXTEND") 71 | end 72 | end 73 | return grapheme_boundclass 74 | end 75 | 76 | let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt") 77 | global function get_grapheme_boundclass(code) 78 | get(grapheme_boundclasses, code, "OTHER") 79 | end 80 | end 81 | 82 | #------------------------------------------------------------------------------- 83 | function read_composition_exclusions(pattern) 84 | section = match(pattern, read("CompositionExclusions.txt",String)).match 85 | es = UInt32[] 86 | for line in split(section, '\n') 87 | m = match(r"^([0-9A-F]+) +#"i, line) 88 | if !isnothing(m) 89 | push!(es, parsehex(m[1])) 90 | end 91 | end 92 | es 93 | end 94 | 95 | exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s)) 96 | excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s)) 97 | 98 | #------------------------------------------------------------------------------- 99 | function read_case_folding(filename) 100 | case_folding = Dict{UInt32,Vector{UInt32}}() 101 | for line in readlines(filename) 102 | m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line) 103 | !isnothing(m) || continue 104 | case_folding[parsehex(m[1])] = parsehex.(split(m[2])) 105 | end 106 | case_folding 107 | end 108 | 109 | let case_folding = read_case_folding("CaseFolding.txt") 110 | global function get_case_folding(code) 111 | get(case_folding, code, nothing) 112 | end 113 | end 114 | 115 | #------------------------------------------------------------------------------- 116 | # Utilities for reading per-char properties from UnicodeData.txt 117 | function split_unicode_data_line(line) 118 | m = match(r""" 119 | ([0-9A-F]+); # code 120 | ([^;]+); # name 121 | ([A-Z]+); # general category 122 | ([0-9]+); # canonical combining class 123 | ([A-Z]+); # bidi class 124 | (<([A-Z]*)>)? # decomposition type 125 | ((\ ?[0-9A-F]+)*); # decompomposition mapping 126 | ([0-9]*); # decimal digit 127 | ([0-9]*); # digit 128 | ([^;]*); # numeric 129 | ([YN]*); # bidi mirrored 130 | ([^;]*); # unicode 1.0 name 131 | ([^;]*); # iso comment 132 | ([0-9A-F]*); # simple uppercase mapping 133 | ([0-9A-F]*); # simple lowercase mapping 134 | ([0-9A-F]*)$ # simple titlecase mapping 135 | """ix, line) 136 | @assert !isnothing(m) 137 | code = parse(UInt32, m[1], base=16) 138 | (code = code, 139 | name = m[2], 140 | category = m[3], 141 | combining_class = parse(Int, m[4]), 142 | bidi_class = m[5], 143 | decomp_type = m[7], 144 | decomp_mapping = m[8] == "" ? nothing : parsehex.(split(m[8])), 145 | bidi_mirrored = m[13] == "Y", 146 | # issue #130: use nonstandard uppercase ß -> ẞ 147 | # issue #195: if character is uppercase but has no lowercase mapping, 148 | # then make lowercase mapping = itself (vice versa for lowercase) 149 | uppercase_mapping = m[16] != "" ? parsehex(m[16]) : 150 | code == 0x000000df ? 0x00001e9e : 151 | m[17] == "" && code in lowercase ? code : 152 | nothing, 153 | lowercase_mapping = m[17] != "" ? parsehex(m[17]) : 154 | m[16] == "" && code in uppercase ? code : 155 | nothing, 156 | titlecase_mapping = m[18] != "" ? parsehex(m[18]) : 157 | code == 0x000000df ? 0x00001e9e : 158 | nothing, 159 | ) 160 | end 161 | 162 | function read_unicode_data(filename) 163 | raw_char_props = split_unicode_data_line.(readlines(filename)) 164 | char_props = Origin(0)(Vector{eltype(raw_char_props)}()) 165 | @assert issorted(raw_char_props, by=c->c.code) 166 | raw_char_props = Iterators.Stateful(raw_char_props) 167 | while !isempty(raw_char_props) 168 | c = popfirst!(raw_char_props) 169 | if occursin(", First>", c.name) 170 | nc = popfirst!(raw_char_props) 171 | @assert occursin(", Last>", nc.name) 172 | name = replace(c.name, ", First"=>"") 173 | for i in c.code:nc.code 174 | push!(char_props, (; c..., name=name, code=i)) 175 | end 176 | else 177 | push!(char_props, c) 178 | end 179 | end 180 | return char_props 181 | end 182 | 183 | char_props = read_unicode_data("UnicodeData.txt") 184 | char_hash = Dict(c.code=>c for c in char_props) 185 | 186 | #------------------------------------------------------------------------------- 187 | # Read character widths from UAX #11: East Asian Width 188 | function read_east_asian_widths(filename) 189 | ea_widths = Dict{UInt32,Int}() 190 | for (rng,widthcode) in read_hex_ranges(filename) 191 | w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full 192 | widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width 193 | widthcode == "A" ? -1 : # ambiguous width 194 | nothing 195 | if !isnothing(w) 196 | set_all!(ea_widths, rng, w) 197 | end 198 | end 199 | return ea_widths 200 | end 201 | 202 | let ea_widths = read_east_asian_widths("EastAsianWidth.txt") 203 | # Following work by @jiahao, we compute character widths using a combination of 204 | # * character category 205 | # * UAX 11: East Asian Width 206 | # * a few exceptions as needed 207 | # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 208 | global function derive_char_width(code, category) 209 | # Use a default width of 1 for all character categories that are 210 | # letter/symbol/number-like, as well as for unassigned/private-use chars. 211 | # This provides a useful nonzero fallback for new codepoints when a new 212 | # Unicode version has been released. 213 | width = 1 214 | 215 | # Various zero-width categories 216 | # 217 | # "Sk" not included in zero width - see issue #167 218 | if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs") 219 | width = 0 220 | end 221 | 222 | # Widths from UAX #11: East Asian Width 223 | eaw = get(ea_widths, code, nothing) 224 | if !isnothing(eaw) 225 | width = eaw < 0 ? 1 : eaw 226 | end 227 | 228 | # A few exceptional cases, found by manual comparison to other wcwidth 229 | # functions and similar checks. 230 | if category == "Mn" 231 | width = 0 232 | end 233 | 234 | if code == 0x00ad 235 | # Soft hyphen is typically printed as a hyphen (-) in terminals. 236 | width = 1 237 | elseif code == 0x2028 || code == 0x2029 238 | #By definition, should have zero width (on the same line) 239 | #0x002028 '\u2028' category: Zl name: LINE SEPARATOR/ 240 | #0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/ 241 | width = 0 242 | end 243 | 244 | return width 245 | end 246 | global function is_ambiguous_width(code) 247 | return get(ea_widths, code, 0) < 0 248 | end 249 | end 250 | 251 | #------------------------------------------------------------------------------- 252 | # Construct data tables which will drive libutf8proc 253 | # 254 | # These tables are "compressed" with an ad-hoc compression scheme (largely some 255 | # simple deduplication and indexing) which can easily and efficiently be 256 | # decompressed on the C side at runtime. 257 | 258 | # Inverse decomposition mapping tables for combining two characters into a single one. 259 | comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}() 260 | comb_issecond = Set{UInt32}() 261 | for char in char_props 262 | # What happens with decompositions that are longer than 2? 263 | if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) && 264 | length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) && 265 | char_hash[char.decomp_mapping[1]].combining_class == 0 && 266 | (char.code ∉ exclusions && char.code ∉ excl_version) 267 | dm0 = char.decomp_mapping[1] 268 | dm1 = char.decomp_mapping[2] 269 | if !haskey(comb_mapping, dm0) 270 | comb_mapping[dm0] = Dict{UInt32, UInt32}() 271 | end 272 | comb_mapping[dm0][dm1] = char.code 273 | push!(comb_issecond, dm1) 274 | end 275 | end 276 | 277 | comb_index = Dict{UInt32, UInt32}() 278 | comb_length = Dict{UInt32, UInt32}() 279 | let 280 | ind = 0 281 | for dm0 in sort!(collect(keys(comb_mapping))) 282 | comb_index[dm0] = ind 283 | len = length(comb_mapping[dm0]) 284 | comb_length[dm0] = len 285 | ind += len 286 | end 287 | end 288 | 289 | utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq)) 290 | 291 | # Utility for packing all UTF-16 encoded sequences into one big array 292 | struct UTF16Sequences 293 | storage::Vector{UInt16} 294 | indices::Dict{Vector{UInt16},Int} 295 | end 296 | UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}()) 297 | 298 | """ 299 | Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where 300 | * The 14 low bits are the index into the `sequences.storage` array where the 301 | sequence resides 302 | * The two top bits are the length of the sequence, or if equal to 3, the first 303 | entry of the sequence itself contains the length. 304 | """ 305 | function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector) 306 | if length(utf32_seq) == 0 307 | return typemax(UInt16) 308 | end 309 | # lencode contains the length of the UTF-32 sequence after decoding 310 | # No sequence has len 0, so we encode len 1 as 0, len 2 as 1. 311 | # We have only 2 bits for the length, though, so longer sequences are 312 | # encoded in the sequence data itself. 313 | seq_lencode = length(utf32_seq) - 1 314 | utf16_seq = utf16_encode(utf32_seq) 315 | idx = get!(sequences.indices, utf16_seq) do 316 | i = length(sequences.storage) 317 | utf16_seq_enc = seq_lencode < 3 ? utf16_seq : 318 | pushfirst!(copy(utf16_seq), seq_lencode) 319 | append!(sequences.storage, utf16_seq_enc) 320 | i 321 | end 322 | @assert idx <= 0x3FFF 323 | seq_code = idx | (min(seq_lencode, 3) << 14) 324 | return seq_code 325 | end 326 | 327 | function encode_sequence!(sequences::UTF16Sequences, code::Integer) 328 | encode_sequence!(sequences, [code]) 329 | end 330 | 331 | function encode_sequence!(sequences::UTF16Sequences, ::Nothing) 332 | return typemax(UInt16) 333 | end 334 | 335 | function char_table_properties!(sequences, char) 336 | code = char.code 337 | 338 | return ( 339 | category = char.category, 340 | combining_class = char.combining_class, 341 | bidi_class = char.bidi_class, 342 | decomp_type = char.decomp_type, 343 | decomp_seqindex = encode_sequence!(sequences, char.decomp_mapping), 344 | casefold_seqindex = encode_sequence!(sequences, get_case_folding(code)), 345 | uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping), 346 | lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping), 347 | titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping), 348 | comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index 349 | comb_length = get(comb_length, code, 0), 350 | comb_issecond = code in comb_issecond, 351 | bidi_mirrored = char.bidi_mirrored, 352 | comp_exclusion = code in exclusions || code in excl_version, 353 | ignorable = code in ignorable, 354 | control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") && 355 | !(char.code in (0x200C, 0x200D)), 356 | charwidth = derive_char_width(code, char.category), 357 | ambiguous_width = is_ambiguous_width(code), 358 | boundclass = get_grapheme_boundclass(code), 359 | indic_conjunct_break = get_indic_conjunct_break(code), 360 | ) 361 | end 362 | 363 | # Many character properties are duplicates. Deduplicate them, constructing a 364 | # per-character array of indicies into the properties array 365 | sequences = UTF16Sequences() 366 | char_table_props = [char_table_properties!(sequences, cp) for cp in char_props] 367 | 368 | deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}()) 369 | char_property_indices = Origin(0)(zeros(Int, 0x00110000)) 370 | let index_map = Dict{eltype(char_table_props),Int}() 371 | for (char, table_props) in zip(char_props, char_table_props) 372 | entry_idx = get!(index_map, table_props) do 373 | idx = length(deduplicated_props) 374 | push!(deduplicated_props, table_props) 375 | idx 376 | end 377 | # Add 1 because unassigned codes occupy slot at index 0 378 | char_property_indices[char.code] = entry_idx + 1 379 | end 380 | end 381 | 382 | # Now compress char_property_indices by breaking it into pages and 383 | # deduplicating those (this works as compression because there are large 384 | # contiguous ranges of code space with identical properties) 385 | prop_page_indices = Int[] 386 | prop_pages = Int[] 387 | let 388 | page_size = 0x100 389 | page_index_map = Dict{Vector{Int}, Int}() 390 | for page in Iterators.partition(char_property_indices, page_size) 391 | page_idx = get!(page_index_map, page) do 392 | idx = length(prop_pages) 393 | append!(prop_pages, page) 394 | idx 395 | end 396 | push!(prop_page_indices, page_idx) 397 | end 398 | end 399 | 400 | #------------------------------------------------------------------------------- 401 | function write_c_index_array(io, array, linelen) 402 | print(io, "{\n ") 403 | i = 0 404 | for x in array 405 | i += 1 406 | if i == linelen 407 | i = 0 408 | print(io, "\n ") 409 | end 410 | print(io, x, ", ") 411 | end 412 | print(io, "};\n\n") 413 | end 414 | 415 | function c_enum_name(prefix, str) 416 | if isnothing(str) 417 | return "0" 418 | else 419 | return "UTF8PROC_$(prefix)_$(Base.uppercase(str))" 420 | end 421 | end 422 | 423 | function c_uint16(seqindex) 424 | if seqindex == typemax(UInt16) 425 | return "UINT16_MAX" 426 | else 427 | return string(seqindex) 428 | end 429 | end 430 | 431 | function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props, 432 | comb_index, comb_length, comb_issecond) 433 | print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ") 434 | write_c_index_array(io, sequences.storage, 8) 435 | print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ") 436 | write_c_index_array(io, prop_page_indices, 8) 437 | print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ") 438 | write_c_index_array(io, prop_pages, 8) 439 | 440 | print(io, """ 441 | static const utf8proc_property_t utf8proc_properties[] = { 442 | {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE}, 443 | """) 444 | for prop in deduplicated_props 445 | print(io, " {", 446 | c_enum_name("CATEGORY", prop.category), ", ", 447 | prop.combining_class, ", ", 448 | c_enum_name("BIDI_CLASS", prop.bidi_class), ", ", 449 | c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ", 450 | c_uint16(prop.decomp_seqindex), ", ", 451 | c_uint16(prop.casefold_seqindex), ", ", 452 | c_uint16(prop.uppercase_seqindex), ", ", 453 | c_uint16(prop.lowercase_seqindex), ", ", 454 | c_uint16(prop.titlecase_seqindex), ", ", 455 | c_uint16(prop.comb_index), ", ", 456 | c_uint16(prop.comb_length), ", ", 457 | prop.comb_issecond, ", ", 458 | prop.bidi_mirrored, ", ", 459 | prop.comp_exclusion, ", ", 460 | prop.ignorable, ", ", 461 | prop.control_boundary, ", ", 462 | prop.charwidth, ", ", 463 | prop.ambiguous_width, ", ", 464 | "0, ", # bitfield padding 465 | c_enum_name("BOUNDCLASS", prop.boundclass), ", ", 466 | c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break), 467 | "},\n" 468 | ) 469 | end 470 | print(io, "};\n\n") 471 | 472 | print(io, "static const utf8proc_int32_t utf8proc_combinations_second[] = {\n") 473 | for dm0 in sort!(collect(keys(comb_mapping))) 474 | print(io, " "); 475 | for dm1 in sort!(collect(keys(comb_mapping[dm0]))) 476 | print(io, " ", dm1, ",") 477 | end 478 | print(io, "\n"); 479 | end 480 | print(io, "};\n\n") 481 | 482 | print(io, "static const utf8proc_int32_t utf8proc_combinations_combined[] = {\n") 483 | for dm0 in sort!(collect(keys(comb_mapping))) 484 | print(io, " "); 485 | for dm1 in sort!(collect(keys(comb_mapping[dm0]))) 486 | code = comb_mapping[dm0][dm1] 487 | print(io, " ", code, ",") 488 | end 489 | print(io, "\n"); 490 | end 491 | print(io, "};\n\n") 492 | end 493 | 494 | 495 | if !isinteractive() 496 | print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props, 497 | comb_index, comb_length, comb_issecond) 498 | end 499 | -------------------------------------------------------------------------------- /utf8proc.c: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ 2 | /* 3 | * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. 4 | * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | * This library contains derived data from a modified version of the 27 | * Unicode data files. 28 | * 29 | * The original data files are available at 30 | * https://www.unicode.org/Public/UNIDATA/ 31 | * 32 | * Please notice the copyright statement in the file "utf8proc_data.c". 33 | */ 34 | 35 | 36 | /* 37 | * File name: utf8proc.c 38 | * 39 | * Description: 40 | * Implementation of libutf8proc. 41 | */ 42 | 43 | 44 | #include "utf8proc.h" 45 | 46 | #ifndef SSIZE_MAX 47 | #define SSIZE_MAX ((size_t)SIZE_MAX/2) 48 | #endif 49 | #ifndef UINT16_MAX 50 | # define UINT16_MAX 65535U 51 | #endif 52 | 53 | #include "utf8proc_data.c" 54 | 55 | 56 | UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { 57 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 58 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 59 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 61 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 63 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 64 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 65 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 70 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 72 | 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 73 | 74 | #define UTF8PROC_HANGUL_SBASE 0xAC00 75 | #define UTF8PROC_HANGUL_LBASE 0x1100 76 | #define UTF8PROC_HANGUL_VBASE 0x1161 77 | #define UTF8PROC_HANGUL_TBASE 0x11A7 78 | #define UTF8PROC_HANGUL_LCOUNT 19 79 | #define UTF8PROC_HANGUL_VCOUNT 21 80 | #define UTF8PROC_HANGUL_TCOUNT 28 81 | #define UTF8PROC_HANGUL_NCOUNT 588 82 | #define UTF8PROC_HANGUL_SCOUNT 11172 83 | /* END is exclusive */ 84 | #define UTF8PROC_HANGUL_L_START 0x1100 85 | #define UTF8PROC_HANGUL_L_END 0x115A 86 | #define UTF8PROC_HANGUL_L_FILLER 0x115F 87 | #define UTF8PROC_HANGUL_V_START 0x1160 88 | #define UTF8PROC_HANGUL_V_END 0x11A3 89 | #define UTF8PROC_HANGUL_T_START 0x11A8 90 | #define UTF8PROC_HANGUL_T_END 0x11FA 91 | #define UTF8PROC_HANGUL_S_START 0xAC00 92 | #define UTF8PROC_HANGUL_S_END 0xD7A4 93 | 94 | /* Should follow semantic-versioning rules (semver.org) based on API 95 | compatibility. (Note that the shared-library version number will 96 | be different, being based on ABI compatibility.): */ 97 | #define STRINGIZEx(x) #x 98 | #define STRINGIZE(x) STRINGIZEx(x) 99 | UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { 100 | return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; 101 | } 102 | 103 | UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { 104 | return "16.0.0"; 105 | } 106 | 107 | UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { 108 | switch (errcode) { 109 | case UTF8PROC_ERROR_NOMEM: 110 | return "Memory for processing UTF-8 data could not be allocated."; 111 | case UTF8PROC_ERROR_OVERFLOW: 112 | return "UTF-8 string is too long to be processed."; 113 | case UTF8PROC_ERROR_INVALIDUTF8: 114 | return "Invalid UTF-8 string"; 115 | case UTF8PROC_ERROR_NOTASSIGNED: 116 | return "Unassigned Unicode code point found in UTF-8 string."; 117 | case UTF8PROC_ERROR_INVALIDOPTS: 118 | return "Invalid options for UTF-8 processing chosen."; 119 | default: 120 | return "An unknown error occurred while processing UTF-8 data."; 121 | } 122 | } 123 | 124 | #define utf_cont(ch) (((ch) & 0xc0) == 0x80) 125 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( 126 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst 127 | ) { 128 | utf8proc_int32_t uc; 129 | const utf8proc_uint8_t *end; 130 | 131 | *dst = -1; 132 | if (!strlen) return 0; 133 | end = str + ((strlen < 0) ? 4 : strlen); 134 | uc = *str++; 135 | if (uc < 0x80) { 136 | *dst = uc; 137 | return 1; 138 | } 139 | // Must be between 0xc2 and 0xf4 inclusive to be valid 140 | if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; 141 | if (uc < 0xe0) { // 2-byte sequence 142 | // Must have valid continuation character 143 | if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; 144 | *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); 145 | return 2; 146 | } 147 | if (uc < 0xf0) { // 3-byte sequence 148 | if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) 149 | return UTF8PROC_ERROR_INVALIDUTF8; 150 | // Check for surrogate chars 151 | if (uc == 0xed && *str > 0x9f) 152 | return UTF8PROC_ERROR_INVALIDUTF8; 153 | uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); 154 | if (uc < 0x800) 155 | return UTF8PROC_ERROR_INVALIDUTF8; 156 | *dst = uc; 157 | return 3; 158 | } 159 | // 4-byte sequence 160 | // Must have 3 valid continuation characters 161 | if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) 162 | return UTF8PROC_ERROR_INVALIDUTF8; 163 | // Make sure in correct range (0x10000 - 0x10ffff) 164 | if (uc == 0xf0) { 165 | if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; 166 | } else if (uc == 0xf4) { 167 | if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; 168 | } 169 | *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); 170 | return 4; 171 | } 172 | 173 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { 174 | return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); 175 | } 176 | 177 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 178 | if (uc < 0x00) { 179 | return 0; 180 | } else if (uc < 0x80) { 181 | dst[0] = (utf8proc_uint8_t) uc; 182 | return 1; 183 | } else if (uc < 0x800) { 184 | dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 185 | dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 186 | return 2; 187 | // Note: we allow encoding 0xd800-0xdfff here, so as not to change 188 | // the API, however, these are actually invalid in UTF-8 189 | } else if (uc < 0x10000) { 190 | dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 191 | dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 192 | dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 193 | return 3; 194 | } else if (uc < 0x110000) { 195 | dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 196 | dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 197 | dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 198 | dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 199 | return 4; 200 | } else return 0; 201 | } 202 | 203 | /* internal version used for inserting 0xff bytes between graphemes */ 204 | static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 205 | if (uc < 0x00) { 206 | if (uc == -1) { /* internal value used for grapheme breaks */ 207 | dst[0] = (utf8proc_uint8_t)0xFF; 208 | return 1; 209 | } 210 | return 0; 211 | } else if (uc < 0x80) { 212 | dst[0] = (utf8proc_uint8_t)uc; 213 | return 1; 214 | } else if (uc < 0x800) { 215 | dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 216 | dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 217 | return 2; 218 | } else if (uc < 0x10000) { 219 | dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 220 | dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 221 | dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 222 | return 3; 223 | } else if (uc < 0x110000) { 224 | dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 225 | dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 226 | dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 227 | dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 228 | return 4; 229 | } else return 0; 230 | } 231 | 232 | /* internal "unsafe" version that does not check whether uc is in range */ 233 | static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { 234 | /* ASSERT: uc >= 0 && uc < 0x110000 */ 235 | return utf8proc_properties + ( 236 | utf8proc_stage2table[ 237 | utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 238 | ] 239 | ); 240 | } 241 | 242 | UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { 243 | return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); 244 | } 245 | 246 | /* return whether there is a grapheme break between boundclasses lbc and tbc 247 | (according to the definition of extended grapheme clusters) 248 | 249 | Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): 250 | http://www.unicode.org/reports/tr29/tr29-29.html 251 | 252 | CAVEATS: 253 | Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) 254 | and GB 12/13 (regional indicator code points) require knowledge of previous characters 255 | and are thus not handled by this function. This may result in an incorrect break before 256 | an E_Modifier class codepoint and an incorrectly missing break between two 257 | REGIONAL_INDICATOR class code points if such support does not exist in the caller. 258 | 259 | See the special support in grapheme_break_extended, for required bookkeeping by the caller. 260 | */ 261 | static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { 262 | return 263 | (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1 264 | (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3 265 | tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // --- 266 | (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4 267 | (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5 268 | (lbc == UTF8PROC_BOUNDCLASS_L && // GB6 269 | (tbc == UTF8PROC_BOUNDCLASS_L || // --- 270 | tbc == UTF8PROC_BOUNDCLASS_V || // --- 271 | tbc == UTF8PROC_BOUNDCLASS_LV || // --- 272 | tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // --- 273 | ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7 274 | lbc == UTF8PROC_BOUNDCLASS_V) && // --- 275 | (tbc == UTF8PROC_BOUNDCLASS_V || // --- 276 | tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // --- 277 | ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8 278 | lbc == UTF8PROC_BOUNDCLASS_T) && // --- 279 | tbc == UTF8PROC_BOUNDCLASS_T) ? false : // --- 280 | (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9 281 | tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- 282 | tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a 283 | lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b 284 | (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below) 285 | tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- 286 | (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) 287 | tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- 288 | true; // GB999 289 | } 290 | 291 | static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state) 292 | { 293 | if (state) { 294 | int state_bc, state_icb; /* boundclass and indic_conjunct_break state */ 295 | if (*state == 0) { /* state initialization */ 296 | state_bc = lbc; 297 | state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE; 298 | } 299 | else { /* lbc and licb are already encoded in *state */ 300 | state_bc = *state & 0xff; // 1st byte of state is bound class 301 | state_icb = *state >> 8; // 2nd byte of state is indic conjunct break 302 | } 303 | 304 | utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) && 305 | !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER 306 | && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c 307 | 308 | // Special support for GB9c. Don't break between two consonants 309 | // separated 1+ linker characters and 0+ extend characters in any order. 310 | // After a consonant, we enter LINKER state after at least one linker. 311 | if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT 312 | || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT 313 | || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) 314 | state_icb = ticb; 315 | else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER) 316 | state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ? 317 | UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb; 318 | 319 | // Special support for GB 12/13 made possible by GB999. After two RI 320 | // class codepoints we want to force a break. Do this by resetting the 321 | // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break 322 | // after that character according to GB999 (unless of course such a break is 323 | // forbidden by a different rule such as GB9). 324 | if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) 325 | state_bc = UTF8PROC_BOUNDCLASS_OTHER; 326 | // Special support for GB11 (emoji extend* zwj / emoji) 327 | else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { 328 | if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji 329 | state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; 330 | else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) 331 | state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo 332 | else 333 | state_bc = tbc; 334 | } 335 | else 336 | state_bc = tbc; 337 | 338 | *state = state_bc + (state_icb << 8); 339 | return break_permitted; 340 | } 341 | else 342 | return grapheme_break_simple(lbc, tbc); 343 | } 344 | 345 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( 346 | utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { 347 | 348 | const utf8proc_property_t *p1 = utf8proc_get_property(c1); 349 | const utf8proc_property_t *p2 = utf8proc_get_property(c2); 350 | return grapheme_break_extended(p1->boundclass, 351 | p2->boundclass, 352 | p1->indic_conjunct_break, 353 | p2->indic_conjunct_break, 354 | state); 355 | } 356 | 357 | 358 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( 359 | utf8proc_int32_t c1, utf8proc_int32_t c2) { 360 | return utf8proc_grapheme_break_stateful(c1, c2, NULL); 361 | } 362 | 363 | static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) 364 | { 365 | utf8proc_int32_t entry_cp = **entry; 366 | if ((entry_cp & 0xF800) == 0xD800) { 367 | *entry = *entry + 1; 368 | entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); 369 | entry_cp += 0x10000; 370 | } 371 | return entry_cp; 372 | } 373 | 374 | static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) 375 | { 376 | const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; 377 | return seqindex_decode_entry(&entry); 378 | } 379 | 380 | static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 381 | utf8proc_ssize_t written = 0; 382 | const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF]; 383 | int len = seqindex >> 14; 384 | if (len >= 3) { 385 | len = *entry; 386 | entry++; 387 | } 388 | for (; len >= 0; entry++, len--) { 389 | utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); 390 | 391 | written += utf8proc_decompose_char(entry_cp, dst+written, 392 | (bufsize > written) ? (bufsize - written) : 0, options, 393 | last_boundclass); 394 | if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 395 | } 396 | return written; 397 | } 398 | 399 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) 400 | { 401 | utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; 402 | return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; 403 | } 404 | 405 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) 406 | { 407 | utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; 408 | return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; 409 | } 410 | 411 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) 412 | { 413 | utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; 414 | return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; 415 | } 416 | 417 | UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) 418 | { 419 | const utf8proc_property_t *p = utf8proc_get_property(c); 420 | return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX; 421 | } 422 | 423 | UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c) 424 | { 425 | const utf8proc_property_t *p = utf8proc_get_property(c); 426 | return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT; 427 | } 428 | 429 | /* return a character width analogous to wcwidth (except portable and 430 | hopefully less buggy than most system wcwidth functions). */ 431 | UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { 432 | return utf8proc_get_property(c)->charwidth; 433 | } 434 | 435 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) { 436 | return utf8proc_get_property(c)->ambiguous_width; 437 | } 438 | 439 | UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { 440 | return (utf8proc_category_t) utf8proc_get_property(c)->category; 441 | } 442 | 443 | UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { 444 | static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; 445 | return s[utf8proc_category(c)]; 446 | } 447 | 448 | #define utf8proc_decompose_lump(replacement_uc) \ 449 | return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 450 | options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) 451 | 452 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 453 | const utf8proc_property_t *property; 454 | utf8proc_propval_t category; 455 | utf8proc_int32_t hangul_sindex; 456 | if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; 457 | property = unsafe_get_property(uc); 458 | category = property->category; 459 | hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 460 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 461 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 462 | utf8proc_int32_t hangul_tindex; 463 | if (bufsize >= 1) { 464 | dst[0] = UTF8PROC_HANGUL_LBASE + 465 | hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 466 | if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 467 | (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 468 | } 469 | hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 470 | if (!hangul_tindex) return 2; 471 | if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 472 | return 3; 473 | } 474 | } 475 | if (options & UTF8PROC_REJECTNA) { 476 | if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 477 | } 478 | if (options & UTF8PROC_IGNORE) { 479 | if (property->ignorable) return 0; 480 | } 481 | if (options & UTF8PROC_STRIPNA) { 482 | if (!category) return 0; 483 | } 484 | if (options & UTF8PROC_LUMP) { 485 | if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 486 | if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 487 | utf8proc_decompose_lump(0x0027); 488 | if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 489 | utf8proc_decompose_lump(0x002D); 490 | if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 491 | if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 492 | if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 493 | utf8proc_decompose_lump(0x003C); 494 | if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 495 | utf8proc_decompose_lump(0x003E); 496 | if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 497 | if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 498 | utf8proc_decompose_lump(0x005E); 499 | if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 500 | utf8proc_decompose_lump(0x005F); 501 | if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 502 | if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 503 | if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 504 | if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 505 | if (category == UTF8PROC_CATEGORY_ZL || 506 | category == UTF8PROC_CATEGORY_ZP) 507 | utf8proc_decompose_lump(0x000A); 508 | } 509 | } 510 | if (options & UTF8PROC_STRIPMARK) { 511 | if (category == UTF8PROC_CATEGORY_MN || 512 | category == UTF8PROC_CATEGORY_MC || 513 | category == UTF8PROC_CATEGORY_ME) return 0; 514 | } 515 | if (options & UTF8PROC_CASEFOLD) { 516 | if (property->casefold_seqindex != UINT16_MAX) { 517 | return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); 518 | } 519 | } 520 | if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 521 | if (property->decomp_seqindex != UINT16_MAX && 522 | (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 523 | return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); 524 | } 525 | } 526 | if (options & UTF8PROC_CHARBOUND) { 527 | utf8proc_bool boundary; 528 | boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break, 529 | last_boundclass); 530 | if (boundary) { 531 | if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ 532 | if (bufsize >= 2) dst[1] = uc; 533 | return 2; 534 | } 535 | } 536 | if (bufsize >= 1) *dst = uc; 537 | return 1; 538 | } 539 | 540 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( 541 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 542 | utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options 543 | ) { 544 | return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); 545 | } 546 | 547 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( 548 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 549 | utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, 550 | utf8proc_custom_func custom_func, void *custom_data 551 | ) { 552 | /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ 553 | utf8proc_ssize_t wpos = 0; 554 | if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 555 | return UTF8PROC_ERROR_INVALIDOPTS; 556 | if ((options & UTF8PROC_STRIPMARK) && 557 | !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 558 | return UTF8PROC_ERROR_INVALIDOPTS; 559 | { 560 | utf8proc_int32_t uc; 561 | utf8proc_ssize_t rpos = 0; 562 | utf8proc_ssize_t decomp_result; 563 | int boundclass = UTF8PROC_BOUNDCLASS_START; 564 | while (1) { 565 | if (options & UTF8PROC_NULLTERM) { 566 | rpos += utf8proc_iterate(str + rpos, -1, &uc); 567 | /* checking of return value is not necessary, 568 | as 'uc' is < 0 in case of error */ 569 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 570 | if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 571 | if (uc == 0) break; 572 | } else { 573 | if (rpos >= strlen) break; 574 | rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 575 | if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 576 | } 577 | if (custom_func != NULL) { 578 | uc = custom_func(uc, custom_data); /* user-specified custom mapping */ 579 | } 580 | decomp_result = utf8proc_decompose_char( 581 | uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 582 | &boundclass 583 | ); 584 | if (decomp_result < 0) return decomp_result; 585 | wpos += decomp_result; 586 | /* prohibiting integer overflows due to too long strings: */ 587 | if (wpos < 0 || 588 | wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2)) 589 | return UTF8PROC_ERROR_OVERFLOW; 590 | } 591 | } 592 | if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { 593 | utf8proc_ssize_t pos = 0; 594 | while (pos < wpos-1) { 595 | utf8proc_int32_t uc1, uc2; 596 | const utf8proc_property_t *property1, *property2; 597 | uc1 = buffer[pos]; 598 | uc2 = buffer[pos+1]; 599 | property1 = unsafe_get_property(uc1); 600 | property2 = unsafe_get_property(uc2); 601 | if (property1->combining_class > property2->combining_class && 602 | property2->combining_class > 0) { 603 | buffer[pos] = uc2; 604 | buffer[pos+1] = uc1; 605 | if (pos > 0) pos--; else pos++; 606 | } else { 607 | pos++; 608 | } 609 | } 610 | } 611 | return wpos; 612 | } 613 | 614 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 615 | /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ 616 | if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { 617 | utf8proc_ssize_t rpos; 618 | utf8proc_ssize_t wpos = 0; 619 | utf8proc_int32_t uc; 620 | for (rpos = 0; rpos < length; rpos++) { 621 | uc = buffer[rpos]; 622 | if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 623 | if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || 624 | ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { 625 | if (options & UTF8PROC_NLF2LS) { 626 | if (options & UTF8PROC_NLF2PS) { 627 | buffer[wpos++] = 0x000A; 628 | } else { 629 | buffer[wpos++] = 0x2028; 630 | } 631 | } else { 632 | if (options & UTF8PROC_NLF2PS) { 633 | buffer[wpos++] = 0x2029; 634 | } else { 635 | buffer[wpos++] = 0x0020; 636 | } 637 | } 638 | } else if ((options & UTF8PROC_STRIPCC) && 639 | (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { 640 | if (uc == 0x0009) buffer[wpos++] = 0x0020; 641 | } else { 642 | buffer[wpos++] = uc; 643 | } 644 | } 645 | length = wpos; 646 | } 647 | if (options & UTF8PROC_COMPOSE) { 648 | utf8proc_int32_t *starter = NULL; 649 | const utf8proc_property_t *starter_property = NULL; 650 | utf8proc_propval_t max_combining_class = -1; 651 | utf8proc_ssize_t rpos; 652 | utf8proc_ssize_t wpos = 0; 653 | for (rpos = 0; rpos < length; rpos++) { 654 | utf8proc_int32_t current_char = buffer[rpos]; 655 | const utf8proc_property_t *current_property = unsafe_get_property(current_char); 656 | if (starter && current_property->combining_class > max_combining_class) { 657 | /* combination perhaps possible */ 658 | utf8proc_int32_t hangul_lindex; 659 | utf8proc_int32_t hangul_sindex; 660 | hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; 661 | if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { 662 | utf8proc_int32_t hangul_vindex; 663 | hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 664 | if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 665 | *starter = UTF8PROC_HANGUL_SBASE + 666 | (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 667 | UTF8PROC_HANGUL_TCOUNT; 668 | starter_property = NULL; 669 | continue; 670 | } 671 | } 672 | hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; 673 | if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 674 | (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { 675 | utf8proc_int32_t hangul_tindex; 676 | hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 677 | if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 678 | *starter += hangul_tindex; 679 | starter_property = NULL; 680 | continue; 681 | } 682 | } 683 | if (!starter_property) { 684 | starter_property = unsafe_get_property(*starter); 685 | } 686 | int idx = starter_property->comb_index; 687 | if (idx < 0x3FF && current_property->comb_issecond) { 688 | int len = starter_property->comb_length; 689 | utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1]; 690 | if (current_char <= max_second) { 691 | // TODO: binary search? arithmetic search? 692 | for (int off = 0; off < len; ++off) { 693 | utf8proc_int32_t second = utf8proc_combinations_second[idx + off]; 694 | if (current_char < second) { 695 | /* not found */ 696 | break; 697 | } 698 | if (current_char == second) { 699 | /* found */ 700 | utf8proc_int32_t composition = utf8proc_combinations_combined[idx + off]; 701 | *starter = composition; 702 | starter_property = NULL; 703 | break; 704 | } 705 | } 706 | if (starter_property == NULL) { 707 | /* found */ 708 | continue; 709 | } 710 | } 711 | } 712 | } 713 | buffer[wpos] = current_char; 714 | if (current_property->combining_class) { 715 | if (current_property->combining_class > max_combining_class) { 716 | max_combining_class = current_property->combining_class; 717 | } 718 | } else { 719 | starter = buffer + wpos; 720 | starter_property = NULL; 721 | max_combining_class = -1; 722 | } 723 | wpos++; 724 | } 725 | length = wpos; 726 | } 727 | return length; 728 | } 729 | 730 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 731 | /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 732 | ASSERT: 'buffer' has one spare byte of free space at the end! */ 733 | length = utf8proc_normalize_utf32(buffer, length, options); 734 | if (length < 0) return length; 735 | { 736 | utf8proc_ssize_t rpos, wpos = 0; 737 | utf8proc_int32_t uc; 738 | if (options & UTF8PROC_CHARBOUND) { 739 | for (rpos = 0; rpos < length; rpos++) { 740 | uc = buffer[rpos]; 741 | wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 742 | } 743 | } else { 744 | for (rpos = 0; rpos < length; rpos++) { 745 | uc = buffer[rpos]; 746 | wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 747 | } 748 | } 749 | ((utf8proc_uint8_t *)buffer)[wpos] = 0; 750 | return wpos; 751 | } 752 | } 753 | 754 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( 755 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options 756 | ) { 757 | return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); 758 | } 759 | 760 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( 761 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, 762 | utf8proc_custom_func custom_func, void *custom_data 763 | ) { 764 | utf8proc_int32_t *buffer; 765 | utf8proc_ssize_t result; 766 | *dstptr = NULL; 767 | result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); 768 | if (result < 0) return result; 769 | buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); 770 | if (!buffer) return UTF8PROC_ERROR_NOMEM; 771 | result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); 772 | if (result < 0) { 773 | free(buffer); 774 | return result; 775 | } 776 | result = utf8proc_reencode(buffer, result, options); 777 | if (result < 0) { 778 | free(buffer); 779 | return result; 780 | } 781 | { 782 | utf8proc_int32_t *newptr; 783 | newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); 784 | if (newptr) buffer = newptr; 785 | } 786 | *dstptr = (utf8proc_uint8_t *)buffer; 787 | return result; 788 | } 789 | 790 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { 791 | utf8proc_uint8_t *retval; 792 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 793 | UTF8PROC_DECOMPOSE); 794 | return retval; 795 | } 796 | 797 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { 798 | utf8proc_uint8_t *retval; 799 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 800 | UTF8PROC_COMPOSE); 801 | return retval; 802 | } 803 | 804 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { 805 | utf8proc_uint8_t *retval; 806 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 807 | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 808 | return retval; 809 | } 810 | 811 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { 812 | utf8proc_uint8_t *retval; 813 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 814 | UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 815 | return retval; 816 | } 817 | 818 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { 819 | utf8proc_uint8_t *retval; 820 | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 821 | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); 822 | return retval; 823 | } 824 | -------------------------------------------------------------------------------- /utf8proc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. 3 | * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | 25 | /** 26 | * @mainpage 27 | * 28 | * utf8proc is a free/open-source (MIT/expat licensed) C library 29 | * providing Unicode normalization, case-folding, and other operations 30 | * for strings in the UTF-8 encoding, supporting up-to-date Unicode versions. 31 | * See the utf8proc home page (http://julialang.org/utf8proc/) 32 | * for downloads and other information, or the source code on github 33 | * (https://github.com/JuliaLang/utf8proc). 34 | * 35 | * For the utf8proc API documentation, see: @ref utf8proc.h 36 | * 37 | * The features of utf8proc include: 38 | * 39 | * - Transformation of strings (utf8proc_map()) to: 40 | * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character) 41 | * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) 42 | * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) 43 | * - case-folding (@ref UTF8PROC_CASEFOLD) 44 | * - Unicode normalization: utf8proc_NFD(), utf8proc_NFC(), utf8proc_NFKD(), utf8proc_NFKC() 45 | * - Detecting grapheme boundaries (utf8proc_grapheme_break() and @ref UTF8PROC_CHARBOUND) 46 | * - Character-width computation: utf8proc_charwidth() 47 | * - Classification of characters by Unicode category: utf8proc_category() and utf8proc_category_string() 48 | * - Encode (utf8proc_encode_char()) and decode (utf8proc_iterate()) Unicode codepoints to/from UTF-8. 49 | */ 50 | 51 | /** @file */ 52 | 53 | #ifndef UTF8PROC_H 54 | #define UTF8PROC_H 55 | 56 | /** @name API version 57 | * 58 | * The utf8proc API version MAJOR.MINOR.PATCH, following 59 | * semantic-versioning rules (http://semver.org) based on API 60 | * compatibility. 61 | * 62 | * This is also returned at runtime by utf8proc_version(); however, the 63 | * runtime version may append a string like "-dev" to the version number 64 | * for prerelease versions. 65 | * 66 | * @note The shared-library version number in the Makefile 67 | * (and CMakeLists.txt, and MANIFEST) may be different, 68 | * being based on ABI compatibility rather than API compatibility. 69 | */ 70 | /** @{ */ 71 | /** The MAJOR version number (increased when backwards API compatibility is broken). */ 72 | #define UTF8PROC_VERSION_MAJOR 2 73 | /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ 74 | #define UTF8PROC_VERSION_MINOR 10 75 | /** The PATCH version (increased for fixes that do not change the API). */ 76 | #define UTF8PROC_VERSION_PATCH 0 77 | /** @} */ 78 | 79 | #include 80 | 81 | #if defined(_MSC_VER) && _MSC_VER < 1800 82 | // MSVC prior to 2013 lacked stdbool.h and stdint.h 83 | typedef signed char utf8proc_int8_t; 84 | typedef unsigned char utf8proc_uint8_t; 85 | typedef short utf8proc_int16_t; 86 | typedef unsigned short utf8proc_uint16_t; 87 | typedef int utf8proc_int32_t; 88 | typedef unsigned int utf8proc_uint32_t; 89 | # ifdef _WIN64 90 | typedef __int64 utf8proc_ssize_t; 91 | typedef unsigned __int64 utf8proc_size_t; 92 | # else 93 | typedef int utf8proc_ssize_t; 94 | typedef unsigned int utf8proc_size_t; 95 | # endif 96 | # ifndef __cplusplus 97 | // emulate C99 bool 98 | typedef unsigned char utf8proc_bool; 99 | # ifndef __bool_true_false_are_defined 100 | # define false 0 101 | # define true 1 102 | # define __bool_true_false_are_defined 1 103 | # endif 104 | # else 105 | typedef bool utf8proc_bool; 106 | # endif 107 | #else 108 | # include 109 | # include 110 | # include 111 | typedef int8_t utf8proc_int8_t; 112 | typedef uint8_t utf8proc_uint8_t; 113 | typedef int16_t utf8proc_int16_t; 114 | typedef uint16_t utf8proc_uint16_t; 115 | typedef int32_t utf8proc_int32_t; 116 | typedef uint32_t utf8proc_uint32_t; 117 | typedef size_t utf8proc_size_t; 118 | typedef ptrdiff_t utf8proc_ssize_t; 119 | typedef bool utf8proc_bool; 120 | #endif 121 | #include 122 | 123 | #ifdef UTF8PROC_STATIC 124 | # define UTF8PROC_DLLEXPORT 125 | #else 126 | # ifdef _WIN32 127 | # ifdef UTF8PROC_EXPORTS 128 | # define UTF8PROC_DLLEXPORT __declspec(dllexport) 129 | # else 130 | # define UTF8PROC_DLLEXPORT __declspec(dllimport) 131 | # endif 132 | # elif __GNUC__ >= 4 133 | # define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default"))) 134 | # else 135 | # define UTF8PROC_DLLEXPORT 136 | # endif 137 | #endif 138 | 139 | #ifdef __cplusplus 140 | extern "C" { 141 | #endif 142 | 143 | /** 144 | * Option flags used by several functions in the library. 145 | */ 146 | typedef enum { 147 | /** The given UTF-8 input is NULL terminated. */ 148 | UTF8PROC_NULLTERM = (1<<0), 149 | /** Unicode Versioning Stability has to be respected. */ 150 | UTF8PROC_STABLE = (1<<1), 151 | /** Compatibility decomposition (i.e. formatting information is lost). */ 152 | UTF8PROC_COMPAT = (1<<2), 153 | /** Return a result with decomposed characters. */ 154 | UTF8PROC_COMPOSE = (1<<3), 155 | /** Return a result with decomposed characters. */ 156 | UTF8PROC_DECOMPOSE = (1<<4), 157 | /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */ 158 | UTF8PROC_IGNORE = (1<<5), 159 | /** Return an error, if the input contains unassigned codepoints. */ 160 | UTF8PROC_REJECTNA = (1<<6), 161 | /** 162 | * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a 163 | * line break, and should be converted to the codepoint for line 164 | * separation (LS). 165 | */ 166 | UTF8PROC_NLF2LS = (1<<7), 167 | /** 168 | * Indicating that NLF-sequences are representing a paragraph break, and 169 | * should be converted to the codepoint for paragraph separation 170 | * (PS). 171 | */ 172 | UTF8PROC_NLF2PS = (1<<8), 173 | /** Indicating that the meaning of NLF-sequences is unknown. */ 174 | UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS), 175 | /** Strips and/or convers control characters. 176 | * 177 | * NLF-sequences are transformed into space, except if one of the 178 | * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) 179 | * are treated as a NLF-sequence in this case. All other control 180 | * characters are simply removed. 181 | */ 182 | UTF8PROC_STRIPCC = (1<<9), 183 | /** 184 | * Performs unicode case folding, to be able to do a case-insensitive 185 | * string comparison. 186 | */ 187 | UTF8PROC_CASEFOLD = (1<<10), 188 | /** 189 | * Inserts 0xFF bytes at the beginning of each sequence which is 190 | * representing a single grapheme cluster (see UAX#29). 191 | */ 192 | UTF8PROC_CHARBOUND = (1<<11), 193 | /** Lumps certain characters together. 194 | * 195 | * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. 196 | * 197 | * If NLF2LF is set, this includes a transformation of paragraph and 198 | * line separators to ASCII line-feed (LF). 199 | */ 200 | UTF8PROC_LUMP = (1<<12), 201 | /** Strips all character markings. 202 | * 203 | * This includes non-spacing, spacing and enclosing (i.e. accents). 204 | * @note This option works only with @ref UTF8PROC_COMPOSE or 205 | * @ref UTF8PROC_DECOMPOSE 206 | */ 207 | UTF8PROC_STRIPMARK = (1<<13), 208 | /** 209 | * Strip unassigned codepoints. 210 | */ 211 | UTF8PROC_STRIPNA = (1<<14), 212 | } utf8proc_option_t; 213 | 214 | /** @name Error codes 215 | * Error codes being returned by almost all functions. 216 | */ 217 | /** @{ */ 218 | /** Memory could not be allocated. */ 219 | #define UTF8PROC_ERROR_NOMEM -1 220 | /** The given string is too long to be processed. */ 221 | #define UTF8PROC_ERROR_OVERFLOW -2 222 | /** The given string is not a legal UTF-8 string. */ 223 | #define UTF8PROC_ERROR_INVALIDUTF8 -3 224 | /** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */ 225 | #define UTF8PROC_ERROR_NOTASSIGNED -4 226 | /** Invalid options have been used. */ 227 | #define UTF8PROC_ERROR_INVALIDOPTS -5 228 | /** @} */ 229 | 230 | /* @name Types */ 231 | 232 | /** Holds the value of a property. */ 233 | typedef utf8proc_int16_t utf8proc_propval_t; 234 | 235 | /** Struct containing information about a codepoint. */ 236 | typedef struct utf8proc_property_struct { 237 | /** 238 | * Unicode category. 239 | * @see utf8proc_category_t. 240 | */ 241 | utf8proc_propval_t category; 242 | utf8proc_propval_t combining_class; 243 | /** 244 | * Bidirectional class. 245 | * @see utf8proc_bidi_class_t. 246 | */ 247 | utf8proc_propval_t bidi_class; 248 | /** 249 | * @anchor Decomposition type. 250 | * @see utf8proc_decomp_type_t. 251 | */ 252 | utf8proc_propval_t decomp_type; 253 | utf8proc_uint16_t decomp_seqindex; 254 | utf8proc_uint16_t casefold_seqindex; 255 | utf8proc_uint16_t uppercase_seqindex; 256 | utf8proc_uint16_t lowercase_seqindex; 257 | utf8proc_uint16_t titlecase_seqindex; 258 | /** 259 | * Character combining table. 260 | * 261 | * The character combining table is formally indexed by two 262 | * characters, the first and second character that might form a 263 | * combining pair. The table entry then contains the combined 264 | * character. Most character pairs cannot be combined. There are 265 | * about 1,000 characters that can be the first character in a 266 | * combining pair, and for most, there are only a handful for 267 | * possible second characters. 268 | * 269 | * The combining table is stored as sparse matrix in the CSR 270 | * (compressed sparse row) format. That is, it is stored as two 271 | * arrays, `utf8proc_uint32_t utf8proc_combinations_second[]` and 272 | * `utf8proc_uint32_t utf8proc_combinations_combined[]`. These 273 | * contain the second combining characters and the combined 274 | * character of every combining pair. 275 | * 276 | * - `comb_index`: Index into the combining table if this character 277 | * is the first character in a combining pair, else 0x3ff 278 | * 279 | * - `comb_length`: Number of table entries for this first character 280 | * 281 | * - `comb_is_second`: As optimization we also record whether this 282 | * character is the second combining character in any pair. If 283 | * not, we can skip the table lookup. 284 | * 285 | * A table lookup starts from a given character pair. It first 286 | * checks whether the first character is stored in the table 287 | * (checking whether the index is 0x3ff) and whether the second 288 | * index is stored in the table (looking at `comb_is_second`). If 289 | * so, the `comb_length` table entries will be checked sequentially 290 | * for a match. 291 | */ 292 | utf8proc_uint16_t comb_index:10; 293 | utf8proc_uint16_t comb_length:5; 294 | utf8proc_uint16_t comb_issecond:1; 295 | unsigned bidi_mirrored:1; 296 | unsigned comp_exclusion:1; 297 | /** 298 | * Can this codepoint be ignored? 299 | * 300 | * Used by utf8proc_decompose_char() when @ref UTF8PROC_IGNORE is 301 | * passed as an option. 302 | */ 303 | unsigned ignorable:1; 304 | unsigned control_boundary:1; 305 | /** The width of the codepoint. */ 306 | unsigned charwidth:2; 307 | /** East Asian width class A */ 308 | unsigned ambiguous_width:1; 309 | unsigned pad:1; 310 | /** 311 | * Boundclass. 312 | * @see utf8proc_boundclass_t. 313 | */ 314 | unsigned boundclass:6; 315 | unsigned indic_conjunct_break:2; 316 | } utf8proc_property_t; 317 | 318 | /** Unicode categories. */ 319 | typedef enum { 320 | UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */ 321 | UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */ 322 | UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */ 323 | UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */ 324 | UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */ 325 | UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */ 326 | UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */ 327 | UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */ 328 | UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */ 329 | UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */ 330 | UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */ 331 | UTF8PROC_CATEGORY_NO = 11, /**< Number, other */ 332 | UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */ 333 | UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */ 334 | UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */ 335 | UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */ 336 | UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */ 337 | UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */ 338 | UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */ 339 | UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */ 340 | UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */ 341 | UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */ 342 | UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */ 343 | UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */ 344 | UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */ 345 | UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */ 346 | UTF8PROC_CATEGORY_CC = 26, /**< Other, control */ 347 | UTF8PROC_CATEGORY_CF = 27, /**< Other, format */ 348 | UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */ 349 | UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */ 350 | } utf8proc_category_t; 351 | 352 | /** Bidirectional character classes. */ 353 | typedef enum { 354 | UTF8PROC_BIDI_CLASS_L = 1, /**< Left-to-Right */ 355 | UTF8PROC_BIDI_CLASS_LRE = 2, /**< Left-to-Right Embedding */ 356 | UTF8PROC_BIDI_CLASS_LRO = 3, /**< Left-to-Right Override */ 357 | UTF8PROC_BIDI_CLASS_R = 4, /**< Right-to-Left */ 358 | UTF8PROC_BIDI_CLASS_AL = 5, /**< Right-to-Left Arabic */ 359 | UTF8PROC_BIDI_CLASS_RLE = 6, /**< Right-to-Left Embedding */ 360 | UTF8PROC_BIDI_CLASS_RLO = 7, /**< Right-to-Left Override */ 361 | UTF8PROC_BIDI_CLASS_PDF = 8, /**< Pop Directional Format */ 362 | UTF8PROC_BIDI_CLASS_EN = 9, /**< European Number */ 363 | UTF8PROC_BIDI_CLASS_ES = 10, /**< European Separator */ 364 | UTF8PROC_BIDI_CLASS_ET = 11, /**< European Number Terminator */ 365 | UTF8PROC_BIDI_CLASS_AN = 12, /**< Arabic Number */ 366 | UTF8PROC_BIDI_CLASS_CS = 13, /**< Common Number Separator */ 367 | UTF8PROC_BIDI_CLASS_NSM = 14, /**< Nonspacing Mark */ 368 | UTF8PROC_BIDI_CLASS_BN = 15, /**< Boundary Neutral */ 369 | UTF8PROC_BIDI_CLASS_B = 16, /**< Paragraph Separator */ 370 | UTF8PROC_BIDI_CLASS_S = 17, /**< Segment Separator */ 371 | UTF8PROC_BIDI_CLASS_WS = 18, /**< Whitespace */ 372 | UTF8PROC_BIDI_CLASS_ON = 19, /**< Other Neutrals */ 373 | UTF8PROC_BIDI_CLASS_LRI = 20, /**< Left-to-Right Isolate */ 374 | UTF8PROC_BIDI_CLASS_RLI = 21, /**< Right-to-Left Isolate */ 375 | UTF8PROC_BIDI_CLASS_FSI = 22, /**< First Strong Isolate */ 376 | UTF8PROC_BIDI_CLASS_PDI = 23, /**< Pop Directional Isolate */ 377 | } utf8proc_bidi_class_t; 378 | 379 | /** Decomposition type. */ 380 | typedef enum { 381 | UTF8PROC_DECOMP_TYPE_FONT = 1, /**< Font */ 382 | UTF8PROC_DECOMP_TYPE_NOBREAK = 2, /**< Nobreak */ 383 | UTF8PROC_DECOMP_TYPE_INITIAL = 3, /**< Initial */ 384 | UTF8PROC_DECOMP_TYPE_MEDIAL = 4, /**< Medial */ 385 | UTF8PROC_DECOMP_TYPE_FINAL = 5, /**< Final */ 386 | UTF8PROC_DECOMP_TYPE_ISOLATED = 6, /**< Isolated */ 387 | UTF8PROC_DECOMP_TYPE_CIRCLE = 7, /**< Circle */ 388 | UTF8PROC_DECOMP_TYPE_SUPER = 8, /**< Super */ 389 | UTF8PROC_DECOMP_TYPE_SUB = 9, /**< Sub */ 390 | UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */ 391 | UTF8PROC_DECOMP_TYPE_WIDE = 11, /**< Wide */ 392 | UTF8PROC_DECOMP_TYPE_NARROW = 12, /**< Narrow */ 393 | UTF8PROC_DECOMP_TYPE_SMALL = 13, /**< Small */ 394 | UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */ 395 | UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */ 396 | UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */ 397 | } utf8proc_decomp_type_t; 398 | 399 | /** Boundclass property. (TR29) */ 400 | typedef enum { 401 | UTF8PROC_BOUNDCLASS_START = 0, /**< Start */ 402 | UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */ 403 | UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */ 404 | UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */ 405 | UTF8PROC_BOUNDCLASS_CONTROL = 4, /**< Control */ 406 | UTF8PROC_BOUNDCLASS_EXTEND = 5, /**< Extend */ 407 | UTF8PROC_BOUNDCLASS_L = 6, /**< L */ 408 | UTF8PROC_BOUNDCLASS_V = 7, /**< V */ 409 | UTF8PROC_BOUNDCLASS_T = 8, /**< T */ 410 | UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */ 411 | UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */ 412 | UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */ 413 | UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */ 414 | UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */ 415 | UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */ 416 | 417 | /* the following are no longer used in Unicode 11, but we keep 418 | the constants here for backward compatibility */ 419 | UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */ 420 | UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */ 421 | UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */ 422 | UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */ 423 | 424 | /* the Extended_Pictographic property is used in the Unicode 11 425 | grapheme-boundary rules, so we store it in the boundclass field */ 426 | UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19, 427 | UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */ 428 | } utf8proc_boundclass_t; 429 | 430 | /** Indic_Conjunct_Break property. (TR44) */ 431 | typedef enum { 432 | UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0, 433 | UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1, 434 | UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2, 435 | UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3, 436 | } utf8proc_indic_conjunct_break_t; 437 | 438 | /** 439 | * Function pointer type passed to utf8proc_map_custom() and 440 | * utf8proc_decompose_custom(), which is used to specify a user-defined 441 | * mapping of codepoints to be applied in conjunction with other mappings. 442 | */ 443 | typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data); 444 | 445 | /** 446 | * Array containing the byte lengths of a UTF-8 encoded codepoint based 447 | * on the first byte. 448 | */ 449 | UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256]; 450 | 451 | /** 452 | * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH 453 | * (http://semver.org format), possibly with a "-dev" suffix for 454 | * development versions. 455 | */ 456 | UTF8PROC_DLLEXPORT const char *utf8proc_version(void); 457 | 458 | /** 459 | * Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH. 460 | */ 461 | UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void); 462 | 463 | /** 464 | * Returns an informative error string for the given utf8proc error code 465 | * (e.g. the error codes returned by utf8proc_map()). 466 | */ 467 | UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode); 468 | 469 | /** 470 | * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. 471 | * The maximum number of bytes read is `strlen`, unless `strlen` is 472 | * negative (in which case up to 4 bytes are read). 473 | * 474 | * If a valid codepoint could be read, it is stored in the variable 475 | * pointed to by `codepoint_ref`, otherwise that variable will be set to -1. 476 | * In case of success, the number of bytes read is returned; otherwise, a 477 | * negative error code is returned. 478 | */ 479 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref); 480 | 481 | /** 482 | * Check if a codepoint is valid (regardless of whether it has been 483 | * assigned a value by the current Unicode standard). 484 | * 485 | * @return 1 if the given `codepoint` is valid and otherwise return 0. 486 | */ 487 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); 488 | 489 | /** 490 | * Encodes the codepoint as an UTF-8 string in the byte array pointed 491 | * to by `dst`. This array must be at least 4 bytes long. 492 | * 493 | * In case of success the number of bytes written is returned, and 494 | * otherwise 0 is returned. 495 | * 496 | * This function does not check whether `codepoint` is valid Unicode. 497 | */ 498 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst); 499 | 500 | /** 501 | * Look up the properties for a given codepoint. 502 | * 503 | * @param codepoint The Unicode codepoint. 504 | * 505 | * @returns 506 | * A pointer to a (constant) struct containing information about 507 | * the codepoint. 508 | * @par 509 | * If the codepoint is unassigned or invalid, a pointer to a special struct is 510 | * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN). 511 | */ 512 | UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint); 513 | 514 | /** Decompose a codepoint into an array of codepoints. 515 | * 516 | * @param codepoint the codepoint. 517 | * @param dst the destination buffer. 518 | * @param bufsize the size of the destination buffer. 519 | * @param options one or more of the following flags: 520 | * - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned 521 | * - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints 522 | * - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding 523 | * - @ref UTF8PROC_COMPAT - replace certain codepoints with their 524 | * compatibility decomposition 525 | * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster 526 | * - @ref UTF8PROC_LUMP - lump certain different codepoints together 527 | * - @ref UTF8PROC_STRIPMARK - remove all character marks 528 | * - @ref UTF8PROC_STRIPNA - remove unassigned codepoints 529 | * @param last_boundclass 530 | * Pointer to an integer variable containing 531 | * the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND 532 | * option is used. If the string is being processed in order, this can be initialized to 0 for 533 | * the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored. 534 | * 535 | * @return 536 | * In case of success, the number of codepoints written is returned; in case 537 | * of an error, a negative error code is returned (utf8proc_errmsg()). 538 | * @par 539 | * If the number of written codepoints would be bigger than `bufsize`, the 540 | * required buffer size is returned, while the buffer will be overwritten with 541 | * undefined data. 542 | */ 543 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( 544 | utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, 545 | utf8proc_option_t options, int *last_boundclass 546 | ); 547 | 548 | /** 549 | * The same as utf8proc_decompose_char(), but acts on a whole UTF-8 550 | * string and orders the decomposed sequences correctly. 551 | * 552 | * If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing 553 | * will be stopped, when a NULL byte is encountered, otherwise `strlen` 554 | * bytes are processed. The result (in the form of 32-bit unicode 555 | * codepoints) is written into the buffer being pointed to by 556 | * `buffer` (which must contain at least `bufsize` entries). In case of 557 | * success, the number of codepoints written is returned; in case of an 558 | * error, a negative error code is returned (utf8proc_errmsg()). 559 | * See utf8proc_decompose_custom() to supply additional transformations. 560 | * 561 | * If the number of written codepoints would be bigger than `bufsize`, the 562 | * required buffer size is returned, while the buffer will be overwritten with 563 | * undefined data. 564 | */ 565 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( 566 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 567 | utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options 568 | ); 569 | 570 | /** 571 | * The same as utf8proc_decompose(), but also takes a `custom_func` mapping function 572 | * that is called on each codepoint in `str` before any other transformations 573 | * (along with a `custom_data` pointer that is passed through to `custom_func`). 574 | * The `custom_func` argument is ignored if it is `NULL`. See also utf8proc_map_custom(). 575 | */ 576 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( 577 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 578 | utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, 579 | utf8proc_custom_func custom_func, void *custom_data 580 | ); 581 | 582 | /** 583 | * Normalizes the sequence of `length` codepoints pointed to by `buffer` 584 | * in-place (i.e., the result is also stored in `buffer`). 585 | * 586 | * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. 587 | * @param length the length (in codepoints) of the buffer. 588 | * @param options a bitwise or (`|`) of one or more of the following flags: 589 | * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS 590 | * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS 591 | * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF 592 | * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters 593 | * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite 594 | * codepoints 595 | * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate 596 | * the unicode versioning stability 597 | * 598 | * @return 599 | * In case of success, the length (in codepoints) of the normalized UTF-32 string is 600 | * returned; otherwise, a negative error code is returned (utf8proc_errmsg()). 601 | * 602 | * @warning The entries of the array pointed to by `str` have to be in the 603 | * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! 604 | */ 605 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); 606 | 607 | /** 608 | * Reencodes the sequence of `length` codepoints pointed to by `buffer` 609 | * UTF-8 data in-place (i.e., the result is also stored in `buffer`). 610 | * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion. 611 | * 612 | * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. 613 | * @param length the length (in codepoints) of the buffer. 614 | * @param options a bitwise or (`|`) of one or more of the following flags: 615 | * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS 616 | * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS 617 | * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF 618 | * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters 619 | * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite 620 | * codepoints 621 | * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate 622 | * the unicode versioning stability 623 | * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster 624 | * 625 | * @return 626 | * In case of success, the length (in bytes) of the resulting nul-terminated 627 | * UTF-8 string is returned; otherwise, a negative error code is returned 628 | * (utf8proc_errmsg()). 629 | * 630 | * @warning The amount of free space pointed to by `buffer` must 631 | * exceed the amount of the input data by one byte, and the 632 | * entries of the array pointed to by `str` have to be in the 633 | * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! 634 | */ 635 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); 636 | 637 | /** 638 | * Given a pair of consecutive codepoints, return whether a grapheme break is 639 | * permitted between them (as defined by the extended grapheme clusters in UAX#29). 640 | * 641 | * @param codepoint1 The first codepoint. 642 | * @param codepoint2 The second codepoint, occurring consecutively after `codepoint1`. 643 | * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires 644 | * state to break graphemes. This state can be passed in as a pointer 645 | * in the `state` argument and should initially be set to 0. If the 646 | * state is not passed in (i.e. a null pointer is passed), UAX#29 rules 647 | * GB10/12/13 which require this state will not be applied, essentially 648 | * matching the rules in Unicode 8.0.0. 649 | * 650 | * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must 651 | * be called IN ORDER on ALL potential breaks in a string. However, it 652 | * is safe to reset the state to zero after a grapheme break. 653 | */ 654 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( 655 | utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state); 656 | 657 | /** 658 | * Same as utf8proc_grapheme_break_stateful(), except without support for the 659 | * Unicode 9 additions to the algorithm. Supported for legacy reasons. 660 | */ 661 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( 662 | utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); 663 | 664 | 665 | /** 666 | * Given a codepoint `c`, return the codepoint of the corresponding 667 | * lower-case character, if any; otherwise (if there is no lower-case 668 | * variant, or if `c` is not a valid codepoint) return `c`. 669 | */ 670 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c); 671 | 672 | /** 673 | * Given a codepoint `c`, return the codepoint of the corresponding 674 | * upper-case character, if any; otherwise (if there is no upper-case 675 | * variant, or if `c` is not a valid codepoint) return `c`. 676 | */ 677 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); 678 | 679 | /** 680 | * Given a codepoint `c`, return the codepoint of the corresponding 681 | * title-case character, if any; otherwise (if there is no title-case 682 | * variant, or if `c` is not a valid codepoint) return `c`. 683 | */ 684 | UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c); 685 | 686 | /** 687 | * Given a codepoint `c`, return `1` if the codepoint corresponds to a lower-case character 688 | * and `0` otherwise. 689 | */ 690 | UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c); 691 | 692 | /** 693 | * Given a codepoint `c`, return `1` if the codepoint corresponds to an upper-case character 694 | * and `0` otherwise. 695 | */ 696 | UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c); 697 | 698 | /** 699 | * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, 700 | * except that a width of 0 is returned for non-printable codepoints 701 | * instead of -1 as in `wcwidth`. 702 | * 703 | * @note 704 | * If you want to check for particular types of non-printable characters, 705 | * (analogous to `isprint` or `iscntrl`), use utf8proc_category(). */ 706 | UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint); 707 | 708 | /** 709 | * Given a codepoint, return whether it has East Asian width class A (Ambiguous) 710 | * 711 | * Codepoints with this property are considered to have charwidth 1 (if they are printable) 712 | * but some East Asian fonts render them as double width. 713 | */ 714 | UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t codepoint); 715 | 716 | /** 717 | * Return the Unicode category for the codepoint (one of the 718 | * @ref utf8proc_category_t constants.) 719 | */ 720 | UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint); 721 | 722 | /** 723 | * Return the two-letter (nul-terminated) Unicode category string for 724 | * the codepoint (e.g. `"Lu"` or `"Co"`). 725 | */ 726 | UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint); 727 | 728 | /** 729 | * Maps the given UTF-8 string pointed to by `str` to a new UTF-8 730 | * string, allocated dynamically by `malloc` and returned via `dstptr`. 731 | * 732 | * If the @ref UTF8PROC_NULLTERM flag in the `options` field is set, 733 | * the length is determined by a NULL terminator, otherwise the 734 | * parameter `strlen` is evaluated to determine the string length, but 735 | * in any case the result will be NULL terminated (though it might 736 | * contain NULL characters with the string if `str` contained NULL 737 | * characters). Other flags in the `options` field are passed to the 738 | * functions defined above, and regarded as described. See also 739 | * utf8proc_map_custom() to supply a custom codepoint transformation. 740 | * 741 | * In case of success the length of the new string is returned, 742 | * otherwise a negative error code is returned. 743 | * 744 | * @note The memory of the new UTF-8 string will have been allocated 745 | * with `malloc`, and should therefore be deallocated with `free`. 746 | */ 747 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( 748 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options 749 | ); 750 | 751 | /** 752 | * Like utf8proc_map(), but also takes a `custom_func` mapping function 753 | * that is called on each codepoint in `str` before any other transformations 754 | * (along with a `custom_data` pointer that is passed through to `custom_func`). 755 | * The `custom_func` argument is ignored if it is `NULL`. 756 | */ 757 | UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( 758 | const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, 759 | utf8proc_custom_func custom_func, void *custom_data 760 | ); 761 | 762 | /** @name Unicode normalization 763 | * 764 | * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or 765 | * NFKC_Casefold normalized version of the null-terminated string `str`. These 766 | * are shortcuts to calling utf8proc_map() with @ref UTF8PROC_NULLTERM 767 | * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. 768 | */ 769 | /** @{ */ 770 | /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ 771 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); 772 | /** NFC normalization (@ref UTF8PROC_COMPOSE). */ 773 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); 774 | /** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ 775 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); 776 | /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ 777 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); 778 | /** 779 | * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT 780 | * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE). 781 | **/ 782 | UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str); 783 | /** @} */ 784 | 785 | #ifdef __cplusplus 786 | } 787 | #endif 788 | 789 | #endif 790 | --------------------------------------------------------------------------------