├── scripts ├── __init__.py ├── Makefile.am ├── version.py ├── compression.py ├── basics.py ├── ccc.py ├── feature.py ├── general_category.py ├── encoding.py ├── numeric_value.py ├── collation.py ├── decomposition.py ├── composition.py ├── quickcheck.py ├── case_folding.py ├── case_conversion.py └── segmentation.py ├── .gitattributes ├── include └── Makefile.am ├── unicode.bin ├── examples ├── .gitignore ├── CMakeLists.txt ├── compress_text.c ├── decode_utf8.c ├── decode_utf16.c ├── decode_utf32.c ├── segment_text.c ├── Makefile.am ├── case_convert.c ├── encode_character.c ├── case_fold.c ├── compare_text.c ├── collate_text.c └── character_properties.c ├── Makefile.am ├── unicorn.pc.in ├── tests └── README.md ├── autogen.sh ├── .github ├── pull_request_template.md ├── ISSUE_TEMPLATE │ └── config.yml └── workflows │ └── build.yml ├── man ├── unisize.3 ├── uni_utf8.3 ├── uni_utf32.3 ├── uni_utf16.3 ├── unicasefold.3 ├── unicaseconv.3 ├── uni_scalar.3 ├── unierrfunc.3 ├── uni_big.3 ├── uni_little.3 ├── Makefile.am ├── uni_native.3 ├── uni_is.3 ├── CMakeLists.txt ├── uni_getversion.3 ├── uni_ccc.3 ├── uni_tolower.3 ├── uni_toupper.3 ├── uni_totitle.3 ├── uni_getucdversion.3 ├── uni_trust.3 ├── unimemfunc.3 ├── uni_normchk.3 ├── uninormform.3 ├── unichar.3 ├── uni_gc.3 ├── uni_prevbrk.3 ├── uni_validate.3 ├── uni_nulify.3 ├── uninormchk.3 ├── uni_caseconvchk.3 ├── uni_numval.3 ├── unistrength.3 ├── uni_casefoldchk.3 ├── uniweighting.3 ├── uni_seterrfunc.3 ├── uni_prev.3 ├── uni_compress.3 ├── uni_nextbrk.3 ├── uni_decompress.3 ├── uni_casefoldcmp.3 ├── uni_normqchk.3 ├── uniattr.3 ├── uni_sortkeycmp.3 ├── uni_setmemfunc.3 ├── uni_next.3 ├── uni_caseconv.3 ├── unistat.3 ├── uni_norm.3 ├── uni_convert.3 ├── uni_encode.3 ├── uni_casefold.3 ├── uni_sortkeymk.3 ├── uni_collate.3 ├── uni_normcmp.3 └── unibreak.3 ├── src ├── logger.c ├── Makefile.am ├── ducet.h ├── normalize.h ├── charvec.h ├── charbuf.h ├── byteswap.h ├── memory.c ├── common.h ├── CMakeLists.txt ├── charvec.c ├── properties.c └── unicorn.c ├── .gitignore ├── features.json ├── configure.ac ├── UnicornConfig.cmake.in └── CMakeLists.txt /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /include/Makefile.am: -------------------------------------------------------------------------------- 1 | include_HEADERS = unicorn.h 2 | -------------------------------------------------------------------------------- /scripts/Makefile.am: -------------------------------------------------------------------------------- 1 | EXTRA_DIST = $(wildcard *.py) 2 | -------------------------------------------------------------------------------- /unicode.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/railgunlabs/unicorn/HEAD/unicode.bin -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | caseconv 2 | casefold 3 | charprops 4 | collate 5 | compare 6 | compress 7 | decode-* 8 | encode-* 9 | segmentation 10 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = man src include examples scripts 2 | 3 | EXTRA_DIST = unicode.bin features.json autogen.sh LICENSE CMakeLists.txt UnicornConfig.cmake.in README.md 4 | 5 | AUTOMAKE_OPTIONS = subdir-objects 6 | 7 | pkgconfigdir = $(libdir)/pkgconfig 8 | pkgconfig_DATA = unicorn.pc 9 | -------------------------------------------------------------------------------- /unicorn.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | 6 | Name: unicorn 7 | Description: Embeddable Unicode algorithms. 8 | Version: @VERSION@ 9 | URL: https://railgunlabs.com/unicorn/ 10 | 11 | Cflags: -I${includedir} 12 | Libs: -L${libdir} -lunicorn 13 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Looking for Tests? 2 | 3 | Unicorn is extensively tested, however, the tests and code for generating the Unicode® data tables are **not** open source. 4 | Both are exclusively available to commercial licensees. 5 | 6 | See [https://railgunlabs.com/unicorn/license/](https://railgunlabs.com/unicorn/license/) for licensing details. 7 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Generating build information using autoreconf" 4 | echo "This may take a while ..." 5 | 6 | if hash autoreconf 2>/dev/null; then 7 | autoreconf --verbose --install --force 8 | echo "Now you are ready to run ./configure" 9 | else 10 | echo "Couldn't find autoreconf, aborting" 11 | echo "Please install autoreconf and try again" 12 | fi 13 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: 📚 Documentation 4 | url: https://railgunlabs.com/unicorn/manual 5 | about: Review documentation and code examples. 6 | - name: 🔹 Basic Support 7 | url: https://railgunlabs.com/contribute 8 | about: Submit unsolicited bugs reports and code contributions. 9 | - name: 💎 Premium Support 10 | url: https://railgunlabs.com/services 11 | about: Priority support for bug reports and personal Q&A assistance. 12 | -------------------------------------------------------------------------------- /scripts/version.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | UNICODE_VERSION = "17.0.0" 13 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | link_libraries(unicorn) 2 | add_executable(example_case_convert case_convert.c) 3 | add_executable(example_case_fold case_fold.c) 4 | add_executable(example_character_properties character_properties.c) 5 | add_executable(example_collate_text collate_text.c) 6 | add_executable(example_compare_text compare_text.c) 7 | add_executable(example_decode_utf8 decode_utf8.c) 8 | add_executable(example_decode_utf16 decode_utf16.c) 9 | add_executable(example_decode_utf32 decode_utf32.c) 10 | add_executable(example_encode_character encode_character.c) 11 | add_executable(example_segment_text segment_text.c) 12 | add_executable(example_compress_text compress_text.c) 13 | -------------------------------------------------------------------------------- /man/unisize.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unisize \- code unit count 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "typedef int32_t unisize;" 11 | .fi 12 | .SH DESCRIPTION 13 | An integer type whose storage size represents the maximum potential length (in code units) of encoded text. 14 | .SH AUTHOR 15 | .UR https://railgunlabs.com 16 | Railgun Labs 17 | .UE . 18 | .SH INTERNET RESOURCES 19 | The online documentation is published on the 20 | .UR https://railgunlabs.com/unicorn 21 | Railgun Labs website 22 | .UE . 23 | .SH LICENSING 24 | Unicorn is distributed with its end-user license agreement (EULA). 25 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 26 | -------------------------------------------------------------------------------- /man/uni_utf8.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_UTF8 \- UTF-8 encoding form 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_UTF8 0x2u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is encoded as UTF-8. 14 | The implementation uses an unsigned 8-bit integer (\f[C]uint8_t\f[R]) to represent UTF-8 code units. 15 | .SH AUTHOR 16 | .UR https://railgunlabs.com 17 | Railgun Labs 18 | .UE . 19 | .SH INTERNET RESOURCES 20 | The online documentation is published on the 21 | .UR https://railgunlabs.com/unicorn 22 | Railgun Labs website 23 | .UE . 24 | .SH LICENSING 25 | Unicorn is distributed with its end-user license agreement (EULA). 26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 27 | -------------------------------------------------------------------------------- /man/uni_utf32.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_UTF32 \- UTF-32 encoding form 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_UTF32 0x8u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates text is encoded as UTF-32. 14 | The implementation uses an unsigned 32-bit integer (\f[C]uint32_t\f[R]) to represent UTF-32 code units. 15 | .SH AUTHOR 16 | .UR https://railgunlabs.com 17 | Railgun Labs 18 | .UE . 19 | .SH INTERNET RESOURCES 20 | The online documentation is published on the 21 | .UR https://railgunlabs.com/unicorn 22 | Railgun Labs website 23 | .UE . 24 | .SH LICENSING 25 | Unicorn is distributed with its end-user license agreement (EULA). 26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 27 | -------------------------------------------------------------------------------- /man/uni_utf16.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_UTF16 \- UTF-16 encoding form 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_UTF16 0x4u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is encoded as UTF-16. 14 | The implementation uses an unsigned 16-bit integer (\f[C]uint16_t\f[R]) to represent UTF-16 code units. 15 | .SH AUTHOR 16 | .UR https://railgunlabs.com 17 | Railgun Labs 18 | .UE . 19 | .SH INTERNET RESOURCES 20 | The online documentation is published on the 21 | .UR https://railgunlabs.com/unicorn 22 | Railgun Labs website 23 | .UE . 24 | .SH LICENSING 25 | Unicorn is distributed with its end-user license agreement (EULA). 26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 27 | -------------------------------------------------------------------------------- /scripts/compression.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | import zipfile 13 | 14 | from .config import Config 15 | from .tools import Codespace 16 | from .feature import Feature 17 | from .basics import SerializationData 18 | 19 | class ShortStringCompression(Feature): 20 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 21 | public_header = "#define UNICORN_FEATURE_COMPRESSION\n" 22 | return SerializationData(public_header=public_header) 23 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build Status 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | linux: 5 | name: "Linux - ${{ matrix.compiler.display }}" 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | compiler: 10 | - { CC: gcc, display: GCC } 11 | - { CC: clang, display: Clang } 12 | steps: 13 | - name: Checkout Repository 14 | uses: actions/checkout@v4 15 | - uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.10' 18 | - name: CMake Build 19 | env: 20 | CC: ${{ matrix.platform.CC }} 21 | CFLAGS: ${{ matrix.platform.CFLAGS }} 22 | run: | 23 | cmake -B build -DCMAKE_BUILD_TYPE=Release -DUNICORN_BUILD_EXAMPLES=ON 24 | cmake --build build 25 | - name: Autotools Build 26 | env: 27 | CC: ${{ matrix.platform.CC }} 28 | CFLAGS: ${{ matrix.platform.CFLAGS }} 29 | run: | 30 | ./autogen.sh 31 | ./configure --enable-examples 32 | make 33 | make distcheck 34 | -------------------------------------------------------------------------------- /scripts/basics.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from enum import IntEnum 13 | from dataclasses import dataclass 14 | 15 | class StorageSize(IntEnum): 16 | UINT32 = 32 17 | UINT16 = 16 18 | UINT8 = 8 19 | 20 | def byte_size(self) -> int: 21 | return int(self / 8) 22 | 23 | @dataclass 24 | class SerializationData: 25 | def __init__(self, source: str = "", header: str = "", public_header: str = "", size: int =0) -> None: 26 | self.source = source 27 | self.header = header 28 | self.public_header = public_header 29 | self.size = size 30 | -------------------------------------------------------------------------------- /src/logger.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #include "common.h" 15 | 16 | static unierrfunc unicorn_logger_cb; 17 | static void *unicorn_logger_ud; 18 | 19 | UNICORN_API void uni_seterrfunc(void *user_data, unierrfunc callback) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 20 | { 21 | unicorn_logger_ud = user_data; 22 | unicorn_logger_cb = callback; 23 | } 24 | 25 | void uni_message(const char *msg) 26 | { 27 | if (unicorn_logger_cb != NULL) 28 | { 29 | unicorn_logger_cb(unicorn_logger_ud, msg); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/unicasefold.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unicasefold \- case folding operations 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum unicasefold { 11 | .RS 12 | .B UNI_DEFAULT, 13 | .B UNI_CANONICAL, 14 | .RE 15 | .B }; 16 | .fi 17 | .SH DESCRIPTION 18 | This enumeration defines the supported case folding operations. 19 | .SH CONSTANTS 20 | .TP 21 | .BR UNI_DEFAULT 22 | Case folding for binary caseless string comparison. 23 | .TP 24 | .BR UNI_CANONICAL 25 | Case folding for canonical caseless string comparison. 26 | .SH AUTHOR 27 | .UR https://railgunlabs.com 28 | Railgun Labs 29 | .UE . 30 | .SH INTERNET RESOURCES 31 | The online documentation is published on the 32 | .UR https://railgunlabs.com/unicorn 33 | Railgun Labs website 34 | .UE . 35 | .SH LICENSING 36 | Unicorn is distributed with its end-user license agreement (EULA). 37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 38 | -------------------------------------------------------------------------------- /man/unicaseconv.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unicaseconv \- case conversion operations 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum unicaseconv { 11 | .RS 12 | .B UNI_LOWER, 13 | .B UNI_TITLE, 14 | .B UNI_UPPER, 15 | .RE 16 | .B }; 17 | .fi 18 | .SH DESCRIPTION 19 | This enumeration defines the supported case conversion operations. 20 | .SH CONSTANTS 21 | .TP 22 | .BR UNI_LOWER 23 | Lowercase case conversion. 24 | .TP 25 | .BR UNI_TITLE 26 | Titlecase case conversion. 27 | .TP 28 | .BR UNI_UPPER 29 | Uppercase case conversion. 30 | .SH AUTHOR 31 | .UR https://railgunlabs.com 32 | Railgun Labs 33 | .UE . 34 | .SH INTERNET RESOURCES 35 | The online documentation is published on the 36 | .UR https://railgunlabs.com/unicorn 37 | Railgun Labs website 38 | .UE . 39 | .SH LICENSING 40 | Unicorn is distributed with its end-user license agreement (EULA). 41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 42 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS = subdir-objects 2 | 3 | config_json = @CONFIG_JSON@ 4 | 5 | include_HEADERS = _unicorn.h 6 | 7 | lib_LTLIBRARIES = libunicorn.la 8 | 9 | libunicorn_la_SOURCES = \ 10 | $(top_srcdir)/include/unicorn.h \ 11 | unicorn.c \ 12 | logger.c \ 13 | memory.c \ 14 | common.h \ 15 | byteswap.h \ 16 | encoding.c \ 17 | segmentation.c \ 18 | normalize.c \ 19 | normalize.h \ 20 | collation.c \ 21 | ducet.c \ 22 | ducet.h \ 23 | compression.c \ 24 | properties.c \ 25 | caseconv.c \ 26 | casefold.c \ 27 | charbuf.c \ 28 | charbuf.h \ 29 | charvec.c \ 30 | charvec.h \ 31 | unidata.c \ 32 | unidata.h \ 33 | _unicorn.h 34 | 35 | libunicorn_la_CFLAGS = -I$(top_srcdir)/include 36 | libunicorn_la_LDFLAGS = -no-undefined -version-info 0:0:0 37 | 38 | unidata.c: 39 | cd $(top_srcdir) && $(PYTHON) -m scripts --config $(config_json) --output $(abs_top_builddir)/src 40 | 41 | unidata.h: unidata.c 42 | 43 | _unicorn.h: unidata.c 44 | 45 | noinst_HEADERS = unidata.h 46 | 47 | BUILT_SOURCES = \ 48 | unidata.c \ 49 | unidata.h \ 50 | _unicorn.h 51 | 52 | CLEANFILES = \ 53 | unidata.c \ 54 | unidata.h \ 55 | _unicorn.h 56 | -------------------------------------------------------------------------------- /man/uni_scalar.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_SCALAR \- unicode scalar value 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_SCALAR 0x1u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is represented by an integer type large enough to accommodate Unicode scalar values. 14 | .PP 15 | This is a pseudo-encoding that uses the \f[B]unichar\f[R](3) type for storage. 16 | .PP 17 | Unicode scalars are not the same as code points. 18 | The former excludes surrogate characters whereas the latter does not. 19 | .SH SEE ALSO 20 | .BR unichar (3) 21 | .SH AUTHOR 22 | .UR https://railgunlabs.com 23 | Railgun Labs 24 | .UE . 25 | .SH INTERNET RESOURCES 26 | The online documentation is published on the 27 | .UR https://railgunlabs.com/unicorn 28 | Railgun Labs website 29 | .UE . 30 | .SH LICENSING 31 | Unicorn is distributed with its end-user license agreement (EULA). 32 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 33 | -------------------------------------------------------------------------------- /man/unierrfunc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unierrfunc \- diagnostic function 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "typedef void(*unierrfunc)(void *" user_data ", const char *" message ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Defines the function signature for a custom error callback function. 14 | The resulting message, given by \f[I]message\f[R], is UTF-8 encoded and null-terminated. 15 | It is a message intended for programmers, written in US English, for debugging purposes. 16 | It must never be displayed to an end user. 17 | .SH SEE ALSO 18 | .BR uni_seterrfunc (3) 19 | .SH AUTHOR 20 | .UR https://railgunlabs.com 21 | Railgun Labs 22 | .UE . 23 | .SH INTERNET RESOURCES 24 | The online documentation is published on the 25 | .UR https://railgunlabs.com/unicorn 26 | Railgun Labs website 27 | .UE . 28 | .SH LICENSING 29 | Unicorn is distributed with its end-user license agreement (EULA). 30 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 31 | -------------------------------------------------------------------------------- /examples/compress_text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This code example compresses text using the BOCU-1 compression algorithm. 15 | // This algorithm is specifically designed for the compression of short 16 | // Unicode text. Once strings become longer, then a general purpose 17 | // compressor is preferable. 18 | 19 | const char src[] = u8"素早い茶色のキツネが怠け者の犬を飛び越えます"; 20 | 21 | char buf[64]; 22 | size_t buf_len = 64; 23 | 24 | // The following function compresses the input text 'src' and writes the 25 | // result to 'buf'. It's function signature is documented online at: 26 | // . 27 | if (uni_compress(src, -1, UNI_UTF8, buf, &buf_len) == UNI_OK) 28 | { 29 | printf("Uncompressed : %ld bytes\n", sizeof(src)); 30 | printf("Compressed : %ld bytes\n", buf_len); 31 | } 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /man/uni_big.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_BIG \- big endian byte order 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_BIG 0x10u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is encoded in big endian byte order. 14 | It's incompatible with the \f[B]UNI_LITTLE\f[R](3) flag. 15 | .PP 16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3). 17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3). 18 | .SH SEE ALSO 19 | .BR UNI_LITTLE (3), 20 | .BR UNI_UTF16 (3), 21 | .BR UNI_UTF32 (3), 22 | .BR UNI_UTF8 (3), 23 | .BR UNI_SCALAR (3) 24 | .SH AUTHOR 25 | .UR https://railgunlabs.com 26 | Railgun Labs 27 | .UE . 28 | .SH INTERNET RESOURCES 29 | The online documentation is published on the 30 | .UR https://railgunlabs.com/unicorn 31 | Railgun Labs website 32 | .UE . 33 | .SH LICENSING 34 | Unicorn is distributed with its end-user license agreement (EULA). 35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 36 | -------------------------------------------------------------------------------- /man/uni_little.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_LITTLE \- little endian byte order 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_LITTLE 0x20u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is encoded in little endian byte order. 14 | It's incompatible with the \f[B]UNI_BIG\f[R](3) flag. 15 | .PP 16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3). 17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3). 18 | .SH SEE ALSO 19 | .BR UNI_BIG (3), 20 | .BR UNI_UTF16 (3), 21 | .BR UNI_UTF32 (3), 22 | .BR UNI_UTF8 (3), 23 | .BR UNI_SCALAR (3) 24 | .SH AUTHOR 25 | .UR https://railgunlabs.com 26 | Railgun Labs 27 | .UE . 28 | .SH INTERNET RESOURCES 29 | The online documentation is published on the 30 | .UR https://railgunlabs.com/unicorn 31 | Railgun Labs website 32 | .UE . 33 | .SH LICENSING 34 | Unicorn is distributed with its end-user license agreement (EULA). 35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 36 | -------------------------------------------------------------------------------- /man/Makefile.am: -------------------------------------------------------------------------------- 1 | man3_MANS = \ 2 | uni_convert.3 \ 3 | uni_decompress.3 \ 4 | uninormchk.3 \ 5 | uni_normqchk.3 \ 6 | uni_norm.3 \ 7 | unisize.3 \ 8 | uni_normcmp.3 \ 9 | uni_caseconv.3 \ 10 | uni_casefoldcmp.3 \ 11 | uniattr.3 \ 12 | unimemfunc.3 \ 13 | uninormform.3 \ 14 | uni_nextbrk.3 \ 15 | uniweighting.3 \ 16 | uni_setmemfunc.3 \ 17 | uni_nulify.3 \ 18 | uni_getucdversion.3 \ 19 | uni_ccc.3 \ 20 | uni_utf16.3 \ 21 | unierrfunc.3 \ 22 | uni_sortkeymk.3 \ 23 | uni_encode.3 \ 24 | uni_compress.3 \ 25 | uni_utf8.3 \ 26 | unistat.3 \ 27 | uni_big.3 \ 28 | uni_normchk.3 \ 29 | uni_collate.3 \ 30 | uni_trust.3 \ 31 | uni_next.3 \ 32 | uni_caseconvchk.3 \ 33 | uni_getversion.3 \ 34 | uni_validate.3 \ 35 | uni_native.3 \ 36 | uni_scalar.3 \ 37 | unicaseconv.3 \ 38 | uni_utf32.3 \ 39 | uni_little.3 \ 40 | unibp.3 \ 41 | unistrength.3 \ 42 | uni_totitle.3 \ 43 | uni_sortkeycmp.3 \ 44 | unicorn.3 \ 45 | uni_toupper.3 \ 46 | uni_prev.3 \ 47 | unicasefold.3 \ 48 | uni_casefold.3 \ 49 | uni_gc.3 \ 50 | uni_prevbrk.3 \ 51 | unichar.3 \ 52 | uni_numval.3 \ 53 | uni_tolower.3 \ 54 | unigc.3 \ 55 | uni_seterrfunc.3 \ 56 | uni_casefoldchk.3 \ 57 | uni_is.3 \ 58 | unibreak.3 59 | EXTRA_DIST = CMakeLists.txt $(man3_MANS) 60 | -------------------------------------------------------------------------------- /examples/decode_utf8.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This example demonstrates decoding the code points of a UTF-8 encoded string. 15 | // It prints each code point on its own line. 16 | 17 | const char str[] = u8"I 🕵️."; // I spy 18 | unisize i = 0; 19 | for (;;) 20 | { 21 | unichar cp = 0x0; 22 | unistat s = uni_next(str, -1, UNI_UTF8, &i, &cp); 23 | if (s == UNI_OK) 24 | { 25 | printf("U+%04X\n", cp); 26 | } 27 | else if (s == UNI_DONE) 28 | { 29 | break; 30 | } 31 | else if (s == UNI_BAD_ENCODING) 32 | { 33 | printf("malformed character at code unit %d\n", i); 34 | return 1; 35 | } 36 | else 37 | { 38 | puts("an internal error occured"); 39 | return 2; // Something else went wrong (check the status code). 40 | } 41 | } 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Libraries 8 | *.lib 9 | *.a 10 | 11 | # Shared objects (inc. Windows DLLs) 12 | *.dll 13 | *.so 14 | *.dylib 15 | 16 | # Executables 17 | *.exe 18 | *.out 19 | *.app 20 | *.i*86 21 | *.x86_64 22 | *.hex 23 | *.msi 24 | 25 | # Visual Studio temporary files 26 | *.suo 27 | *.user 28 | *.sbr 29 | *.pdb 30 | *.ncb 31 | *.sdf 32 | *.idb 33 | 34 | # Mac OS X 35 | .DS_Store 36 | 37 | # CMake out-of-source build files 38 | build 39 | Win32 40 | _CPack_Packages 41 | 42 | # Automake 43 | Makefile 44 | Makefile.in 45 | /ar-lib 46 | /mdate-sh 47 | /py-compile 48 | /test-driver 49 | /ylwrap 50 | .deps/ 51 | .dirstamp 52 | libtool 53 | ltmain.sh 54 | *.lo 55 | *.la 56 | *.lai 57 | *.so* 58 | *.libs 59 | 60 | # Autoconf 61 | autom4te.cache 62 | configure~ 63 | /autoscan.log 64 | /autoscan-*.log 65 | /aclocal.m4 66 | /compile 67 | /config.cache 68 | /config.guess 69 | /config.h.in 70 | /config.log 71 | /config.status 72 | /config.sub 73 | /configure 74 | /configure.scan 75 | /depcomp 76 | /install-sh 77 | /missing 78 | /stamp-h1 79 | *.tar.gz 80 | *.zip 81 | 82 | # Generated pkg-config file 83 | unicorn.pc 84 | 85 | # Python files 86 | *.pyc 87 | 88 | # Generate C source code 89 | unidata.c 90 | unidata.h 91 | _unicorn.h 92 | -------------------------------------------------------------------------------- /examples/decode_utf16.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | // This example demonstrates decoding the code points of a UTF-16 encoded string. 16 | // It prints each code point on its own line. 17 | 18 | const char16_t str[] = u"I 🕵️."; // I spy 19 | unisize i = 0; 20 | for (;;) 21 | { 22 | unichar cp = 0x0; 23 | unistat s = uni_next(str, -1, UNI_UTF16, &i, &cp); 24 | if (s == UNI_OK) 25 | { 26 | printf("U+%04X\n", cp); 27 | } 28 | else if (s == UNI_DONE) 29 | { 30 | break; 31 | } 32 | else if (s == UNI_BAD_ENCODING) 33 | { 34 | printf("malformed character at code unit %d\n", i); 35 | return 1; 36 | } 37 | else 38 | { 39 | puts("an internal error occured"); 40 | return 2; // Something else went wrong (check the status code). 41 | } 42 | } 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /examples/decode_utf32.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | // This example demonstrates decoding the code points of a UTF-32 encoded string. 16 | // It prints each code point on its own line. 17 | 18 | const char32_t str[] = U"I 🕵️."; // I spy 19 | unisize i = 0; 20 | for (;;) 21 | { 22 | unichar cp = 0x0; 23 | unistat s = uni_next(str, -1, UNI_UTF32, &i, &cp); 24 | if (s == UNI_OK) 25 | { 26 | printf("U+%04X\n", cp); 27 | } 28 | else if (s == UNI_DONE) 29 | { 30 | break; 31 | } 32 | else if (s == UNI_BAD_ENCODING) 33 | { 34 | printf("malformed character at code unit %d\n", i); 35 | return 1; 36 | } 37 | else 38 | { 39 | puts("an internal error occured"); 40 | return 2; // Something else went wrong (check the status code). 41 | } 42 | } 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /man/uni_native.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_NATIVE \- native endian byte order 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_NATIVE 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the text is encoded in native byte order. 14 | The value of this flag will be identical to either \f[B]UNI_BIG\f[R](3) or \f[B]UNI_LITTLE\f[R](3). 15 | .PP 16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3). 17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3). 18 | .SH SEE ALSO 19 | .BR UNI_BIG (3), 20 | .BR UNI_LITTLE (3), 21 | .BR UNI_UTF16 (3), 22 | .BR UNI_UTF32 (3), 23 | .BR UNI_UTF8 (3), 24 | .BR UNI_SCALAR (3) 25 | .SH AUTHOR 26 | .UR https://railgunlabs.com 27 | Railgun Labs 28 | .UE . 29 | .SH INTERNET RESOURCES 30 | The online documentation is published on the 31 | .UR https://railgunlabs.com/unicorn 32 | Railgun Labs website 33 | .UE . 34 | .SH LICENSING 35 | Unicorn is distributed with its end-user license agreement (EULA). 36 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 37 | -------------------------------------------------------------------------------- /man/uni_is.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_is \- binary property value 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "bool uni_is(unichar " c ", unibp " p ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function retrieves the value of the binary property \f[I]p\f[R] for the character \f[I]c\f[R]. 14 | .PP 15 | The binary character property \f[I]p\f[R] must be enabled in the JSON configuration file otherwise \f[C]false\f[R] is always returned. 16 | Instructions for enabling it are documented alongside property \f[I]p\f[R] in the \f[B]unibp\f[R](3) enumeration. 17 | .SH RETURN VALUE 18 | Returns \f[C]true\f[R] if character \f[I]c\f[R] has property \f[I]p\f[R]. 19 | .SH SEE ALSO 20 | .BR unibp (3), 21 | .BR unichar (3) 22 | .SH AUTHOR 23 | .UR https://railgunlabs.com 24 | Railgun Labs 25 | .UE . 26 | .SH INTERNET RESOURCES 27 | The online documentation is published on the 28 | .UR https://railgunlabs.com/unicorn 29 | Railgun Labs website 30 | .UE . 31 | .SH LICENSING 32 | Unicorn is distributed with its end-user license agreement (EULA). 33 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 34 | -------------------------------------------------------------------------------- /man/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(MAN 2 | uni_convert.3 3 | uni_decompress.3 4 | uninormchk.3 5 | uni_normqchk.3 6 | uni_norm.3 7 | unisize.3 8 | uni_normcmp.3 9 | uni_caseconv.3 10 | uni_casefoldcmp.3 11 | uniattr.3 12 | unimemfunc.3 13 | uninormform.3 14 | uni_nextbrk.3 15 | uniweighting.3 16 | uni_setmemfunc.3 17 | uni_nulify.3 18 | uni_getucdversion.3 19 | uni_ccc.3 20 | uni_utf16.3 21 | unierrfunc.3 22 | uni_sortkeymk.3 23 | uni_encode.3 24 | uni_compress.3 25 | uni_utf8.3 26 | unistat.3 27 | uni_big.3 28 | uni_normchk.3 29 | uni_collate.3 30 | uni_trust.3 31 | uni_next.3 32 | uni_caseconvchk.3 33 | uni_getversion.3 34 | uni_validate.3 35 | uni_native.3 36 | uni_scalar.3 37 | unicaseconv.3 38 | uni_utf32.3 39 | uni_little.3 40 | unibp.3 41 | unistrength.3 42 | uni_totitle.3 43 | uni_sortkeycmp.3 44 | unicorn.3 45 | uni_toupper.3 46 | uni_prev.3 47 | unicasefold.3 48 | uni_casefold.3 49 | uni_gc.3 50 | uni_prevbrk.3 51 | unichar.3 52 | uni_numval.3 53 | uni_tolower.3 54 | unigc.3 55 | uni_seterrfunc.3 56 | uni_casefoldchk.3 57 | uni_is.3 58 | unibreak.3) 59 | install(FILES ${MAN} DESTINATION man/man3 COMPONENT doc) 60 | -------------------------------------------------------------------------------- /src/ducet.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #ifndef DUCET_H 15 | #define DUCET_H 16 | 17 | #include "normalize.h" 18 | #include "unidata.h" 19 | 20 | #if defined(UNICORN_FEATURE_COLLATION) 21 | 22 | typedef struct CE 23 | { 24 | uint16_t primary; 25 | uint16_t secondary; 26 | uint16_t tertiary; 27 | } CE; 28 | 29 | struct CEDecoder 30 | { 31 | int32_t ces_index; 32 | int32_t ces_count; 33 | struct NormalizeState normbuf; 34 | struct CharVec charvec; 35 | CE ces[UNICORN_MAX_COLLATION]; 36 | }; 37 | 38 | void uni_cebuf_init(struct CEDecoder *state); 39 | void uni_cebuf_free(struct CEDecoder *state); 40 | bool uni_cebuf_is_empty(const struct CEDecoder *state); 41 | CE uni_cebuf_pop(struct CEDecoder *state); 42 | unistat uni_cebuf_append_run(struct CEDecoder *state, struct unitext *text); 43 | 44 | #endif 45 | 46 | #endif // DUCET_H 47 | -------------------------------------------------------------------------------- /man/uni_getversion.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_getversion \- library version 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "void uni_getversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Writes the major, minor, and patch version of the Unicorn library to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively. 14 | Null arguments are ignored. 15 | .SH EXAMPLES 16 | This example prints the major, minor, and patch version of the library to stdout. 17 | .PP 18 | .in +4n 19 | .EX 20 | #include 21 | #include 22 | 23 | int main(void) 24 | { 25 | int32_t major, minor, patch; 26 | uni_getversion(&major, &minor, &patch); 27 | printf("%d.%d.%d", major, minor, patch); 28 | return 0; 29 | } 30 | .EE 31 | .in 32 | .SH AUTHOR 33 | .UR https://railgunlabs.com 34 | Railgun Labs 35 | .UE . 36 | .SH INTERNET RESOURCES 37 | The online documentation is published on the 38 | .UR https://railgunlabs.com/unicorn 39 | Railgun Labs website 40 | .UE . 41 | .SH LICENSING 42 | Unicorn is distributed with its end-user license agreement (EULA). 43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 44 | -------------------------------------------------------------------------------- /man/uni_ccc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_ccc \- canonical combining class 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "uint8_t uni_ccc(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the \f[C]Canonical_Combining_Class\f[R] property for the code point \f[I]c\f[R]. 14 | The canonical combining class property is used during the Unicode normalization algorithm to sort combining characters. 15 | .PP 16 | Support for the canonical combining class property must be enabled in the JSON configuration file otherwise the function will always return zero. 17 | .PP 18 | .in +4n 19 | .EX 20 | { 21 | "characterProperties": [ 22 | "Canonical_Combining_Class" 23 | ] 24 | } 25 | .EE 26 | .in 27 | .SH RETURN VALUE 28 | The canonical combining class of \f[I]c\f[R]. 29 | .SH SEE ALSO 30 | .BR unichar (3) 31 | .SH AUTHOR 32 | .UR https://railgunlabs.com 33 | Railgun Labs 34 | .UE . 35 | .SH INTERNET RESOURCES 36 | The online documentation is published on the 37 | .UR https://railgunlabs.com/unicorn 38 | Railgun Labs website 39 | .UE . 40 | .SH LICENSING 41 | Unicorn is distributed with its end-user license agreement (EULA). 42 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 43 | -------------------------------------------------------------------------------- /man/uni_tolower.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_tolower \- simple lower case mapping 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unichar uni_tolower(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the simple lower case mapping for the code point \f[I]c\f[R]. 14 | For the full lowercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_LOWER\f[R]. 15 | .PP 16 | Support for simple lower case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R]. 17 | .PP 18 | .in +4n 19 | .EX 20 | { 21 | "characterProperties": [ 22 | "Simple_Lowercase_Mapping" 23 | ] 24 | } 25 | .EE 26 | .in 27 | .SH RETURN VALUE 28 | The simple lower case mapping for \f[I]c\f[R]. 29 | .SH SEE ALSO 30 | .BR uni_caseconv (3), 31 | .BR unicaseconv (3), 32 | .BR unichar (3) 33 | .SH AUTHOR 34 | .UR https://railgunlabs.com 35 | Railgun Labs 36 | .UE . 37 | .SH INTERNET RESOURCES 38 | The online documentation is published on the 39 | .UR https://railgunlabs.com/unicorn 40 | Railgun Labs website 41 | .UE . 42 | .SH LICENSING 43 | Unicorn is distributed with its end-user license agreement (EULA). 44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 45 | -------------------------------------------------------------------------------- /man/uni_toupper.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_toupper \- simple upper case mapping 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unichar uni_toupper(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the simple upper case mapping for the code point \f[I]c\f[R]. 14 | For the full uppercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_UPPER\f[R]. 15 | .PP 16 | Support for simple upper case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R]. 17 | .PP 18 | .in +4n 19 | .EX 20 | { 21 | "characterProperties": [ 22 | "Simple_Uppercase_Mapping" 23 | ] 24 | } 25 | .EE 26 | .in 27 | .SH RETURN VALUE 28 | The simple upper case mapping for \f[I]c\f[R]. 29 | .SH SEE ALSO 30 | .BR uni_caseconv (3), 31 | .BR unicaseconv (3), 32 | .BR unichar (3) 33 | .SH AUTHOR 34 | .UR https://railgunlabs.com 35 | Railgun Labs 36 | .UE . 37 | .SH INTERNET RESOURCES 38 | The online documentation is published on the 39 | .UR https://railgunlabs.com/unicorn 40 | Railgun Labs website 41 | .UE . 42 | .SH LICENSING 43 | Unicorn is distributed with its end-user license agreement (EULA). 44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 45 | -------------------------------------------------------------------------------- /man/uni_totitle.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_totitle \- simple title case mapping 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unichar uni_totitle(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the simple title case mapping for the code point \f[I]c\f[R]. 14 | For the full title case mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_TITLE\f[R]. 15 | .PP 16 | Support for simple title case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R]. 17 | .PP 18 | .in +4n 19 | .EX 20 | { 21 | "characterProperties": [ 22 | "Simple_Titlecase_Mapping" 23 | ] 24 | } 25 | .EE 26 | .in 27 | .SH RETURN VALUE 28 | The simple title case mapping for \f[I]c\f[R]. 29 | .SH SEE ALSO 30 | .BR uni_caseconv (3), 31 | .BR unicaseconv (3), 32 | .BR unichar (3) 33 | .SH AUTHOR 34 | .UR https://railgunlabs.com 35 | Railgun Labs 36 | .UE . 37 | .SH INTERNET RESOURCES 38 | The online documentation is published on the 39 | .UR https://railgunlabs.com/unicorn 40 | Railgun Labs website 41 | .UE . 42 | .SH LICENSING 43 | Unicorn is distributed with its end-user license agreement (EULA). 44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 45 | -------------------------------------------------------------------------------- /man/uni_getucdversion.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_getucdversion \- unicode standard version 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "void uni_getucdversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Writes the major, minor, and patch version of the Unicode Standard to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively. 14 | Null arguments are ignored. 15 | .SH EXAMPLES 16 | This example prints the major, minor, and patch version of the Unicode Standard implemented by the library to stdout. 17 | .PP 18 | .in +4n 19 | .EX 20 | #include 21 | #include 22 | 23 | int main(void) 24 | { 25 | int32_t major, minor, patch; 26 | uni_getucdversion(&major, &minor, &patch); 27 | printf("%d.%d.%d", major, minor, patch); 28 | return 0; 29 | } 30 | .EE 31 | .in 32 | .SH AUTHOR 33 | .UR https://railgunlabs.com 34 | Railgun Labs 35 | .UE . 36 | .SH INTERNET RESOURCES 37 | The online documentation is published on the 38 | .UR https://railgunlabs.com/unicorn 39 | Railgun Labs website 40 | .UE . 41 | .SH LICENSING 42 | Unicorn is distributed with its end-user license agreement (EULA). 43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 44 | -------------------------------------------------------------------------------- /src/normalize.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #ifndef NORMALIZE_H 15 | #define NORMALIZE_H 16 | 17 | #include "common.h" 18 | #include "charvec.h" 19 | 20 | typedef bool (*IsStable)(unichar cp); 21 | 22 | // Incremental normalization state book keeping. 23 | struct NormalizeState 24 | { 25 | int32_t offset; 26 | int32_t r; 27 | IsStable is_stable; 28 | 29 | // Tracks an unstable span of characters within unnormalized text. 30 | // This is the span of text that will be normalized. 31 | struct CharVec span; 32 | 33 | // Buffer with decomposed characters. 34 | struct CharVec decomp; 35 | }; 36 | 37 | void uni_norm_init(struct NormalizeState *state, IsStable is_stable); 38 | void uni_norm_free(struct NormalizeState *state); 39 | 40 | void uni_norm_reset(struct NormalizeState *ns); 41 | 42 | unistat uni_norm_append_run(struct NormalizeState *state, struct unitext *it); 43 | 44 | bool uni_is_stable_nfd(unichar cp); 45 | 46 | #endif // NORMALIZE_H 47 | -------------------------------------------------------------------------------- /scripts/ccc.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | 21 | class CanonicalCombiningClass(Feature): 22 | def pre_process(self, codespace: Codespace) -> None: 23 | codespace.register_property("canonical_combining_class", 0, StorageSize.UINT8) 24 | 25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 26 | file = archive.open('ccc.json') 27 | mappings: Dict[str,int] = json.loads(file.read())["ccc"] 28 | file.close() 29 | 30 | ccc_property = codespace.get("canonical_combining_class") 31 | for cp, ccc in mappings.items(): 32 | codespace.set(int(cp,16), ccc_property, ccc) 33 | 34 | public_header = "#define UNICORN_FEATURE_CCC\n" 35 | return SerializationData(public_header=public_header) 36 | -------------------------------------------------------------------------------- /man/uni_trust.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_TRUST \- well-formed text 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_TRUST 0x40u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates the encoded text is well-formed. 14 | .PP 15 | Under normal usage Unicorn processes text with the assumption that it might be malformed. 16 | This means it performs safety checks which are redundant if the text has already been validated. 17 | The purpose of this flag is to inform Unicorn that it can skip redundant validation checks thereby improving performance. 18 | .PP 19 | Usage of this flag means functions will never return \f[B]UNI_BAD_ENCODING\f[R]. 20 | .PP 21 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer. 22 | Using this flag with an output buffer will produce \f[B]UNI_BAD_OPERATION\f[R]. 23 | .SH SEE ALSO 24 | .BR unistat (3), 25 | .BR uni_norm (3), 26 | .BR uni_validate (3) 27 | .SH AUTHOR 28 | .UR https://railgunlabs.com 29 | Railgun Labs 30 | .UE . 31 | .SH INTERNET RESOURCES 32 | The online documentation is published on the 33 | .UR https://railgunlabs.com/unicorn 34 | Railgun Labs website 35 | .UE . 36 | .SH LICENSING 37 | Unicorn is distributed with its end-user license agreement (EULA). 38 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 39 | -------------------------------------------------------------------------------- /man/unimemfunc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unimemfunc \- memory allocation function 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "typedef void *(*unimemfunc)(void *" user_data ", void *" ptr ", size_t " old_size ", size_t " new_size ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Defines the function signature for a custom memory management implementation. 14 | The memory manager is responsible for allocating, reallocating, and freeing memory. 15 | The implementation must behave like \f[B]realloc\f[R](3) or \f[B]free\f[R](3) depending upon its arguments. 16 | .PP 17 | When \f[I]new_size\f[R] is non-zero, the allocator must behave like \f[B]realloc\f[R](3). 18 | If \f[I]ptr\f[R] is NULL and \f[I]old_size\f[R] is zero, the allocator must behave like \f[B]malloc\f[R](3). 19 | .PP 20 | When \f[I]new_size\f[R] is zero, the allocator must behave like \f[B]free\f[R](3). 21 | It must also return NULL to the caller. 22 | .SH SEE ALSO 23 | .BR uni_setmemfunc (3) 24 | .SH AUTHOR 25 | .UR https://railgunlabs.com 26 | Railgun Labs 27 | .UE . 28 | .SH INTERNET RESOURCES 29 | The online documentation is published on the 30 | .UR https://railgunlabs.com/unicorn 31 | Railgun Labs website 32 | .UE . 33 | .SH LICENSING 34 | Unicorn is distributed with its end-user license agreement (EULA). 35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 36 | -------------------------------------------------------------------------------- /scripts/feature.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Set, Tuple, Type 13 | import zipfile 14 | 15 | from .basics import StorageSize, SerializationData 16 | from .tools import Codespace 17 | from .config import Config 18 | 19 | class Feature(object): 20 | def __init__(self) -> None: 21 | self.total_size = 0 22 | def pre_process(self, codespace: Codespace) -> None: 23 | pass 24 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 25 | return SerializationData() 26 | def post_process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 27 | return SerializationData() 28 | @staticmethod 29 | def dependencies() -> Set[Type['Feature']]: 30 | return set() 31 | 32 | FLAGS_PROPERTY: Tuple[str, int, StorageSize] = ("flags", 0, StorageSize.UINT8) 33 | 34 | IS_COMPOSABLE = 0x1 35 | IS_CASED = 0x2 36 | IS_CASE_IGNORABLE = 0x4 37 | IS_NORMALIZATION_NEEDED = 0x8 38 | IS_CHANGES_WHEN_CASEFOLDED = 0x10 39 | -------------------------------------------------------------------------------- /examples/segment_text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This example prints the code unit indices of the extended grapheme clusters in the string. 15 | // A grapheme cluster is a user-perceived character. It correspond to a single graphical 16 | // character on the end-users display. A grapheme can consist of multiple code points. 17 | 18 | // Change 'UNI_GRAPHEME' to 'UNI_WORD' or 'UNI_SENTENCE' and observe how it affects the 19 | // printed break positions. 20 | 21 | const char str[] = u8"👨🏼‍🚀👨🏽‍🚀 landed on the 🌕"; 22 | unisize break_at = 0; 23 | for (;;) 24 | { 25 | unistat s = uni_nextbrk(UNI_GRAPHEME, str, -1, UNI_UTF8, &break_at); 26 | if (s == UNI_OK) 27 | { 28 | printf("%d\n", break_at); 29 | } 30 | else if (s == UNI_DONE) 31 | { 32 | break; 33 | } 34 | else if (s == UNI_BAD_ENCODING) 35 | { 36 | printf("malformed character at %d\n", break_at); 37 | return 1; 38 | } 39 | else 40 | { 41 | puts("an internal error occured"); 42 | return 2; // Something else went wrong (check the status code). 43 | } 44 | } 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /scripts/general_category.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | 21 | class GeneralCategoryProperty(Feature): 22 | def pre_process(self, codespace: Codespace) -> None: 23 | codespace.register_property("general_category", 29, StorageSize.UINT8) 24 | 25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 26 | file = archive.open('gc.json') 27 | data = json.loads(file.read()) 28 | characters: Dict[str,int] = data["characters"] 29 | file.close() 30 | 31 | gc_property = codespace.get("general_category") 32 | for cp, value in characters.items(): 33 | codespace.set(int(cp,16), gc_property, value) 34 | 35 | public_header = "#define UNICORN_FEATURE_GC\n" 36 | return SerializationData(public_header=public_header) 37 | -------------------------------------------------------------------------------- /features.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.0", 3 | "endian": "native", 4 | "hasStandardAllocators": true, 5 | "characterStorage": "uint32_t", 6 | "optimizeFor": "speed", 7 | "stackBufferSize": 32, 8 | "algorithms": { 9 | "normalization": [ 10 | "NFC", 11 | "NFD" 12 | ], 13 | "normalizationQuickCheck": true, 14 | "caseConversion": [ 15 | "lower", 16 | "title", 17 | "upper" 18 | ], 19 | "caseFolding": [ 20 | "default", 21 | "canonical" 22 | ], 23 | "segmentation": [ 24 | "grapheme", 25 | "word", 26 | "sentence" 27 | ], 28 | "compression": true, 29 | "collation": true 30 | }, 31 | "characterProperties": [ 32 | "Alphabetic", 33 | "Canonical_Combining_Class", 34 | "Dash", 35 | "Diacritic", 36 | "Extender", 37 | "General_Category", 38 | "Hex_Digit", 39 | "Ideographic", 40 | "Lowercase", 41 | "Math", 42 | "Noncharacter_Code_Point", 43 | "Numeric_Value", 44 | "Quotation_Mark", 45 | "Simple_Lowercase_Mapping", 46 | "Simple_Titlecase_Mapping", 47 | "Simple_Uppercase_Mapping", 48 | "Terminal_Punctuation", 49 | "Unified_Ideograph", 50 | "Uppercase", 51 | "White_Space" 52 | ], 53 | "encodingForms": [ 54 | "UTF-8", 55 | "UTF-16", 56 | "UTF-32" 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /man/uni_normchk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_normchk \- normalization check 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_normchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]text\f[R] is normalized. 14 | It's more "expensive" than \f[B]uni_normqchk\f[R](3), but returns a definitive yes/no answer. 15 | .SH RETURN VALUE 16 | .TP 17 | UNI_OK 18 | On success. 19 | .TP 20 | UNI_BAD_OPERATION 21 | If \f[I]text\f[R] or \f[I]result\f[R] is null. 22 | .TP 23 | UNI_BAD_ENCODING 24 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 25 | .TP 26 | UNI_FEATURE_DISABLED 27 | If Unicorn was built without support for \f[I]form\f[R]. 28 | .SH SEE ALSO 29 | .BR uni_normqchk (3), 30 | .BR uninormchk (3), 31 | .BR UNI_TRUST (3), 32 | .BR uninormform (3), 33 | .BR unisize (3), 34 | .BR uniattr (3) 35 | .SH AUTHOR 36 | .UR https://railgunlabs.com 37 | Railgun Labs 38 | .UE . 39 | .SH INTERNET RESOURCES 40 | The online documentation is published on the 41 | .UR https://railgunlabs.com/unicorn 42 | Railgun Labs website 43 | .UE . 44 | .SH LICENSING 45 | Unicorn is distributed with its end-user license agreement (EULA). 46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 47 | -------------------------------------------------------------------------------- /scripts/encoding.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | import zipfile 13 | 14 | from .config import Config 15 | from .tools import Codespace 16 | from .feature import Feature 17 | from .basics import SerializationData 18 | 19 | class EncodingUTF8(Feature): 20 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 21 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF8\n" 22 | return SerializationData(public_header=public_header) 23 | 24 | class EncodingUTF16(Feature): 25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 26 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF16\n" 27 | return SerializationData(public_header=public_header) 28 | 29 | class EncodingUTF32(Feature): 30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 31 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF32\n" 32 | return SerializationData(public_header=public_header) 33 | -------------------------------------------------------------------------------- /man/uninormform.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uninormform \- unicode normalization forms 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum uninormform { 11 | .RS 12 | .B UNI_NFC, 13 | .B UNI_NFD, 14 | .RE 15 | .B }; 16 | .fi 17 | .SH DESCRIPTION 18 | Normalization forms define how strings should be transformed for the purposes of testing equivalence. 19 | .SH CONSTANTS 20 | .TP 21 | .BR UNI_NFC 22 | Performs Canonical Decomposition followed by Canonical Composition. 23 | .IP 24 | Support for canonical composition can be enabled in the JSON configuration file. 25 | .IP 26 | .in +4n 27 | .EX 28 | { 29 | "algorithms": { 30 | "normalization": ["NFC"] 31 | } 32 | } 33 | .EE 34 | .in 35 | .TP 36 | .BR UNI_NFD 37 | Performs Canonical Decomposition. 38 | .IP 39 | Support for canonical decomposition can be enabled in the JSON configuration file. 40 | .IP 41 | .in +4n 42 | .EX 43 | { 44 | "algorithms": { 45 | "normalization": ["NFD"] 46 | } 47 | } 48 | .EE 49 | .in 50 | .SH AUTHOR 51 | .UR https://railgunlabs.com 52 | Railgun Labs 53 | .UE . 54 | .SH INTERNET RESOURCES 55 | The online documentation is published on the 56 | .UR https://railgunlabs.com/unicorn 57 | Railgun Labs website 58 | .UE . 59 | .SH LICENSING 60 | Unicorn is distributed with its end-user license agreement (EULA). 61 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 62 | -------------------------------------------------------------------------------- /man/unichar.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unichar \- code point 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "typedef uint32_t unichar;" 11 | .fi 12 | .SH DESCRIPTION 13 | This integer represents a Unicode code point or Unicode scalar value depending on how it's used. 14 | .PP 15 | The integer type used as the underlying storage for this type is configurable. 16 | This helps simplify integration with existing applications which may already have defined their own Unicode character type. 17 | The type chosen can be signed or unsigned, but it must be large enough to accommodate the entire Unicode character repertoire. 18 | As of Unicode 17.0, that means an integer with at least 21 bits of storage. 19 | .PP 20 | The snippet below demonstrates how to redefine \f[B]unichar\f[R] as a signed 64-bit integer. 21 | Using a 64-bit integer is wasteful, as you only need 21-bits, and was chosen for example purposes only. 22 | .PP 23 | .in +4n 24 | .EX 25 | { 26 | "characterStorage": "int64_t", 27 | } 28 | .EE 29 | .in 30 | .SH AUTHOR 31 | .UR https://railgunlabs.com 32 | Railgun Labs 33 | .UE . 34 | .SH INTERNET RESOURCES 35 | The online documentation is published on the 36 | .UR https://railgunlabs.com/unicorn 37 | Railgun Labs website 38 | .UE . 39 | .SH LICENSING 40 | Unicorn is distributed with its end-user license agreement (EULA). 41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 42 | -------------------------------------------------------------------------------- /src/charvec.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | // Implements a heap-allocated growable character buffer. 15 | 16 | #ifndef CHARVEC_H 17 | #define CHARVEC_H 18 | 19 | #include "common.h" 20 | 21 | struct CharVec 22 | { 23 | unisize length; 24 | unisize capacity; 25 | unichar *chars; 26 | unichar scratch[UNICORN_STACK_BUFFER_SIZE]; 27 | }; 28 | 29 | void uni_charvec_init(struct CharVec *buffer); 30 | void uni_charvec_free(struct CharVec *buffer); 31 | void uni_charvec_reset(struct CharVec *buffer); 32 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count); 33 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity); 34 | void uni_charvec_remove(struct CharVec *buf, unisize i); 35 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count); 36 | 37 | static inline unisize uni_charvec_length(const struct CharVec *buffer) 38 | { 39 | return buffer->length; 40 | } 41 | 42 | static inline unisize uni_charvec_capacity(const struct CharVec *buffer) 43 | { 44 | return buffer->capacity; 45 | } 46 | 47 | #endif // CHARVEC_H 48 | -------------------------------------------------------------------------------- /man/uni_gc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_gc \- general category 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unigc uni_gc(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the \f[C]General_Category\f[R] character property for the code point \f[I]c\f[R]. 14 | This property describes a character's general classification. 15 | For example, the Latin Capital Letter A (\f[C]U+0041\f[R]) has a general category value of \f[B]UNI_UPPERCASE_LETTER\f[R] which indicates it's an uppercase letter. 16 | .PP 17 | For a complete list of all possible general category values, see \f[B]unigc\f[R](3). 18 | .PP 19 | Support for the \f[C]General_Category\f[R] character property must be enabled in the JSON configuration file otherwise the function will always return \f[B]UNI_UNASSIGNED\f[R]. 20 | .PP 21 | .in +4n 22 | .EX 23 | { 24 | "characterProperties": [ 25 | "General_Category" 26 | ] 27 | } 28 | .EE 29 | .in 30 | .SH RETURN VALUE 31 | The general category of \f[I]c\f[R]. 32 | .SH SEE ALSO 33 | .BR unigc (3), 34 | .BR unichar (3) 35 | .SH AUTHOR 36 | .UR https://railgunlabs.com 37 | Railgun Labs 38 | .UE . 39 | .SH INTERNET RESOURCES 40 | The online documentation is published on the 41 | .UR https://railgunlabs.com/unicorn 42 | Railgun Labs website 43 | .UE . 44 | .SH LICENSING 45 | Unicorn is distributed with its end-user license agreement (EULA). 46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 47 | -------------------------------------------------------------------------------- /man/uni_prevbrk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_prevbrk \- compute preceding boundary 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_prevbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function computes the preceding \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R]. 14 | The implementation sets \f[I]index\f[R] to the code unit offset of the preceding \f[I]boundary\f[R]. 15 | .SH RETURN VALUE 16 | .TP 17 | UNI_OK 18 | If the break iterator was successfully repositioned. 19 | .TP 20 | UNI_DONE 21 | If \f[I]index\f[R] is at the beginning of \f[I]text\f[R]. 22 | .TP 23 | UNI_BAD_OPERATION 24 | If \f[I]text\f[R] or \f[I]index\f[R] is null. 25 | .TP 26 | UNI_BAD_ENCODING 27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 28 | .SH SEE ALSO 29 | .BR UNI_TRUST (3), 30 | .BR unibreak (3), 31 | .BR unisize (3), 32 | .BR uniattr (3) 33 | .SH AUTHOR 34 | .UR https://railgunlabs.com 35 | Railgun Labs 36 | .UE . 37 | .SH INTERNET RESOURCES 38 | The online documentation is published on the 39 | .UR https://railgunlabs.com/unicorn 40 | Railgun Labs website 41 | .UE . 42 | .SH LICENSING 43 | Unicorn is distributed with its end-user license agreement (EULA). 44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 45 | -------------------------------------------------------------------------------- /man/uni_validate.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_validate \- validate text 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_validate(const void *" text ", unisize " text_len ", uniattr " text_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]text\f[R] is well-formed. 14 | The length of \f[I]text\f[R] is given in code units by \f[I]text_len\f[R] and its encoding form is specified by \f[I]text_attr\f[R]. 15 | .PP 16 | The \f[B]UNI_TRUST\f[R](3) flag has no effect when used with \f[I]text_attr\f[R] as the entire purpose of this function is to verify \f[I]text\f[R] is well-formed and assuming it is well-formed would defeat the purpose. 17 | .SH RETURN VALUE 18 | .TP 19 | UNI_OK 20 | If \f[I]text\f[R] is well-formed. 21 | .TP 22 | UNI_BAD_OPERATION 23 | If \f[I]text\f[R] is null. 24 | .TP 25 | UNI_BAD_ENCODING 26 | If the encoding of \f[I]text\f[R] is malformed. 27 | .TP 28 | UNI_FEATURE_DISABLED 29 | If the encoding form flagged in \f[I]text_attr\f[R] is disabled. 30 | .SH SEE ALSO 31 | .BR UNI_TRUST (3), 32 | .BR unisize (3), 33 | .BR uniattr (3) 34 | .SH AUTHOR 35 | .UR https://railgunlabs.com 36 | Railgun Labs 37 | .UE . 38 | .SH INTERNET RESOURCES 39 | The online documentation is published on the 40 | .UR https://railgunlabs.com/unicorn 41 | Railgun Labs website 42 | .UE . 43 | .SH LICENSING 44 | Unicorn is distributed with its end-user license agreement (EULA). 45 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 46 | -------------------------------------------------------------------------------- /man/uni_nulify.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | UNI_NULIFY \- null-terminated output 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B #define UNI_NULIFY 0x80u 11 | .fi 12 | .SH DESCRIPTION 13 | Text attribute bit flag that indicates that the output buffer must be null-terminated by the implementation. 14 | .PP 15 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer. 16 | The output buffer is the destination buffer that the implementation writes encoded characters to. 17 | When this flag is applied to the output buffer the implementation guarantees a NUL character (U+0000) is appended, even if it means truncating the last non-null character. 18 | Conceptually, this behavior is like the BSD function \f[C]strlcpy\f[R] which also guarantees the destination buffer is null-terminated. 19 | .PP 20 | The implementation will never append an extraneous null character if the output buffer is already null-terminated. 21 | .PP 22 | Using this flag with an input buffer will produce \f[B]UNI_BAD_OPERATION\f[R]. 23 | .SH SEE ALSO 24 | .BR uni_norm (3), 25 | .BR unistat (3) 26 | .SH AUTHOR 27 | .UR https://railgunlabs.com 28 | Railgun Labs 29 | .UE . 30 | .SH INTERNET RESOURCES 31 | The online documentation is published on the 32 | .UR https://railgunlabs.com/unicorn 33 | Railgun Labs website 34 | .UE . 35 | .SH LICENSING 36 | Unicorn is distributed with its end-user license agreement (EULA). 37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 38 | -------------------------------------------------------------------------------- /scripts/numeric_value.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | 21 | class NumericValueProperty(Feature): 22 | def pre_process(self, codespace: Codespace) -> None: 23 | codespace.register_property("numeric_value_offset", 0, StorageSize.UINT8) 24 | 25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 26 | file = archive.open('numeric_values.json') 27 | data = json.loads(file.read()) 28 | mappings: Dict[str,int] = data["mappings"] 29 | source: str = data["source"] 30 | header: str = data["header"] 31 | size: int = data["size"] 32 | file.close() 33 | 34 | numval_property = codespace.get("numeric_value_offset") 35 | for cp, index in mappings.items(): 36 | codespace.set(int(cp,16), numval_property, index) 37 | 38 | public_header = "#define UNICORN_FEATURE_NUMERIC_VALUE\n" 39 | return SerializationData(source, header, public_header, size) 40 | -------------------------------------------------------------------------------- /examples/Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS = subdir-objects 2 | 3 | EXTRA_DIST = CMakeLists.txt 4 | 5 | if BUILD_EXAMPLES 6 | 7 | PROG_LDADD = ../src/libunicorn.la 8 | PROG_CFLAGS = -I$(top_srcdir)/include -I$(abs_top_builddir)/src 9 | 10 | bin_PROGRAMS = caseconv casefold charprops collate compare compress decode-utf8 decode-utf16 decode-utf32 encode-char segmentation 11 | 12 | caseconv_SOURCES = case_convert.c 13 | caseconv_LDADD = $(PROG_LDADD) 14 | caseconv_CFLAGS = $(PROG_CFLAGS) 15 | 16 | casefold_SOURCES = case_fold.c 17 | casefold_LDADD = $(PROG_LDADD) 18 | casefold_CFLAGS = $(PROG_CFLAGS) 19 | 20 | charprops_SOURCES = character_properties.c 21 | charprops_LDADD = $(PROG_LDADD) 22 | charprops_CFLAGS = $(PROG_CFLAGS) 23 | 24 | collate_SOURCES = collate_text.c 25 | collate_LDADD = $(PROG_LDADD) 26 | collate_CFLAGS = $(PROG_CFLAGS) 27 | 28 | compare_SOURCES = compare_text.c 29 | compare_LDADD = $(PROG_LDADD) 30 | compare_CFLAGS = $(PROG_CFLAGS) 31 | 32 | compress_SOURCES = compress_text.c 33 | compress_LDADD = $(PROG_LDADD) 34 | compress_CFLAGS = $(PROG_CFLAGS) 35 | 36 | decode_utf8_SOURCES = decode_utf8.c 37 | decode_utf8_LDADD = $(PROG_LDADD) 38 | decode_utf8_CFLAGS = $(PROG_CFLAGS) 39 | 40 | decode_utf16_SOURCES = decode_utf16.c 41 | decode_utf16_LDADD = $(PROG_LDADD) 42 | decode_utf16_CFLAGS = $(PROG_CFLAGS) 43 | 44 | decode_utf32_SOURCES = decode_utf32.c 45 | decode_utf32_LDADD = $(PROG_LDADD) 46 | decode_utf32_CFLAGS = $(PROG_CFLAGS) 47 | 48 | encode_char_SOURCES = encode_character.c 49 | encode_char_LDADD = $(PROG_LDADD) 50 | encode_char_CFLAGS = $(PROG_CFLAGS) 51 | 52 | segmentation_SOURCES = segment_text.c 53 | segmentation_LDADD = $(PROG_LDADD) 54 | segmentation_CFLAGS = $(PROG_CFLAGS) 55 | 56 | endif 57 | -------------------------------------------------------------------------------- /man/uninormchk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uninormchk \- quick check constants 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum uninormchk { 11 | .RS 12 | .B UNI_YES, 13 | .B UNI_MAYBE, 14 | .B UNI_NO, 15 | .RE 16 | .B }; 17 | .fi 18 | .SH DESCRIPTION 19 | These properties indicate whether a character can or cannot appear in a given normalization form. 20 | .PP 21 | When used with \f[B]uni_normqchk\f[R](3) these properties indicate whether some input string is possibly in the desired normalization form. 22 | This may make it possible to bypass the more time-consuming call to run the complete Unicode Normalization Algorithm. 23 | .PP 24 | Users can alternatively use \f[B]uni_normchk\f[R](3) which will definitively check if text is normalized. 25 | .SH CONSTANTS 26 | .TP 27 | .BR UNI_YES 28 | Characters that can occur in the respective normalization form. 29 | .TP 30 | .BR UNI_MAYBE 31 | Characters that may occur in the respective normalization, depending on the context. 32 | .TP 33 | .BR UNI_NO 34 | Characters that cannot occur in the respective normalization form. 35 | .SH SEE ALSO 36 | .BR uni_normqchk (3), 37 | .BR uni_normchk (3) 38 | .SH AUTHOR 39 | .UR https://railgunlabs.com 40 | Railgun Labs 41 | .UE . 42 | .SH INTERNET RESOURCES 43 | The online documentation is published on the 44 | .UR https://railgunlabs.com/unicorn 45 | Railgun Labs website 46 | .UE . 47 | .SH LICENSING 48 | Unicorn is distributed with its end-user license agreement (EULA). 49 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 50 | -------------------------------------------------------------------------------- /src/charbuf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #ifndef CHARBUF_H 15 | #define CHARBUF_H 16 | 17 | #include "common.h" 18 | 19 | struct CharBufImpl; 20 | 21 | struct CharBuf 22 | { 23 | void *storage; 24 | 25 | // This is where the next character will be written. 26 | // When finished this is the number of characters that needed to be 27 | // written, but not necessarily how many were actually written. 28 | unisize length; 29 | 30 | // Number of characters actually written. 31 | unisize written; 32 | 33 | unisize capacity; 34 | unisize *result; 35 | bool null_terminate; 36 | bool is_null_terminated; 37 | const struct CharBufImpl *impl; 38 | }; 39 | 40 | struct CharBufImpl 41 | { 42 | void (*append)(struct CharBuf *buf, const unichar *chars, unisize chars_count); 43 | void (*nullterminate)(struct CharBuf *buf); 44 | }; 45 | 46 | unistat uni_charbuf_init(struct CharBuf *buf, void *text, unisize *capacity, uniattr attributes); 47 | unistat uni_charbuf_finalize(struct CharBuf *buf); 48 | 49 | void uni_charbuf_append(struct CharBuf *buf, const unichar *chars, unisize chars_count); 50 | void uni_charbuf_appendchar(struct CharBuf *buf, unichar ch); 51 | 52 | #endif // CHARBUF_H 53 | -------------------------------------------------------------------------------- /man/uni_caseconvchk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_caseconvchk \- check case status 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_caseconvchk(unicaseconv " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R]. 14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R]. 15 | .PP 16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R]. 17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated. 18 | .SH RETURN VALUE 19 | .TP 20 | UNI_OK 21 | On success. 22 | .TP 23 | UNI_BAD_OPERATION 24 | If \f[I]text\f[R] or \f[I]result\f[R] is null. 25 | .TP 26 | UNI_BAD_ENCODING 27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[C]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 28 | .TP 29 | UNI_FEATURE_DISABLED 30 | If Unicorn was built without support for case conversion. 31 | .SH SEE ALSO 32 | .BR UNI_TRUST (3), 33 | .BR unicaseconv (3), 34 | .BR unisize (3), 35 | .BR uniattr (3) 36 | .SH AUTHOR 37 | .UR https://railgunlabs.com 38 | Railgun Labs 39 | .UE . 40 | .SH INTERNET RESOURCES 41 | The online documentation is published on the 42 | .UR https://railgunlabs.com/unicorn 43 | Railgun Labs website 44 | .UE . 45 | .SH LICENSING 46 | Unicorn is distributed with its end-user license agreement (EULA). 47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 48 | -------------------------------------------------------------------------------- /examples/case_convert.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This code example case converts text to upper case, e.g. it would 15 | // convert the text "hello world" to "HELLO WORLD". Unicorn also 16 | // supports lower and title case conversion. 17 | 18 | const char *src = u8"Straße"; 19 | 20 | char dest[64]; 21 | unisize dest_len = 64; 22 | 23 | // The following function case converts a string to uppercase. 24 | // The resulting string can be presented to an end-user. 25 | // The signature for the function is documented online at: 26 | // . 27 | if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, dest, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 28 | { 29 | printf("Case converted: %s\n", dest); 30 | } 31 | 32 | // If you don't know how big the destination buffer will be, you can first 33 | // call uni_caseconv() with a null and zero-sized destination buffer. 34 | // When called this way, the implementation will write to 'dest_len' the 35 | // number of code units needed to accommodate the output. You can then 36 | // allocate a buffer with the appropriate size and call the function again. 37 | dest_len = 0; 38 | if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, NULL, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 39 | { 40 | printf("Code units needed: %d\n", dest_len); 41 | } 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /man/uni_numval.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_numval \- numeric property 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "const char *uni_numval(unichar " c ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function returns the \f[C]Numeric_Value\f[R] character property for the code point \f[I]c\f[R]. 14 | If the input character does not have a numeric value, then null is returned. 15 | .PP 16 | The numeric value is returned as a null-terminated C string in decimal notation. 17 | That means if the character has a numeric value of -1/2, which is the case for TIBETAN DIGIT HALF ZERO (U+0F33), it would be returned as the string "-0.5". 18 | The caller can parse the string into an integer or floating-point storage format as needed. 19 | .PP 20 | Support for this character property must be enabled in the JSON configuration file otherwise the function will always return null. 21 | .PP 22 | .in +4n 23 | .EX 24 | { 25 | "characterProperties": [ 26 | "Numeric_Value" 27 | ] 28 | } 29 | .EE 30 | .in 31 | .SH RETURN VALUE 32 | The value of the \f[C]Numeric_Value\f[R] character property in decimal notation or null if the character does not have a numeric value. 33 | .SH SEE ALSO 34 | .BR unichar (3) 35 | .SH AUTHOR 36 | .UR https://railgunlabs.com 37 | Railgun Labs 38 | .UE . 39 | .SH INTERNET RESOURCES 40 | The online documentation is published on the 41 | .UR https://railgunlabs.com/unicorn 42 | Railgun Labs website 43 | .UE . 44 | .SH LICENSING 45 | Unicorn is distributed with its end-user license agreement (EULA). 46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 47 | -------------------------------------------------------------------------------- /man/unistrength.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unistrength \- collation comparison levels 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum unistrength { 11 | .RS 12 | .B UNI_PRIMARY, 13 | .B UNI_SECONDARY, 14 | .B UNI_TERTIARY, 15 | .B UNI_QUATERNARY, 16 | .RE 17 | .B }; 18 | .fi 19 | .SH DESCRIPTION 20 | The Unicode collation algorithm is a multilevel comparison algorithm. 21 | The number of levels that are considered in comparison is known as the collation strength. 22 | This enumeration defines constants for each level. 23 | .PP 24 | In comparing two strings, the most important feature is the identity of the base letters. 25 | For example, the difference between an A and a B. If the base letters differ, accent differences are typically ignored. 26 | If the base letters or their accents differ, case differences (uppercase versus lowercase) are typically ignored. 27 | .SH CONSTANTS 28 | .TP 29 | .BR UNI_PRIMARY 30 | Represents differences in the base letter or symbol. 31 | .TP 32 | .BR UNI_SECONDARY 33 | Represents differences in accents. 34 | .TP 35 | .BR UNI_TERTIARY 36 | Represents differences in case or variants of symbols. 37 | .TP 38 | .BR UNI_QUATERNARY 39 | Represents differences in punctuation. 40 | .SH AUTHOR 41 | .UR https://railgunlabs.com 42 | Railgun Labs 43 | .UE . 44 | .SH INTERNET RESOURCES 45 | The online documentation is published on the 46 | .UR https://railgunlabs.com/unicorn 47 | Railgun Labs website 48 | .UE . 49 | .SH LICENSING 50 | Unicorn is distributed with its end-user license agreement (EULA). 51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 52 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # Specify the project and version + the minimum automake version required. 2 | AC_PREREQ(2.66) 3 | AC_INIT([unicorn], [1.3.1]) 4 | 5 | # Sanity check to verify this is indeed the source directory. 6 | AC_CONFIG_SRCDIR(src/unicorn.c) 7 | 8 | # Initialize automake and use "foreign" so it doesn't attmpet to look for or 9 | # create a GNU project structure: COPYING, NEWS, etc... 10 | AM_INIT_AUTOMAKE([foreign]) 11 | 12 | # Check for the --with-unicorn-config option. 13 | AC_ARG_WITH([unicorn-config], 14 | [AS_HELP_STRING([--with-unicorn-config=PATH], 15 | [Path to a JSON file that configures the Unicorn feature set. (default: features.json)])], 16 | [CONFIG_JSON="$withval"], 17 | [CONFIG_JSON="features.json"]) 18 | AC_SUBST([CONFIG_JSON]) 19 | 20 | # Check for the --enable-examples option. 21 | AC_ARG_ENABLE([examples], 22 | [AS_HELP_STRING([--enable-examples], [Build the example programs (disabled by default)])], 23 | [enable_examples=$enableval], 24 | [enable_examples=no]) # Do not build the examples by default. 25 | AM_CONDITIONAL([BUILD_EXAMPLES], [test "x$enable_examples" = "xyes"]) 26 | 27 | # Checks for toolchain programs. 28 | AC_PROG_CC 29 | AM_PROG_AR 30 | AC_PROG_CPP 31 | AC_PROG_INSTALL 32 | LT_INIT([win32-dll]) 33 | 34 | # Verify the expected C headers are available.AM_PATH_PYTHON([3.10]) 35 | AC_SUBST([PYTHON],[$PYTHON]) 36 | AC_CHECK_HEADERS([string.h assert.h stdbool.h], [], [AC_MSG_ERROR([could not find required header])]) 37 | 38 | # Check for Python (required to build the project). 39 | AM_PATH_PYTHON([3.10]) 40 | AC_SUBST([PYTHON],[$PYTHON]) 41 | 42 | AC_CONFIG_FILES([ 43 | Makefile 44 | src/Makefile 45 | include/Makefile 46 | examples/Makefile 47 | scripts/Makefile 48 | man/Makefile 49 | unicorn.pc 50 | ]) 51 | AC_OUTPUT 52 | -------------------------------------------------------------------------------- /man/uni_casefoldchk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_casefoldchk \- check case fold status 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_casefoldchk(unicasefold " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R]. 14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R]. 15 | .PP 16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R]. 17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated. 18 | .SH RETURN VALUE 19 | .TP 20 | UNI_OK 21 | On success. 22 | .TP 23 | UNI_BAD_OPERATION 24 | If \f[I]text\f[R] or \f[I]result\f[R] is null. 25 | .TP 26 | UNI_BAD_ENCODING 27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 28 | .TP 29 | UNI_FEATURE_DISABLED 30 | If Unicorn was built without support for case folding. 31 | .TP 32 | UNI_NO_MEMORY 33 | If dynamic memory allocation failed. 34 | .SH SEE ALSO 35 | .BR UNI_TRUST (3), 36 | .BR unicasefold (3), 37 | .BR unisize (3), 38 | .BR uniattr (3) 39 | .SH AUTHOR 40 | .UR https://railgunlabs.com 41 | Railgun Labs 42 | .UE . 43 | .SH INTERNET RESOURCES 44 | The online documentation is published on the 45 | .UR https://railgunlabs.com/unicorn 46 | Railgun Labs website 47 | .UE . 48 | .SH LICENSING 49 | Unicorn is distributed with its end-user license agreement (EULA). 50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 51 | -------------------------------------------------------------------------------- /src/byteswap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #ifndef BYTESWAP_H 15 | #define BYTESWAP_H 16 | 17 | #include "common.h" 18 | 19 | static inline uint32_t uni_swap32(uint32_t val) 20 | { 21 | uint32_t valm = (val << 8u) & 0xFF00FF00u; 22 | valm |= (val >> 8u) & 0xFF00FFu; 23 | return (valm << 16u) | (valm >> 16u); 24 | } 25 | 26 | static inline uint16_t uni_swap16(uint16_t val) 27 | { 28 | const uint32_t byte1 = (uint32_t)val << 8u; 29 | const uint32_t byte2 = (uint32_t)val >> 8u; 30 | return (uint16_t)(byte1 | byte2); 31 | } 32 | 33 | static inline uint32_t uni_swap32_le(uint32_t val) 34 | { 35 | #if defined(UNICORN_BIG_ENDIAN) 36 | return uni_swap32(val); 37 | #else 38 | return val; 39 | #endif 40 | } 41 | 42 | static inline uint16_t uni_swap16_le(uint16_t val) 43 | { 44 | #if defined(UNICORN_BIG_ENDIAN) 45 | return uni_swap16(val); 46 | #else 47 | return val; 48 | #endif 49 | } 50 | 51 | static inline uint32_t uni_swap32_be(uint32_t val) 52 | { 53 | #if defined(UNICORN_LITTLE_ENDIAN) 54 | return uni_swap32(val); 55 | #else 56 | return val; 57 | #endif 58 | } 59 | 60 | static inline uint16_t uni_swap16_be(uint16_t val) 61 | { 62 | #if defined(UNICORN_LITTLE_ENDIAN) 63 | return uni_swap16(val); 64 | #else 65 | return val; 66 | #endif 67 | } 68 | 69 | #endif // BYTESWAP_H 70 | -------------------------------------------------------------------------------- /man/uniweighting.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uniweighting \- collation weighting algorithm 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum uniweighting { 11 | .RS 12 | .B UNI_NON_IGNORABLE, 13 | .B UNI_SHIFTED, 14 | .RE 15 | .B }; 16 | .fi 17 | .SH DESCRIPTION 18 | This enumeration defines constants representing the possible options for variable weighted characters. 19 | .PP 20 | The Unicode Collation Algorithm defines the collation for some characters as variable. 21 | This typically includes punctuation characters and symbols. 22 | Based on the variable-weighting setting, these characters can be interpreted as having a quaternary level. 23 | .SH CONSTANTS 24 | .TP 25 | .BR UNI_NON_IGNORABLE 26 | The words with hyphen-minus or hyphen are grouped together, but before all letters in the third position. 27 | This is because they are not ignorable, and have primary values that differ from the letters. 28 | The symbols ☠ and ♡ have primary differences. 29 | .TP 30 | .BR UNI_SHIFTED 31 | The hyphen-minus and hyphen are grouped together, and their differences are less significant than the casing differences in the letter "l". 32 | This grouping results from the fact that they are ignorable, but their fourth level differences are according to the original primary order, which is more intuitive than Unicode order. 33 | The symbols ☠ and ♡ are ignored on levels 1-3. 34 | .SH SEE ALSO 35 | .BR unistrength (3) 36 | .SH AUTHOR 37 | .UR https://railgunlabs.com 38 | Railgun Labs 39 | .UE . 40 | .SH INTERNET RESOURCES 41 | The online documentation is published on the 42 | .UR https://railgunlabs.com/unicorn 43 | Railgun Labs website 44 | .UE . 45 | .SH LICENSING 46 | Unicorn is distributed with its end-user license agreement (EULA). 47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 48 | -------------------------------------------------------------------------------- /scripts/collation.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict, Set, Type 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | from .decomposition import CanonicalDecompositionFeature 21 | 22 | class CollationFeature(Feature): 23 | @staticmethod 24 | def dependencies() -> Set[Type[Feature]]: 25 | return set([CanonicalDecompositionFeature]) 26 | 27 | def pre_process(self, codespace: Codespace) -> None: 28 | codespace.register_property("collation_subtype", 0, StorageSize.UINT8) 29 | 30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 31 | file = archive.open('collation.json') 32 | data = json.loads(file.read()) 33 | subtypes: Dict[str,int] = data["subtypes"] 34 | size: int = data["size"] 35 | header: str = data["header"] 36 | source: str = data["source"] 37 | file.close() 38 | 39 | # Associate collation subtypes with code points. 40 | subtype_property = codespace.get("collation_subtype") 41 | for key,subtype in subtypes.items(): 42 | codepoint = int(key, 16) 43 | codespace.set(codepoint, subtype_property, subtype) 44 | 45 | public_header = "#define UNICORN_FEATURE_COLLATION\n" 46 | return SerializationData(source, header, public_header, size) 47 | -------------------------------------------------------------------------------- /man/uni_seterrfunc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_seterrfunc \- receive diagnostic events 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "void uni_seterrfunc(void *" user_data ", unierrfunc " callback ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Associate the function \f[I]callback\f[R] with the library's internal error logger. 14 | Anytime a function returns an error \f[I]callback\f[R] will be invoked with a description of the error. 15 | It is up to the implementation of \f[I]callback\f[R] to be thread safe. 16 | .PP 17 | The implementation generally invokes \f[I]callback\f[R] at the moment the error occurs. 18 | Callers are encouraged to set breakpoints in their implementation of \f[I]callback\f[R] and view the stack trace to discover the exact cause of the error. 19 | .SH EXAMPLES 20 | This example registers a custom error callback with Unicorn. 21 | This callback will be invoked when an error occurs in Unicorn. 22 | In practice, you can set a breakpoint within the function for your debugger to pause on and then troubleshoot the error by inspecting the error message. 23 | .PP 24 | .in +4n 25 | .EX 26 | #include 27 | #include 28 | 29 | static void on_error(void *user_data, const char *message) 30 | { 31 | puts(message); 32 | } 33 | 34 | int main(void) 35 | { 36 | uni_seterrfunc(NULL, on_error); 37 | return 0; 38 | } 39 | .EE 40 | .in 41 | .SH SEE ALSO 42 | .BR unimemfunc (3), 43 | .BR unierrfunc (3) 44 | .SH AUTHOR 45 | .UR https://railgunlabs.com 46 | Railgun Labs 47 | .UE . 48 | .SH INTERNET RESOURCES 49 | The online documentation is published on the 50 | .UR https://railgunlabs.com/unicorn 51 | Railgun Labs website 52 | .UE . 53 | .SH LICENSING 54 | Unicorn is distributed with its end-user license agreement (EULA). 55 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 56 | -------------------------------------------------------------------------------- /man/uni_prev.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_prev \- decode the previous scalar value 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_prev(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function decodes the preceding Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R]. 14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the preceding scalar. 15 | .PP 16 | This function returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters. 17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R]. 18 | .PP 19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined. 20 | .SH RETURN VALUE 21 | .TP 22 | UNI_OK 23 | If the character was successfully decoded. 24 | .TP 25 | UNI_DONE 26 | If the end of \f[I]text\f[R] was reached. 27 | .TP 28 | UNI_BAD_ENCODING 29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 30 | .TP 31 | UNI_BAD_OPERATION 32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL. 33 | .SH SEE ALSO 34 | .BR unistat (3), 35 | .BR UNI_TRUST (3), 36 | .BR unisize (3), 37 | .BR uniattr (3), 38 | .BR unichar (3) 39 | .SH AUTHOR 40 | .UR https://railgunlabs.com 41 | Railgun Labs 42 | .UE . 43 | .SH INTERNET RESOURCES 44 | The online documentation is published on the 45 | .UR https://railgunlabs.com/unicorn 46 | Railgun Labs website 47 | .UE . 48 | .SH LICENSING 49 | Unicorn is distributed with its end-user license agreement (EULA). 50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 51 | -------------------------------------------------------------------------------- /man/uni_compress.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_compress \- compress text 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_compress(const void *" text ", unisize " text_len ", uniattr " text_attr ", uint8_t *" buffer ", size_t *" buffer_length ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function compresses \f[I]text\f[R] using Binary Ordered Compression for Unicode (BOCU-1) and writes the results to \f[I]buffer\f[R]. 14 | Text is compressed based on its code points, i.e. UTF-8, 16, and 32 will compress to the same byte sequence. 15 | .PP 16 | The capacity of \f[I]buffer\f[R] is specified by \f[I]buffer_length\f[R]. 17 | The implementation always writes to \f[I]buffer_length\f[R] the total number of bytes in the fully compressed text. 18 | If the capacity of \f[I]buffer\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 19 | If \f[I]buffer\f[R] is null, then \f[I]buffer_length\f[R] must be zero. 20 | .SH RETURN VALUE 21 | .TP 22 | UNI_OK 23 | On success. 24 | .TP 25 | UNI_NO_SPACE 26 | If \f[I]buffer\f[R] is not large enough. 27 | .TP 28 | UNI_BAD_OPERATION 29 | If \f[I]text\f[R] or \f[I]buffer_length\f[R] is null. 30 | .TP 31 | UNI_BAD_ENCODING 32 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 33 | .TP 34 | UNI_FEATURE_DISABLED 35 | If Unicorn was configured without support for the compression algorithm. 36 | .SH SEE ALSO 37 | .BR unistat (3), 38 | .BR UNI_TRUST (3), 39 | .BR unisize (3), 40 | .BR uniattr (3) 41 | .SH AUTHOR 42 | .UR https://railgunlabs.com 43 | Railgun Labs 44 | .UE . 45 | .SH INTERNET RESOURCES 46 | The online documentation is published on the 47 | .UR https://railgunlabs.com/unicorn 48 | Railgun Labs website 49 | .UE . 50 | .SH LICENSING 51 | Unicorn is distributed with its end-user license agreement (EULA). 52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 53 | -------------------------------------------------------------------------------- /man/uni_nextbrk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_nextbrk \- compute next boundary 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_nextbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function computes the next \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R]. 14 | The implementation sets \f[I]index\f[R] to the code unit offset of the next \f[I]boundary\f[R]. 15 | .SH RETURN VALUE 16 | .TP 17 | UNI_OK 18 | If the break iterator was successfully repositioned. 19 | .TP 20 | UNI_DONE 21 | If \f[I]index\f[R] is beyond the last character of \f[I]text\f[R]. 22 | .TP 23 | UNI_BAD_OPERATION 24 | If \f[I]text\f[R] or \f[I]index\f[R] is null. 25 | .TP 26 | UNI_BAD_ENCODING 27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 28 | .SH EXAMPLES 29 | This example iterates the extended grapheme clusters of a UTF-8 encoded string and prints the indices where each grapheme begins. 30 | .PP 31 | .in +4n 32 | .EX 33 | #include 34 | #include 35 | 36 | int main(void) 37 | { 38 | const char *string = u8"Hi, 世界"; 39 | unisize index = 0; 40 | while (uni_nextbrk(UNI_GRAPHEME, string, -1, UNI_UTF8, &index) == UNI_OK) 41 | { 42 | printf("%d\\n", index); // prints '1', '2', '3', '4', 7', '10' 43 | } 44 | return 0; 45 | } 46 | .EE 47 | .in 48 | .SH SEE ALSO 49 | .BR UNI_TRUST (3), 50 | .BR unibreak (3), 51 | .BR unisize (3), 52 | .BR uniattr (3) 53 | .SH AUTHOR 54 | .UR https://railgunlabs.com 55 | Railgun Labs 56 | .UE . 57 | .SH INTERNET RESOURCES 58 | The online documentation is published on the 59 | .UR https://railgunlabs.com/unicorn 60 | Railgun Labs website 61 | .UE . 62 | .SH LICENSING 63 | Unicorn is distributed with its end-user license agreement (EULA). 64 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 65 | -------------------------------------------------------------------------------- /man/uni_decompress.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_decompress \- decompress text 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_decompress(const uint8_t *" buffer ", size_t " buffer_length ", void *" text ", unisize *" text_len ", uniattr " text_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function decompresses \f[I]buffer\f[R] and writes the result to \f[I]text\f[R]. 14 | The encoding of \f[I]text_attr\f[R] does not need to match the encoding of the original uncompressed text. 15 | .PP 16 | The capacity of \f[I]text\f[R] is specified by \f[I]text_len\f[R]. 17 | If \f[I]text\f[R] is not null, then the implementation writes to \f[I]text_len\f[R] the total number of code units written to \f[I]text\f[R]. 18 | If the capacity of \f[I]text\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 19 | .PP 20 | If \f[I]text\f[R] is null and \f[I]text_len\f[R] is zero, then the implementation writes to \f[I]text_len\f[R] the number of code units in the fully decompressed text and returns \f[B]UNI_OK\f[R]. 21 | Call the function this way to first compute the total length of the uncompressed text before calling it again with a sufficiently sized buffer. 22 | .SH RETURN VALUE 23 | .TP 24 | UNI_OK 25 | On success. 26 | .TP 27 | UNI_NO_SPACE 28 | If \f[I]text\f[R] is not large enough. 29 | .TP 30 | UNI_BAD_OPERATION 31 | If \f[I]buffer\f[R] is null, \f[I]text\f[R] is null and \f[I]text_len\f[R] is non-zero. 32 | .TP 33 | UNI_FEATURE_DISABLED 34 | If Unicorn was configured without support for the compression algorithm. 35 | .SH SEE ALSO 36 | .BR unistat (3), 37 | .BR uni_compress (3), 38 | .BR unisize (3), 39 | .BR uniattr (3) 40 | .SH AUTHOR 41 | .UR https://railgunlabs.com 42 | Railgun Labs 43 | .UE . 44 | .SH INTERNET RESOURCES 45 | The online documentation is published on the 46 | .UR https://railgunlabs.com/unicorn 47 | Railgun Labs website 48 | .UE . 49 | .SH LICENSING 50 | Unicorn is distributed with its end-user license agreement (EULA). 51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 52 | -------------------------------------------------------------------------------- /man/uni_casefoldcmp.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_casefoldcmp \- case-insensitive string comparison 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_casefoldcmp(unicasefold " casing ", const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are a caseless match in casing form \f[I]casing\f[R]. 14 | If they match, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R]. 15 | .PP 16 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R]. 17 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R]. 18 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated. 19 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated. 20 | .SH RETURN VALUE 21 | .TP 22 | UNI_OK 23 | If \f[C]text\f[R] was checked successfully. 24 | .TP 25 | UNI_BAD_OPERATION 26 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null. 27 | .TP 28 | UNI_BAD_ENCODING 29 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)). 30 | .TP 31 | UNI_FEATURE_DISABLED 32 | If the library was built without support for case folding. 33 | .TP 34 | UNI_NO_MEMORY 35 | If dynamic memory allocation failed. 36 | .SH SEE ALSO 37 | .BR uniattr (3), 38 | .BR UNI_TRUST (3), 39 | .BR unicasefold (3), 40 | .BR unisize (3) 41 | .SH AUTHOR 42 | .UR https://railgunlabs.com 43 | Railgun Labs 44 | .UE . 45 | .SH INTERNET RESOURCES 46 | The online documentation is published on the 47 | .UR https://railgunlabs.com/unicorn 48 | Railgun Labs website 49 | .UE . 50 | .SH LICENSING 51 | Unicorn is distributed with its end-user license agreement (EULA). 52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 53 | -------------------------------------------------------------------------------- /man/uni_normqchk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_normqchk \- normalization quick check 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_normqchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", uninormchk *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function performs a quick check to see if \f[I]text\f[R] is, is not, or might be normalized. 14 | For a definitive yes/no answer, use \f[B]uni_normchk\f[R](3). 15 | .PP 16 | The implementation behaves as follows: 17 | .RS 18 | .IP \[bu] 2 19 | \f[I]result\f[R] is set to \f[B]UNI_YES\f[R] if all characters in \f[I]text\f[R] quick check to yes. 20 | .IP \[bu] 2 21 | \f[I]result\f[R] is set to \f[B]UNI_NO\f[R] if at least one character in \f[I]text\f[R] is quick checks to no. 22 | .IP \[bu] 2 23 | \f[I]result\f[R] is set to \f[B]UNI_MAYBE\f[R] if at least one character in \f[I]text\f[R] quick checks to maybe and no characters quick check to no. 24 | .RE 25 | .PP 26 | Support for normalization quick check must be enabled in the JSON configuration file. 27 | .PP 28 | .in +4n 29 | .EX 30 | { 31 | "algorithms": { 32 | "normalizationQuickCheck": true 33 | } 34 | } 35 | .EE 36 | .in 37 | .SH RETURN VALUE 38 | .TP 39 | UNI_OK 40 | On success. 41 | .TP 42 | UNI_BAD_OPERATION 43 | If \f[I]text\f[R] or \f[I]result\f[R] is null. 44 | .TP 45 | UNI_BAD_ENCODING 46 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 47 | .TP 48 | UNI_FEATURE_DISABLED 49 | If Unicorn was built without support for \f[I]form\f[R]. 50 | .SH SEE ALSO 51 | .BR uni_normchk (3), 52 | .BR uninormchk (3), 53 | .BR UNI_TRUST (3), 54 | .BR uninormform (3), 55 | .BR unisize (3), 56 | .BR uniattr (3) 57 | .SH AUTHOR 58 | .UR https://railgunlabs.com 59 | Railgun Labs 60 | .UE . 61 | .SH INTERNET RESOURCES 62 | The online documentation is published on the 63 | .UR https://railgunlabs.com/unicorn 64 | Railgun Labs website 65 | .UE . 66 | .SH LICENSING 67 | Unicorn is distributed with its end-user license agreement (EULA). 68 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 69 | -------------------------------------------------------------------------------- /examples/encode_character.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This example demonstrates how to encode a code point as UTF-8, UTF-16, and UTF-32. 15 | // After encoding, the code units are printed as hexadecimal. 16 | 17 | // The following variable defines the Unicode code point that will be encoded. 18 | // In this example, it's Unicode character U+0300. Change this value to a 19 | // different character and notice how the printed code units change. 20 | unichar cp = 0x1F600; 21 | 22 | printf("Code point: U+%04X\n", cp); 23 | 24 | // 25 | // Encode the code point as UTF-8. UTF-8 is a variable length encoding and will encode 26 | // to either 1, 2, 3, or 4 code units. 27 | // 28 | 29 | uint8_t u8[4]; 30 | unisize u8_len = 4; 31 | 32 | if (uni_encode(cp, u8, &u8_len, UNI_UTF8) == UNI_OK) 33 | { 34 | printf("UTF-8 :"); 35 | for (unisize i = 0; i < u8_len; i++) 36 | { 37 | printf(" 0x%02X", u8[i]); 38 | } 39 | putchar('\n'); 40 | } 41 | 42 | // 43 | // Encode the code point as UTF-16. UTF-16 is a variable length encoding and will encode 44 | // to either 1 or 2 code units. 45 | // 46 | 47 | uint16_t u16[2]; 48 | unisize u16_len = 2; 49 | 50 | if (uni_encode(cp, u16, &u16_len, UNI_UTF16) == UNI_OK) 51 | { 52 | printf("UTF-16 :"); 53 | for (unisize i = 0; i < u16_len; i++) 54 | { 55 | printf(" 0x%04X", u16[i]); 56 | } 57 | putchar('\n'); 58 | } 59 | 60 | // 61 | // Encode the code point as UTF-32. Code points are 21-bit integers and UTF-32 stores 62 | // them in a 32-bit integer. UTF-32 is not variable length, unlike UTF-8 and UTF-16. 63 | // 64 | 65 | uint32_t u32[1]; 66 | unisize u32_len = 1; 67 | 68 | if (uni_encode(cp, u32, &u32_len, UNI_UTF32) == UNI_OK) 69 | { 70 | printf("UTF-32 :"); 71 | for (unisize i = 0; i < u32_len; i++) 72 | { 73 | printf(" 0x%08X", u32[i]); 74 | } 75 | putchar('\n'); 76 | } 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /examples/case_fold.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | int main(int argc, char *argv[]) 15 | { 16 | // This code example demonstrates case folding strings. Case folding 17 | // transform strings for the purposes of caseless string comparison. 18 | // 19 | // You are not supposed to display case folded strings to the end-user. 20 | // They for internal caseless string comparisons only. 21 | 22 | const char *src1 = u8"Straße"; 23 | const char *src2 = u8"STrasse"; 24 | 25 | char dst1[16]; 26 | char dst2[16]; 27 | 28 | unisize dst1_len = 16; 29 | unisize dst2_len = 16; 30 | 31 | bool is_equal = false; 32 | 33 | // The following function case folds two strings, performs the comparison, and 34 | // reports the result. This approach is recommended when two strings are being 35 | // compared in a one-off comparison check. If you'll be repeatedly comparing 36 | // the same strings over and over again, then the approach in the next code 37 | // fragment is preferred. 38 | if (uni_casefoldcmp(UNI_CANONICAL, src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK) 39 | { 40 | if (is_equal) 41 | { 42 | puts("equal"); 43 | } 44 | else 45 | { 46 | puts("not equal"); 47 | } 48 | } 49 | 50 | // The following code fragment case folds two strings and compares the results. 51 | // This more manual approach is recommended when you'll be comparing the same 52 | // strings over and over again because you can case fold them _once_ and then 53 | // re-use the results for each subsequent comparison. 54 | if (uni_casefold(UNI_CANONICAL, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 55 | { 56 | if (uni_casefold(UNI_CANONICAL, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 57 | { 58 | if (strcmp(dst1, dst2) == 0) 59 | { 60 | puts("equal"); 61 | } 62 | else 63 | { 64 | puts("not equal"); 65 | } 66 | } 67 | } 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /examples/compare_text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | int main(int argc, char *argv[]) 15 | { 16 | // This code example demonstrates comparing strings for canonical equivalence. 17 | // Canonical equivalence means the graphemes are compared, not the code points 18 | // or code units. It interprets precomposed and decomposed characters that 19 | // represent the same grapheme as equivalent. 20 | 21 | const char *src1 = u8"ma\u0301scara"; // 'a' + U+0301 = á (decomposed) 22 | const char *src2 = u8"m\u00E1scara"; // U+00E1 = á (precomposed) 23 | 24 | char dst1[16]; 25 | char dst2[16]; 26 | 27 | unisize dst1_len = 16; 28 | unisize dst2_len = 16; 29 | 30 | bool is_equal = false; 31 | 32 | // The following function normalizes two strings, performs the comparison, and 33 | // prints the result. This approach is recommended when two strings are compared 34 | // in a one-off comparison check. If you'll be repeatedly comparing the same 35 | // strings over and over again, then the approach in the next fragment is preferred. 36 | if (uni_normcmp(src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK) 37 | { 38 | if (is_equal) 39 | { 40 | puts("equal"); 41 | } 42 | else 43 | { 44 | puts("not equal"); 45 | } 46 | } 47 | 48 | // The following lines normalize two strings and compare the results. 49 | // This more manual approach is recommended when you'll be comparing the same 50 | // strings over and over again because you can normalize them _once_ and then 51 | // re-use the results for each subsequent comparison. 52 | if (uni_norm(UNI_NFD, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 53 | { 54 | if (uni_norm(UNI_NFD, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK) 55 | { 56 | if (strcmp(dst1, dst2) == 0) 57 | { 58 | puts("equal"); 59 | } 60 | else 61 | { 62 | puts("not equal"); 63 | } 64 | } 65 | } 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /man/uniattr.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uniattr \- text attributes 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "typedef uint32_t uniattr;" 11 | .fi 12 | .SH DESCRIPTION 13 | This type represents a bit mask of text attributes. 14 | Unless documented otherwise, all functions that accept a \f[B]uniattr\f[R] must observe the following rules: 15 | .PP 16 | There must be exactly one of the following character encoding flags present. 17 | .PP 18 | .RS 19 | .IP \[bu] 2 20 | \f[B]UNI_SCALAR\f[R](3) 21 | .IP \[bu] 2 22 | \f[B]UNI_UTF8\f[R](3) 23 | .IP \[bu] 2 24 | \f[B]UNI_UTF16\f[R](3) 25 | .IP \[bu] 2 26 | \f[B]UNI_UTF32\f[R](3) 27 | .RE 28 | .PP 29 | There can optionally be one of the following endian flags. 30 | If none are present, then native byte order is assumed. 31 | These flags are incompatible with \f[B]UNI_SCALAR\f[R](3) which is always in native byte order. 32 | These flags have no effect with \f[B]UNI_UTF8\f[R](3) because UTF-8 is endian independent. 33 | .PP 34 | .RS 35 | .IP \[bu] 2 36 | \f[B]UNI_NATIVE\f[R](3) 37 | .IP \[bu] 2 38 | \f[B]UNI_LITTLE\f[R](3) 39 | .IP \[bu] 2 40 | \f[B]UNI_BIG\f[R](3) 41 | .RE 42 | .PP 43 | There can optionally be any combination of the following flags. 44 | .PP 45 | .RS 46 | .IP \[bu] 2 47 | \f[B]UNI_TRUST\f[R](3) 48 | .IP \[bu] 2 49 | \f[B]UNI_NULIFY\f[R](3) 50 | .RE 51 | .PP 52 | The following example demonstrates using these flags with \f[B]uni_next\f[R](3) to parse the first code point of a UTF-16 big endian string. 53 | .PP 54 | .in +4n 55 | .EX 56 | const char16_t text[] = u"Hello, World!"; 57 | int index = 0; 58 | unichar cp; 59 | uni_next(text, -1, UNI_UTF16 | UNI_BIG, &index, &cp); 60 | .EE 61 | .in 62 | .SH SEE ALSO 63 | .BR UNI_SCALAR (3), 64 | .BR UNI_UTF8 (3), 65 | .BR UNI_UTF16 (3), 66 | .BR UNI_UTF32 (3), 67 | .BR UNI_NATIVE (3), 68 | .BR UNI_LITTLE (3), 69 | .BR UNI_BIG (3), 70 | .BR UNI_TRUST (3), 71 | .BR UNI_NULIFY (3), 72 | .BR uni_next (3) 73 | .SH AUTHOR 74 | .UR https://railgunlabs.com 75 | Railgun Labs 76 | .UE . 77 | .SH INTERNET RESOURCES 78 | The online documentation is published on the 79 | .UR https://railgunlabs.com/unicorn 80 | Railgun Labs website 81 | .UE . 82 | .SH LICENSING 83 | Unicorn is distributed with its end-user license agreement (EULA). 84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 85 | -------------------------------------------------------------------------------- /scripts/decomposition.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import List, Set, Type, Tuple 13 | import zipfile 14 | import json 15 | 16 | from .config import Config, OptimizeFor 17 | from .tools import Codepoint, Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | from .ccc import CanonicalCombiningClass 21 | from .encoding import EncodingUTF8 22 | 23 | class CanonicalDecompositionFeature(Feature): 24 | @staticmethod 25 | def dependencies() -> Set[Type[Feature]]: 26 | return set([CanonicalCombiningClass, EncodingUTF8]) 27 | 28 | def pre_process(self, codespace: Codespace) -> None: 29 | codespace.register_property("canonical_decomposition_mapping_offset", 0, StorageSize.UINT16) 30 | 31 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 32 | decomp_property = codespace.get("canonical_decomposition_mapping_offset") 33 | if config.optimize == OptimizeFor.SPEED: 34 | file = archive.open('normalize_for_speed.json') 35 | else: 36 | file = archive.open('normalize_for_size.json') 37 | data = json.loads(file.read()) 38 | mappings: List[Tuple[Codepoint,int]] = data["mappings"] 39 | source: str = data["source"] 40 | header: str = data["header"] 41 | size: int = data["size"] 42 | file.close() 43 | 44 | # Compute the size (in bytes) occupied by the decomposition mapping data. 45 | if config.optimize == OptimizeFor.SPEED: 46 | size *= config.character_storage_bytes() 47 | 48 | # Associate code points with their canonical decomposition mapping. 49 | for mapping in mappings: 50 | cp = mapping[0] 51 | offset = mapping[1] 52 | codespace.set(cp, decomp_property, offset) 53 | 54 | public_header = "#define UNICORN_FEATURE_NFD\n" 55 | return SerializationData(source, header, public_header, size) 56 | -------------------------------------------------------------------------------- /scripts/composition.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import List, Set, Tuple, Type 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codepoint, Codespace 18 | from .feature import Feature, IS_COMPOSABLE, FLAGS_PROPERTY 19 | from .basics import StorageSize, SerializationData 20 | from .decomposition import CanonicalDecompositionFeature 21 | 22 | class CanonicalCompositionFeature(Feature): 23 | @staticmethod 24 | def dependencies() -> Set[Type[Feature]]: 25 | return set([CanonicalDecompositionFeature]) 26 | 27 | def pre_process(self, codespace: Codespace) -> None: 28 | codespace.register_property("canonical_composition_mapping_offset", 0, StorageSize.UINT16) 29 | codespace.register_property("canonical_composition_mapping_count", 0, StorageSize.UINT8) 30 | codespace.register_property(*FLAGS_PROPERTY) 31 | 32 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 33 | file = archive.open('composition.json') 34 | data = json.loads(file.read()) 35 | mappings: List[Tuple[Codepoint,int,int]] = data["compositionMappings"] 36 | size: int = data["compositionSize"] 37 | source: str = data["compositionSource"] 38 | header: str = data["compositionHeader"] 39 | file.close() 40 | 41 | offset_property = codespace.get("canonical_composition_mapping_offset") 42 | count_property = codespace.get("canonical_composition_mapping_count") 43 | flags_property = codespace.get(FLAGS_PROPERTY[0]) 44 | 45 | for value_triplet in mappings: 46 | first = value_triplet[0] 47 | offset = value_triplet[1] 48 | count = value_triplet[2] 49 | codespace.set(first, offset_property, offset) 50 | codespace.set(first, count_property, count) 51 | codespace.set_bitwise(first, flags_property, IS_COMPOSABLE) 52 | 53 | public_header = "#define UNICORN_FEATURE_NFC\n" 54 | return SerializationData(source, header, public_header, size) 55 | -------------------------------------------------------------------------------- /man/uni_sortkeycmp.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_sortkeycmp \- compare sort keys 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_sortkeycmp(const uint16_t *" sk1 ", size_t " sk1_len ", const uint16_t *" sk2 ", size_t " sk2_len ", int32_t *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function compares sort keys \f[I]sk1\f[R] and \f[I]sk2\f[R] and writes either -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]sk1 < sk2\f[R], \f[C]sk1 = sk2\f[R], or \f[C]sk1 > sk2\f[R]. 14 | The sort keys must be generated with \f[B]uni_sortkeymk\f[R](3). 15 | .SH RETURN VALUE 16 | .TP 17 | UNI_OK 18 | If the collation algorithm executed successfully. 19 | .TP 20 | UNI_BAD_OPERATION 21 | If \f[I]sk1\f[R], \f[I]sk2\f[R], or \f[I]result\f[R] are null. 22 | .SH EXAMPLES 23 | This example constructs two sort keys and compares them. 24 | Sort keys must be generated with the same settings for their order to make sense. 25 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength. 26 | In a real application you would cache the sort keys for future comparisons. 27 | For one-off comparisons, use \f[B]uni_collate\f[R](3). 28 | .PP 29 | .in +4n 30 | .EX 31 | #include 32 | #include 33 | 34 | int main(void) 35 | { 36 | const char *s1 = "hello", *s2 = "Hi"; 37 | uint16_t sk1[32] = {0}, sk2[16] = {0}; 38 | size_t sk1_len = 32, sk2_len = 16; 39 | 40 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK || 41 | uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK) 42 | { 43 | puts("failed to construct sort keys"); 44 | return 1; 45 | } 46 | 47 | int result; 48 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK) 49 | { 50 | puts("failed to compare sort keys"); 51 | return 1; 52 | } 53 | 54 | printf("%d", result); 55 | return 0; 56 | } 57 | .EE 58 | .in 59 | .SH SEE ALSO 60 | .BR uni_sortkeymk (3) 61 | .SH AUTHOR 62 | .UR https://railgunlabs.com 63 | Railgun Labs 64 | .UE . 65 | .SH INTERNET RESOURCES 66 | The online documentation is published on the 67 | .UR https://railgunlabs.com/unicorn 68 | Railgun Labs website 69 | .UE . 70 | .SH LICENSING 71 | Unicorn is distributed with its end-user license agreement (EULA). 72 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 73 | -------------------------------------------------------------------------------- /man/uni_setmemfunc.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_setmemfunc \- register a memory allocator 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_setmemfunc(void *" user_data ", unimemfunc " allocf ");" 11 | .fi 12 | .SH DESCRIPTION 13 | Sets \f[I]allocf\f[R] as the implementation for dynamic memory allocations. 14 | If \f[I]allocf\f[R] is null then the implementation reverts to its default allocator which may be the C standard allocator or a dummy allocator that always fails to allocate memory. 15 | The latter is only present when the C standard library allocators are disabled. 16 | .PP 17 | Support for C standard library allocators must be enabled in the JSON configuration file. 18 | If the standard allocators are not enabled, then dummy allocators that always return NULL are used. 19 | In this case, a custom allocator must be supplied. 20 | .PP 21 | .in +4n 22 | .EX 23 | { 24 | "hasStandardAllocators": true 25 | } 26 | .EE 27 | .in 28 | .SH RETURN VALUE 29 | .TP 30 | UNI_OK 31 | If the memory allocator was modified. 32 | .TP 33 | UNI_FEATURE_DISABLED 34 | If \f[I]allocf\f[R] is null and Unicorn was built without support for the C standard library memory allocator. 35 | .SH EXAMPLES 36 | This example registers a custom memory allocator with Unicorn. 37 | In the example, the custom allocator wraps the C standard memory allocators, however, this is redundant because Unicorn already initializes with a custom allocator that wraps the C standard allocators. 38 | They are wrapped here for example purposes only. 39 | An actual implementation would implement and wrap its own custom memory allocator. 40 | .PP 41 | .in +4n 42 | .EX 43 | #include 44 | #include 45 | 46 | static void *allocfree(void *ud, void *ptr, size_t osize, size_t nsize) 47 | { 48 | if (nsize == 0) 49 | { 50 | free(ptr); 51 | return NULL; 52 | } 53 | else 54 | { 55 | return realloc(ptr, nsize); 56 | } 57 | } 58 | 59 | int main(void) 60 | { 61 | uni_setmemfunc(NULL, allocfree); 62 | return 0; 63 | } 64 | .EE 65 | .in 66 | .SH SEE ALSO 67 | .BR unimemfunc (3) 68 | .SH AUTHOR 69 | .UR https://railgunlabs.com 70 | Railgun Labs 71 | .UE . 72 | .SH INTERNET RESOURCES 73 | The online documentation is published on the 74 | .UR https://railgunlabs.com/unicorn 75 | Railgun Labs website 76 | .UE . 77 | .SH LICENSING 78 | Unicorn is distributed with its end-user license agreement (EULA). 79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 80 | -------------------------------------------------------------------------------- /scripts/quickcheck.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict, Set, Type 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | from .ccc import CanonicalCombiningClass 21 | 22 | class NFCQuickCheck(Feature): 23 | @staticmethod 24 | def dependencies() -> Set[Type[Feature]]: 25 | return set([CanonicalCombiningClass]) 26 | 27 | def pre_process(self, codespace: Codespace) -> None: 28 | codespace.register_property("quick_check_flags", 0, StorageSize.UINT8) 29 | 30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData: 31 | file = archive.open('quickcheck.json') 32 | data = json.loads(file.read()) 33 | nfc_quick_check: Dict[str,int] = data["quickCheckNFC"] 34 | file.close() 35 | 36 | qc_flags_property = codespace.get("quick_check_flags") 37 | for cp, qc_value in nfc_quick_check.items(): 38 | codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value << 4) 39 | 40 | public_header = "#define UNICORN_FEATURE_NFC_QUICK_CHECK\n" 41 | return SerializationData(public_header=public_header) 42 | 43 | class NFDQuickCheck(Feature): 44 | @staticmethod 45 | def dependencies() -> Set[Type[Feature]]: 46 | return set([CanonicalCombiningClass]) 47 | 48 | def pre_process(self, codespace: Codespace) -> None: 49 | codespace.register_property("quick_check_flags", 0, StorageSize.UINT8) 50 | 51 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData: 52 | file = archive.open('quickcheck.json') 53 | data = json.loads(file.read()) 54 | nfd_quick_check: Dict[str,int] = data["quickCheckNFD"] 55 | file.close() 56 | 57 | qc_flags_property = codespace.get("quick_check_flags") 58 | for cp, qc_value in nfd_quick_check.items(): 59 | codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value) 60 | 61 | public_header = "#define UNICORN_FEATURE_NFD_QUICK_CHECK\n" 62 | return SerializationData(public_header=public_header) 63 | -------------------------------------------------------------------------------- /man/uni_next.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_next \- decode a scalar value 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_next(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function decodes one Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R]. 14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the next scalar. 15 | .PP 16 | It returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters. 17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R]. 18 | .PP 19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined. 20 | .SH RETURN VALUE 21 | .TP 22 | UNI_OK 23 | If the scalar was successfully decoded. 24 | .TP 25 | UNI_DONE 26 | If the end of \f[I]text\f[R] was reached. 27 | .TP 28 | UNI_BAD_ENCODING 29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 30 | .TP 31 | UNI_BAD_OPERATION 32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL. 33 | .SH EXAMPLES 34 | This example prints each Unicode scalar value from a text string encoded as UTF-8. 35 | .PP 36 | .in +4n 37 | .EX 38 | #include 39 | #include 40 | 41 | int main(void) 42 | { 43 | const char str[] = u8"I 🕵️."; // I spy 44 | unisize i = 0; 45 | for (;;) 46 | { 47 | unichar cp; 48 | unistat r = uni_next(str, -1, UNI_UTF8, &i, &cp); 49 | if (r == UNI_DONE) 50 | { 51 | break; 52 | } 53 | else if (r == UNI_BAD_ENCODING) 54 | { 55 | // malformed character 56 | } 57 | else 58 | { 59 | printf("U+%04X\\n", cp); // print scalar 60 | } 61 | } 62 | return 0; 63 | } 64 | .EE 65 | .in 66 | .SH SEE ALSO 67 | .BR unistat (3), 68 | .BR UNI_TRUST (3), 69 | .BR unisize (3), 70 | .BR uniattr (3), 71 | .BR unichar (3) 72 | .SH AUTHOR 73 | .UR https://railgunlabs.com 74 | Railgun Labs 75 | .UE . 76 | .SH INTERNET RESOURCES 77 | The online documentation is published on the 78 | .UR https://railgunlabs.com/unicorn 79 | Railgun Labs website 80 | .UE . 81 | .SH LICENSING 82 | Unicorn is distributed with its end-user license agreement (EULA). 83 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 84 | -------------------------------------------------------------------------------- /examples/collate_text.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | // This code example demonstrates how to collate strings (compare strings for sorting). 15 | // The online documentation is available here: 16 | // . 17 | 18 | const char *s1 = u8"Hello World"; 19 | const char *s2 = u8"こんにちは世界"; 20 | 21 | uint16_t sk1[64] = {0}; 22 | uint16_t sk2[64] = {0}; 23 | 24 | size_t sk1_len = 64; 25 | size_t sk2_len = 64; 26 | 27 | int32_t result = 0; 28 | 29 | // The following snippet collates two strings and prints the result of the comparison. 30 | // This function, conceptually, builds sort keys for the input strings and compares them. 31 | // Building a sort key is expensive. If strings are collated multiple times, it is recommend 32 | // to build the sort keys once and use them for subsequent comparisons. 33 | if (uni_collate(s1, -1, UNI_UTF8, s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, &result) == UNI_OK) 34 | { 35 | if (result < 0) 36 | { 37 | puts("s1 < s2"); 38 | } 39 | else if (result > 0) 40 | { 41 | puts("s1 > s2"); 42 | } 43 | else 44 | { 45 | puts("s1 = s2"); 46 | } 47 | } 48 | 49 | // The following snippet collates strings the manual way. The first step is to 50 | // generate a "sort key" for both strings. The sort key encodes the sorting 51 | // weights of the string. 52 | // 53 | // Constructing a sort key is the computationally expensive part of the process. 54 | // Once the sort keys are constructed, they are compared which is inexpensive. 55 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk1, &sk1_len) == UNI_OK) 56 | { 57 | if (uni_sortkeymk(s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk2, &sk2_len) == UNI_OK) 58 | { 59 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) == UNI_OK) 60 | { 61 | if (result < 0) 62 | { 63 | puts("s1 < s2"); 64 | } 65 | else if (result > 0) 66 | { 67 | puts("s1 > s2"); 68 | } 69 | else 70 | { 71 | puts("s1 = s2"); 72 | } 73 | } 74 | } 75 | } 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /src/memory.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #include "common.h" 15 | #include "unidata.h" 16 | 17 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES) 18 | #include 19 | static void *default_allocf(void *ud, void *ptr, size_t osize, size_t nsize) 20 | { 21 | (void)ud; 22 | (void)osize; 23 | void *mem = NULL; 24 | if (nsize == (size_t)0) 25 | { 26 | free(ptr); // cppcheck-suppress misra-c2012-21.3 ; This function is optional. 27 | } 28 | else 29 | { 30 | mem = realloc(ptr, nsize); // cppcheck-suppress misra-c2012-21.3 ; This function is optional. 31 | } 32 | return mem; 33 | } 34 | static unimemfunc unicorn_allocf = &default_allocf; 35 | #else 36 | static unimemfunc unicorn_allocf; 37 | #endif 38 | 39 | static void *unicorn_ud; 40 | 41 | UNICORN_API unistat uni_setmemfunc(void *user_data, unimemfunc allocf) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 42 | { 43 | unistat status = UNI_OK; 44 | if (allocf == NULL) 45 | { 46 | unicorn_ud = NULL; 47 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES) 48 | unicorn_allocf = &default_allocf; 49 | #else 50 | unicorn_allocf = NULL; 51 | status = UNI_FEATURE_DISABLED; 52 | #endif 53 | } 54 | else 55 | { 56 | unicorn_ud = user_data; 57 | unicorn_allocf = allocf; 58 | } 59 | return status; 60 | } 61 | 62 | void *uni_malloc(size_t size) 63 | { 64 | void *ptr = NULL; 65 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests. 66 | { 67 | ptr = unicorn_allocf(unicorn_ud, NULL, 0, size); 68 | } 69 | return ptr; 70 | } 71 | 72 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size) 73 | { 74 | void *new_ptr = NULL; 75 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests. 76 | { 77 | new_ptr = unicorn_allocf(unicorn_ud, old_ptr, old_size, new_size); 78 | } 79 | return new_ptr; 80 | } 81 | 82 | void uni_free(void *ptr, size_t size) 83 | { 84 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests. 85 | { 86 | (void)unicorn_allocf(unicorn_ud, ptr, size, 0); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /UnicornConfig.cmake.in: -------------------------------------------------------------------------------- 1 | #[=======================================================================[.rst: 2 | FindUnicorn 3 | ------------ 4 | 5 | Finds the Unicorn library. 6 | 7 | Imported Targets 8 | ^^^^^^^^^^^^^^^^ 9 | 10 | This module provides the following imported targets, if found: 11 | 12 | ``Railgun::Unicorn`` 13 | The Unicorn library 14 | 15 | Result Variables 16 | ^^^^^^^^^^^^^^^^ 17 | 18 | This will define the following variables: 19 | 20 | ``UNICORN_FOUND`` 21 | True if the system has the Unicorn library. 22 | ``UNICORN_VERSION`` 23 | The version of the Unicorn library which was found. 24 | ``UNICORN_INCLUDE_DIRS`` 25 | Include directories needed to use Unicorn. 26 | ``UNICORN_LIBRARIES`` 27 | Libraries needed to link to Unicorn. 28 | 29 | Cache Variables 30 | ^^^^^^^^^^^^^^^ 31 | 32 | The following cache variables may also be set: 33 | 34 | ``UNICORN_INCLUDE_DIR`` 35 | The directory containing ``unicorn.h``. 36 | ``UNICORN_LIBRARY`` 37 | The path to the Unicorn library. 38 | 39 | #]=======================================================================] 40 | 41 | # On Windows the header and library are installed in the same directory as this 42 | # script is therefore the following function is called to obtain its path. 43 | get_filename_component(UNICORN_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY) 44 | 45 | # Find header. 46 | find_path(UNICORN_INCLUDE_DIR unicorn.h 47 | HINTS /usr/local/include /usr/include ${UNICORN_CONFIG_DIR}) 48 | 49 | # Find library. 50 | find_library(UNICORN_LIBRARY 51 | NAMES Unicorn unicorn 52 | HINTS /usr/local/lib /usr/lib ${UNICORN_CONFIG_DIR}) 53 | 54 | # Not needed anymore. 55 | unset(UNICORN_CONFIG_DIR) 56 | 57 | # Use version specified in top-level CMakeLists.txt 58 | set(UNICORN_VERSION "@PROJECT_VERSION@") 59 | 60 | # Handle the QUIETLY and REQUIRED arguments and set UNICORN_FOUND to TRUE if all listed variables are TRUE. 61 | include(FindPackageHandleStandardArgs) 62 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Unicorn 63 | FOUND_VAR UNICORN_FOUND 64 | REQUIRED_VARS UNICORN_LIBRARY UNICORN_INCLUDE_DIR 65 | VERSION_VAR UNICORN_VERSION) 66 | 67 | # Set result variables. 68 | if(UNICORN_FOUND) 69 | set(UNICORN_LIBRARIES ${UNICORN_LIBRARY}) 70 | set(UNICORN_INCLUDE_DIRS ${UNICORN_INCLUDE_DIR}) 71 | endif() 72 | 73 | # Export a module. 74 | if(UNICORN_FOUND AND NOT TARGET Railgun::Unicorn) 75 | add_library(Railgun::Unicorn UNKNOWN IMPORTED) 76 | set_target_properties(Railgun::Unicorn PROPERTIES 77 | IMPORTED_LOCATION "${UNICORN_LIBRARY}" 78 | INTERFACE_INCLUDE_DIRECTORIES "${UNICORN_INCLUDE_DIR}") 79 | endif() 80 | 81 | # Cached variables should be hidden in the CMake interface unless the user explicitly asks to edit them. 82 | mark_as_advanced(UNICORN_INCLUDE_DIR UNICORN_LIBRARY) 83 | -------------------------------------------------------------------------------- /man/uni_caseconv.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_caseconv \- case convert a string 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_caseconv(unicaseconv " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function case converts \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R]. 14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R]. 15 | .PP 16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R]. 17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 18 | .PP 19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R]. 20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer. 21 | .SH RETURN VALUE 22 | .TP 23 | UNI_OK 24 | If \f[I]src\f[R] was case converted successfully. 25 | .TP 26 | UNI_BAD_OPERATION 27 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero. 28 | .TP 29 | UNI_BAD_ENCODING 30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 31 | .TP 32 | UNI_NO_SPACE 33 | If \f[I]dst\f[R] was not large enough to accommodate the case converted text. 34 | .TP 35 | UNI_FEATURE_DISABLED 36 | If the library was built without support for case conversion. 37 | .SH EXAMPLES 38 | This example case converts a string to uppercase. 39 | .PP 40 | .in +4n 41 | .EX 42 | #include 43 | #include 44 | 45 | int main(void) 46 | { 47 | const char *in = u8"hello, 世界"; 48 | char out[32]; 49 | unisize outlen = sizeof(out); 50 | 51 | if (uni_caseconv(UNI_UPPER, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK) 52 | { 53 | // something went wrong 54 | return 1; 55 | } 56 | 57 | printf("%.*s", outlen, out); // prints "HELLO, 世界" 58 | return 0; 59 | } 60 | .EE 61 | .in 62 | .SH SEE ALSO 63 | .BR unistat (3), 64 | .BR UNI_TRUST (3), 65 | .BR unicaseconv (3), 66 | .BR unisize (3), 67 | .BR uniattr (3) 68 | .SH AUTHOR 69 | .UR https://railgunlabs.com 70 | Railgun Labs 71 | .UE . 72 | .SH INTERNET RESOURCES 73 | The online documentation is published on the 74 | .UR https://railgunlabs.com/unicorn 75 | Railgun Labs website 76 | .UE . 77 | .SH LICENSING 78 | Unicorn is distributed with its end-user license agreement (EULA). 79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 80 | -------------------------------------------------------------------------------- /scripts/case_folding.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import List, Set, Type, Tuple 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace, Codepoint 18 | from .feature import Feature, FLAGS_PROPERTY, IS_NORMALIZATION_NEEDED, IS_CHANGES_WHEN_CASEFOLDED 19 | from .basics import StorageSize, SerializationData 20 | from .decomposition import CanonicalDecompositionFeature 21 | 22 | class DefaultCaseFolding(Feature): 23 | def pre_process(self, codespace: Codespace) -> None: 24 | codespace.register_property("full_casefold_mapping_offset", 0, StorageSize.UINT16) 25 | codespace.register_property(*FLAGS_PROPERTY) 26 | 27 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 28 | file = archive.open('casefold.json') 29 | data = json.loads(file.read()) 30 | source: str = data["source"] 31 | header: str = data["header"] 32 | size: int = data["size"] 33 | has_greek: List[Codepoint] = data["hasGreek"] 34 | case_foldings: List[Tuple[Codepoint,int]] = data["caseFoldings"] 35 | changes_when_casefolded: List[Codepoint] = data["changesWhenCasefolded"] 36 | file.close() 37 | 38 | flags_property = codespace.get(FLAGS_PROPERTY[0]) 39 | casefold_property = codespace.get("full_casefold_mapping_offset") 40 | 41 | for cp in has_greek: 42 | codespace.set_bitwise(cp, flags_property, IS_NORMALIZATION_NEEDED) 43 | 44 | for cp in changes_when_casefolded: 45 | codespace.set_bitwise(cp, flags_property, IS_CHANGES_WHEN_CASEFOLDED) 46 | 47 | for cp,offset in case_foldings: 48 | codespace.set(cp, casefold_property, offset) 49 | 50 | header += "#define UNICORN_CHAR_NEEDS_NORMALIZATION {0}u\n".format(IS_NORMALIZATION_NEEDED) 51 | header += "#define UNICORN_CHAR_CHANGES_WHEN_CASEFOLDED {0}u\n".format(IS_CHANGES_WHEN_CASEFOLDED) 52 | 53 | public_header = "#define UNICORN_FEATURE_CASEFOLD_DEFAULT\n" 54 | return SerializationData(source, header, public_header, size) 55 | 56 | class CanonicalCaseFolding(Feature): 57 | @staticmethod 58 | def dependencies() -> Set[Type[Feature]]: 59 | return set([DefaultCaseFolding, CanonicalDecompositionFeature]) 60 | 61 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 62 | public_header = "#define UNICORN_FEATURE_CASEFOLD_CANONICAL\n" 63 | return SerializationData(public_header=public_header) 64 | -------------------------------------------------------------------------------- /man/unistat.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unistat \- status code 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum unistat { 11 | .RS 12 | .B UNI_OK, 13 | .B UNI_DONE, 14 | .B UNI_NO_MEMORY, 15 | .B UNI_NO_SPACE, 16 | .B UNI_BAD_ENCODING, 17 | .B UNI_BAD_OPERATION, 18 | .B UNI_FEATURE_DISABLED, 19 | .B UNI_MALFUNCTION, 20 | .RE 21 | .B }; 22 | .fi 23 | .SH DESCRIPTION 24 | Most functions in the Unicorn library return a constant from this enumeration. 25 | Of the constants, only \f[B]UNI_OK\f[R] and \f[B]UNI_DONE\f[R] represent a success status whereas the others represent failures. 26 | .SH CONSTANTS 27 | .TP 28 | .BR UNI_OK 29 | Represents the successful execution of an operation. 30 | .TP 31 | .BR UNI_DONE 32 | Represents the successful completion of an operation. 33 | An example of a function that returns this status code is \f[B]uni_next\f[R](3) which returns it when the end of the input string has been reached. 34 | .TP 35 | .BR UNI_NO_MEMORY 36 | Functions that can dynamically allocate memory will list this constant as one of their potential return values. 37 | Functions that omit this constant as a return value do not perform dynamic memory allocation. 38 | .TP 39 | .BR UNI_NO_SPACE 40 | This failure code indicates the destination buffer is too small to receive the content and therefore a larger buffer must be used. 41 | .TP 42 | .BR UNI_BAD_ENCODING 43 | This failure code indicates a malformed character sequence was encountered when decoding text. 44 | Examples of malformed character sequences would be overlong sequences in UTF-8 or unpaired surrogate characters in UTF-16. 45 | .TP 46 | .BR UNI_BAD_OPERATION 47 | This failure code is returned by functions when they are called with invalid arguments. 48 | For example, a function might return this code when given a null pointer when a non-null pointer was expected. 49 | .TP 50 | .BR UNI_FEATURE_DISABLED 51 | This failure code indicates the requested operation cannot be performed because the feature is disabled. 52 | Unicorn allows consumers to customize the Unicode features it's built with. 53 | This is useful for reducing the footprint of the library. 54 | If a user attempts to call a function that uses a disabled feature of the library, then this status code will be returned. 55 | .TP 56 | .BR UNI_MALFUNCTION 57 | This failure code indicates a defect with the implementation. 58 | In a working version of Unicorn, an application will never see this status code. 59 | If an application encounters this code, it means there is a bug in Unicorn. 60 | .SH SEE ALSO 61 | .BR uni_next (3) 62 | .SH AUTHOR 63 | .UR https://railgunlabs.com 64 | Railgun Labs 65 | .UE . 66 | .SH INTERNET RESOURCES 67 | The online documentation is published on the 68 | .UR https://railgunlabs.com/unicorn 69 | Railgun Labs website 70 | .UE . 71 | .SH LICENSING 72 | Unicorn is distributed with its end-user license agreement (EULA). 73 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 74 | -------------------------------------------------------------------------------- /man/uni_norm.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_norm \- normalize text 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_norm(uninormform " form ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function normalizes \f[I]src\f[R] and writes the result to the \f[I]dst\f[R] buffer. 14 | .PP 15 | The implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R]. 16 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 17 | .PP 18 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the function writes to \f[I]dst_len\f[R] the number of code units in the fully normalized text and returns \f[B]UNI_OK\f[R]. 19 | The function can be called this way to first compute the total size of the destination buffer, then called again with a sufficiently sized buffer. 20 | .PP 21 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated. 22 | .SH RETURN VALUE 23 | .TP 24 | UNI_OK 25 | On success. 26 | .TP 27 | UNI_BAD_OPERATION 28 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is negative, or if \f[I]dst\f[R] is NULL and \f[I]dst_len\f[R] is greater than zero. 29 | .TP 30 | UNI_BAD_ENCODING 31 | If \f[I]src\f[R] is malformed; this is never returned if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3). 32 | .TP 33 | UNI_NO_SPACE 34 | If \f[I]dst\f[R] lacks the capacity to store the normalization of \f[I]src\f[R]. 35 | .TP 36 | UNI_NO_MEMORY 37 | If dynamic memory allocation failed. 38 | .TP 39 | UNI_FEATURE_DISABLED 40 | If Unicorn was built without support for normalizing to \f[I]form\f[R]. 41 | .SH EXAMPLES 42 | This example normalizes the input string to Normalization Form D. This form is ideal for in-memory string comparison because it is quick to compute. 43 | For persistent storage or transmission, Normalization Form C is preferred. 44 | .PP 45 | .in +4n 46 | .EX 47 | #include 48 | #include 49 | 50 | int main(void) 51 | { 52 | const char *in = u8"Åström"; 53 | char out[32]; 54 | unisize outlen = sizeof(out); 55 | 56 | if (uni_norm(UNI_NFD, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK) 57 | { 58 | // something went wrong 59 | return 1; 60 | } 61 | 62 | printf("%.*s", outlen, out); 63 | return 0; 64 | } 65 | .EE 66 | .in 67 | .SH SEE ALSO 68 | .BR unistat (3), 69 | .BR UNI_TRUST (3), 70 | .BR uninormform (3), 71 | .BR unisize (3), 72 | .BR uniattr (3) 73 | .SH AUTHOR 74 | .UR https://railgunlabs.com 75 | Railgun Labs 76 | .UE . 77 | .SH INTERNET RESOURCES 78 | The online documentation is published on the 79 | .UR https://railgunlabs.com/unicorn 80 | Railgun Labs website 81 | .UE . 82 | .SH LICENSING 83 | Unicorn is distributed with its end-user license agreement (EULA). 84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 85 | -------------------------------------------------------------------------------- /scripts/case_conversion.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Dict, Set, Type 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature, FLAGS_PROPERTY 19 | from .simple_case_mappings import SimpleLowercaseMapping, SimpleTitlecaseMapping, SimpleUppercaseMapping 20 | from .segmentation import WordBreak 21 | from .ccc import CanonicalCombiningClass 22 | from .basics import SerializationData 23 | 24 | class _CaseConversion(Feature): 25 | def pre_process(self, codespace: Codespace) -> None: 26 | codespace.register_property(*FLAGS_PROPERTY) 27 | 28 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData: 29 | file = archive.open('special_case_mappings.json') 30 | data = json.loads(file.read()) 31 | source: str = data["source"] 32 | header: str = data["header"] 33 | size: int = data["size"] 34 | mappings: Dict[str,int] = data["characterFlags"] 35 | file.close() 36 | 37 | flags_property = codespace.get(FLAGS_PROPERTY[0]) 38 | 39 | for cp, flags in mappings.items(): 40 | codespace.set_bitwise(int(cp,16), flags_property, flags) 41 | 42 | return SerializationData(source=source, header=header, size=size) 43 | 44 | class LowercaseConversion(Feature): 45 | @staticmethod 46 | def dependencies() -> Set[Type[Feature]]: 47 | return set([SimpleLowercaseMapping, _CaseConversion]) 48 | 49 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData: 50 | public_header = "#define UNICORN_FEATURE_LOWERCASE_CONVERT\n" 51 | return SerializationData(public_header=public_header) 52 | 53 | class UppercaseConversion(Feature): 54 | @staticmethod 55 | def dependencies() -> Set[Type[Feature]]: 56 | return set([SimpleUppercaseMapping, _CaseConversion]) 57 | 58 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData: 59 | public_header = "#define UNICORN_FEATURE_UPPERCASE_CONVERT\n" 60 | return SerializationData(public_header=public_header) 61 | 62 | class TitlecaseConversion(Feature): 63 | @staticmethod 64 | def dependencies() -> Set[Type[Feature]]: 65 | return set([SimpleTitlecaseMapping, WordBreak, CanonicalCombiningClass, LowercaseConversion]) 66 | 67 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData: 68 | public_header = "#define UNICORN_FEATURE_TITLECASE_CONVERT\n" 69 | return SerializationData(public_header=public_header) 70 | -------------------------------------------------------------------------------- /man/uni_convert.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_convert \- convert encoding forms 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_convert(const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function converts the text pointed to by \f[I]src\f[R], which is in the encoding form specified by \f[I]src_attr\f[R], to the encoding specified by \f[I]dst_attr\f[R] and writes the result to \f[I]dst\f[R]. 14 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated. 15 | .PP 16 | If \f[I]dst\f[R] isn't large enough to contain the re-encoded \f[I]src\f[R] text, then it will be truncated to the nearest code point boundary and \f[B]UNI_NO_SPACE\f[R] will be returned. 17 | If \f[I]dst\f[R] is large enough, then \f[B]UNI_OK\f[R] is returned. 18 | .PP 19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully re-encoded text and returns \f[B]UNI_OK\f[R]. 20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer. 21 | .PP 22 | If \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] are identical, then this function behaves like \f[B]memcpy\f[R](3). 23 | .SH RETURN VALUE 24 | .TP 25 | UNI_OK 26 | On success. 27 | .TP 28 | UNI_BAD_OPERATION 29 | If \f[I]src\f[R] or \f[C]dest_len\f[R] are null. 30 | .TP 31 | UNI_BAD_ENCODING 32 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 33 | .TP 34 | UNI_NO_SPACE 35 | If \f[C]dest\f[R] is too small. 36 | .TP 37 | UNI_FEATURE_DISABLED 38 | If the \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] encoding forms are disabled. 39 | .SH EXAMPLES 40 | This example re-encodes text from UTF-32 to UTF-8. 41 | It uses \f[C]char32_t\f[R] and 'U' literal strings from C11. 42 | .PP 43 | .in +4n 44 | .EX 45 | #include 46 | #include 47 | #include 48 | 49 | int main(void) 50 | { 51 | const char32_t in[] = U"👨🏻‍🚀🧑🏼‍🚀 landed on the 🌕 in 1969."; 52 | char out[64]; 53 | unisize outlen = sizeof(out); 54 | 55 | if (uni_convert(in, -1, UNI_UTF32, out, &outlen, UNI_UTF8) != UNI_OK) 56 | { 57 | // something went wrong 58 | return 1; 59 | } 60 | 61 | printf("%.*s", outlen, out); 62 | return 0; 63 | } 64 | .EE 65 | .in 66 | .SH SEE ALSO 67 | .BR unistat (3), 68 | .BR UNI_TRUST (3), 69 | .BR unisize (3), 70 | .BR uniattr (3) 71 | .SH AUTHOR 72 | .UR https://railgunlabs.com 73 | Railgun Labs 74 | .UE . 75 | .SH INTERNET RESOURCES 76 | The online documentation is published on the 77 | .UR https://railgunlabs.com/unicorn 78 | Railgun Labs website 79 | .UE . 80 | .SH LICENSING 81 | Unicorn is distributed with its end-user license agreement (EULA). 82 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 83 | -------------------------------------------------------------------------------- /man/uni_encode.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_encode \- encode a scalar value 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_encode(unichar " cp ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function encodes the Unicode scalar value \f[I]cp\f[R] into the encoding form \f[I]dst_attr\f[R] and writes the resulting code units to \f[I]dst\f[R]. 14 | The code unit capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R]. 15 | The implementation will write the number of code units needed to encode \f[I]cp\f[R] to \f[I]dst_len\f[R] on output. 16 | .PP 17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 18 | .PP 19 | To ensure the \f[I]dst\f[R] is sufficiently sized, the following table lists the number of code units needed to encode any character within its respective encoding form. 20 | .PP 21 | .TS 22 | allbox tab(|); 23 | ll. 24 | \fBEncoding Form\fR|\fBLongest Code Unit Sequence\fR 25 | T{ 26 | UTF-8 27 | T}|T{ 28 | 4 29 | T} 30 | T{ 31 | UTF-16 32 | T}|T{ 33 | 2 34 | T} 35 | T{ 36 | UTF-32 37 | T}|T{ 38 | 1 39 | T} 40 | .TE 41 | .PP 42 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the number of code units needed will be computed by the implementation and written to \f[I]dst_len\f[R] and \f[B]UNI_OK\f[R] is returned. 43 | .SH RETURN VALUE 44 | .TP 45 | UNI_OK 46 | If the character was encoded successfully. 47 | .TP 48 | UNI_BAD_OPERATION 49 | If \f[I]cp\f[R] is not a Unicode scalar value, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero. 50 | .TP 51 | UNI_NO_SPACE 52 | If \f[C]dest\f[R] is too small. 53 | .TP 54 | UNI_FEATURE_DISABLED 55 | If Unicorn was built without support for the encoding form specified by \f[I]dst_attr\f[R]. 56 | .SH EXAMPLES 57 | This example encodes the Unicode scalar value for GRINNING FACE (U+1F600) as UTF-8 encoded code units. 58 | The code units are printed to stdout as hexadecimal numbers. 59 | .PP 60 | .in +4n 61 | .EX 62 | #include 63 | #include 64 | 65 | int main(void) 66 | { 67 | unichar cp = 0x1F600; 68 | 69 | uint8_t u8[4]; 70 | unisize u8_len = 4; 71 | 72 | if (uni_encode(cp, u8, &u8_len, UNI_UTF8) != UNI_OK) 73 | { 74 | puts("failed to encode character"); 75 | } 76 | 77 | for (unisize i = 0; i < u8_len; i++) 78 | { 79 | printf("0x%02X ", u8[i]); 80 | } 81 | return 0; 82 | } 83 | .EE 84 | .in 85 | .SH SEE ALSO 86 | .BR unistat (3), 87 | .BR unichar (3), 88 | .BR unisize (3), 89 | .BR uniattr (3) 90 | .SH AUTHOR 91 | .UR https://railgunlabs.com 92 | Railgun Labs 93 | .UE . 94 | .SH INTERNET RESOURCES 95 | The online documentation is published on the 96 | .UR https://railgunlabs.com/unicorn 97 | Railgun Labs website 98 | .UE . 99 | .SH LICENSING 100 | Unicorn is distributed with its end-user license agreement (EULA). 101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 102 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #ifndef COMMON_H 15 | #define COMMON_H 16 | 17 | #include "unicorn.h" 18 | #include 19 | #include 20 | 21 | #define GET_ENCODING(FLAGS) ((FLAGS) & (UNI_SCALAR | UNI_UTF8 | UNI_UTF16 | UNI_UTF32)) 22 | 23 | #define UNICHAR_C(X) ((unichar)(X)) 24 | #define UNISIZE_C(X) ((unisize)(X)) 25 | #define UNIHASH_C(X) ((unihash)(X)) 26 | 27 | #define UNICORN_LARGEST_CODE_POINT UNICHAR_C(0x10FFFFL) 28 | #define FIRST_ILLEGAL_CODE_POINT UNICHAR_C(0x110000L) 29 | 30 | #define COUNT_OF(array) (sizeof(array) / sizeof((array)[0])) 31 | 32 | typedef uint8_t UChar8; 33 | typedef uint16_t UChar16; 34 | typedef uint32_t UChar32; 35 | 36 | typedef uint32_t unihash; 37 | 38 | typedef uint16_t(*ByteSwap16)(uint16_t val); 39 | typedef uint32_t(*ByteSwap32)(uint32_t val); 40 | 41 | struct unitext 42 | { 43 | const void *data; 44 | unisize index; 45 | unisize length; 46 | uniattr encoding; 47 | }; 48 | 49 | void *uni_malloc(size_t size); 50 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size); 51 | void uni_free(void *ptr, size_t size); 52 | 53 | unisize unichar_to_u8(unichar codepoint, UChar8 bytes[4]); 54 | unisize unichar_to_u16(unichar codepoint, UChar16 words[2], ByteSwap16 swap); // cppcheck-suppress premium-misra-c-2012-17.3 ; This is a false positive. 55 | 56 | unisize uni_prev_UTF8_seqlen(const UChar8 *start, unisize offset); 57 | 58 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding); 59 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding); 60 | 61 | void uni_message(const char *msg); 62 | 63 | static inline bool is_low_surrogate(unichar c) 64 | { 65 | bool is_low; 66 | if (c < UNICHAR_C(0xDC00)) 67 | { 68 | is_low = false; 69 | } 70 | else if (c > UNICHAR_C(0xDFFF)) 71 | { 72 | is_low = false; 73 | } 74 | else 75 | { 76 | is_low = true; 77 | } 78 | return is_low; 79 | } 80 | 81 | static inline bool is_high_surrogate(unichar c) 82 | { 83 | bool is_high; 84 | if (c < UNICHAR_C(0xD800)) 85 | { 86 | is_high = false; 87 | } 88 | else if (c > UNICHAR_C(0xDBFF)) 89 | { 90 | is_high = false; 91 | } 92 | else 93 | { 94 | is_high = true; 95 | } 96 | return is_high; 97 | } 98 | 99 | #if defined(_MSC_VER) 100 | #define UNREACHABLE __assume(false) 101 | #elif defined(__GNUC__) || defined(__clang__) 102 | #define UNREACHABLE __builtin_unreachable() 103 | #else 104 | #define UNREACHABLE assert(0 && "unreachable code") // LCOV_EXCL_BR_LINE 105 | #endif 106 | 107 | #endif // COMMON_H 108 | -------------------------------------------------------------------------------- /man/uni_casefold.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_casefold \- case fold a string 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_casefold(unicasefold " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function case folds \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R]. 14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R]. 15 | .PP 16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R]. 17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 18 | .PP 19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R]. 20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer. 21 | .SH RETURN VALUE 22 | .TP 23 | UNI_OK 24 | If \f[C]text\f[R] was checked successfully. 25 | .TP 26 | UNI_BAD_OPERATION 27 | If \f[C]text\f[R] is null, \f[C]length\f[R] is negative, or \f[C]result\f[R] is null. 28 | .TP 29 | UNI_BAD_ENCODING 30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 31 | .TP 32 | UNI_NO_SPACE 33 | If \f[I]dst\f[R] was not large enough. 34 | .TP 35 | UNI_FEATURE_DISABLED 36 | If the library was built without support for case folding. 37 | .TP 38 | UNI_NO_MEMORY 39 | If dynamic memory allocation failed. 40 | .SH EXAMPLES 41 | This example casefolds a string for canonical caseless comparison. 42 | .PP 43 | The example will produce an output string that is longer than the input string. 44 | This is because the German 'ß' (U+00DF) case folds to 'ss' (U+0073 U+0073) and the Latin small letter 'ö' with diaeresis (U+00F6) canonically normalizes to an 'o' (U+006F) followed by a combining diaeresis character (U+0308). 45 | .PP 46 | .in +4n 47 | .EX 48 | #include 49 | #include 50 | 51 | int main(void) 52 | { 53 | const char *in = u8"Stößen"; 54 | char out[32]; 55 | unisize outlen = sizeof(out); 56 | 57 | if (uni_casefold(UNI_CANONICAL, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK) 58 | { 59 | // something went wrong 60 | return 1; 61 | } 62 | 63 | printf("%.*s", outlen, out); // prints "stössen" 64 | return 0; 65 | } 66 | .EE 67 | .in 68 | .SH SEE ALSO 69 | .BR unistat (3), 70 | .BR UNI_TRUST (3), 71 | .BR unicasefold (3), 72 | .BR unisize (3), 73 | .BR uniattr (3) 74 | .SH AUTHOR 75 | .UR https://railgunlabs.com 76 | Railgun Labs 77 | .UE . 78 | .SH INTERNET RESOURCES 79 | The online documentation is published on the 80 | .UR https://railgunlabs.com/unicorn 81 | Railgun Labs website 82 | .UE . 83 | .SH LICENSING 84 | Unicorn is distributed with its end-user license agreement (EULA). 85 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 86 | -------------------------------------------------------------------------------- /man/uni_sortkeymk.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_sortkeymk \- make a sort key 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_sortkeymk(const void *" text ", unisize " text_len ", uniattr " text_attr ", uniweighting " weighting ", unistrength " strength ", uint16_t *" sortkey ", size_t *" sortkey_cap ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function constructs a sort key for \f[I]text\f[R] and writes the result to \f[I]sortkey\f[R]. 14 | .PP 15 | The number of code units in \f[I]text\f[R] is specified by \f[I]text_len\f[R]. 16 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated. 17 | .PP 18 | The capacity of \f[I]sortkey\f[R] is specified by \f[I]sortkey_cap\f[R]. 19 | The implementation always writes to \f[I]sortkey_cap\f[R] the total number of weights needed for \f[I]text\f[R]. 20 | If the capacity of \f[I]sortkey\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned. 21 | .SH RETURN VALUE 22 | .TP 23 | UNI_OK 24 | If the scalar was successfully decoded. 25 | .TP 26 | UNI_NO_SPACE 27 | If \f[I]sortkey\f[R] lacks capacity for the collation elements. 28 | .TP 29 | UNI_BAD_OPERATION 30 | If \f[I]text\f[R] or \f[I]sortkey_cap\f[R] are NULL. 31 | .TP 32 | UNI_BAD_ENCODING 33 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)). 34 | .TP 35 | UNI_NO_MEMORY 36 | If dynamic memory allocation failed. 37 | .SH EXAMPLES 38 | This example constructs two sort keys and compares them. 39 | Sort keys must be generated with the same settings for their order to make sense. 40 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength. 41 | In a real application you would cache the sort keys for future comparisons. 42 | For one-off comparisons, use \f[B]uni_collate\f[R](3). 43 | .PP 44 | .in +4n 45 | .EX 46 | #include 47 | #include 48 | 49 | int main(void) 50 | { 51 | const char *s1 = "hello", *s2 = "Hi"; 52 | uint16_t sk1[32] = {0}, sk2[16] = {0}; 53 | size_t sk1_len = 32, sk2_len = 16; 54 | 55 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK || 56 | uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK) 57 | { 58 | puts("failed to construct sort keys"); 59 | return 1; 60 | } 61 | 62 | int result; 63 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK) 64 | { 65 | puts("failed to compare sort keys"); 66 | return 1; 67 | } 68 | 69 | printf("%d", result); 70 | return 0; 71 | } 72 | .EE 73 | .in 74 | .SH SEE ALSO 75 | .BR unistat (3), 76 | .BR uniweighting (3), 77 | .BR unistrength (3), 78 | .BR UNI_TRUST (3), 79 | .BR unisize (3), 80 | .BR uniattr (3) 81 | .SH AUTHOR 82 | .UR https://railgunlabs.com 83 | Railgun Labs 84 | .UE . 85 | .SH INTERNET RESOURCES 86 | The online documentation is published on the 87 | .UR https://railgunlabs.com/unicorn 88 | Railgun Labs website 89 | .UE . 90 | .SH LICENSING 91 | Unicorn is distributed with its end-user license agreement (EULA). 92 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 93 | -------------------------------------------------------------------------------- /man/uni_collate.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_collate \- compare strings for sorting 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_collate(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", uniweighting " weighting ", unistrength " strength ", int32_t *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function compares \f[I]s1\f[R] and \f[I]s2\f[R] for sorting and writes -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]s1 < s2\f[R], \f[C]s1 = s2\f[R], or \f[C]s1 > s2\f[R]. 14 | .PP 15 | This function is intended for one-off string comparisons. 16 | If a string will be compared multiple times, then it's recommended to construct a sort key once with \f[B]uni_sortkeymk\f[R](3) and perform comparisons with \f[B]uni_sortkeycmp\f[R](3). 17 | .PP 18 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R]. 19 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated. 20 | .PP 21 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R]. 22 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated. 23 | .SH RETURN VALUE 24 | .TP 25 | UNI_OK 26 | On success. 27 | .TP 28 | UNI_BAD_OPERATION 29 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null. 30 | .TP 31 | UNI_BAD_ENCODING 32 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)). 33 | .TP 34 | UNI_FEATURE_DISABLED 35 | If Unicorn was built without support for collation. 36 | .TP 37 | UNI_NO_MEMORY 38 | If dynamic memory allocation failed. 39 | .SH EXAMPLES 40 | This example collates two strings. 41 | The function conceptually constructs a sort key for the input strings and then compares them. 42 | Constructing a sort key is expensive. 43 | If a string will be collated multiple times, then it is recommended to construct a sort key once and cache it for future calculations. 44 | See \f[B]uni_sortkeymk\f[R](3) for details. 45 | .PP 46 | .in +4n 47 | .EX 48 | #include 49 | #include 50 | 51 | int main(void) 52 | { 53 | const char *s1 = u8"Hello World"; 54 | const char *s2 = u8"こんにちは世界"; 55 | int result; 56 | 57 | if (uni_collate(s1, -1, UNI_UTF8, 58 | s2, -1, UNI_UTF8, 59 | UNI_NON_IGNORABLE, UNI_PRIMARY, &result) != UNI_OK) 60 | { 61 | puts("failed to collate strings"); 62 | return 1; 63 | } 64 | 65 | printf("%d", result); 66 | return 0; 67 | } 68 | .EE 69 | .in 70 | .SH SEE ALSO 71 | .BR uni_sortkeymk (3), 72 | .BR uni_sortkeycmp (3), 73 | .BR uniweighting (3), 74 | .BR unistrength (3), 75 | .BR uniattr (3), 76 | .BR UNI_TRUST (3), 77 | .BR unisize (3) 78 | .SH AUTHOR 79 | .UR https://railgunlabs.com 80 | Railgun Labs 81 | .UE . 82 | .SH INTERNET RESOURCES 83 | The online documentation is published on the 84 | .UR https://railgunlabs.com/unicorn 85 | Railgun Labs website 86 | .UE . 87 | .SH LICENSING 88 | Unicorn is distributed with its end-user license agreement (EULA). 89 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 90 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(SOURCES 2 | ../include/unicorn.h 3 | unicorn.c 4 | logger.c 5 | memory.c 6 | common.h 7 | byteswap.h 8 | encoding.c 9 | segmentation.c 10 | normalize.c 11 | normalize.h 12 | collation.c 13 | ducet.c 14 | ducet.h 15 | compression.c 16 | properties.c 17 | caseconv.c 18 | casefold.c 19 | charbuf.c 20 | charbuf.h 21 | charvec.c 22 | charvec.h 23 | ${CMAKE_BINARY_DIR}/unidata.c 24 | ${CMAKE_BINARY_DIR}/unidata.h 25 | ) 26 | 27 | if (UNICORN_BUILD_STATIC) 28 | add_library(unicorn_static STATIC ${SOURCES}) 29 | target_compile_definitions(unicorn_static PUBLIC UNICORN_STATIC) 30 | set_property(TARGET unicorn_static PROPERTY C_STANDARD 99) 31 | set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h) 32 | set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h) 33 | add_dependencies(unicorn_static unicorn_generate) 34 | install(TARGETS unicorn_static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 35 | endif () 36 | 37 | if (UNICORN_BUILD_SHARED OR UNICORN_BUILD_EXAMPLES) 38 | add_library(unicorn SHARED ${SOURCES}) 39 | target_compile_definitions(unicorn PRIVATE DLL_EXPORT) 40 | set_property(TARGET unicorn PROPERTY C_STANDARD 99) 41 | set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h) 42 | set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h) 43 | add_dependencies(unicorn unicorn_generate) 44 | 45 | if (CMAKE_C_COMPILER_ID MATCHES "Clang" OR 46 | CMAKE_C_COMPILER_ID MATCHES "GNU") 47 | target_compile_options(unicorn PRIVATE -pedantic) 48 | endif () 49 | 50 | if (CMAKE_C_COMPILER_ID MATCHES "Clang") 51 | target_compile_options(unicorn PRIVATE -Wall -Wextra -Wconversion -Wno-missing-field-initializers 52 | -Wdouble-promotion -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion) 53 | endif () 54 | 55 | install(TARGETS unicorn LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 56 | endif () 57 | 58 | # Define a special build of Unicorn specifically for unit testing. 59 | # This build is either the same as the static library build OR its 60 | # a special dynamic library build that optionally adds code 61 | # coverage instrumentation. 62 | # 63 | # The static build is neccessary on Windows to ensure the tests 64 | # have access to internal/private symbols e.g. those not exported 65 | # with DLL linkage. The coverage build is only ever run on *nix 66 | # systems and shared objects on those platforms don't have those 67 | # linker issues. 68 | if (UNICORN_BUILD_TESTS OR UNICORN_BUILD_COOKBOOK) 69 | if (UNICORN_CODE_COVERAGE) 70 | add_library(unicorn_internal SHARED ${SOURCES}) 71 | target_compile_options(unicorn_internal PRIVATE -g -fprofile-arcs -ftest-coverage -O0) 72 | target_link_options(unicorn_internal PRIVATE -fprofile-arcs -ftest-coverage -fbranch-probabilities -O0) 73 | target_compile_definitions(unicorn_internal PRIVATE DLL_EXPORT) 74 | else () 75 | add_library(unicorn_internal STATIC ${SOURCES}) 76 | target_compile_definitions(unicorn_internal PUBLIC UNICORN_STATIC) 77 | endif () 78 | set_property(TARGET unicorn_internal PROPERTY C_STANDARD 99) 79 | add_dependencies(unicorn_internal unicorn_generate) 80 | endif () 81 | -------------------------------------------------------------------------------- /man/uni_normcmp.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | uni_normcmp \- compare strings for canonical equivalence 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .BI "unistat uni_normcmp(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");" 11 | .fi 12 | .SH DESCRIPTION 13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are canonically equivalent. 14 | That is, it checks if the graphemes of both strings are the same. 15 | The behavior of this function is identical to calling \f[B]uni_norm\f[R](3) with \f[B]UNI_NFD\f[R] on both strings followed by a code point comparison. 16 | .PP 17 | The implementation is optimized for normalizing the strings incrementally while simultaneously comparing them. 18 | This is a more optimal approach when it's unknown whether input strings are normalized or not. 19 | If it's known in advance that the strings are both normalized, then they can be compared directly with \f[B]memcmp\f[R](3) or \f[B]strcmp\f[R](3). 20 | .PP 21 | The implementation strives to be highly performant and avoid dynamic memory allocation when possible. 22 | Typically, memory allocation will only be performed for unnaturally long combining character sequences, like 23 | .UR https://en.wikipedia.org/wiki/Zalgo_text 24 | Zalgo text 25 | .UE . 26 | It's rare for real-world text to trigger memory allocation. 27 | .SH RETURN VALUE 28 | .TP 29 | UNI_OK 30 | If the string was normalized successfully. 31 | .TP 32 | UNI_BAD_OPERATION 33 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, if \f[I]s1_len\f[R] or \f[I]s2_len\f[R] are negative, or if \f[I]result\f[R] is null. 34 | .TP 35 | UNI_BAD_ENCODING 36 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)). 37 | .TP 38 | UNI_NO_MEMORY 39 | If dynamic memory allocation failed. 40 | .SH EXAMPLES 41 | This example compares two strings for canonical equivalence. 42 | Conceptually, the implementation normalizes both strings, performs the comparison, and reports the result. 43 | This approach is recommended when strings are compared for one-off equality. 44 | If strings are compared repeatedly, then it's recommended to normalize them with \f[B]uni_norm\f[R](3) and cache the result for the comparisons. 45 | .PP 46 | .in +4n 47 | .EX 48 | #include 49 | #include 50 | #include 51 | 52 | int main(void) 53 | { 54 | const char *s1 = u8"ma\\u0301scara"; // 'a' + U+0301 = á (decomposed) 55 | const char *s2 = u8"m\\u00E1scara"; // U+00E1 = á (precomposed) 56 | bool is_equal; 57 | 58 | if (uni_normcmp(s1, -1, UNI_UTF8, 59 | s2, -1, UNI_UTF8, &is_equal) != UNI_OK) 60 | { 61 | puts("failed to normalize and compare strings"); 62 | return 1; 63 | } 64 | 65 | printf("%s", is_equal ? "equal" : "not equal"); 66 | return 0; 67 | } 68 | .EE 69 | .in 70 | .SH SEE ALSO 71 | .BR uni_norm (3), 72 | .BR uninormform (3), 73 | .BR uniattr (3), 74 | .BR UNI_TRUST (3), 75 | .BR unisize (3) 76 | .SH AUTHOR 77 | .UR https://railgunlabs.com 78 | Railgun Labs 79 | .UE . 80 | .SH INTERNET RESOURCES 81 | The online documentation is published on the 82 | .UR https://railgunlabs.com/unicorn 83 | Railgun Labs website 84 | .UE . 85 | .SH LICENSING 86 | Unicorn is distributed with its end-user license agreement (EULA). 87 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 88 | -------------------------------------------------------------------------------- /man/unibreak.3: -------------------------------------------------------------------------------- 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1" 2 | .SH NAME 3 | unibreak \- text boundaries 4 | .SH LIBRARY 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn) 6 | .SH SYNOPSIS 7 | .nf 8 | .B #include 9 | .PP 10 | .B enum unibreak { 11 | .RS 12 | .B UNI_GRAPHEME, 13 | .B UNI_WORD, 14 | .B UNI_SENTENCE, 15 | .RE 16 | .B }; 17 | .fi 18 | .SH DESCRIPTION 19 | The elements of this enumeration describe the boundaries that can be detected. 20 | Conceptually, a break represents the space in between two code points. 21 | .SH CONSTANTS 22 | .TP 23 | .BR UNI_GRAPHEME 24 | A grapheme or grapheme cluster is a user-perceived character. 25 | Grapheme clusters are useful for user interfaces and user-facing text manipulation. 26 | For example, graphemes should be used in the implementation of text selection, arrow key movement, backspacing through text, and so forth. 27 | When truncating text for presentation grapheme cluster boundaries should be used to avoid malforming user-perceived characters. 28 | .IP 29 | A key feature of Unicode grapheme clusters is that they remain unchanged across all canonically equivalent normalization forms. 30 | That means the graphemes boundaries remain unchanged whether the text is normalized as NFC or NFD. 31 | .IP 32 | Support for grapheme cluster break detection can be enabled in the JSON configuration as shown below: 33 | .IP 34 | .in +4n 35 | .EX 36 | { 37 | "algorithms": { 38 | "segmentation": [ 39 | "grapheme" 40 | ] 41 | } 42 | } 43 | .EE 44 | .in 45 | .TP 46 | .BR UNI_WORD 47 | Word boundaries are used in a number of different contexts. 48 | The most familiar ones are selection (double-click mouse selection or "move to next word" control-arrow keys) and "whole word search" for search and replace. 49 | They are also used in database queries and regular expressions, to determine whether elements are within a certain number of words of one another. 50 | .IP 51 | The default algorithm for detecting word boundaries primarily intended for languages that use white space to delimit words. 52 | Unfortunately, some scripts, like Thai, Lao, Khmer, and Myanmar, do not use spaces between words. 53 | Ideographic scripts such as Japanese and Chinese are even more complex. 54 | It is therefore not possible to provide an algorithm that correctly detects word boundaries across languages. 55 | These languages require special handling by a more sophisticated word break detection algorithm that understands the rules of the language. 56 | .IP 57 | Support for word break detection can be enabled in the JSON configuration file as shown below: 58 | .IP 59 | .in +4n 60 | .EX 61 | { 62 | "algorithms": { 63 | "segmentation": [ 64 | "word" 65 | ] 66 | } 67 | } 68 | .EE 69 | .in 70 | .TP 71 | .BR UNI_SENTENCE 72 | Sentence boundaries are often used for triple-click or other methods of selecting or iterating through blocks of text that are larger than single words. 73 | They are also used to determine whether words occur within the same sentence in database queries. 74 | .IP 75 | Support for sentence break detection can be enabled in the JSON configuration file as shown below: 76 | .IP 77 | .in +4n 78 | .EX 79 | { 80 | "algorithms": { 81 | "segmentation": [ 82 | "sentence" 83 | ] 84 | } 85 | } 86 | .EE 87 | .in 88 | .SH SEE ALSO 89 | .BR uninormform (3) 90 | .SH AUTHOR 91 | .UR https://railgunlabs.com 92 | Railgun Labs 93 | .UE . 94 | .SH INTERNET RESOURCES 95 | The online documentation is published on the 96 | .UR https://railgunlabs.com/unicorn 97 | Railgun Labs website 98 | .UE . 99 | .SH LICENSING 100 | Unicorn is distributed with its end-user license agreement (EULA). 101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES. 102 | -------------------------------------------------------------------------------- /src/charvec.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #include "charvec.h" 15 | 16 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity) 17 | { 18 | unistat status = UNI_OK; 19 | 20 | // LCOV_EXCL_START 21 | assert(buffer != NULL); 22 | assert(new_capacity >= 0); 23 | // LCOV_EXCL_STOP 24 | 25 | if (new_capacity > buffer->capacity) 26 | { 27 | unichar *ptr; 28 | if (buffer->chars == buffer->scratch) 29 | { 30 | ptr = uni_malloc(sizeof(buffer->chars[0]) * (size_t)new_capacity); // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required. 31 | if (ptr != NULL) 32 | { 33 | (void)memcpy(ptr, buffer->scratch, sizeof(buffer->chars[0]) * (size_t)buffer->capacity); 34 | } 35 | } 36 | else 37 | { 38 | ptr = uni_realloc(buffer->chars, // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required. 39 | sizeof(buffer->chars[0]) * (size_t)buffer->capacity, 40 | sizeof(buffer->chars[0]) * (size_t)new_capacity); 41 | } 42 | 43 | if (ptr == NULL) 44 | { 45 | uni_message("memory allocation failed"); 46 | status = UNI_NO_MEMORY; 47 | } 48 | else 49 | { 50 | buffer->chars = ptr; 51 | buffer->capacity = new_capacity; 52 | } 53 | } 54 | 55 | return status; 56 | } 57 | 58 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count) 59 | { 60 | // LCOV_EXCL_START 61 | assert(buffer); 62 | assert(chars); 63 | // LCOV_EXCL_STOP 64 | 65 | const unistat status = uni_charvec_reserve(buffer, buffer->length + chars_count); 66 | if (status == UNI_OK) 67 | { 68 | (void)memcpy(&buffer->chars[buffer->length], chars, sizeof(buffer->chars[0]) * (size_t)chars_count); 69 | buffer->length += chars_count; 70 | } 71 | 72 | return status; 73 | } 74 | 75 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count) 76 | { 77 | assert((cb->length + chars_count) <= cb->capacity); // LCOV_EXCL_BR_LINE 78 | (void)memcpy(&cb->chars[cb->length], chars, sizeof(chars[0]) * (size_t)chars_count); 79 | cb->length += chars_count; 80 | } 81 | 82 | void uni_charvec_reset(struct CharVec *buffer) 83 | { 84 | buffer->length = 0; 85 | } 86 | 87 | void uni_charvec_init(struct CharVec *buffer) 88 | { 89 | const size_t scratch_buffer_size = COUNT_OF(buffer->scratch); 90 | buffer->length = 0; 91 | buffer->capacity = (unisize)scratch_buffer_size; 92 | buffer->chars = buffer->scratch; 93 | } 94 | 95 | void uni_charvec_free(struct CharVec *buffer) 96 | { 97 | if (buffer->chars != buffer->scratch) 98 | { 99 | uni_free(buffer->chars, sizeof(buffer->chars[0]) * (size_t)buffer->capacity); 100 | } 101 | } 102 | 103 | void uni_charvec_remove(struct CharVec *buf, unisize i) 104 | { 105 | assert(buf->length >= 1); // LCOV_EXCL_BR_LINE 106 | const unisize shift = buf->length - (i + UNISIZE_C(1)); 107 | assert(shift >= 0); // LCOV_EXCL_BR_LINE: The math should never work out negative. 108 | (void)memmove(&buf->chars[i], &buf->chars[i + 1], sizeof(buf->chars[0]) * (size_t)shift); 109 | buf->length -= 1; 110 | } 111 | -------------------------------------------------------------------------------- /scripts/segmentation.py: -------------------------------------------------------------------------------- 1 | # Unicorn - Embeddable Unicode Algorithms 2 | # Copyright (c) 2024-2025 Railgun Labs 3 | # 4 | # This software is dual-licensed: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License version 3 as 6 | # published by the Free Software Foundation. For the terms of this 7 | # license, see . 8 | # 9 | # Alternatively, you can license this software under a proprietary 10 | # license, as set out in . 11 | 12 | from typing import Set, Type 13 | import zipfile 14 | import json 15 | 16 | from .config import Config 17 | from .tools import Codespace 18 | from .feature import Feature 19 | from .basics import StorageSize, SerializationData 20 | 21 | max_states = 0 22 | 23 | class _Segmentation(Feature): 24 | def post_process(self, archive: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData: 25 | gcb = archive.open('segmentation.json') 26 | data = json.loads(gcb.read()) 27 | header = data["header"] 28 | gcb.close() 29 | 30 | header += "#define MAX_BREAK_STATES {0}\n".format(max_states) 31 | 32 | public_header = "#define UNICORN_FEATURE_SEGMENTATION\n" 33 | return SerializationData(header=header, public_header=public_header) 34 | 35 | class GraphemeBreak(Feature): 36 | @staticmethod 37 | def dependencies() -> Set[Type[Feature]]: 38 | return set([_Segmentation]) 39 | 40 | def pre_process(self, codespace: Codespace) -> None: 41 | codespace.register_property("gcb", 0, StorageSize.UINT8) 42 | codespace.register_property("incb", 0, StorageSize.UINT8) 43 | 44 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData: 45 | gcb = archive.open('gcb.json') 46 | data = json.loads(gcb.read()) 47 | header = data["header"] 48 | source = data["source"] 49 | size = data["size"] 50 | gcb.close() 51 | 52 | global max_states 53 | max_states = max(int(data["states"]), max_states) 54 | 55 | gcb_property = codespace.get("gcb") 56 | for d in data["gcb"]: 57 | codespace.set(d[0], gcb_property, d[1]) 58 | 59 | incb_property = codespace.get("incb") 60 | for d in data["incb"]: 61 | codespace.set(d[0], incb_property, d[1]) 62 | 63 | public_header = "#define UNICORN_FEATURE_GCB\n" 64 | return SerializationData(source, header, public_header, size) 65 | 66 | class WordBreak(Feature): 67 | @staticmethod 68 | def dependencies() -> Set[Type[Feature]]: 69 | return set([_Segmentation]) 70 | 71 | def pre_process(self, codespace: Codespace) -> None: 72 | codespace.register_property("wb", 0, StorageSize.UINT8) 73 | codespace.register_property("wbx", 0, StorageSize.UINT8) 74 | 75 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData: 76 | wb = archive.open('wb.json') 77 | data = json.loads(wb.read()) 78 | header = data["header"] 79 | source = data["source"] 80 | size = data["size"] 81 | wb.close() 82 | 83 | global max_states 84 | max_states = max(int(data["states"]), max_states) 85 | 86 | wb_property = codespace.get("wb") 87 | for d in data["wb"]: 88 | codespace.set(d[0], wb_property, d[1]) 89 | 90 | wbx_property = codespace.get("wbx") 91 | for d in data["wbx"]: 92 | codespace.set(d[0], wbx_property, d[1]) 93 | 94 | public_header = "#define UNICORN_FEATURE_WB\n" 95 | return SerializationData(source, header, public_header, size) 96 | 97 | class SentenceBreak(Feature): 98 | @staticmethod 99 | def dependencies() -> Set[Type[Feature]]: 100 | return set([_Segmentation]) 101 | 102 | def pre_process(self, codespace: Codespace) -> None: 103 | codespace.register_property("sb", 0, StorageSize.UINT8) 104 | 105 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData: 106 | wb = archive.open('sb.json') 107 | data = json.loads(wb.read()) 108 | header = data["header"] 109 | source = data["source"] 110 | size = data["size"] 111 | wb.close() 112 | 113 | global max_states 114 | max_states = max(int(data["states"]), max_states) 115 | 116 | wb_property = codespace.get("sb") 117 | for d in data["sb"]: 118 | codespace.set(d[0], wb_property, d[1]) 119 | 120 | public_header = "#define UNICORN_FEATURE_SB\n" 121 | return SerializationData(source, header, public_header, size) 122 | -------------------------------------------------------------------------------- /src/properties.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2024-2025 Railgun Labs 4 | * 5 | * This software is dual-licensed: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License version 3 as 7 | * published by the Free Software Foundation. For the terms of this 8 | * license, see . 9 | * 10 | * Alternatively, you can license this software under a proprietary 11 | * license, as set out in . 12 | */ 13 | 14 | #include "common.h" 15 | #include "unidata.h" 16 | 17 | UNICORN_API unigc uni_gc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 18 | { 19 | #if defined(UNICORN_FEATURE_GC) 20 | return (unigc)unicorn_get_codepoint_data(c)->general_category; // cppcheck-suppress premium-misra-c-2012-10.5 ; Intentional violation to unpack data from 32-bits to an enum. 21 | #else 22 | (void)c; 23 | return UNI_UNASSIGNED; 24 | #endif 25 | } 26 | 27 | UNICORN_API uint8_t uni_ccc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 28 | { 29 | #if defined(UNICORN_FEATURE_CCC) 30 | return unicorn_get_codepoint_data(c)->canonical_combining_class; 31 | #else 32 | (void)c; 33 | return 0; 34 | #endif 35 | } 36 | 37 | UNICORN_API bool uni_is(unichar c, unibp p) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 38 | { 39 | #if defined(UNICORN_FEATURE_BINARY_PROPERTIES) 40 | const uint32_t binary_props = (uint32_t)unicorn_get_codepoint_data(c)->binary_properties; 41 | const uint32_t prop = (uint32_t)p; 42 | const uint32_t bit_mask = (uint32_t)1 << prop; 43 | return ((binary_props & bit_mask) == bit_mask) ? true : false; 44 | #else 45 | (void)c; 46 | (void)p; 47 | return false; 48 | #endif 49 | } 50 | 51 | UNICORN_API const char *uni_numval(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 52 | { 53 | #if defined(UNICORN_FEATURE_NUMERIC_VALUE) 54 | const char *numval = NULL; 55 | const uint8_t index = unicorn_get_codepoint_data(c)->numeric_value_offset; 56 | if (index > (uint8_t)0) 57 | { 58 | assert((size_t)index < COUNT_OF(unicorn_numeric_values)); // LCOV_EXCL_BR_LINE 59 | numval = unicorn_numeric_values[index]; 60 | } 61 | return numval; 62 | #else 63 | (void)c; 64 | return NULL; 65 | #endif 66 | } 67 | 68 | UNICORN_API unichar uni_tolower(unichar c) 69 | { 70 | #if defined(UNICORN_FEATURE_SIMPLE_LOWERCASE_MAPPINGS) 71 | unichar lower; 72 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_lowercase_mapping; 73 | const uint16_t index = GET_CASED_VALUE(mapping); 74 | if (CASING_IN_TABLE(mapping)) 75 | { 76 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE 77 | lower = unicorn_simple_case_mappings[index]; 78 | } 79 | else 80 | { 81 | const int32_t diff = (int32_t)index; 82 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF); 83 | lower = (unichar)cp; 84 | } 85 | return lower; 86 | #else 87 | return c; 88 | #endif 89 | } 90 | 91 | UNICORN_API unichar uni_toupper(unichar c) 92 | { 93 | #if defined(UNICORN_FEATURE_SIMPLE_UPPERCASE_MAPPINGS) 94 | unichar upper; 95 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_uppercase_mapping; 96 | const uint16_t index = GET_CASED_VALUE(mapping); 97 | if (CASING_IN_TABLE(mapping)) 98 | { 99 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE 100 | upper = unicorn_simple_case_mappings[index]; 101 | } 102 | else 103 | { 104 | const int32_t diff = (int32_t)index; 105 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF); 106 | upper = (unichar)cp; 107 | } 108 | return upper; 109 | #else 110 | return c; 111 | #endif 112 | } 113 | 114 | UNICORN_API unichar uni_totitle(unichar c) 115 | { 116 | #if defined(UNICORN_FEATURE_SIMPLE_TITLECASE_MAPPINGS) 117 | unichar title; 118 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_titlecase_mapping; 119 | const uint16_t index = GET_CASED_VALUE(mapping); 120 | if (CASING_IN_TABLE(mapping)) 121 | { 122 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE 123 | title = unicorn_simple_case_mappings[index]; 124 | } 125 | else 126 | { 127 | const int32_t diff = (int32_t)index; 128 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF); 129 | title = (unichar)cp; 130 | } 131 | return title; 132 | #else 133 | return c; 134 | #endif 135 | } 136 | -------------------------------------------------------------------------------- /examples/character_properties.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is public domain. 3 | * You may use, modify, and distribute it without restriction. 4 | */ 5 | 6 | // Examples are also documented online at: 7 | // . 8 | 9 | #include 10 | #include 11 | 12 | static const char *const general_categories[] = { 13 | [UNI_UPPERCASE_LETTER] = "Uppercase_Letter", 14 | [UNI_LOWERCASE_LETTER] = "Lowercase_Letter", 15 | [UNI_TITLECASE_LETTER] = "Titlecase_Letter", 16 | [UNI_MODIFIER_LETTER] = "Modifier_Letter", 17 | [UNI_OTHER_LETTER] = "Other_Letter", 18 | [UNI_NONSPACING_MARK] = "Nonspacing_Mark", 19 | [UNI_SPACING_MARK] = "Spacing_Mark", 20 | [UNI_ENCLOSING_MARK] = "Enclosing_Mark", 21 | [UNI_DECIMAL_NUMBER] = "Decimal_Number", 22 | [UNI_LETTER_NUMBER] = "Letter_Number", 23 | [UNI_OTHER_NUMBER] = "Other_Number", 24 | [UNI_CONNECTOR_PUNCTUATION] = "Connector_Punctuation", 25 | [UNI_DASH_PUNCTUATION] = "Dash_Punctuation", 26 | [UNI_OPEN_PUNCTUATION] = "Open_Punctuation", 27 | [UNI_CLOSE_PUNCTUATION] = "Close_Punctuation", 28 | [UNI_INITIAL_PUNCTUATION] = "Initial_Punctuation", 29 | [UNI_FINAL_PUNCTUATION] = "Final_Punctuation", 30 | [UNI_OTHER_PUNCTUATION] = "Other_Punctuation", 31 | [UNI_MATH_SYMBOL] = "Math_Symbol", 32 | [UNI_CURRENCY_SYMBOL] = "Currency_Symbol", 33 | [UNI_MODIFIER_SYMBOL] = "Modifier_Symbol", 34 | [UNI_OTHER_SYMBOL] = "Other_Symbol", 35 | [UNI_SPACE_SEPARATOR] = "Space_Separator", 36 | [UNI_LINE_SEPARATOR] = "Line_Separator", 37 | [UNI_PARAGRAPH_SEPARATOR] = "Paragraph_Separator", 38 | [UNI_CONTROL] = "Control", 39 | [UNI_FORMAT] = "Format", 40 | [UNI_SURROGATE] = "Surrogate", 41 | [UNI_PRIVATE_USE] = "Private_Use", 42 | [UNI_UNASSIGNED] = "Unassigned", 43 | }; 44 | 45 | static const char *const binary_properties[] = { 46 | [UNI_NONCHARACTER_CODE_POINT] = "Noncharacter_Code_Point", 47 | [UNI_ALPHABETIC] = "Alphabetic", 48 | [UNI_LOWERCASE] = "Lowercase", 49 | [UNI_UPPERCASE] = "Uppercase", 50 | [UNI_HEX_DIGIT] = "Hex_Digit", 51 | [UNI_WHITE_SPACE] = "White_Space", 52 | [UNI_MATH] = "Math", 53 | [UNI_DASH] = "Dash", 54 | [UNI_DIACRITIC] = "Diacritic", 55 | [UNI_EXTENDER] = "Extender", 56 | [UNI_IDEOGRAPHIC] = "Ideographic", 57 | [UNI_QUOTATION_MARK] = "Quotation_Mark", 58 | [UNI_UNIFIED_IDEOGRAPH] = "Unified_Ideograph", 59 | [UNI_TERMINAL_PUNCTUATION] = "Terminal_Punctuation", 60 | }; 61 | 62 | int main(int argc, char *argv[]) 63 | { 64 | // This code example prints character properties associated with 65 | // a Unicode code point. The definition of each character property 66 | // is documented here: 67 | // . 68 | 69 | // The following variable defines the Unicode character whose 70 | // properties will be printed. In this example, it's Unicode 71 | // character U+0300. Change this value to a different character 72 | // and notice how the printed properties change. 73 | unichar cp = 0x300; 74 | 75 | unichar tmp_cp; 76 | uint8_t ccc; 77 | const char *numeric_value; 78 | 79 | printf("Code point: U+%04X\n", cp); 80 | 81 | // Print the General_Category property value. 82 | printf("General category: %s\n", general_categories[uni_gc(cp)]); 83 | 84 | // Print the Canonical_Combining_Class property value (if non-zero). 85 | ccc = uni_ccc(cp); 86 | if (ccc != 0) 87 | { 88 | printf("Canonical combining class: %d\n", ccc); 89 | } 90 | 91 | // Print each binary property assigned to the code point. 92 | for (int i = 0; i < sizeof(binary_properties) / sizeof(binary_properties[0]); i++) 93 | { 94 | if (uni_is(cp, i)) 95 | { 96 | printf("Binary property: %s\n", binary_properties[i]); 97 | } 98 | } 99 | 100 | // Print the Numeric_Value property value (if present). 101 | numeric_value = uni_numval(cp); 102 | if (numeric_value != NULL) 103 | { 104 | printf("Numeric value: %s\n", numeric_value); 105 | } 106 | 107 | // Print the Simple_Lowercase_Mapping (if unique). 108 | tmp_cp = uni_tolower(cp); 109 | if (tmp_cp != cp) 110 | { 111 | printf("Simple lower case mapping: U+%04X\n", tmp_cp); 112 | } 113 | 114 | // Print the Simple_Uppercase_Mapping (if unique). 115 | tmp_cp = uni_toupper(cp); 116 | if (tmp_cp != cp) 117 | { 118 | printf("Simple upper case mapping: U+%04X\n", tmp_cp); 119 | } 120 | 121 | // Print the Simple_Titlecase_Mapping (if unique). 122 | tmp_cp = uni_totitle(cp); 123 | if (tmp_cp != cp) 124 | { 125 | printf("Simple title case mapping: U+%04X\n", tmp_cp); 126 | } 127 | 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /src/unicorn.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Unicorn - Embeddable Unicode Algorithms 3 | * Copyright (c) 2021-2025 Railgun Labs 4 | * 5 | * THE CONTENTS OF THIS PROJECT ARE PROPRIETARY AND CONFIDENTIAL. UNAUTHORIZED 6 | * COPYING, TRANSFERRING OR REPRODUCTION OF THE CONTENTS OF THIS PROJECT, VIA 7 | * ANY MEDIUM IS STRICTLY PROHIBITED. 8 | * 9 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 11 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 12 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 13 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 14 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 15 | * SOFTWARE. 16 | */ 17 | 18 | #include "charbuf.h" 19 | #include "unidata.h" 20 | 21 | #if defined(_MSC_VER) 22 | #include 23 | #endif 24 | 25 | static inline int32_t uni_popcnt(uint32_t n) 26 | { 27 | #if defined(_MSC_VER) 28 | return (int32_t)__popcnt(n); // cppcheck-suppress premium-misra-c-2012-17.3 29 | #elif defined(__GNUC__) || defined(__clang__) 30 | return (int32_t)__builtin_popcount(n); // cppcheck-suppress premium-misra-c-2012-17.3 31 | #else 32 | int32_t popcnt = 0; 33 | for (uint32_t v = (uint32_t)n; v != 0u; v >>= 1u) 34 | { 35 | if ((v & 1u) == 1u) 36 | { 37 | popcnt += 1; 38 | } 39 | } 40 | return popcnt; 41 | #endif 42 | } 43 | 44 | static unistat check_encoding(uniattr *encoding) 45 | { 46 | unistat status = UNI_OK; 47 | const uniattr enc = *encoding; 48 | uint32_t flags; 49 | 50 | // Only one or none of the endian flags can be provided. 51 | uniattr mask = enc & (UNI_LITTLE | UNI_BIG); 52 | flags = (uint32_t)mask; 53 | switch (uni_popcnt(flags)) 54 | { 55 | case 0: // No endian flags was provided; default to native byte order. 56 | if ((GET_ENCODING(enc) & (UNI_UTF16 | UNI_UTF32)) != (uniattr)0) 57 | { 58 | #if defined(UNICORN_BIG_ENDIAN) 59 | (*encoding) |= UNI_BIG; 60 | #else 61 | (*encoding) |= UNI_LITTLE; 62 | #endif 63 | } 64 | break; 65 | case 1: // Exactly one endian flag was provided (good). 66 | break; 67 | default: // Too many endian flags were provided (bad). 68 | uni_message("too many endian flags (must specify exactly one)"); 69 | status = UNI_BAD_OPERATION; 70 | break; 71 | } 72 | 73 | if (status == UNI_OK) 74 | { 75 | // Make sure exactly ONE text encoding flag was provided. 76 | mask = GET_ENCODING(enc); 77 | flags = (uint32_t)mask; 78 | if (uni_popcnt(flags) != 1) 79 | { 80 | uni_message("expected exactly one encoding form"); 81 | status = UNI_BAD_OPERATION; 82 | } 83 | } 84 | 85 | return status; 86 | } 87 | 88 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding) 89 | { 90 | unistat status = UNI_OK; 91 | (void)length; 92 | 93 | // Only destination buffers can use the null terminated flag. 94 | if (((*encoding) & UNI_NULIFY) == UNI_NULIFY) 95 | { 96 | uni_message("input buffer incompatible with 'UNI_NULIFY' flag"); 97 | status = UNI_BAD_OPERATION; 98 | } 99 | else if (text == NULL) 100 | { 101 | uni_message("input buffer is null"); 102 | status = UNI_BAD_OPERATION; 103 | } 104 | else 105 | { 106 | status = check_encoding(encoding); 107 | } 108 | 109 | return status; 110 | } 111 | 112 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding) 113 | { 114 | unistat status = UNI_OK; 115 | 116 | // Only input buffers can use the trusted. 117 | if (((*encoding) & UNI_TRUST) == UNI_TRUST) 118 | { 119 | uni_message("output buffer incompatible with 'UNI_TRUST' flag"); 120 | status = UNI_BAD_OPERATION; 121 | } 122 | else if (capacity == NULL) 123 | { 124 | uni_message("output buffer capacity is null"); 125 | status = UNI_BAD_OPERATION; 126 | } 127 | else if (*capacity < 0) 128 | { 129 | uni_message("output buffer capacity is negative"); 130 | status = UNI_BAD_OPERATION; 131 | } 132 | else if ((buffer == NULL) && (*capacity > 0)) 133 | { 134 | uni_message("output buffer cannot be null if its capacity is non-zero"); 135 | status = UNI_BAD_OPERATION; 136 | } 137 | else 138 | { 139 | status = check_encoding(encoding); 140 | } 141 | 142 | return status; 143 | } 144 | 145 | UNICORN_API void uni_getversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 146 | { 147 | if (major != NULL) 148 | { 149 | *major = 1; 150 | } 151 | 152 | if (minor != NULL) 153 | { 154 | *minor = 0; 155 | } 156 | 157 | if (patch != NULL) 158 | { 159 | *patch = 0; 160 | } 161 | } 162 | 163 | UNICORN_API void uni_getucdversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage. 164 | { 165 | if (major != NULL) 166 | { 167 | *major = UNICODE_VERSION_MAJOR; 168 | } 169 | 170 | if (minor != NULL) 171 | { 172 | *minor = UNICODE_VERSION_MINOR; 173 | } 174 | 175 | if (patch != NULL) 176 | { 177 | *patch = UNICODE_VERSION_PATCH; 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22) 2 | project(unicorn VERSION 1.3.1 LANGUAGES C) 3 | 4 | include(CheckIncludeFile) 5 | include(GNUInstallDirs) 6 | 7 | option(UNICORN_BUILD_STATIC "Build Unicorn as a static library" ON) 8 | option(UNICORN_BUILD_SHARED "Build Unicorn as a dynamic library" ON) 9 | option(UNICORN_BUILD_EXAMPLES "Build Unicorn code examples (requires the shared library)" OFF) 10 | 11 | option(UNICORN_BUILD_TESTS "Build the tests (commercial licensees only)" OFF) 12 | option(UNICORN_BUILD_COOKBOOK "Build the documentation/cookbook programs (commercial licensees only)" OFF) 13 | 14 | option(UNICORN_CODE_COVERAGE "Toggle code coverage" OFF) 15 | option(UNICORN_UNDEFINED_BEHAVIOR_SANITIZER "Toggle undefined behavior sanitizer" OFF) 16 | option(UNICORN_ADDRESS_SANITIZER "Toggle address sanitizer" OFF) 17 | option(UNICORN_MEMORY_SANITIZER "Toggle address sanitizer" OFF) 18 | 19 | if (NOT UNICORN_BUILD_STATIC AND NOT UNICORN_BUILD_SHARED) 20 | message(FATAL_ERROR "Please build either the static or shared library.") 21 | endif () 22 | 23 | set(UNICORN_CONFIG "features.json" CACHE STRING "Path to a JSON file that configures the Unicorn feature set.") 24 | if (UNICORN_CONFIG STREQUAL "") 25 | message(FATAL_ERROR "please specify a unicorn configuration file (must be a JSON file)") 26 | endif() 27 | 28 | if (MSVC) 29 | # The Microsoft compiler assumes source is ANSI-encoded, which depends on the localized version of Windows in use 30 | # On U.S. and Western European Windows the encoding is assumed to be Windows-1252. Adding the following compiler 31 | # switch forces UTF-8. 32 | add_compile_options(/utf-8) 33 | endif () 34 | 35 | if (UNICORN_CODE_COVERAGE) 36 | if (NOT CMAKE_C_COMPILER_ID STREQUAL "GNU") 37 | message(FATAL_ERROR "Code coverage can only be generated with GCC.") 38 | endif () 39 | # You can't have sanitizers enabled with coverage checks. 40 | elseif (UNICORN_UNDEFINED_BEHAVIOR_SANITIZER) 41 | add_compile_options(-g -fsanitize=undefined) 42 | add_link_options(-fsanitize=undefined) 43 | elseif (UNICORN_ADDRESS_SANITIZER) 44 | add_compile_options(-g -fsanitize=address) 45 | add_link_options(-fsanitize=address) 46 | elseif (UNICORN_MEMORY_SANITIZER) 47 | add_compile_options(-O0 -g -fsanitize=memory -fsanitize-memory-track-origins) 48 | add_link_options(-fsanitize=memory -fsanitize-memory-track-origins) 49 | endif () 50 | 51 | # Check for required C standard library headers. 52 | check_include_file(string.h HAVE_STRING_H) 53 | if (NOT HAVE_STRING_H) 54 | message(FATAL_ERROR "could not find required header") 55 | endif () 56 | 57 | check_include_file(assert.h HAVE_ASSERT_H) 58 | if (NOT HAVE_ASSERT_H) 59 | message(FATAL_ERROR "could not find required header") 60 | endif () 61 | 62 | check_include_file(stdbool.h HAVE_STDBOOL_H) 63 | if (NOT HAVE_STDBOOL_H) 64 | message(FATAL_ERROR "could not find required header") 65 | endif () 66 | 67 | # Python is required to generate the Unicode data tables. 68 | find_package(Python3 3.10.0 REQUIRED COMPONENTS Interpreter) 69 | 70 | # Execute the Python Unicode data generators. 71 | file(GLOB_RECURSE PYTHON_SOURCES "scripts/${PYTHON_SOURCES}*.py") 72 | set(GENERATED_FILES 73 | "${CMAKE_BINARY_DIR}/unidata.c" 74 | "${CMAKE_BINARY_DIR}/unidata.h" 75 | "${CMAKE_BINARY_DIR}/_unicorn.h" 76 | ) 77 | add_custom_command( 78 | OUTPUT ${GENERATED_FILES} 79 | COMMAND Python3::Interpreter -m scripts --config "${UNICORN_CONFIG}" --output "${CMAKE_BINARY_DIR}" 80 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 81 | DEPENDS unicode.bin ${PYTHON_SOURCES} ${UNICORN_CONFIG}) 82 | add_custom_target(unicorn_generate DEPENDS ${GENERATED_FILES}) 83 | 84 | # Mark the following source files as being generated at build time so CMake doesn't try to find them. 85 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.c PROPERTIES GENERATED 1) 86 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.h PROPERTIES GENERATED 1) 87 | set_source_files_properties(${CMAKE_BINARY_DIR}/_unicorn.h PROPERTIES GENERATED 1) 88 | 89 | include_directories(${CMAKE_BINARY_DIR}) # For finding the generated files. 90 | include_directories(include) # For finding the public header files. 91 | include_directories(src) # For finding internal header files. 92 | 93 | # Add the Unicorn library. 94 | add_subdirectory(src) 95 | 96 | # Add the examples, if requested. 97 | if (UNICORN_BUILD_EXAMPLES) 98 | add_subdirectory(examples) 99 | endif () 100 | 101 | # Generate version information. 102 | include(CMakePackageConfigHelpers) 103 | write_basic_package_version_file(${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake COMPATIBILITY SameMajorVersion) 104 | 105 | # Pre-process pkg-config and UnicornConfig.cmake files. 106 | set(prefix "${CMAKE_INSTALL_PREFIX}") 107 | set(exec_prefix "\${prefix}") 108 | set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") 109 | set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") 110 | set(VERSION ${PROJECT_VERSION}) 111 | configure_file("${CMAKE_SOURCE_DIR}/UnicornConfig.cmake.in" "${CMAKE_BINARY_DIR}/UnicornConfig.cmake" @ONLY) 112 | configure_file("${CMAKE_SOURCE_DIR}/unicorn.pc.in" "${CMAKE_BINARY_DIR}/unicorn.pc" @ONLY) 113 | 114 | # Only install man pages when running "make install" locally. 115 | add_subdirectory(man) 116 | 117 | # This is the "make install" action. 118 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfig.cmake DESTINATION cmake) 119 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake DESTINATION cmake) 120 | install(FILES ${CMAKE_BINARY_DIR}/unicorn.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) 121 | 122 | # Add the unit tests if requested (commerical users only). 123 | if (UNICORN_BUILD_TESTS) 124 | enable_testing() 125 | include(CTest) 126 | add_subdirectory(../tests tests) 127 | endif () 128 | 129 | if (UNICORN_BUILD_COOKBOOK) 130 | add_subdirectory(../docs docs) 131 | endif () 132 | --------------------------------------------------------------------------------