├── scripts
    ├── __init__.py
    ├── Makefile.am
    ├── version.py
    ├── compression.py
    ├── basics.py
    ├── ccc.py
    ├── feature.py
    ├── general_category.py
    ├── encoding.py
    ├── numeric_value.py
    ├── collation.py
    ├── decomposition.py
    ├── composition.py
    ├── quickcheck.py
    ├── case_folding.py
    ├── case_conversion.py
    └── segmentation.py
├── .gitattributes
├── include
    └── Makefile.am
├── unicode.bin
├── examples
    ├── .gitignore
    ├── CMakeLists.txt
    ├── compress_text.c
    ├── decode_utf8.c
    ├── decode_utf16.c
    ├── decode_utf32.c
    ├── segment_text.c
    ├── Makefile.am
    ├── case_convert.c
    ├── encode_character.c
    ├── case_fold.c
    ├── compare_text.c
    ├── collate_text.c
    └── character_properties.c
├── Makefile.am
├── unicorn.pc.in
├── tests
    └── README.md
├── autogen.sh
├── .github
    ├── pull_request_template.md
    ├── ISSUE_TEMPLATE
    │   └── config.yml
    └── workflows
    │   └── build.yml
├── man
    ├── unisize.3
    ├── uni_utf8.3
    ├── uni_utf32.3
    ├── uni_utf16.3
    ├── unicasefold.3
    ├── unicaseconv.3
    ├── uni_scalar.3
    ├── unierrfunc.3
    ├── uni_big.3
    ├── uni_little.3
    ├── Makefile.am
    ├── uni_native.3
    ├── uni_is.3
    ├── CMakeLists.txt
    ├── uni_getversion.3
    ├── uni_ccc.3
    ├── uni_tolower.3
    ├── uni_toupper.3
    ├── uni_totitle.3
    ├── uni_getucdversion.3
    ├── uni_trust.3
    ├── unimemfunc.3
    ├── uni_normchk.3
    ├── uninormform.3
    ├── unichar.3
    ├── uni_gc.3
    ├── uni_prevbrk.3
    ├── uni_validate.3
    ├── uni_nulify.3
    ├── uninormchk.3
    ├── uni_caseconvchk.3
    ├── uni_numval.3
    ├── unistrength.3
    ├── uni_casefoldchk.3
    ├── uniweighting.3
    ├── uni_seterrfunc.3
    ├── uni_prev.3
    ├── uni_compress.3
    ├── uni_nextbrk.3
    ├── uni_decompress.3
    ├── uni_casefoldcmp.3
    ├── uni_normqchk.3
    ├── uniattr.3
    ├── uni_sortkeycmp.3
    ├── uni_setmemfunc.3
    ├── uni_next.3
    ├── uni_caseconv.3
    ├── unistat.3
    ├── uni_norm.3
    ├── uni_convert.3
    ├── uni_encode.3
    ├── uni_casefold.3
    ├── uni_sortkeymk.3
    ├── uni_collate.3
    ├── uni_normcmp.3
    └── unibreak.3
├── src
    ├── logger.c
    ├── Makefile.am
    ├── ducet.h
    ├── normalize.h
    ├── charvec.h
    ├── charbuf.h
    ├── byteswap.h
    ├── memory.c
    ├── common.h
    ├── CMakeLists.txt
    ├── charvec.c
    ├── properties.c
    └── unicorn.c
├── .gitignore
├── features.json
├── configure.ac
├── UnicornConfig.cmake.in
└── CMakeLists.txt


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/include/Makefile.am:
--------------------------------------------------------------------------------
1 | include_HEADERS = unicorn.h
2 | 


--------------------------------------------------------------------------------
/scripts/Makefile.am:
--------------------------------------------------------------------------------
1 | EXTRA_DIST = $(wildcard *.py)
2 | 


--------------------------------------------------------------------------------
/unicode.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/railgunlabs/unicorn/HEAD/unicode.bin


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
 1 | caseconv
 2 | casefold
 3 | charprops
 4 | collate
 5 | compare
 6 | compress
 7 | decode-*
 8 | encode-*
 9 | segmentation
10 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS = man src include examples scripts
2 | 
3 | EXTRA_DIST = unicode.bin features.json autogen.sh LICENSE CMakeLists.txt UnicornConfig.cmake.in README.md
4 | 
5 | AUTOMAKE_OPTIONS = subdir-objects
6 | 
7 | pkgconfigdir = $(libdir)/pkgconfig
8 | pkgconfig_DATA = unicorn.pc
9 | 


--------------------------------------------------------------------------------
/unicorn.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@prefix@
 2 | exec_prefix=@exec_prefix@
 3 | libdir=@libdir@
 4 | includedir=@includedir@
 5 | 
 6 | Name: unicorn
 7 | Description: Embeddable Unicode algorithms.
 8 | Version: @VERSION@
 9 | URL: https://railgunlabs.com/unicorn/
10 | 
11 | Cflags: -I${includedir}
12 | Libs: -L${libdir} -lunicorn
13 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Looking for Tests?
2 | 
3 | Unicorn is extensively tested, however, the tests and code for generating the Unicode® data tables are **not** open source.
4 | Both are exclusively available to commercial licensees.
5 | 
6 | See [https://railgunlabs.com/unicorn/license/](https://railgunlabs.com/unicorn/license/) for licensing details.
7 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "Generating build information using autoreconf"
 4 | echo "This may take a while ..."
 5 | 
 6 | if hash autoreconf 2>/dev/null; then
 7 |     autoreconf --verbose --install --force
 8 |     echo "Now you are ready to run ./configure"
 9 | else
10 |     echo "Couldn't find autoreconf, aborting"
11 |     echo "Please install autoreconf and try again"
12 | fi
13 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | <!--------------------------------- READ ME FIRST ---------------------------------
2 |   Please do not open a pull request. The pull request tab is enabled because GitHub
3 |   does not provide maintainers a mechanism to disable it.
4 | 
5 |   Please submit your contribution to: https://railgunlabs.com/contribute/
6 |   --------------------------------------------------------------------------------->
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | blank_issues_enabled: false
 2 | contact_links:
 3 |   - name: 📚 Documentation
 4 |     url: https://railgunlabs.com/unicorn/manual
 5 |     about: Review documentation and code examples.
 6 |   - name: 🔹 Basic Support
 7 |     url: https://railgunlabs.com/contribute
 8 |     about: Submit unsolicited bugs reports and code contributions.
 9 |   - name: 💎 Premium Support
10 |     url: https://railgunlabs.com/services
11 |     about: Priority support for bug reports and personal Q&A assistance.
12 | 


--------------------------------------------------------------------------------
/scripts/version.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | UNICODE_VERSION = "17.0.0"
13 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | link_libraries(unicorn)
 2 | add_executable(example_case_convert case_convert.c)
 3 | add_executable(example_case_fold case_fold.c)
 4 | add_executable(example_character_properties character_properties.c)
 5 | add_executable(example_collate_text collate_text.c)
 6 | add_executable(example_compare_text compare_text.c)
 7 | add_executable(example_decode_utf8 decode_utf8.c)
 8 | add_executable(example_decode_utf16 decode_utf16.c)
 9 | add_executable(example_decode_utf32 decode_utf32.c)
10 | add_executable(example_encode_character encode_character.c)
11 | add_executable(example_segment_text segment_text.c)
12 | add_executable(example_compress_text compress_text.c)
13 | 


--------------------------------------------------------------------------------
/man/unisize.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unisize \- code unit count
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "typedef int32_t unisize;"
11 | .fi
12 | .SH DESCRIPTION
13 | An integer type whose storage size represents the maximum potential length (in code units) of encoded text.
14 | .SH AUTHOR
15 | .UR https://railgunlabs.com
16 | Railgun Labs
17 | .UE .
18 | .SH INTERNET RESOURCES
19 | The online documentation is published on the
20 | .UR https://railgunlabs.com/unicorn
21 | Railgun Labs website
22 | .UE .
23 | .SH LICENSING
24 | Unicorn is distributed with its end-user license agreement (EULA).
25 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
26 | 


--------------------------------------------------------------------------------
/man/uni_utf8.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_UTF8 \- UTF-8 encoding form
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_UTF8 0x2u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded as UTF-8.
14 | The implementation uses an unsigned 8-bit integer (\f[C]uint8_t\f[R]) to represent UTF-8 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 | 


--------------------------------------------------------------------------------
/man/uni_utf32.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_UTF32 \- UTF-32 encoding form
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_UTF32 0x8u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates text is encoded as UTF-32.
14 | The implementation uses an unsigned 32-bit integer (\f[C]uint32_t\f[R]) to represent UTF-32 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 | 


--------------------------------------------------------------------------------
/man/uni_utf16.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_UTF16 \- UTF-16 encoding form
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_UTF16 0x4u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded as UTF-16.
14 | The implementation uses an unsigned 16-bit integer (\f[C]uint16_t\f[R]) to represent UTF-16 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 | 


--------------------------------------------------------------------------------
/scripts/compression.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | import zipfile
13 | 
14 | from .config import Config
15 | from .tools import Codespace
16 | from .feature import Feature
17 | from .basics import SerializationData
18 | 
19 | class ShortStringCompression(Feature):
20 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
21 |         public_header = "#define UNICORN_FEATURE_COMPRESSION\n"
22 |         return SerializationData(public_header=public_header)
23 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build Status
 2 | on: [push, pull_request, workflow_dispatch]
 3 | jobs:
 4 |   linux:
 5 |     name: "Linux - ${{ matrix.compiler.display }}"
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       matrix:
 9 |         compiler:
10 |           - { CC: gcc, display: GCC }
11 |           - { CC: clang, display: Clang }
12 |     steps:
13 |     - name: Checkout Repository
14 |       uses: actions/checkout@v4
15 |     - uses: actions/setup-python@v5
16 |       with:
17 |         python-version: '3.10'
18 |     - name: CMake Build
19 |       env:
20 |         CC: ${{ matrix.platform.CC }}
21 |         CFLAGS: ${{ matrix.platform.CFLAGS }}
22 |       run: |
23 |         cmake -B build -DCMAKE_BUILD_TYPE=Release -DUNICORN_BUILD_EXAMPLES=ON
24 |         cmake --build build
25 |     - name: Autotools Build
26 |       env:
27 |         CC: ${{ matrix.platform.CC }}
28 |         CFLAGS: ${{ matrix.platform.CFLAGS }}
29 |       run: |
30 |         ./autogen.sh
31 |         ./configure --enable-examples
32 |         make
33 |         make distcheck
34 | 


--------------------------------------------------------------------------------
/scripts/basics.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from enum import IntEnum
13 | from dataclasses import dataclass
14 | 
15 | class StorageSize(IntEnum):
16 |     UINT32 = 32
17 |     UINT16 = 16
18 |     UINT8 = 8
19 | 
20 |     def byte_size(self) -> int:
21 |         return int(self / 8)
22 | 
23 | @dataclass
24 | class SerializationData:
25 |     def __init__(self, source: str = "", header: str = "", public_header: str = "", size: int =0) -> None:
26 |         self.source = source
27 |         self.header = header
28 |         self.public_header = public_header
29 |         self.size = size
30 | 


--------------------------------------------------------------------------------
/src/logger.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #include "common.h"
15 | 
16 | static unierrfunc unicorn_logger_cb;
17 | static void *unicorn_logger_ud;
18 | 
19 | UNICORN_API void uni_seterrfunc(void *user_data, unierrfunc callback) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
20 | {
21 |     unicorn_logger_ud = user_data;
22 |     unicorn_logger_cb = callback;
23 | }
24 | 
25 | void uni_message(const char *msg)
26 | {
27 |     if (unicorn_logger_cb != NULL)
28 |     {
29 |         unicorn_logger_cb(unicorn_logger_ud, msg);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/unicasefold.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unicasefold \- case folding operations
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum unicasefold {
11 | .RS
12 | .B UNI_DEFAULT,
13 | .B UNI_CANONICAL,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | This enumeration defines the supported case folding operations.
19 | .SH CONSTANTS
20 | .TP
21 | .BR UNI_DEFAULT
22 | Case folding for binary caseless string comparison.
23 | .TP
24 | .BR UNI_CANONICAL
25 | Case folding for canonical caseless string comparison.
26 | .SH AUTHOR
27 | .UR https://railgunlabs.com
28 | Railgun Labs
29 | .UE .
30 | .SH INTERNET RESOURCES
31 | The online documentation is published on the
32 | .UR https://railgunlabs.com/unicorn
33 | Railgun Labs website
34 | .UE .
35 | .SH LICENSING
36 | Unicorn is distributed with its end-user license agreement (EULA).
37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
38 | 


--------------------------------------------------------------------------------
/man/unicaseconv.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unicaseconv \- case conversion operations
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum unicaseconv {
11 | .RS
12 | .B UNI_LOWER,
13 | .B UNI_TITLE,
14 | .B UNI_UPPER,
15 | .RE
16 | .B };
17 | .fi
18 | .SH DESCRIPTION
19 | This enumeration defines the supported case conversion operations.
20 | .SH CONSTANTS
21 | .TP
22 | .BR UNI_LOWER
23 | Lowercase case conversion.
24 | .TP
25 | .BR UNI_TITLE
26 | Titlecase case conversion.
27 | .TP
28 | .BR UNI_UPPER
29 | Uppercase case conversion.
30 | .SH AUTHOR
31 | .UR https://railgunlabs.com
32 | Railgun Labs
33 | .UE .
34 | .SH INTERNET RESOURCES
35 | The online documentation is published on the
36 | .UR https://railgunlabs.com/unicorn
37 | Railgun Labs website
38 | .UE .
39 | .SH LICENSING
40 | Unicorn is distributed with its end-user license agreement (EULA).
41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
42 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | AUTOMAKE_OPTIONS = subdir-objects
 2 | 
 3 | config_json = @CONFIG_JSON@
 4 | 
 5 | include_HEADERS = _unicorn.h
 6 | 
 7 | lib_LTLIBRARIES = libunicorn.la
 8 | 
 9 | libunicorn_la_SOURCES = \
10 | 	$(top_srcdir)/include/unicorn.h \
11 | 	unicorn.c \
12 | 	logger.c \
13 | 	memory.c \
14 | 	common.h \
15 | 	byteswap.h \
16 | 	encoding.c \
17 | 	segmentation.c \
18 | 	normalize.c \
19 | 	normalize.h \
20 | 	collation.c \
21 | 	ducet.c \
22 | 	ducet.h \
23 | 	compression.c \
24 | 	properties.c \
25 | 	caseconv.c \
26 | 	casefold.c \
27 | 	charbuf.c \
28 | 	charbuf.h \
29 | 	charvec.c \
30 | 	charvec.h \
31 | 	unidata.c \
32 | 	unidata.h \
33 | 	_unicorn.h
34 | 
35 | libunicorn_la_CFLAGS = -I$(top_srcdir)/include
36 | libunicorn_la_LDFLAGS = -no-undefined -version-info 0:0:0
37 | 
38 | unidata.c:
39 | 	cd $(top_srcdir) && $(PYTHON) -m scripts --config $(config_json) --output $(abs_top_builddir)/src
40 | 
41 | unidata.h: unidata.c
42 |  
43 | _unicorn.h: unidata.c
44 | 
45 | noinst_HEADERS = unidata.h
46 | 
47 | BUILT_SOURCES = \
48 | 	unidata.c \
49 | 	unidata.h \
50 | 	_unicorn.h
51 | 
52 | CLEANFILES = \
53 | 	unidata.c \
54 | 	unidata.h \
55 | 	_unicorn.h
56 | 


--------------------------------------------------------------------------------
/man/uni_scalar.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_SCALAR \- unicode scalar value
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_SCALAR 0x1u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is represented by an integer type large enough to accommodate Unicode scalar values.
14 | .PP
15 | This is a pseudo-encoding that uses the \f[B]unichar\f[R](3) type for storage.
16 | .PP
17 | Unicode scalars are not the same as code points.
18 | The former excludes surrogate characters whereas the latter does not.
19 | .SH SEE ALSO
20 | .BR unichar (3)
21 | .SH AUTHOR
22 | .UR https://railgunlabs.com
23 | Railgun Labs
24 | .UE .
25 | .SH INTERNET RESOURCES
26 | The online documentation is published on the
27 | .UR https://railgunlabs.com/unicorn
28 | Railgun Labs website
29 | .UE .
30 | .SH LICENSING
31 | Unicorn is distributed with its end-user license agreement (EULA).
32 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
33 | 


--------------------------------------------------------------------------------
/man/unierrfunc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unierrfunc \- diagnostic function
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "typedef void(*unierrfunc)(void *" user_data ", const char *" message ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Defines the function signature for a custom error callback function.
14 | The resulting message, given by \f[I]message\f[R], is UTF-8 encoded and null-terminated.
15 | It is a message intended for programmers, written in US English, for debugging purposes.
16 | It must never be displayed to an end user.
17 | .SH SEE ALSO
18 | .BR uni_seterrfunc (3)
19 | .SH AUTHOR
20 | .UR https://railgunlabs.com
21 | Railgun Labs
22 | .UE .
23 | .SH INTERNET RESOURCES
24 | The online documentation is published on the
25 | .UR https://railgunlabs.com/unicorn
26 | Railgun Labs website
27 | .UE .
28 | .SH LICENSING
29 | Unicorn is distributed with its end-user license agreement (EULA).
30 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
31 | 


--------------------------------------------------------------------------------
/examples/compress_text.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This code example compresses text using the BOCU-1 compression algorithm.
15 |     // This algorithm is specifically designed for the compression of short
16 |     // Unicode text. Once strings become longer, then a general purpose
17 |     // compressor is preferable. 
18 | 
19 |     const char src[] = u8"素早い茶色のキツネが怠け者の犬を飛び越えます";
20 | 
21 |     char buf[64];
22 |     size_t buf_len = 64;
23 | 
24 |     // The following function compresses the input text 'src' and writes the
25 |     // result to 'buf'. It's function signature is documented online at:
26 |     // <http://localhost:8080/unicorn/manual/api/compression/uni-compress/>.
27 |     if (uni_compress(src, -1, UNI_UTF8, buf, &buf_len) == UNI_OK)
28 |     {
29 |         printf("Uncompressed : %ld bytes\n", sizeof(src));
30 |         printf("Compressed   : %ld bytes\n", buf_len);
31 |     }
32 |     return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/man/uni_big.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_BIG \- big endian byte order
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_BIG 0x10u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in big endian byte order.
14 | It's incompatible with the \f[B]UNI_LITTLE\f[R](3) flag.
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_LITTLE (3),
20 | .BR UNI_UTF16 (3),
21 | .BR UNI_UTF32 (3),
22 | .BR UNI_UTF8 (3),
23 | .BR UNI_SCALAR (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 | 


--------------------------------------------------------------------------------
/man/uni_little.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_LITTLE \- little endian byte order
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_LITTLE 0x20u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in little endian byte order.
14 | It's incompatible with the \f[B]UNI_BIG\f[R](3) flag.
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_BIG (3),
20 | .BR UNI_UTF16 (3),
21 | .BR UNI_UTF32 (3),
22 | .BR UNI_UTF8 (3),
23 | .BR UNI_SCALAR (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 | 


--------------------------------------------------------------------------------
/man/Makefile.am:
--------------------------------------------------------------------------------
 1 | man3_MANS = \
 2 | 	uni_convert.3 \
 3 | 	uni_decompress.3 \
 4 | 	uninormchk.3 \
 5 | 	uni_normqchk.3 \
 6 | 	uni_norm.3 \
 7 | 	unisize.3 \
 8 | 	uni_normcmp.3 \
 9 | 	uni_caseconv.3 \
10 | 	uni_casefoldcmp.3 \
11 | 	uniattr.3 \
12 | 	unimemfunc.3 \
13 | 	uninormform.3 \
14 | 	uni_nextbrk.3 \
15 | 	uniweighting.3 \
16 | 	uni_setmemfunc.3 \
17 | 	uni_nulify.3 \
18 | 	uni_getucdversion.3 \
19 | 	uni_ccc.3 \
20 | 	uni_utf16.3 \
21 | 	unierrfunc.3 \
22 | 	uni_sortkeymk.3 \
23 | 	uni_encode.3 \
24 | 	uni_compress.3 \
25 | 	uni_utf8.3 \
26 | 	unistat.3 \
27 | 	uni_big.3 \
28 | 	uni_normchk.3 \
29 | 	uni_collate.3 \
30 | 	uni_trust.3 \
31 | 	uni_next.3 \
32 | 	uni_caseconvchk.3 \
33 | 	uni_getversion.3 \
34 | 	uni_validate.3 \
35 | 	uni_native.3 \
36 | 	uni_scalar.3 \
37 | 	unicaseconv.3 \
38 | 	uni_utf32.3 \
39 | 	uni_little.3 \
40 | 	unibp.3 \
41 | 	unistrength.3 \
42 | 	uni_totitle.3 \
43 | 	uni_sortkeycmp.3 \
44 | 	unicorn.3 \
45 | 	uni_toupper.3 \
46 | 	uni_prev.3 \
47 | 	unicasefold.3 \
48 | 	uni_casefold.3 \
49 | 	uni_gc.3 \
50 | 	uni_prevbrk.3 \
51 | 	unichar.3 \
52 | 	uni_numval.3 \
53 | 	uni_tolower.3 \
54 | 	unigc.3 \
55 | 	uni_seterrfunc.3 \
56 | 	uni_casefoldchk.3 \
57 | 	uni_is.3 \
58 | 	unibreak.3
59 | EXTRA_DIST = CMakeLists.txt $(man3_MANS)
60 | 


--------------------------------------------------------------------------------
/examples/decode_utf8.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This example demonstrates decoding the code points of a UTF-8 encoded string.
15 |     // It prints each code point on its own line.
16 | 
17 |     const char str[] = u8"I 🕵️."; // I spy
18 |     unisize i = 0;
19 |     for (;;)
20 |     {
21 |         unichar cp = 0x0;
22 |         unistat s = uni_next(str, -1, UNI_UTF8, &i, &cp);
23 |         if (s == UNI_OK)
24 |         {
25 |             printf("U+%04X\n", cp);
26 |         }
27 |         else if (s == UNI_DONE)
28 |         {
29 |             break;
30 |         }
31 |         else if (s == UNI_BAD_ENCODING)
32 |         {
33 |             printf("malformed character at code unit %d\n", i);
34 |             return 1;
35 |         }
36 |         else
37 |         {
38 |             puts("an internal error occured");
39 |             return 2; // Something else went wrong (check the status code).
40 |         }
41 |     }
42 |     return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Libraries
 8 | *.lib
 9 | *.a
10 | 
11 | # Shared objects (inc. Windows DLLs)
12 | *.dll
13 | *.so
14 | *.dylib
15 | 
16 | # Executables
17 | *.exe
18 | *.out
19 | *.app
20 | *.i*86
21 | *.x86_64
22 | *.hex
23 | *.msi
24 | 
25 | # Visual Studio temporary files
26 | *.suo
27 | *.user
28 | *.sbr
29 | *.pdb
30 | *.ncb
31 | *.sdf
32 | *.idb
33 | 
34 | # Mac OS X
35 | .DS_Store
36 | 
37 | # CMake out-of-source build files
38 | build
39 | Win32
40 | _CPack_Packages
41 | 
42 | # Automake
43 | Makefile
44 | Makefile.in
45 | /ar-lib
46 | /mdate-sh
47 | /py-compile
48 | /test-driver
49 | /ylwrap
50 | .deps/
51 | .dirstamp
52 | libtool
53 | ltmain.sh
54 | *.lo
55 | *.la
56 | *.lai
57 | *.so*
58 | *.libs
59 | 
60 | # Autoconf
61 | autom4te.cache
62 | configure~
63 | /autoscan.log
64 | /autoscan-*.log
65 | /aclocal.m4
66 | /compile
67 | /config.cache
68 | /config.guess
69 | /config.h.in
70 | /config.log
71 | /config.status
72 | /config.sub
73 | /configure
74 | /configure.scan
75 | /depcomp
76 | /install-sh
77 | /missing
78 | /stamp-h1
79 | *.tar.gz
80 | *.zip
81 | 
82 | # Generated pkg-config file
83 | unicorn.pc
84 | 
85 | # Python files
86 | *.pyc
87 | 
88 | # Generate C source code
89 | unidata.c
90 | unidata.h
91 | _unicorn.h
92 | 


--------------------------------------------------------------------------------
/examples/decode_utf16.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | #include <uchar.h>
12 | 
13 | int main(int argc, char *argv[])
14 | {
15 |     // This example demonstrates decoding the code points of a UTF-16 encoded string.
16 |     // It prints each code point on its own line.
17 | 
18 |     const char16_t str[] = u"I 🕵️."; // I spy
19 |     unisize i = 0;
20 |     for (;;)
21 |     {
22 |         unichar cp = 0x0;
23 |         unistat s = uni_next(str, -1, UNI_UTF16, &i, &cp);
24 |         if (s == UNI_OK)
25 |         {
26 |             printf("U+%04X\n", cp);
27 |         }
28 |         else if (s == UNI_DONE)
29 |         {
30 |             break;
31 |         }
32 |         else if (s == UNI_BAD_ENCODING)
33 |         {
34 |             printf("malformed character at code unit %d\n", i);
35 |             return 1;
36 |         }
37 |         else
38 |         {
39 |             puts("an internal error occured");
40 |             return 2; // Something else went wrong (check the status code).
41 |         }
42 |     }
43 |     return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/examples/decode_utf32.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | #include <uchar.h>
12 | 
13 | int main(int argc, char *argv[])
14 | {
15 |     // This example demonstrates decoding the code points of a UTF-32 encoded string.
16 |     // It prints each code point on its own line.
17 | 
18 |     const char32_t str[] = U"I 🕵️."; // I spy
19 |     unisize i = 0;
20 |     for (;;)
21 |     {
22 |         unichar cp = 0x0;
23 |         unistat s = uni_next(str, -1, UNI_UTF32, &i, &cp);
24 |         if (s == UNI_OK)
25 |         {
26 |             printf("U+%04X\n", cp);
27 |         }
28 |         else if (s == UNI_DONE)
29 |         {
30 |             break;
31 |         }
32 |         else if (s == UNI_BAD_ENCODING)
33 |         {
34 |             printf("malformed character at code unit %d\n", i);
35 |             return 1;
36 |         }
37 |         else
38 |         {
39 |             puts("an internal error occured");
40 |             return 2; // Something else went wrong (check the status code).
41 |         }
42 |     }
43 |     return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/man/uni_native.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_NATIVE \- native endian byte order
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_NATIVE
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in native byte order.
14 | The value of this flag will be identical to either \f[B]UNI_BIG\f[R](3) or \f[B]UNI_LITTLE\f[R](3).
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_BIG (3),
20 | .BR UNI_LITTLE (3),
21 | .BR UNI_UTF16 (3),
22 | .BR UNI_UTF32 (3),
23 | .BR UNI_UTF8 (3),
24 | .BR UNI_SCALAR (3)
25 | .SH AUTHOR
26 | .UR https://railgunlabs.com
27 | Railgun Labs
28 | .UE .
29 | .SH INTERNET RESOURCES
30 | The online documentation is published on the
31 | .UR https://railgunlabs.com/unicorn
32 | Railgun Labs website
33 | .UE .
34 | .SH LICENSING
35 | Unicorn is distributed with its end-user license agreement (EULA).
36 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
37 | 


--------------------------------------------------------------------------------
/man/uni_is.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_is \- binary property value
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "bool uni_is(unichar " c ", unibp " p ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function retrieves the value of the binary property \f[I]p\f[R] for the character \f[I]c\f[R].
14 | .PP
15 | The binary character property \f[I]p\f[R] must be enabled in the JSON configuration file otherwise \f[C]false\f[R] is always returned.
16 | Instructions for enabling it are documented alongside property \f[I]p\f[R] in the \f[B]unibp\f[R](3) enumeration.
17 | .SH RETURN VALUE
18 | Returns \f[C]true\f[R] if character \f[I]c\f[R] has property \f[I]p\f[R].
19 | .SH SEE ALSO
20 | .BR unibp (3),
21 | .BR unichar (3)
22 | .SH AUTHOR
23 | .UR https://railgunlabs.com
24 | Railgun Labs
25 | .UE .
26 | .SH INTERNET RESOURCES
27 | The online documentation is published on the
28 | .UR https://railgunlabs.com/unicorn
29 | Railgun Labs website
30 | .UE .
31 | .SH LICENSING
32 | Unicorn is distributed with its end-user license agreement (EULA).
33 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
34 | 


--------------------------------------------------------------------------------
/man/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(MAN
 2 |     uni_convert.3
 3 |     uni_decompress.3
 4 |     uninormchk.3
 5 |     uni_normqchk.3
 6 |     uni_norm.3
 7 |     unisize.3
 8 |     uni_normcmp.3
 9 |     uni_caseconv.3
10 |     uni_casefoldcmp.3
11 |     uniattr.3
12 |     unimemfunc.3
13 |     uninormform.3
14 |     uni_nextbrk.3
15 |     uniweighting.3
16 |     uni_setmemfunc.3
17 |     uni_nulify.3
18 |     uni_getucdversion.3
19 |     uni_ccc.3
20 |     uni_utf16.3
21 |     unierrfunc.3
22 |     uni_sortkeymk.3
23 |     uni_encode.3
24 |     uni_compress.3
25 |     uni_utf8.3
26 |     unistat.3
27 |     uni_big.3
28 |     uni_normchk.3
29 |     uni_collate.3
30 |     uni_trust.3
31 |     uni_next.3
32 |     uni_caseconvchk.3
33 |     uni_getversion.3
34 |     uni_validate.3
35 |     uni_native.3
36 |     uni_scalar.3
37 |     unicaseconv.3
38 |     uni_utf32.3
39 |     uni_little.3
40 |     unibp.3
41 |     unistrength.3
42 |     uni_totitle.3
43 |     uni_sortkeycmp.3
44 |     unicorn.3
45 |     uni_toupper.3
46 |     uni_prev.3
47 |     unicasefold.3
48 |     uni_casefold.3
49 |     uni_gc.3
50 |     uni_prevbrk.3
51 |     unichar.3
52 |     uni_numval.3
53 |     uni_tolower.3
54 |     unigc.3
55 |     uni_seterrfunc.3
56 |     uni_casefoldchk.3
57 |     uni_is.3
58 |     unibreak.3)
59 | install(FILES ${MAN} DESTINATION man/man3 COMPONENT doc)
60 | 


--------------------------------------------------------------------------------
/src/ducet.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #ifndef DUCET_H
15 | #define DUCET_H
16 | 
17 | #include "normalize.h"
18 | #include "unidata.h"
19 | 
20 | #if defined(UNICORN_FEATURE_COLLATION)
21 | 
22 | typedef struct CE
23 | {
24 |     uint16_t primary;
25 |     uint16_t secondary;
26 |     uint16_t tertiary;
27 | } CE;
28 | 
29 | struct CEDecoder
30 | {
31 |     int32_t ces_index;
32 |     int32_t ces_count;
33 |     struct NormalizeState normbuf;
34 |     struct CharVec charvec;
35 |     CE ces[UNICORN_MAX_COLLATION];
36 | };
37 | 
38 | void uni_cebuf_init(struct CEDecoder *state);
39 | void uni_cebuf_free(struct CEDecoder *state);
40 | bool uni_cebuf_is_empty(const struct CEDecoder *state);
41 | CE uni_cebuf_pop(struct CEDecoder *state);
42 | unistat uni_cebuf_append_run(struct CEDecoder *state, struct unitext *text);
43 | 
44 | #endif
45 | 
46 | #endif // DUCET_H
47 | 


--------------------------------------------------------------------------------
/man/uni_getversion.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_getversion \- library version
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "void uni_getversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Writes the major, minor, and patch version of the Unicorn library to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively.
14 | Null arguments are ignored.
15 | .SH EXAMPLES
16 | This example prints the major, minor, and patch version of the library to stdout.
17 | .PP
18 | .in +4n
19 | .EX
20 | #include <unicorn.h>
21 | #include <stdio.h>
22 | 
23 | int main(void)
24 | {
25 |     int32_t major, minor, patch;
26 |     uni_getversion(&major, &minor, &patch);
27 |     printf("%d.%d.%d", major, minor, patch);
28 |     return 0;
29 | }
30 | .EE
31 | .in
32 | .SH AUTHOR
33 | .UR https://railgunlabs.com
34 | Railgun Labs
35 | .UE .
36 | .SH INTERNET RESOURCES
37 | The online documentation is published on the
38 | .UR https://railgunlabs.com/unicorn
39 | Railgun Labs website
40 | .UE .
41 | .SH LICENSING
42 | Unicorn is distributed with its end-user license agreement (EULA).
43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
44 | 


--------------------------------------------------------------------------------
/man/uni_ccc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_ccc \- canonical combining class
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "uint8_t uni_ccc(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]Canonical_Combining_Class\f[R] property for the code point \f[I]c\f[R].
14 | The canonical combining class property is used during the Unicode normalization algorithm to sort combining characters.
15 | .PP
16 | Support for the canonical combining class property must be enabled in the JSON configuration file otherwise the function will always return zero.
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 |     "characterProperties": [
22 |         "Canonical_Combining_Class"
23 |     ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The canonical combining class of \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR unichar (3)
31 | .SH AUTHOR
32 | .UR https://railgunlabs.com
33 | Railgun Labs
34 | .UE .
35 | .SH INTERNET RESOURCES
36 | The online documentation is published on the
37 | .UR https://railgunlabs.com/unicorn
38 | Railgun Labs website
39 | .UE .
40 | .SH LICENSING
41 | Unicorn is distributed with its end-user license agreement (EULA).
42 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
43 | 


--------------------------------------------------------------------------------
/man/uni_tolower.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_tolower \- simple lower case mapping
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unichar uni_tolower(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple lower case mapping for the code point \f[I]c\f[R].
14 | For the full lowercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_LOWER\f[R].
15 | .PP
16 | Support for simple lower case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 |     "characterProperties": [
22 |         "Simple_Lowercase_Mapping"
23 |     ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple lower case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 | 


--------------------------------------------------------------------------------
/man/uni_toupper.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_toupper \- simple upper case mapping
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unichar uni_toupper(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple upper case mapping for the code point \f[I]c\f[R].
14 | For the full uppercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_UPPER\f[R].
15 | .PP
16 | Support for simple upper case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 |     "characterProperties": [
22 |         "Simple_Uppercase_Mapping"
23 |     ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple upper case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 | 


--------------------------------------------------------------------------------
/man/uni_totitle.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_totitle \- simple title case mapping
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unichar uni_totitle(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple title case mapping for the code point \f[I]c\f[R].
14 | For the full title case mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_TITLE\f[R].
15 | .PP
16 | Support for simple title case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 |     "characterProperties": [
22 |         "Simple_Titlecase_Mapping"
23 |     ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple title case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 | 


--------------------------------------------------------------------------------
/man/uni_getucdversion.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_getucdversion \- unicode standard version
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "void uni_getucdversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Writes the major, minor, and patch version of the Unicode Standard to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively.
14 | Null arguments are ignored.
15 | .SH EXAMPLES
16 | This example prints the major, minor, and patch version of the Unicode Standard implemented by the library to stdout.
17 | .PP
18 | .in +4n
19 | .EX
20 | #include <unicorn.h>
21 | #include <stdio.h>
22 | 
23 | int main(void)
24 | {
25 |     int32_t major, minor, patch;
26 |     uni_getucdversion(&major, &minor, &patch);
27 |     printf("%d.%d.%d", major, minor, patch);
28 |     return 0;
29 | }
30 | .EE
31 | .in
32 | .SH AUTHOR
33 | .UR https://railgunlabs.com
34 | Railgun Labs
35 | .UE .
36 | .SH INTERNET RESOURCES
37 | The online documentation is published on the
38 | .UR https://railgunlabs.com/unicorn
39 | Railgun Labs website
40 | .UE .
41 | .SH LICENSING
42 | Unicorn is distributed with its end-user license agreement (EULA).
43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
44 | 


--------------------------------------------------------------------------------
/src/normalize.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #ifndef NORMALIZE_H
15 | #define NORMALIZE_H
16 | 
17 | #include "common.h"
18 | #include "charvec.h"
19 | 
20 | typedef bool (*IsStable)(unichar cp);
21 | 
22 | // Incremental normalization state book keeping.
23 | struct NormalizeState
24 | {
25 |     int32_t offset;
26 |     int32_t r;
27 |     IsStable is_stable;
28 | 
29 |     // Tracks an unstable span of characters within unnormalized text.
30 |     // This is the span of text that will be normalized.
31 |     struct CharVec span;
32 | 
33 |     // Buffer with decomposed characters.
34 |     struct CharVec decomp;
35 | };
36 | 
37 | void uni_norm_init(struct NormalizeState *state, IsStable is_stable);
38 | void uni_norm_free(struct NormalizeState *state);
39 | 
40 | void uni_norm_reset(struct NormalizeState *ns);
41 | 
42 | unistat uni_norm_append_run(struct NormalizeState *state, struct unitext *it);
43 | 
44 | bool uni_is_stable_nfd(unichar cp);
45 | 
46 | #endif // NORMALIZE_H
47 | 


--------------------------------------------------------------------------------
/scripts/ccc.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | 
21 | class CanonicalCombiningClass(Feature):
22 |     def pre_process(self, codespace: Codespace) -> None:
23 |         codespace.register_property("canonical_combining_class", 0, StorageSize.UINT8)
24 | 
25 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 |         file = archive.open('ccc.json')
27 |         mappings: Dict[str,int] = json.loads(file.read())["ccc"]
28 |         file.close()
29 | 
30 |         ccc_property = codespace.get("canonical_combining_class")
31 |         for cp, ccc in mappings.items():
32 |             codespace.set(int(cp,16), ccc_property, ccc)
33 | 
34 |         public_header = "#define UNICORN_FEATURE_CCC\n"
35 |         return SerializationData(public_header=public_header)
36 | 


--------------------------------------------------------------------------------
/man/uni_trust.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_TRUST \- well-formed text
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_TRUST 0x40u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the encoded text is well-formed.
14 | .PP
15 | Under normal usage Unicorn processes text with the assumption that it might be malformed.
16 | This means it performs safety checks which are redundant if the text has already been validated.
17 | The purpose of this flag is to inform Unicorn that it can skip redundant validation checks thereby improving performance.
18 | .PP
19 | Usage of this flag means functions will never return \f[B]UNI_BAD_ENCODING\f[R].
20 | .PP
21 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer.
22 | Using this flag with an output buffer will produce \f[B]UNI_BAD_OPERATION\f[R].
23 | .SH SEE ALSO
24 | .BR unistat (3),
25 | .BR uni_norm (3),
26 | .BR uni_validate (3)
27 | .SH AUTHOR
28 | .UR https://railgunlabs.com
29 | Railgun Labs
30 | .UE .
31 | .SH INTERNET RESOURCES
32 | The online documentation is published on the
33 | .UR https://railgunlabs.com/unicorn
34 | Railgun Labs website
35 | .UE .
36 | .SH LICENSING
37 | Unicorn is distributed with its end-user license agreement (EULA).
38 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
39 | 


--------------------------------------------------------------------------------
/man/unimemfunc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unimemfunc \- memory allocation function
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "typedef void *(*unimemfunc)(void *" user_data ", void *" ptr ", size_t " old_size ", size_t " new_size ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Defines the function signature for a custom memory management implementation.
14 | The memory manager is responsible for allocating, reallocating, and freeing memory.
15 | The implementation must behave like \f[B]realloc\f[R](3) or \f[B]free\f[R](3) depending upon its arguments.
16 | .PP
17 | When \f[I]new_size\f[R] is non-zero, the allocator must behave like \f[B]realloc\f[R](3).
18 | If \f[I]ptr\f[R] is NULL and \f[I]old_size\f[R] is zero, the allocator must behave like \f[B]malloc\f[R](3).
19 | .PP
20 | When \f[I]new_size\f[R] is zero, the allocator must behave like \f[B]free\f[R](3).
21 | It must also return NULL to the caller.
22 | .SH SEE ALSO
23 | .BR uni_setmemfunc (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 | 


--------------------------------------------------------------------------------
/scripts/feature.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Set, Tuple, Type
13 | import zipfile
14 | 
15 | from .basics import StorageSize, SerializationData
16 | from .tools import Codespace
17 | from .config import Config
18 | 
19 | class Feature(object):
20 |     def __init__(self) -> None:
21 |         self.total_size = 0
22 |     def pre_process(self, codespace: Codespace) -> None:
23 |         pass
24 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
25 |         return SerializationData()
26 |     def post_process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
27 |         return SerializationData()
28 |     @staticmethod
29 |     def dependencies() -> Set[Type['Feature']]:
30 |         return set()
31 | 
32 | FLAGS_PROPERTY: Tuple[str, int, StorageSize] = ("flags", 0, StorageSize.UINT8)
33 | 
34 | IS_COMPOSABLE = 0x1
35 | IS_CASED = 0x2
36 | IS_CASE_IGNORABLE = 0x4
37 | IS_NORMALIZATION_NEEDED = 0x8
38 | IS_CHANGES_WHEN_CASEFOLDED = 0x10
39 | 


--------------------------------------------------------------------------------
/examples/segment_text.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This example prints the code unit indices of the extended grapheme clusters in the string.
15 |     // A grapheme cluster is a user-perceived character. It correspond to a single graphical
16 |     // character on the end-users display. A grapheme can consist of multiple code points.
17 | 
18 |     // Change 'UNI_GRAPHEME' to 'UNI_WORD' or 'UNI_SENTENCE' and observe how it affects the
19 |     // printed break positions.
20 | 
21 |     const char str[] = u8"👨🏼‍🚀👨🏽‍🚀 landed on the 🌕";
22 |     unisize break_at = 0;
23 |     for (;;)
24 |     {
25 |         unistat s = uni_nextbrk(UNI_GRAPHEME, str, -1, UNI_UTF8, &break_at);
26 |         if (s == UNI_OK)
27 |         {
28 |             printf("%d\n", break_at);
29 |         }
30 |         else if (s == UNI_DONE)
31 |         {
32 |             break;
33 |         }
34 |         else if (s == UNI_BAD_ENCODING)
35 |         {
36 |             printf("malformed character at %d\n", break_at);
37 |             return 1;
38 |         }
39 |         else
40 |         {
41 |             puts("an internal error occured");
42 |             return 2; // Something else went wrong (check the status code).
43 |         }
44 |     }
45 |     return 0;
46 | }
47 | 


--------------------------------------------------------------------------------
/scripts/general_category.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | 
21 | class GeneralCategoryProperty(Feature):
22 |     def pre_process(self, codespace: Codespace) -> None:
23 |         codespace.register_property("general_category", 29, StorageSize.UINT8)
24 | 
25 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 |         file = archive.open('gc.json')
27 |         data = json.loads(file.read())
28 |         characters: Dict[str,int] = data["characters"]
29 |         file.close()
30 | 
31 |         gc_property = codespace.get("general_category")
32 |         for cp, value in characters.items():
33 |             codespace.set(int(cp,16), gc_property, value)
34 | 
35 |         public_header = "#define UNICORN_FEATURE_GC\n"
36 |         return SerializationData(public_header=public_header)
37 | 


--------------------------------------------------------------------------------
/features.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "1.0",
 3 |     "endian": "native",
 4 |     "hasStandardAllocators": true,
 5 |     "characterStorage": "uint32_t",
 6 |     "optimizeFor": "speed",
 7 |     "stackBufferSize": 32,
 8 |     "algorithms": {
 9 |         "normalization": [
10 |             "NFC",
11 |             "NFD"
12 |         ],
13 |         "normalizationQuickCheck": true,
14 |         "caseConversion": [
15 |             "lower",
16 |             "title",
17 |             "upper"
18 |         ],
19 |         "caseFolding": [
20 |             "default",
21 |             "canonical"
22 |         ],
23 |         "segmentation": [
24 |             "grapheme",
25 |             "word",
26 |             "sentence"
27 |         ],
28 |         "compression": true,
29 |         "collation": true
30 |     },
31 |     "characterProperties": [
32 |         "Alphabetic",
33 |         "Canonical_Combining_Class",
34 |         "Dash",
35 |         "Diacritic",
36 |         "Extender",
37 |         "General_Category",
38 |         "Hex_Digit",
39 |         "Ideographic",
40 |         "Lowercase",
41 |         "Math",
42 |         "Noncharacter_Code_Point",
43 |         "Numeric_Value",
44 |         "Quotation_Mark",
45 |         "Simple_Lowercase_Mapping",
46 |         "Simple_Titlecase_Mapping",
47 |         "Simple_Uppercase_Mapping",
48 |         "Terminal_Punctuation",
49 |         "Unified_Ideograph",
50 |         "Uppercase",
51 |         "White_Space"
52 |     ],
53 |     "encodingForms": [
54 |         "UTF-8",
55 |         "UTF-16",
56 |         "UTF-32"
57 |     ]
58 | }
59 | 


--------------------------------------------------------------------------------
/man/uni_normchk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_normchk \- normalization check
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_normchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is normalized.
14 | It's more "expensive" than \f[B]uni_normqchk\f[R](3), but returns a definitive yes/no answer.
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | On success.
19 | .TP
20 | UNI_BAD_OPERATION
21 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
22 | .TP
23 | UNI_BAD_ENCODING
24 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
25 | .TP
26 | UNI_FEATURE_DISABLED
27 | If Unicorn was built without support for \f[I]form\f[R].
28 | .SH SEE ALSO
29 | .BR uni_normqchk (3),
30 | .BR uninormchk (3),
31 | .BR UNI_TRUST (3),
32 | .BR uninormform (3),
33 | .BR unisize (3),
34 | .BR uniattr (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 | 


--------------------------------------------------------------------------------
/scripts/encoding.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | import zipfile
13 | 
14 | from .config import Config
15 | from .tools import Codespace
16 | from .feature import Feature
17 | from .basics import SerializationData
18 | 
19 | class EncodingUTF8(Feature):
20 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
21 |         public_header = "#define UNICORN_FEATURE_ENCODING_UTF8\n"
22 |         return SerializationData(public_header=public_header)
23 |     
24 | class EncodingUTF16(Feature):
25 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 |         public_header = "#define UNICORN_FEATURE_ENCODING_UTF16\n"
27 |         return SerializationData(public_header=public_header)
28 |     
29 | class EncodingUTF32(Feature):
30 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
31 |         public_header = "#define UNICORN_FEATURE_ENCODING_UTF32\n"
32 |         return SerializationData(public_header=public_header)
33 | 


--------------------------------------------------------------------------------
/man/uninormform.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uninormform \- unicode normalization forms
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum uninormform {
11 | .RS
12 | .B UNI_NFC,
13 | .B UNI_NFD,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | Normalization forms define how strings should be transformed for the purposes of testing equivalence.
19 | .SH CONSTANTS
20 | .TP
21 | .BR UNI_NFC
22 | Performs Canonical Decomposition followed by Canonical Composition.
23 | .IP
24 | Support for canonical composition can be enabled in the JSON configuration file.
25 | .IP
26 | .in +4n
27 | .EX
28 | {
29 |     "algorithms": {
30 |         "normalization": ["NFC"]
31 |     }
32 | }
33 | .EE
34 | .in
35 | .TP
36 | .BR UNI_NFD
37 | Performs Canonical Decomposition.
38 | .IP
39 | Support for canonical decomposition can be enabled in the JSON configuration file.
40 | .IP
41 | .in +4n
42 | .EX
43 | {
44 |     "algorithms": {
45 |         "normalization": ["NFD"]
46 |     }
47 | }
48 | .EE
49 | .in
50 | .SH AUTHOR
51 | .UR https://railgunlabs.com
52 | Railgun Labs
53 | .UE .
54 | .SH INTERNET RESOURCES
55 | The online documentation is published on the
56 | .UR https://railgunlabs.com/unicorn
57 | Railgun Labs website
58 | .UE .
59 | .SH LICENSING
60 | Unicorn is distributed with its end-user license agreement (EULA).
61 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
62 | 


--------------------------------------------------------------------------------
/man/unichar.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unichar \- code point
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "typedef uint32_t unichar;"
11 | .fi
12 | .SH DESCRIPTION
13 | This integer represents a Unicode code point or Unicode scalar value depending on how it's used.
14 | .PP
15 | The integer type used as the underlying storage for this type is configurable.
16 | This helps simplify integration with existing applications which may already have defined their own Unicode character type.
17 | The type chosen can be signed or unsigned, but it must be large enough to accommodate the entire Unicode character repertoire.
18 | As of Unicode 17.0, that means an integer with at least 21 bits of storage.
19 | .PP
20 | The snippet below demonstrates how to redefine \f[B]unichar\f[R] as a signed 64-bit integer.
21 | Using a 64-bit integer is wasteful, as you only need 21-bits, and was chosen for example purposes only.
22 | .PP
23 | .in +4n
24 | .EX
25 | {
26 |     "characterStorage": "int64_t",
27 | }
28 | .EE
29 | .in
30 | .SH AUTHOR
31 | .UR https://railgunlabs.com
32 | Railgun Labs
33 | .UE .
34 | .SH INTERNET RESOURCES
35 | The online documentation is published on the
36 | .UR https://railgunlabs.com/unicorn
37 | Railgun Labs website
38 | .UE .
39 | .SH LICENSING
40 | Unicorn is distributed with its end-user license agreement (EULA).
41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
42 | 


--------------------------------------------------------------------------------
/src/charvec.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | // Implements a heap-allocated growable character buffer.
15 | 
16 | #ifndef CHARVEC_H
17 | #define CHARVEC_H
18 | 
19 | #include "common.h"
20 | 
21 | struct CharVec
22 | {
23 |     unisize length;
24 |     unisize capacity;
25 |     unichar *chars;
26 |     unichar scratch[UNICORN_STACK_BUFFER_SIZE];
27 | };
28 | 
29 | void uni_charvec_init(struct CharVec *buffer);
30 | void uni_charvec_free(struct CharVec *buffer);
31 | void uni_charvec_reset(struct CharVec *buffer);
32 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count);
33 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity);
34 | void uni_charvec_remove(struct CharVec *buf, unisize i);
35 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count);
36 | 
37 | static inline unisize uni_charvec_length(const struct CharVec *buffer)
38 | {
39 |     return buffer->length;
40 | }
41 | 
42 | static inline unisize uni_charvec_capacity(const struct CharVec *buffer)
43 | {
44 |     return buffer->capacity;
45 | }
46 | 
47 | #endif // CHARVEC_H
48 | 


--------------------------------------------------------------------------------
/man/uni_gc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_gc \- general category
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unigc uni_gc(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]General_Category\f[R] character property for the code point \f[I]c\f[R].
14 | This property describes a character's general classification.
15 | For example, the Latin Capital Letter A (\f[C]U+0041\f[R]) has a general category value of \f[B]UNI_UPPERCASE_LETTER\f[R] which indicates it's an uppercase letter.
16 | .PP
17 | For a complete list of all possible general category values, see \f[B]unigc\f[R](3).
18 | .PP
19 | Support for the \f[C]General_Category\f[R] character property must be enabled in the JSON configuration file otherwise the function will always return \f[B]UNI_UNASSIGNED\f[R].
20 | .PP
21 | .in +4n
22 | .EX
23 | {
24 |     "characterProperties": [
25 |         "General_Category"
26 |     ]
27 | }
28 | .EE
29 | .in
30 | .SH RETURN VALUE
31 | The general category of \f[I]c\f[R].
32 | .SH SEE ALSO
33 | .BR unigc (3),
34 | .BR unichar (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 | 


--------------------------------------------------------------------------------
/man/uni_prevbrk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_prevbrk \- compute preceding boundary
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_prevbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function computes the preceding \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R].
14 | The implementation sets \f[I]index\f[R] to the code unit offset of the preceding \f[I]boundary\f[R].
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the break iterator was successfully repositioned.
19 | .TP
20 | UNI_DONE
21 | If \f[I]index\f[R] is at the beginning of \f[I]text\f[R].
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]index\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
28 | .SH SEE ALSO
29 | .BR UNI_TRUST (3),
30 | .BR unibreak (3),
31 | .BR unisize (3),
32 | .BR uniattr (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 | 


--------------------------------------------------------------------------------
/man/uni_validate.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_validate \- validate text
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_validate(const void *" text ", unisize " text_len ", uniattr " text_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is well-formed.
14 | The length of \f[I]text\f[R] is given in code units by \f[I]text_len\f[R] and its encoding form is specified by \f[I]text_attr\f[R].
15 | .PP
16 | The \f[B]UNI_TRUST\f[R](3) flag has no effect when used with \f[I]text_attr\f[R] as the entire purpose of this function is to verify \f[I]text\f[R] is well-formed and assuming it is well-formed would defeat the purpose.
17 | .SH RETURN VALUE
18 | .TP
19 | UNI_OK
20 | If \f[I]text\f[R] is well-formed.
21 | .TP
22 | UNI_BAD_OPERATION
23 | If \f[I]text\f[R] is null.
24 | .TP
25 | UNI_BAD_ENCODING
26 | If the encoding of \f[I]text\f[R] is malformed.
27 | .TP
28 | UNI_FEATURE_DISABLED
29 | If the encoding form flagged in \f[I]text_attr\f[R] is disabled.
30 | .SH SEE ALSO
31 | .BR UNI_TRUST (3),
32 | .BR unisize (3),
33 | .BR uniattr (3)
34 | .SH AUTHOR
35 | .UR https://railgunlabs.com
36 | Railgun Labs
37 | .UE .
38 | .SH INTERNET RESOURCES
39 | The online documentation is published on the
40 | .UR https://railgunlabs.com/unicorn
41 | Railgun Labs website
42 | .UE .
43 | .SH LICENSING
44 | Unicorn is distributed with its end-user license agreement (EULA).
45 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
46 | 


--------------------------------------------------------------------------------
/man/uni_nulify.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | UNI_NULIFY \- null-terminated output
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B #define UNI_NULIFY 0x80u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates that the output buffer must be null-terminated by the implementation.
14 | .PP
15 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer.
16 | The output buffer is the destination buffer that the implementation writes encoded characters to.
17 | When this flag is applied to the output buffer the implementation guarantees a NUL character (U+0000) is appended, even if it means truncating the last non-null character.
18 | Conceptually, this behavior is like the BSD function \f[C]strlcpy\f[R] which also guarantees the destination buffer is null-terminated.
19 | .PP
20 | The implementation will never append an extraneous null character if the output buffer is already null-terminated.
21 | .PP
22 | Using this flag with an input buffer will produce \f[B]UNI_BAD_OPERATION\f[R].
23 | .SH SEE ALSO
24 | .BR uni_norm (3),
25 | .BR unistat (3)
26 | .SH AUTHOR
27 | .UR https://railgunlabs.com
28 | Railgun Labs
29 | .UE .
30 | .SH INTERNET RESOURCES
31 | The online documentation is published on the
32 | .UR https://railgunlabs.com/unicorn
33 | Railgun Labs website
34 | .UE .
35 | .SH LICENSING
36 | Unicorn is distributed with its end-user license agreement (EULA).
37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
38 | 


--------------------------------------------------------------------------------
/scripts/numeric_value.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | 
21 | class NumericValueProperty(Feature):
22 |     def pre_process(self, codespace: Codespace) -> None:
23 |         codespace.register_property("numeric_value_offset", 0, StorageSize.UINT8)
24 | 
25 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 |         file = archive.open('numeric_values.json')
27 |         data = json.loads(file.read())
28 |         mappings: Dict[str,int] = data["mappings"]
29 |         source: str = data["source"]
30 |         header: str = data["header"]
31 |         size: int = data["size"]
32 |         file.close()
33 | 
34 |         numval_property = codespace.get("numeric_value_offset")
35 |         for cp, index in mappings.items():
36 |             codespace.set(int(cp,16), numval_property, index)
37 | 
38 |         public_header = "#define UNICORN_FEATURE_NUMERIC_VALUE\n"
39 |         return SerializationData(source, header, public_header, size)
40 | 


--------------------------------------------------------------------------------
/examples/Makefile.am:
--------------------------------------------------------------------------------
 1 | AUTOMAKE_OPTIONS = subdir-objects
 2 | 
 3 | EXTRA_DIST = CMakeLists.txt
 4 | 
 5 | if BUILD_EXAMPLES
 6 | 
 7 | PROG_LDADD = ../src/libunicorn.la
 8 | PROG_CFLAGS = -I$(top_srcdir)/include -I$(abs_top_builddir)/src
 9 | 
10 | bin_PROGRAMS = caseconv casefold charprops collate compare compress decode-utf8 decode-utf16 decode-utf32 encode-char segmentation
11 | 
12 | caseconv_SOURCES = case_convert.c
13 | caseconv_LDADD = $(PROG_LDADD)
14 | caseconv_CFLAGS = $(PROG_CFLAGS)
15 | 
16 | casefold_SOURCES = case_fold.c
17 | casefold_LDADD = $(PROG_LDADD)
18 | casefold_CFLAGS = $(PROG_CFLAGS)
19 | 
20 | charprops_SOURCES = character_properties.c
21 | charprops_LDADD = $(PROG_LDADD)
22 | charprops_CFLAGS = $(PROG_CFLAGS)
23 | 
24 | collate_SOURCES = collate_text.c
25 | collate_LDADD = $(PROG_LDADD)
26 | collate_CFLAGS = $(PROG_CFLAGS)
27 | 
28 | compare_SOURCES = compare_text.c
29 | compare_LDADD = $(PROG_LDADD)
30 | compare_CFLAGS = $(PROG_CFLAGS)
31 | 
32 | compress_SOURCES = compress_text.c
33 | compress_LDADD = $(PROG_LDADD)
34 | compress_CFLAGS = $(PROG_CFLAGS)
35 | 
36 | decode_utf8_SOURCES = decode_utf8.c
37 | decode_utf8_LDADD = $(PROG_LDADD)
38 | decode_utf8_CFLAGS = $(PROG_CFLAGS)
39 | 
40 | decode_utf16_SOURCES = decode_utf16.c
41 | decode_utf16_LDADD = $(PROG_LDADD)
42 | decode_utf16_CFLAGS = $(PROG_CFLAGS)
43 | 
44 | decode_utf32_SOURCES = decode_utf32.c
45 | decode_utf32_LDADD = $(PROG_LDADD)
46 | decode_utf32_CFLAGS = $(PROG_CFLAGS)
47 | 
48 | encode_char_SOURCES = encode_character.c
49 | encode_char_LDADD = $(PROG_LDADD)
50 | encode_char_CFLAGS = $(PROG_CFLAGS)
51 | 
52 | segmentation_SOURCES = segment_text.c
53 | segmentation_LDADD = $(PROG_LDADD)
54 | segmentation_CFLAGS = $(PROG_CFLAGS)
55 | 
56 | endif
57 | 


--------------------------------------------------------------------------------
/man/uninormchk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uninormchk \- quick check constants
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum uninormchk {
11 | .RS
12 | .B UNI_YES,
13 | .B UNI_MAYBE,
14 | .B UNI_NO,
15 | .RE
16 | .B };
17 | .fi
18 | .SH DESCRIPTION
19 | These properties indicate whether a character can or cannot appear in a given normalization form.
20 | .PP
21 | When used with \f[B]uni_normqchk\f[R](3) these properties indicate whether some input string is possibly in the desired normalization form.
22 | This may make it possible to bypass the more time-consuming call to run the complete Unicode Normalization Algorithm.
23 | .PP
24 | Users can alternatively use \f[B]uni_normchk\f[R](3) which will definitively check if text is normalized.
25 | .SH CONSTANTS
26 | .TP
27 | .BR UNI_YES
28 | Characters that can occur in the respective normalization form.
29 | .TP
30 | .BR UNI_MAYBE
31 | Characters that may occur in the respective normalization, depending on the context.
32 | .TP
33 | .BR UNI_NO
34 | Characters that cannot occur in the respective normalization form.
35 | .SH SEE ALSO
36 | .BR uni_normqchk (3),
37 | .BR uni_normchk (3)
38 | .SH AUTHOR
39 | .UR https://railgunlabs.com
40 | Railgun Labs
41 | .UE .
42 | .SH INTERNET RESOURCES
43 | The online documentation is published on the
44 | .UR https://railgunlabs.com/unicorn
45 | Railgun Labs website
46 | .UE .
47 | .SH LICENSING
48 | Unicorn is distributed with its end-user license agreement (EULA).
49 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
50 | 


--------------------------------------------------------------------------------
/src/charbuf.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #ifndef CHARBUF_H
15 | #define CHARBUF_H
16 | 
17 | #include "common.h"
18 | 
19 | struct CharBufImpl;
20 | 
21 | struct CharBuf
22 | {
23 |     void *storage;
24 | 
25 |     // This is where the next character will be written.
26 |     // When finished this is the number of characters that needed to be
27 |     // written, but not necessarily how many were actually written.
28 |     unisize length;
29 | 
30 |     // Number of characters actually written.
31 |     unisize written;
32 | 
33 |     unisize capacity;
34 |     unisize *result;
35 |     bool null_terminate;
36 |     bool is_null_terminated;
37 |     const struct CharBufImpl *impl;
38 | };
39 | 
40 | struct CharBufImpl
41 | {
42 |     void (*append)(struct CharBuf *buf, const unichar *chars, unisize chars_count);
43 |     void (*nullterminate)(struct CharBuf *buf);
44 | };
45 | 
46 | unistat uni_charbuf_init(struct CharBuf *buf, void *text, unisize *capacity, uniattr attributes);
47 | unistat uni_charbuf_finalize(struct CharBuf *buf);
48 | 
49 | void uni_charbuf_append(struct CharBuf *buf, const unichar *chars, unisize chars_count);
50 | void uni_charbuf_appendchar(struct CharBuf *buf, unichar ch);
51 | 
52 | #endif // CHARBUF_H
53 | 


--------------------------------------------------------------------------------
/man/uni_caseconvchk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_caseconvchk \- check case status
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_caseconvchk(unicaseconv " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R].
14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R].
17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
18 | .SH RETURN VALUE
19 | .TP
20 | UNI_OK
21 | On success.
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[C]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
28 | .TP
29 | UNI_FEATURE_DISABLED
30 | If Unicorn was built without support for case conversion.
31 | .SH SEE ALSO
32 | .BR UNI_TRUST (3),
33 | .BR unicaseconv (3),
34 | .BR unisize (3),
35 | .BR uniattr (3)
36 | .SH AUTHOR
37 | .UR https://railgunlabs.com
38 | Railgun Labs
39 | .UE .
40 | .SH INTERNET RESOURCES
41 | The online documentation is published on the
42 | .UR https://railgunlabs.com/unicorn
43 | Railgun Labs website
44 | .UE .
45 | .SH LICENSING
46 | Unicorn is distributed with its end-user license agreement (EULA).
47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
48 | 


--------------------------------------------------------------------------------
/examples/case_convert.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This code example case converts text to upper case, e.g. it would
15 |     // convert the text "hello world" to "HELLO WORLD". Unicorn also
16 |     // supports lower and title case conversion.
17 | 
18 |     const char *src = u8"Straße";
19 | 
20 |     char dest[64];
21 |     unisize dest_len = 64;
22 | 
23 |     // The following function case converts a string to uppercase.
24 |     // The resulting string can be presented to an end-user.
25 |     // The signature for the function is documented online at:
26 |     // <https://railgunlabs.com/unicorn/manual/api/case-mapping/uni-caseconv/>.
27 |     if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, dest, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
28 |     {
29 |         printf("Case converted: %s\n", dest);
30 |     }
31 | 
32 |     // If you don't know how big the destination buffer will be, you can first
33 |     // call uni_caseconv() with a null and zero-sized destination buffer.
34 |     // When called this way, the implementation will write to 'dest_len' the
35 |     // number of code units needed to accommodate the output. You can then
36 |     // allocate a buffer with the appropriate size and call the function again.
37 |     dest_len = 0;
38 |     if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, NULL, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
39 |     {
40 |         printf("Code units needed: %d\n", dest_len);
41 |     }
42 |     return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/man/uni_numval.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_numval \- numeric property
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "const char *uni_numval(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]Numeric_Value\f[R] character property for the code point \f[I]c\f[R].
14 | If the input character does not have a numeric value, then null is returned.
15 | .PP
16 | The numeric value is returned as a null-terminated C string in decimal notation.
17 | That means if the character has a numeric value of -1/2, which is the case for TIBETAN DIGIT HALF ZERO (U+0F33), it would be returned as the string "-0.5".
18 | The caller can parse the string into an integer or floating-point storage format as needed.
19 | .PP
20 | Support for this character property must be enabled in the JSON configuration file otherwise the function will always return null.
21 | .PP
22 | .in +4n
23 | .EX
24 | {
25 |     "characterProperties": [
26 |         "Numeric_Value"
27 |     ]
28 | }
29 | .EE
30 | .in
31 | .SH RETURN VALUE
32 | The value of the \f[C]Numeric_Value\f[R] character property in decimal notation or null if the character does not have a numeric value.
33 | .SH SEE ALSO
34 | .BR unichar (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 | 


--------------------------------------------------------------------------------
/man/unistrength.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unistrength \- collation comparison levels
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum unistrength {
11 | .RS
12 | .B UNI_PRIMARY,
13 | .B UNI_SECONDARY,
14 | .B UNI_TERTIARY,
15 | .B UNI_QUATERNARY,
16 | .RE
17 | .B };
18 | .fi
19 | .SH DESCRIPTION
20 | The Unicode collation algorithm is a multilevel comparison algorithm.
21 | The number of levels that are considered in comparison is known as the collation strength.
22 | This enumeration defines constants for each level.
23 | .PP
24 | In comparing two strings, the most important feature is the identity of the base letters.
25 | For example, the difference between an A and a B. If the base letters differ, accent differences are typically ignored.
26 | If the base letters or their accents differ, case differences (uppercase versus lowercase) are typically ignored.
27 | .SH CONSTANTS
28 | .TP
29 | .BR UNI_PRIMARY
30 | Represents differences in the base letter or symbol.
31 | .TP
32 | .BR UNI_SECONDARY
33 | Represents differences in accents.
34 | .TP
35 | .BR UNI_TERTIARY
36 | Represents differences in case or variants of symbols.
37 | .TP
38 | .BR UNI_QUATERNARY
39 | Represents differences in punctuation.
40 | .SH AUTHOR
41 | .UR https://railgunlabs.com
42 | Railgun Labs
43 | .UE .
44 | .SH INTERNET RESOURCES
45 | The online documentation is published on the
46 | .UR https://railgunlabs.com/unicorn
47 | Railgun Labs website
48 | .UE .
49 | .SH LICENSING
50 | Unicorn is distributed with its end-user license agreement (EULA).
51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
52 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | # Specify the project and version + the minimum automake version required.
 2 | AC_PREREQ(2.66)
 3 | AC_INIT([unicorn], [1.3.1])
 4 | 
 5 | # Sanity check to verify this is indeed the source directory.
 6 | AC_CONFIG_SRCDIR(src/unicorn.c)
 7 | 
 8 | # Initialize automake and use "foreign" so it doesn't attmpet to look for or
 9 | # create a GNU project structure: COPYING, NEWS, etc...
10 | AM_INIT_AUTOMAKE([foreign])
11 | 
12 | # Check for the --with-unicorn-config option.
13 | AC_ARG_WITH([unicorn-config],
14 |     [AS_HELP_STRING([--with-unicorn-config=PATH],
15 |                     [Path to a JSON file that configures the Unicorn feature set. (default: features.json)])],
16 |     [CONFIG_JSON="$withval"],
17 |     [CONFIG_JSON="features.json"])
18 | AC_SUBST([CONFIG_JSON])
19 | 
20 | # Check for the --enable-examples option.
21 | AC_ARG_ENABLE([examples],
22 |     [AS_HELP_STRING([--enable-examples], [Build the example programs (disabled by default)])],
23 |     [enable_examples=$enableval],
24 |     [enable_examples=no]) # Do not build the examples by default.
25 | AM_CONDITIONAL([BUILD_EXAMPLES], [test "x$enable_examples" = "xyes"])
26 | 
27 | # Checks for toolchain programs.
28 | AC_PROG_CC
29 | AM_PROG_AR
30 | AC_PROG_CPP
31 | AC_PROG_INSTALL
32 | LT_INIT([win32-dll])
33 | 
34 | # Verify the expected C headers are available.AM_PATH_PYTHON([3.10])
35 | AC_SUBST([PYTHON],[$PYTHON])
36 | AC_CHECK_HEADERS([string.h assert.h stdbool.h], [], [AC_MSG_ERROR([could not find required header])])
37 | 
38 | # Check for Python (required to build the project).
39 | AM_PATH_PYTHON([3.10])
40 | AC_SUBST([PYTHON],[$PYTHON])
41 | 
42 | AC_CONFIG_FILES([
43 |     Makefile
44 |     src/Makefile
45 |     include/Makefile
46 |     examples/Makefile
47 |     scripts/Makefile
48 |     man/Makefile
49 |     unicorn.pc
50 | ])
51 | AC_OUTPUT
52 | 


--------------------------------------------------------------------------------
/man/uni_casefoldchk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_casefoldchk \- check case fold status
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_casefoldchk(unicasefold " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R].
14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R].
17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
18 | .SH RETURN VALUE
19 | .TP
20 | UNI_OK
21 | On success.
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
28 | .TP
29 | UNI_FEATURE_DISABLED
30 | If Unicorn was built without support for case folding.
31 | .TP
32 | UNI_NO_MEMORY
33 | If dynamic memory allocation failed.
34 | .SH SEE ALSO
35 | .BR UNI_TRUST (3),
36 | .BR unicasefold (3),
37 | .BR unisize (3),
38 | .BR uniattr (3)
39 | .SH AUTHOR
40 | .UR https://railgunlabs.com
41 | Railgun Labs
42 | .UE .
43 | .SH INTERNET RESOURCES
44 | The online documentation is published on the
45 | .UR https://railgunlabs.com/unicorn
46 | Railgun Labs website
47 | .UE .
48 | .SH LICENSING
49 | Unicorn is distributed with its end-user license agreement (EULA).
50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
51 | 


--------------------------------------------------------------------------------
/src/byteswap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #ifndef BYTESWAP_H
15 | #define BYTESWAP_H
16 | 
17 | #include "common.h"
18 | 
19 | static inline uint32_t uni_swap32(uint32_t val)
20 | {
21 |     uint32_t valm = (val << 8u) & 0xFF00FF00u;
22 |     valm |= (val >> 8u) & 0xFF00FFu;
23 |     return (valm << 16u) | (valm >> 16u);
24 | }
25 | 
26 | static inline uint16_t uni_swap16(uint16_t val)
27 | {
28 |     const uint32_t byte1 = (uint32_t)val << 8u;
29 |     const uint32_t byte2 = (uint32_t)val >> 8u;
30 |     return (uint16_t)(byte1 | byte2);
31 | }
32 | 
33 | static inline uint32_t uni_swap32_le(uint32_t val)
34 | {
35 | #if defined(UNICORN_BIG_ENDIAN)
36 |     return uni_swap32(val);
37 | #else
38 |     return val;
39 | #endif
40 | }
41 | 
42 | static inline uint16_t uni_swap16_le(uint16_t val)
43 | {
44 | #if defined(UNICORN_BIG_ENDIAN)
45 |     return uni_swap16(val);
46 | #else
47 |     return val;
48 | #endif
49 | }
50 | 
51 | static inline uint32_t uni_swap32_be(uint32_t val)
52 | {
53 | #if defined(UNICORN_LITTLE_ENDIAN)
54 |     return uni_swap32(val);
55 | #else
56 |     return val;
57 | #endif
58 | }
59 | 
60 | static inline uint16_t uni_swap16_be(uint16_t val)
61 | {
62 | #if defined(UNICORN_LITTLE_ENDIAN)
63 |     return uni_swap16(val);
64 | #else
65 |     return val;
66 | #endif
67 | }
68 | 
69 | #endif // BYTESWAP_H
70 | 


--------------------------------------------------------------------------------
/man/uniweighting.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uniweighting \- collation weighting algorithm
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum uniweighting {
11 | .RS
12 | .B UNI_NON_IGNORABLE,
13 | .B UNI_SHIFTED,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | This enumeration defines constants representing the possible options for variable weighted characters.
19 | .PP
20 | The Unicode Collation Algorithm defines the collation for some characters as variable.
21 | This typically includes punctuation characters and symbols.
22 | Based on the variable-weighting setting, these characters can be interpreted as having a quaternary level.
23 | .SH CONSTANTS
24 | .TP
25 | .BR UNI_NON_IGNORABLE
26 | The words with hyphen-minus or hyphen are grouped together, but before all letters in the third position.
27 | This is because they are not ignorable, and have primary values that differ from the letters.
28 | The symbols ☠ and ♡ have primary differences.
29 | .TP
30 | .BR UNI_SHIFTED
31 | The hyphen-minus and hyphen are grouped together, and their differences are less significant than the casing differences in the letter "l".
32 | This grouping results from the fact that they are ignorable, but their fourth level differences are according to the original primary order, which is more intuitive than Unicode order.
33 | The symbols ☠ and ♡ are ignored on levels 1-3.
34 | .SH SEE ALSO
35 | .BR unistrength (3)
36 | .SH AUTHOR
37 | .UR https://railgunlabs.com
38 | Railgun Labs
39 | .UE .
40 | .SH INTERNET RESOURCES
41 | The online documentation is published on the
42 | .UR https://railgunlabs.com/unicorn
43 | Railgun Labs website
44 | .UE .
45 | .SH LICENSING
46 | Unicorn is distributed with its end-user license agreement (EULA).
47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
48 | 


--------------------------------------------------------------------------------
/scripts/collation.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 | 
22 | class CollationFeature(Feature):
23 |     @staticmethod
24 |     def dependencies() -> Set[Type[Feature]]:
25 |         return set([CanonicalDecompositionFeature])
26 | 
27 |     def pre_process(self, codespace: Codespace) -> None:
28 |         codespace.register_property("collation_subtype", 0, StorageSize.UINT8)
29 | 
30 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
31 |         file = archive.open('collation.json')
32 |         data = json.loads(file.read())
33 |         subtypes: Dict[str,int] = data["subtypes"]
34 |         size: int = data["size"]
35 |         header: str = data["header"]
36 |         source: str = data["source"]
37 |         file.close()
38 | 
39 |         # Associate collation subtypes with code points.
40 |         subtype_property = codespace.get("collation_subtype")
41 |         for key,subtype in subtypes.items():
42 |             codepoint = int(key, 16)
43 |             codespace.set(codepoint, subtype_property, subtype)
44 | 
45 |         public_header = "#define UNICORN_FEATURE_COLLATION\n"
46 |         return SerializationData(source, header, public_header, size)
47 | 


--------------------------------------------------------------------------------
/man/uni_seterrfunc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_seterrfunc \- receive diagnostic events
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "void uni_seterrfunc(void *" user_data ", unierrfunc " callback ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Associate the function \f[I]callback\f[R] with the library's internal error logger.
14 | Anytime a function returns an error \f[I]callback\f[R] will be invoked with a description of the error.
15 | It is up to the implementation of \f[I]callback\f[R] to be thread safe.
16 | .PP
17 | The implementation generally invokes \f[I]callback\f[R] at the moment the error occurs.
18 | Callers are encouraged to set breakpoints in their implementation of \f[I]callback\f[R] and view the stack trace to discover the exact cause of the error.
19 | .SH EXAMPLES
20 | This example registers a custom error callback with Unicorn.
21 | This callback will be invoked when an error occurs in Unicorn.
22 | In practice, you can set a breakpoint within the function for your debugger to pause on and then troubleshoot the error by inspecting the error message.
23 | .PP
24 | .in +4n
25 | .EX
26 | #include <unicorn.h>
27 | #include <stdio.h>
28 | 
29 | static void on_error(void *user_data, const char *message)
30 | {
31 |     puts(message);
32 | }
33 | 
34 | int main(void)
35 | {
36 |     uni_seterrfunc(NULL, on_error);
37 |     return 0;
38 | }
39 | .EE
40 | .in
41 | .SH SEE ALSO
42 | .BR unimemfunc (3),
43 | .BR unierrfunc (3)
44 | .SH AUTHOR
45 | .UR https://railgunlabs.com
46 | Railgun Labs
47 | .UE .
48 | .SH INTERNET RESOURCES
49 | The online documentation is published on the
50 | .UR https://railgunlabs.com/unicorn
51 | Railgun Labs website
52 | .UE .
53 | .SH LICENSING
54 | Unicorn is distributed with its end-user license agreement (EULA).
55 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
56 | 


--------------------------------------------------------------------------------
/man/uni_prev.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_prev \- decode the previous scalar value
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_prev(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decodes the preceding Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R].
14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the preceding scalar.
15 | .PP
16 | This function returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters.
17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R].
18 | .PP
19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If the character was successfully decoded.
24 | .TP
25 | UNI_DONE
26 | If the end of \f[I]text\f[R] was reached.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
30 | .TP
31 | UNI_BAD_OPERATION
32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL.
33 | .SH SEE ALSO
34 | .BR unistat (3),
35 | .BR UNI_TRUST (3),
36 | .BR unisize (3),
37 | .BR uniattr (3),
38 | .BR unichar (3)
39 | .SH AUTHOR
40 | .UR https://railgunlabs.com
41 | Railgun Labs
42 | .UE .
43 | .SH INTERNET RESOURCES
44 | The online documentation is published on the
45 | .UR https://railgunlabs.com/unicorn
46 | Railgun Labs website
47 | .UE .
48 | .SH LICENSING
49 | Unicorn is distributed with its end-user license agreement (EULA).
50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
51 | 


--------------------------------------------------------------------------------
/man/uni_compress.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_compress \- compress text
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_compress(const void *" text ", unisize " text_len ", uniattr " text_attr ", uint8_t *" buffer ", size_t *" buffer_length ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compresses \f[I]text\f[R] using Binary Ordered Compression for Unicode (BOCU-1) and writes the results to \f[I]buffer\f[R].
14 | Text is compressed based on its code points, i.e. UTF-8, 16, and 32 will compress to the same byte sequence.
15 | .PP
16 | The capacity of \f[I]buffer\f[R] is specified by \f[I]buffer_length\f[R].
17 | The implementation always writes to \f[I]buffer_length\f[R] the total number of bytes in the fully compressed text.
18 | If the capacity of \f[I]buffer\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
19 | If \f[I]buffer\f[R] is null, then \f[I]buffer_length\f[R] must be zero.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | On success.
24 | .TP
25 | UNI_NO_SPACE
26 | If \f[I]buffer\f[R] is not large enough.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]text\f[R] or \f[I]buffer_length\f[R] is null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_FEATURE_DISABLED
35 | If Unicorn was configured without support for the compression algorithm.
36 | .SH SEE ALSO
37 | .BR unistat (3),
38 | .BR UNI_TRUST (3),
39 | .BR unisize (3),
40 | .BR uniattr (3)
41 | .SH AUTHOR
42 | .UR https://railgunlabs.com
43 | Railgun Labs
44 | .UE .
45 | .SH INTERNET RESOURCES
46 | The online documentation is published on the
47 | .UR https://railgunlabs.com/unicorn
48 | Railgun Labs website
49 | .UE .
50 | .SH LICENSING
51 | Unicorn is distributed with its end-user license agreement (EULA).
52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
53 | 


--------------------------------------------------------------------------------
/man/uni_nextbrk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_nextbrk \- compute next boundary
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_nextbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function computes the next \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R].
14 | The implementation sets \f[I]index\f[R] to the code unit offset of the next \f[I]boundary\f[R].
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the break iterator was successfully repositioned.
19 | .TP
20 | UNI_DONE
21 | If \f[I]index\f[R] is beyond the last character of \f[I]text\f[R].
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]index\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
28 | .SH EXAMPLES
29 | This example iterates the extended grapheme clusters of a UTF-8 encoded string and prints the indices where each grapheme begins.
30 | .PP
31 | .in +4n
32 | .EX
33 | #include <unicorn.h>
34 | #include <stdio.h>
35 | 
36 | int main(void)
37 | {
38 |     const char *string = u8"Hi, 世界";
39 |     unisize index = 0;
40 |     while (uni_nextbrk(UNI_GRAPHEME, string, -1, UNI_UTF8, &index) == UNI_OK)
41 |     {
42 |         printf("%d\\n", index); // prints '1', '2', '3', '4', 7', '10'
43 |     }
44 |     return 0;
45 | }
46 | .EE
47 | .in
48 | .SH SEE ALSO
49 | .BR UNI_TRUST (3),
50 | .BR unibreak (3),
51 | .BR unisize (3),
52 | .BR uniattr (3)
53 | .SH AUTHOR
54 | .UR https://railgunlabs.com
55 | Railgun Labs
56 | .UE .
57 | .SH INTERNET RESOURCES
58 | The online documentation is published on the
59 | .UR https://railgunlabs.com/unicorn
60 | Railgun Labs website
61 | .UE .
62 | .SH LICENSING
63 | Unicorn is distributed with its end-user license agreement (EULA).
64 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
65 | 


--------------------------------------------------------------------------------
/man/uni_decompress.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_decompress \- decompress text
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_decompress(const uint8_t *" buffer ", size_t " buffer_length ", void *" text ", unisize *" text_len ", uniattr " text_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decompresses \f[I]buffer\f[R] and writes the result to \f[I]text\f[R].
14 | The encoding of \f[I]text_attr\f[R] does not need to match the encoding of the original uncompressed text.
15 | .PP
16 | The capacity of \f[I]text\f[R] is specified by \f[I]text_len\f[R].
17 | If \f[I]text\f[R] is not null, then the implementation writes to \f[I]text_len\f[R] the total number of code units written to \f[I]text\f[R].
18 | If the capacity of \f[I]text\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
19 | .PP
20 | If \f[I]text\f[R] is null and \f[I]text_len\f[R] is zero, then the implementation writes to \f[I]text_len\f[R] the number of code units in the fully decompressed text and returns \f[B]UNI_OK\f[R].
21 | Call the function this way to first compute the total length of the uncompressed text before calling it again with a sufficiently sized buffer.
22 | .SH RETURN VALUE
23 | .TP
24 | UNI_OK
25 | On success.
26 | .TP
27 | UNI_NO_SPACE
28 | If \f[I]text\f[R] is not large enough.
29 | .TP
30 | UNI_BAD_OPERATION
31 | If \f[I]buffer\f[R] is null, \f[I]text\f[R] is null and \f[I]text_len\f[R] is non-zero.
32 | .TP
33 | UNI_FEATURE_DISABLED
34 | If Unicorn was configured without support for the compression algorithm.
35 | .SH SEE ALSO
36 | .BR unistat (3),
37 | .BR uni_compress (3),
38 | .BR unisize (3),
39 | .BR uniattr (3)
40 | .SH AUTHOR
41 | .UR https://railgunlabs.com
42 | Railgun Labs
43 | .UE .
44 | .SH INTERNET RESOURCES
45 | The online documentation is published on the
46 | .UR https://railgunlabs.com/unicorn
47 | Railgun Labs website
48 | .UE .
49 | .SH LICENSING
50 | Unicorn is distributed with its end-user license agreement (EULA).
51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
52 | 


--------------------------------------------------------------------------------
/man/uni_casefoldcmp.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_casefoldcmp \- case-insensitive string comparison
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_casefoldcmp(unicasefold " casing ", const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are a caseless match in casing form \f[I]casing\f[R].
14 | If they match, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R].
17 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R].
18 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated.
19 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If \f[C]text\f[R] was checked successfully.
24 | .TP
25 | UNI_BAD_OPERATION
26 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
30 | .TP
31 | UNI_FEATURE_DISABLED
32 | If the library was built without support for case folding.
33 | .TP
34 | UNI_NO_MEMORY
35 | If dynamic memory allocation failed.
36 | .SH SEE ALSO
37 | .BR uniattr (3),
38 | .BR UNI_TRUST (3),
39 | .BR unicasefold (3),
40 | .BR unisize (3)
41 | .SH AUTHOR
42 | .UR https://railgunlabs.com
43 | Railgun Labs
44 | .UE .
45 | .SH INTERNET RESOURCES
46 | The online documentation is published on the
47 | .UR https://railgunlabs.com/unicorn
48 | Railgun Labs website
49 | .UE .
50 | .SH LICENSING
51 | Unicorn is distributed with its end-user license agreement (EULA).
52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
53 | 


--------------------------------------------------------------------------------
/man/uni_normqchk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_normqchk \- normalization quick check
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_normqchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", uninormchk *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function performs a quick check to see if \f[I]text\f[R] is, is not, or might be normalized.
14 | For a definitive yes/no answer, use \f[B]uni_normchk\f[R](3).
15 | .PP
16 | The implementation behaves as follows:
17 | .RS
18 | .IP \[bu] 2
19 | \f[I]result\f[R] is set to \f[B]UNI_YES\f[R] if all characters in \f[I]text\f[R] quick check to yes.
20 | .IP \[bu] 2
21 | \f[I]result\f[R] is set to \f[B]UNI_NO\f[R] if at least one character in \f[I]text\f[R] is quick checks to no.
22 | .IP \[bu] 2
23 | \f[I]result\f[R] is set to \f[B]UNI_MAYBE\f[R] if at least one character in \f[I]text\f[R] quick checks to maybe and no characters quick check to no.
24 | .RE
25 | .PP
26 | Support for normalization quick check must be enabled in the JSON configuration file.
27 | .PP
28 | .in +4n
29 | .EX
30 | {
31 |     "algorithms": {
32 |         "normalizationQuickCheck": true
33 |     }
34 | }
35 | .EE
36 | .in
37 | .SH RETURN VALUE
38 | .TP
39 | UNI_OK
40 | On success.
41 | .TP
42 | UNI_BAD_OPERATION
43 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
44 | .TP
45 | UNI_BAD_ENCODING
46 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
47 | .TP
48 | UNI_FEATURE_DISABLED
49 | If Unicorn was built without support for \f[I]form\f[R].
50 | .SH SEE ALSO
51 | .BR uni_normchk (3),
52 | .BR uninormchk (3),
53 | .BR UNI_TRUST (3),
54 | .BR uninormform (3),
55 | .BR unisize (3),
56 | .BR uniattr (3)
57 | .SH AUTHOR
58 | .UR https://railgunlabs.com
59 | Railgun Labs
60 | .UE .
61 | .SH INTERNET RESOURCES
62 | The online documentation is published on the
63 | .UR https://railgunlabs.com/unicorn
64 | Railgun Labs website
65 | .UE .
66 | .SH LICENSING
67 | Unicorn is distributed with its end-user license agreement (EULA).
68 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
69 | 


--------------------------------------------------------------------------------
/examples/encode_character.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This example demonstrates how to encode a code point as UTF-8, UTF-16, and UTF-32.
15 |     // After encoding, the code units are printed as hexadecimal.
16 | 
17 |     // The following variable defines the Unicode code point that will be encoded.
18 |     // In this example, it's Unicode character U+0300. Change this value to a
19 |     // different character and notice how the printed code units change.
20 |     unichar cp = 0x1F600;
21 | 
22 |     printf("Code point: U+%04X\n", cp);
23 | 
24 |     //
25 |     // Encode the code point as UTF-8. UTF-8 is a variable length encoding and will encode
26 |     // to either 1, 2, 3, or 4 code units.
27 |     //
28 | 
29 |     uint8_t u8[4];
30 |     unisize u8_len = 4;
31 | 
32 |     if (uni_encode(cp, u8, &u8_len, UNI_UTF8) == UNI_OK)
33 |     {
34 |         printf("UTF-8  :");
35 |         for (unisize i = 0; i < u8_len; i++)
36 |         {
37 |             printf(" 0x%02X", u8[i]);
38 |         }
39 |         putchar('\n');
40 |     }
41 | 
42 |     //
43 |     // Encode the code point as UTF-16. UTF-16 is a variable length encoding and will encode
44 |     // to either 1 or 2 code units.
45 |     //
46 | 
47 |     uint16_t u16[2];
48 |     unisize u16_len = 2;
49 | 
50 |     if (uni_encode(cp, u16, &u16_len, UNI_UTF16) == UNI_OK)
51 |     {
52 |         printf("UTF-16 :");
53 |         for (unisize i = 0; i < u16_len; i++)
54 |         {
55 |             printf(" 0x%04X", u16[i]);
56 |         }
57 |         putchar('\n');
58 |     }
59 | 
60 |     //
61 |     // Encode the code point as UTF-32. Code points are 21-bit integers and UTF-32 stores
62 |     // them in a 32-bit integer. UTF-32 is not variable length, unlike UTF-8 and UTF-16.
63 |     //
64 | 
65 |     uint32_t u32[1];
66 |     unisize u32_len = 1;
67 | 
68 |     if (uni_encode(cp, u32, &u32_len, UNI_UTF32) == UNI_OK)
69 |     {
70 |         printf("UTF-32 :");
71 |         for (unisize i = 0; i < u32_len; i++)
72 |         {
73 |             printf(" 0x%08X", u32[i]);
74 |         }
75 |         putchar('\n');
76 |     }
77 | 
78 |     return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/examples/case_fold.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | #include <string.h>
12 | #include <stdbool.h>
13 | 
14 | int main(int argc, char *argv[])
15 | {
16 |     // This code example demonstrates case folding strings. Case folding
17 |     // transform strings for the purposes of caseless string comparison.
18 |     // 
19 |     // You are not supposed to display case folded strings to the end-user.
20 |     // They for internal caseless string comparisons only.
21 | 
22 |     const char *src1 = u8"Straße";
23 |     const char *src2 = u8"STrasse";
24 | 
25 |     char dst1[16];
26 |     char dst2[16];
27 | 
28 |     unisize dst1_len = 16;
29 |     unisize dst2_len = 16;
30 | 
31 |     bool is_equal = false;
32 |     
33 |     // The following function case folds two strings, performs the comparison, and 
34 |     // reports the result. This approach is recommended when two strings are being
35 |     // compared in a one-off comparison check. If you'll be repeatedly comparing
36 |     // the same strings over and over again, then the approach in the next code
37 |     // fragment is preferred.
38 |     if (uni_casefoldcmp(UNI_CANONICAL, src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK)
39 |     {
40 |         if (is_equal)
41 |         {
42 |             puts("equal");
43 |         }
44 |         else
45 |         {
46 |             puts("not equal");
47 |         }
48 |     }
49 | 
50 |     // The following code fragment case folds two strings and compares the results.
51 |     // This more manual approach is recommended when you'll be comparing the same
52 |     // strings over and over again because you can case fold them _once_ and then
53 |     // re-use the results for each subsequent comparison.
54 |     if (uni_casefold(UNI_CANONICAL, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
55 |     {
56 |         if (uni_casefold(UNI_CANONICAL, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
57 |         {
58 |             if (strcmp(dst1, dst2) == 0)
59 |             {
60 |                 puts("equal");
61 |             }
62 |             else
63 |             {
64 |                 puts("not equal");
65 |             }
66 |         }
67 |     }
68 |     return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/examples/compare_text.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | #include <string.h>
12 | #include <stdbool.h>
13 | 
14 | int main(int argc, char *argv[])
15 | {
16 |     // This code example demonstrates comparing strings for canonical equivalence.
17 |     // Canonical equivalence means the graphemes are compared, not the code points
18 |     // or code units. It interprets precomposed and decomposed characters that
19 |     // represent the same grapheme as equivalent.
20 | 
21 |     const char *src1 = u8"ma\u0301scara"; // 'a' + U+0301 = á  (decomposed)
22 |     const char *src2 = u8"m\u00E1scara";  //       U+00E1 = á  (precomposed)
23 | 
24 |     char dst1[16];
25 |     char dst2[16];
26 | 
27 |     unisize dst1_len = 16;
28 |     unisize dst2_len = 16;
29 | 
30 |     bool is_equal = false;
31 |     
32 |     // The following function normalizes two strings, performs the comparison, and 
33 |     // prints the result. This approach is recommended when two strings are compared
34 |     // in a one-off comparison check. If you'll be repeatedly comparing the same
35 |     // strings over and over again, then the approach in the next fragment is preferred.
36 |     if (uni_normcmp(src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK)
37 |     {
38 |         if (is_equal)
39 |         {
40 |             puts("equal");
41 |         }
42 |         else
43 |         {
44 |             puts("not equal");
45 |         }
46 |     }
47 | 
48 |     // The following lines normalize two strings and compare the results.
49 |     // This more manual approach is recommended when you'll be comparing the same
50 |     // strings over and over again because you can normalize them _once_ and then
51 |     // re-use the results for each subsequent comparison.
52 |     if (uni_norm(UNI_NFD, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
53 |     {
54 |         if (uni_norm(UNI_NFD, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
55 |         {
56 |             if (strcmp(dst1, dst2) == 0)
57 |             {
58 |                 puts("equal");
59 |             }
60 |             else
61 |             {
62 |                 puts("not equal");
63 |             }
64 |         }
65 |     }
66 |     return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/man/uniattr.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uniattr \- text attributes
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "typedef uint32_t uniattr;"
11 | .fi
12 | .SH DESCRIPTION
13 | This type represents a bit mask of text attributes.
14 | Unless documented otherwise, all functions that accept a \f[B]uniattr\f[R] must observe the following rules:
15 | .PP
16 | There must be exactly one of the following character encoding flags present.
17 | .PP
18 | .RS
19 | .IP \[bu] 2
20 | \f[B]UNI_SCALAR\f[R](3)
21 | .IP \[bu] 2
22 | \f[B]UNI_UTF8\f[R](3)
23 | .IP \[bu] 2
24 | \f[B]UNI_UTF16\f[R](3)
25 | .IP \[bu] 2
26 | \f[B]UNI_UTF32\f[R](3)
27 | .RE
28 | .PP
29 | There can optionally be one of the following endian flags.
30 | If none are present, then native byte order is assumed.
31 | These flags are incompatible with \f[B]UNI_SCALAR\f[R](3) which is always in native byte order.
32 | These flags have no effect with \f[B]UNI_UTF8\f[R](3) because UTF-8 is endian independent.
33 | .PP
34 | .RS
35 | .IP \[bu] 2
36 | \f[B]UNI_NATIVE\f[R](3)
37 | .IP \[bu] 2
38 | \f[B]UNI_LITTLE\f[R](3)
39 | .IP \[bu] 2
40 | \f[B]UNI_BIG\f[R](3)
41 | .RE
42 | .PP
43 | There can optionally be any combination of the following flags.
44 | .PP
45 | .RS
46 | .IP \[bu] 2
47 | \f[B]UNI_TRUST\f[R](3)
48 | .IP \[bu] 2
49 | \f[B]UNI_NULIFY\f[R](3)
50 | .RE
51 | .PP
52 | The following example demonstrates using these flags with \f[B]uni_next\f[R](3) to parse the first code point of a UTF-16 big endian string.
53 | .PP
54 | .in +4n
55 | .EX
56 | const char16_t text[] = u"Hello, World!";
57 | int index = 0;
58 | unichar cp;
59 | uni_next(text, -1, UNI_UTF16 | UNI_BIG, &index, &cp);
60 | .EE
61 | .in
62 | .SH SEE ALSO
63 | .BR UNI_SCALAR (3),
64 | .BR UNI_UTF8 (3),
65 | .BR UNI_UTF16 (3),
66 | .BR UNI_UTF32 (3),
67 | .BR UNI_NATIVE (3),
68 | .BR UNI_LITTLE (3),
69 | .BR UNI_BIG (3),
70 | .BR UNI_TRUST (3),
71 | .BR UNI_NULIFY (3),
72 | .BR uni_next (3)
73 | .SH AUTHOR
74 | .UR https://railgunlabs.com
75 | Railgun Labs
76 | .UE .
77 | .SH INTERNET RESOURCES
78 | The online documentation is published on the
79 | .UR https://railgunlabs.com/unicorn
80 | Railgun Labs website
81 | .UE .
82 | .SH LICENSING
83 | Unicorn is distributed with its end-user license agreement (EULA).
84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
85 | 


--------------------------------------------------------------------------------
/scripts/decomposition.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import List, Set, Type, Tuple
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config, OptimizeFor
17 | from .tools import Codepoint, Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .ccc import CanonicalCombiningClass
21 | from .encoding import EncodingUTF8
22 | 
23 | class CanonicalDecompositionFeature(Feature):
24 |     @staticmethod
25 |     def dependencies() -> Set[Type[Feature]]:
26 |         return set([CanonicalCombiningClass, EncodingUTF8])
27 | 
28 |     def pre_process(self, codespace: Codespace) -> None:
29 |         codespace.register_property("canonical_decomposition_mapping_offset", 0, StorageSize.UINT16)
30 | 
31 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
32 |         decomp_property = codespace.get("canonical_decomposition_mapping_offset")
33 |         if config.optimize == OptimizeFor.SPEED:
34 |             file = archive.open('normalize_for_speed.json')
35 |         else:
36 |             file = archive.open('normalize_for_size.json')
37 |         data = json.loads(file.read())
38 |         mappings: List[Tuple[Codepoint,int]] = data["mappings"]
39 |         source: str = data["source"]
40 |         header: str = data["header"]
41 |         size: int = data["size"]
42 |         file.close()
43 | 
44 |         # Compute the size (in bytes) occupied by the decomposition mapping data.
45 |         if config.optimize == OptimizeFor.SPEED:
46 |             size *= config.character_storage_bytes()
47 | 
48 |         # Associate code points with their canonical decomposition mapping.
49 |         for mapping in mappings:
50 |             cp = mapping[0]
51 |             offset = mapping[1]
52 |             codespace.set(cp, decomp_property, offset)
53 | 
54 |         public_header = "#define UNICORN_FEATURE_NFD\n"
55 |         return SerializationData(source, header, public_header, size)
56 | 


--------------------------------------------------------------------------------
/scripts/composition.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import List, Set, Tuple, Type
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codepoint, Codespace
18 | from .feature import Feature, IS_COMPOSABLE, FLAGS_PROPERTY
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 | 
22 | class CanonicalCompositionFeature(Feature):
23 |     @staticmethod
24 |     def dependencies() -> Set[Type[Feature]]:
25 |         return set([CanonicalDecompositionFeature])
26 | 
27 |     def pre_process(self, codespace: Codespace) -> None:
28 |         codespace.register_property("canonical_composition_mapping_offset", 0, StorageSize.UINT16)
29 |         codespace.register_property("canonical_composition_mapping_count", 0, StorageSize.UINT8)
30 |         codespace.register_property(*FLAGS_PROPERTY)
31 | 
32 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
33 |         file = archive.open('composition.json')
34 |         data = json.loads(file.read())
35 |         mappings: List[Tuple[Codepoint,int,int]] = data["compositionMappings"]
36 |         size: int = data["compositionSize"]
37 |         source: str = data["compositionSource"]
38 |         header: str = data["compositionHeader"]
39 |         file.close()
40 | 
41 |         offset_property = codespace.get("canonical_composition_mapping_offset")
42 |         count_property = codespace.get("canonical_composition_mapping_count")
43 |         flags_property = codespace.get(FLAGS_PROPERTY[0])
44 | 
45 |         for value_triplet in mappings:
46 |             first = value_triplet[0]
47 |             offset = value_triplet[1]
48 |             count = value_triplet[2]
49 |             codespace.set(first, offset_property, offset)
50 |             codespace.set(first, count_property, count)
51 |             codespace.set_bitwise(first, flags_property, IS_COMPOSABLE)
52 | 
53 |         public_header = "#define UNICORN_FEATURE_NFC\n"
54 |         return SerializationData(source, header, public_header, size)
55 | 


--------------------------------------------------------------------------------
/man/uni_sortkeycmp.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_sortkeycmp \- compare sort keys
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_sortkeycmp(const uint16_t *" sk1 ", size_t " sk1_len ", const uint16_t *" sk2 ", size_t " sk2_len ", int32_t *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compares sort keys \f[I]sk1\f[R] and \f[I]sk2\f[R] and writes either -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]sk1 < sk2\f[R], \f[C]sk1 = sk2\f[R], or \f[C]sk1 > sk2\f[R].
14 | The sort keys must be generated with \f[B]uni_sortkeymk\f[R](3).
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the collation algorithm executed successfully.
19 | .TP
20 | UNI_BAD_OPERATION
21 | If \f[I]sk1\f[R], \f[I]sk2\f[R], or \f[I]result\f[R] are null.
22 | .SH EXAMPLES
23 | This example constructs two sort keys and compares them.
24 | Sort keys must be generated with the same settings for their order to make sense.
25 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength.
26 | In a real application you would cache the sort keys for future comparisons.
27 | For one-off comparisons, use \f[B]uni_collate\f[R](3).
28 | .PP
29 | .in +4n
30 | .EX
31 | #include <unicorn.h>
32 | #include <stdio.h>
33 | 
34 | int main(void)
35 | {
36 |     const char *s1 = "hello", *s2 = "Hi";
37 |     uint16_t sk1[32] = {0}, sk2[16] = {0};
38 |     size_t sk1_len = 32, sk2_len = 16;
39 | 
40 |     if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK ||
41 |         uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK)
42 |     {
43 |         puts("failed to construct sort keys");
44 |         return 1;
45 |     }
46 | 
47 |     int result;
48 |     if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK)
49 |     {
50 |         puts("failed to compare sort keys");
51 |         return 1;
52 |     }
53 | 
54 |     printf("%d", result);
55 |     return 0;
56 | }
57 | .EE
58 | .in
59 | .SH SEE ALSO
60 | .BR uni_sortkeymk (3)
61 | .SH AUTHOR
62 | .UR https://railgunlabs.com
63 | Railgun Labs
64 | .UE .
65 | .SH INTERNET RESOURCES
66 | The online documentation is published on the
67 | .UR https://railgunlabs.com/unicorn
68 | Railgun Labs website
69 | .UE .
70 | .SH LICENSING
71 | Unicorn is distributed with its end-user license agreement (EULA).
72 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
73 | 


--------------------------------------------------------------------------------
/man/uni_setmemfunc.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_setmemfunc \- register a memory allocator
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_setmemfunc(void *" user_data ", unimemfunc " allocf ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Sets \f[I]allocf\f[R] as the implementation for dynamic memory allocations.
14 | If \f[I]allocf\f[R] is null then the implementation reverts to its default allocator which may be the C standard allocator or a dummy allocator that always fails to allocate memory.
15 | The latter is only present when the C standard library allocators are disabled.
16 | .PP
17 | Support for C standard library allocators must be enabled in the JSON configuration file.
18 | If the standard allocators are not enabled, then dummy allocators that always return NULL are used.
19 | In this case, a custom allocator must be supplied.
20 | .PP
21 | .in +4n
22 | .EX
23 | {
24 |     "hasStandardAllocators": true
25 | }
26 | .EE
27 | .in
28 | .SH RETURN VALUE
29 | .TP
30 | UNI_OK
31 | If the memory allocator was modified.
32 | .TP
33 | UNI_FEATURE_DISABLED
34 | If \f[I]allocf\f[R] is null and Unicorn was built without support for the C standard library memory allocator.
35 | .SH EXAMPLES
36 | This example registers a custom memory allocator with Unicorn.
37 | In the example, the custom allocator wraps the C standard memory allocators, however, this is redundant because Unicorn already initializes with a custom allocator that wraps the C standard allocators.
38 | They are wrapped here for example purposes only.
39 | An actual implementation would implement and wrap its own custom memory allocator.
40 | .PP
41 | .in +4n
42 | .EX
43 | #include <unicorn.h>
44 | #include <stdlib.h>
45 | 
46 | static void *allocfree(void *ud, void *ptr, size_t osize, size_t nsize)
47 | {
48 |     if (nsize == 0)
49 |     {
50 |         free(ptr);
51 |         return NULL;
52 |     }
53 |     else
54 |     {
55 |         return realloc(ptr, nsize);
56 |     }
57 | }
58 | 
59 | int main(void)
60 | {
61 |     uni_setmemfunc(NULL, allocfree);
62 |     return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unimemfunc (3)
68 | .SH AUTHOR
69 | .UR https://railgunlabs.com
70 | Railgun Labs
71 | .UE .
72 | .SH INTERNET RESOURCES
73 | The online documentation is published on the
74 | .UR https://railgunlabs.com/unicorn
75 | Railgun Labs website
76 | .UE .
77 | .SH LICENSING
78 | Unicorn is distributed with its end-user license agreement (EULA).
79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
80 | 


--------------------------------------------------------------------------------
/scripts/quickcheck.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .ccc import CanonicalCombiningClass
21 | 
22 | class NFCQuickCheck(Feature):
23 |     @staticmethod
24 |     def dependencies() -> Set[Type[Feature]]:
25 |         return set([CanonicalCombiningClass])
26 | 
27 |     def pre_process(self, codespace: Codespace) -> None:
28 |         codespace.register_property("quick_check_flags", 0, StorageSize.UINT8)
29 | 
30 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
31 |         file = archive.open('quickcheck.json')
32 |         data = json.loads(file.read())
33 |         nfc_quick_check: Dict[str,int] = data["quickCheckNFC"]
34 |         file.close()
35 | 
36 |         qc_flags_property = codespace.get("quick_check_flags")
37 |         for cp, qc_value in nfc_quick_check.items():
38 |             codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value << 4)
39 | 
40 |         public_header = "#define UNICORN_FEATURE_NFC_QUICK_CHECK\n"
41 |         return SerializationData(public_header=public_header)
42 | 
43 | class NFDQuickCheck(Feature):
44 |     @staticmethod
45 |     def dependencies() -> Set[Type[Feature]]:
46 |         return set([CanonicalCombiningClass])
47 | 
48 |     def pre_process(self, codespace: Codespace) -> None:
49 |         codespace.register_property("quick_check_flags", 0, StorageSize.UINT8)
50 | 
51 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
52 |         file = archive.open('quickcheck.json')
53 |         data = json.loads(file.read())
54 |         nfd_quick_check: Dict[str,int] = data["quickCheckNFD"]
55 |         file.close()
56 | 
57 |         qc_flags_property = codespace.get("quick_check_flags")
58 |         for cp, qc_value in nfd_quick_check.items():
59 |             codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value)
60 | 
61 |         public_header = "#define UNICORN_FEATURE_NFD_QUICK_CHECK\n"
62 |         return SerializationData(public_header=public_header)
63 | 


--------------------------------------------------------------------------------
/man/uni_next.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_next \- decode a scalar value
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_next(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decodes one Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R].
14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the next scalar.
15 | .PP
16 | It returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters.
17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R].
18 | .PP
19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If the scalar was successfully decoded.
24 | .TP
25 | UNI_DONE
26 | If the end of \f[I]text\f[R] was reached.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
30 | .TP
31 | UNI_BAD_OPERATION
32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL.
33 | .SH EXAMPLES
34 | This example prints each Unicode scalar value from a text string encoded as UTF-8.
35 | .PP
36 | .in +4n
37 | .EX
38 | #include <unicorn.h>
39 | #include <stdio.h>
40 | 
41 | int main(void)
42 | {
43 |     const char str[] = u8"I 🕵️."; // I spy
44 |     unisize i = 0;
45 |     for (;;)
46 |     {
47 |         unichar cp;
48 |         unistat r = uni_next(str, -1, UNI_UTF8, &i, &cp);
49 |         if (r == UNI_DONE)
50 |         {
51 |             break;
52 |         }
53 |         else if (r == UNI_BAD_ENCODING)
54 |         {
55 |             // malformed character
56 |         }
57 |         else
58 |         {
59 |             printf("U+%04X\\n", cp); // print scalar
60 |         }
61 |     }
62 |     return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unistat (3),
68 | .BR UNI_TRUST (3),
69 | .BR unisize (3),
70 | .BR uniattr (3),
71 | .BR unichar (3)
72 | .SH AUTHOR
73 | .UR https://railgunlabs.com
74 | Railgun Labs
75 | .UE .
76 | .SH INTERNET RESOURCES
77 | The online documentation is published on the
78 | .UR https://railgunlabs.com/unicorn
79 | Railgun Labs website
80 | .UE .
81 | .SH LICENSING
82 | Unicorn is distributed with its end-user license agreement (EULA).
83 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
84 | 


--------------------------------------------------------------------------------
/examples/collate_text.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is public domain.
 3 |  *  You may use, modify, and distribute it without restriction.
 4 |  */
 5 | 
 6 | // Examples are also documented online at:
 7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
 8 | 
 9 | #include <unicorn.h>
10 | #include <stdio.h>
11 | 
12 | int main(int argc, char *argv[])
13 | {
14 |     // This code example demonstrates how to collate strings (compare strings for sorting).
15 |     // The online documentation is available here:
16 |     // <https://railgunlabs.com/unicorn/manual/api/collation/>.
17 | 
18 |     const char *s1 = u8"Hello World";
19 |     const char *s2 = u8"こんにちは世界";
20 | 
21 |     uint16_t sk1[64] = {0};
22 |     uint16_t sk2[64] = {0};
23 | 
24 |     size_t sk1_len = 64;
25 |     size_t sk2_len = 64;
26 | 
27 |     int32_t result = 0;
28 |     
29 |     // The following snippet collates two strings and prints the result of the comparison.
30 |     // This function, conceptually, builds sort keys for the input strings and compares them.
31 |     // Building a sort key is expensive. If strings are collated multiple times, it is recommend
32 |     // to build the sort keys once and use them for subsequent comparisons.
33 |     if (uni_collate(s1, -1, UNI_UTF8, s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, &result) == UNI_OK)
34 |     {
35 |         if (result < 0)
36 |         {
37 |             puts("s1 < s2");
38 |         }
39 |         else if (result > 0)
40 |         {
41 |             puts("s1 > s2");
42 |         }
43 |         else
44 |         {
45 |             puts("s1 = s2");
46 |         }
47 |     }
48 | 
49 |     // The following snippet collates strings the manual way. The first step is to
50 |     // generate a "sort key" for both strings. The sort key encodes the sorting
51 |     // weights of the string.
52 |     // 
53 |     // Constructing a sort key is the computationally expensive part of the process.
54 |     // Once the sort keys are constructed, they are compared which is inexpensive.
55 |     if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk1, &sk1_len) == UNI_OK)
56 |     {
57 |         if (uni_sortkeymk(s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk2, &sk2_len) == UNI_OK)
58 |         {
59 |             if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) == UNI_OK)
60 |             {
61 |                 if (result < 0)
62 |                 {
63 |                     puts("s1 < s2");
64 |                 }
65 |                 else if (result > 0)
66 |                 {
67 |                     puts("s1 > s2");
68 |                 }
69 |                 else
70 |                 {
71 |                     puts("s1 = s2");
72 |                 }
73 |             }
74 |         }
75 |     }
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/memory.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Unicorn - Embeddable Unicode Algorithms
 3 |  *  Copyright (c) 2024-2025 Railgun Labs
 4 |  *
 5 |  *  This software is dual-licensed: you can redistribute it and/or modify
 6 |  *  it under the terms of the GNU General Public License version 3 as
 7 |  *  published by the Free Software Foundation. For the terms of this
 8 |  *  license, see <https://www.gnu.org/licenses/>.
 9 |  *
10 |  *  Alternatively, you can license this software under a proprietary
11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
12 |  */
13 | 
14 | #include "common.h"
15 | #include "unidata.h"
16 | 
17 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES)
18 | #include <stdlib.h>
19 | static void *default_allocf(void *ud, void *ptr, size_t osize, size_t nsize)
20 | {
21 |     (void)ud;
22 |     (void)osize;
23 |     void *mem = NULL;
24 |     if (nsize == (size_t)0)
25 |     {
26 |         free(ptr); // cppcheck-suppress misra-c2012-21.3 ; This function is optional.
27 |     }
28 |     else
29 |     {
30 |         mem = realloc(ptr, nsize); // cppcheck-suppress misra-c2012-21.3 ; This function is optional.
31 |     }
32 |     return mem;
33 | }
34 | static unimemfunc unicorn_allocf = &default_allocf;
35 | #else
36 | static unimemfunc unicorn_allocf;
37 | #endif
38 | 
39 | static void *unicorn_ud;
40 | 
41 | UNICORN_API unistat uni_setmemfunc(void *user_data, unimemfunc allocf) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
42 | {
43 |     unistat status = UNI_OK;
44 |     if (allocf == NULL)
45 |     {
46 |         unicorn_ud = NULL;
47 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES)
48 |         unicorn_allocf = &default_allocf;
49 | #else
50 |         unicorn_allocf = NULL;
51 |         status = UNI_FEATURE_DISABLED;
52 | #endif
53 |     }
54 |     else
55 |     {
56 |         unicorn_ud = user_data;
57 |         unicorn_allocf = allocf;
58 |     }
59 |     return status;
60 | }
61 | 
62 | void *uni_malloc(size_t size)
63 | {
64 |     void *ptr = NULL;
65 |     if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
66 |     {
67 |         ptr = unicorn_allocf(unicorn_ud, NULL, 0, size);
68 |     }
69 |     return ptr;
70 | }
71 | 
72 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size)
73 | {
74 |     void *new_ptr = NULL;
75 |     if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
76 |     {
77 |         new_ptr = unicorn_allocf(unicorn_ud, old_ptr, old_size, new_size);
78 |     }
79 |     return new_ptr;
80 | }
81 | 
82 | void uni_free(void *ptr, size_t size)
83 | {
84 |     if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
85 |     {
86 |         (void)unicorn_allocf(unicorn_ud, ptr, size, 0);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/UnicornConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | #[=======================================================================[.rst:
 2 | FindUnicorn
 3 | ------------
 4 | 
 5 | Finds the Unicorn library.
 6 | 
 7 | Imported Targets
 8 | ^^^^^^^^^^^^^^^^
 9 | 
10 | This module provides the following imported targets, if found:
11 | 
12 | ``Railgun::Unicorn``
13 |   The Unicorn library
14 | 
15 | Result Variables
16 | ^^^^^^^^^^^^^^^^
17 | 
18 | This will define the following variables:
19 | 
20 | ``UNICORN_FOUND``
21 |   True if the system has the Unicorn library.
22 | ``UNICORN_VERSION``
23 |   The version of the Unicorn library which was found.
24 | ``UNICORN_INCLUDE_DIRS``
25 |   Include directories needed to use Unicorn.
26 | ``UNICORN_LIBRARIES``
27 |   Libraries needed to link to Unicorn.
28 | 
29 | Cache Variables
30 | ^^^^^^^^^^^^^^^
31 | 
32 | The following cache variables may also be set:
33 | 
34 | ``UNICORN_INCLUDE_DIR``
35 |   The directory containing ``unicorn.h``.
36 | ``UNICORN_LIBRARY``
37 |   The path to the Unicorn library.
38 | 
39 | #]=======================================================================]
40 | 
41 | # On Windows the header and library are installed in the same directory as this
42 | # script is therefore the following function is called to obtain its path.
43 | get_filename_component(UNICORN_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY)
44 | 
45 | # Find header.
46 | find_path(UNICORN_INCLUDE_DIR unicorn.h
47 |           HINTS /usr/local/include /usr/include ${UNICORN_CONFIG_DIR})
48 | 
49 | # Find library.
50 | find_library(UNICORN_LIBRARY
51 |              NAMES Unicorn unicorn
52 |              HINTS /usr/local/lib /usr/lib ${UNICORN_CONFIG_DIR})
53 | 
54 | # Not needed anymore.
55 | unset(UNICORN_CONFIG_DIR)
56 | 
57 | # Use version specified in top-level CMakeLists.txt
58 | set(UNICORN_VERSION "@PROJECT_VERSION@")
59 | 
60 | # Handle the QUIETLY and REQUIRED arguments and set UNICORN_FOUND to TRUE if all listed variables are TRUE.
61 | include(FindPackageHandleStandardArgs)
62 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Unicorn
63 |                                   FOUND_VAR UNICORN_FOUND
64 |                                   REQUIRED_VARS UNICORN_LIBRARY UNICORN_INCLUDE_DIR
65 |                                   VERSION_VAR UNICORN_VERSION)
66 | 
67 | # Set result variables.
68 | if(UNICORN_FOUND)
69 |     set(UNICORN_LIBRARIES ${UNICORN_LIBRARY})
70 |     set(UNICORN_INCLUDE_DIRS ${UNICORN_INCLUDE_DIR})
71 | endif()
72 | 
73 | # Export a module.
74 | if(UNICORN_FOUND AND NOT TARGET Railgun::Unicorn)
75 |     add_library(Railgun::Unicorn UNKNOWN IMPORTED)
76 |     set_target_properties(Railgun::Unicorn PROPERTIES
77 |                           IMPORTED_LOCATION "${UNICORN_LIBRARY}"
78 |                           INTERFACE_INCLUDE_DIRECTORIES "${UNICORN_INCLUDE_DIR}")
79 | endif()
80 | 
81 | # Cached variables should be hidden in the CMake interface unless the user explicitly asks to edit them.
82 | mark_as_advanced(UNICORN_INCLUDE_DIR UNICORN_LIBRARY)
83 | 


--------------------------------------------------------------------------------
/man/uni_caseconv.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_caseconv \- case convert a string
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_caseconv(unicaseconv " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function case converts \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R].
14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
15 | .PP
16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If \f[I]src\f[R] was case converted successfully.
25 | .TP
26 | UNI_BAD_OPERATION
27 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero.
28 | .TP
29 | UNI_BAD_ENCODING
30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
31 | .TP
32 | UNI_NO_SPACE
33 | If \f[I]dst\f[R] was not large enough to accommodate the case converted text.
34 | .TP
35 | UNI_FEATURE_DISABLED
36 | If the library was built without support for case conversion.
37 | .SH EXAMPLES
38 | This example case converts a string to uppercase.
39 | .PP
40 | .in +4n
41 | .EX
42 | #include <unicorn.h>
43 | #include <stdio.h>
44 | 
45 | int main(void)
46 | {
47 |     const char *in = u8"hello, 世界";
48 |     char out[32];
49 |     unisize outlen = sizeof(out);
50 | 
51 |     if (uni_caseconv(UNI_UPPER, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK)
52 |     {
53 |         // something went wrong
54 |         return 1;
55 |     }
56 | 
57 |     printf("%.*s", outlen, out); // prints "HELLO, 世界"
58 |     return 0;
59 | }
60 | .EE
61 | .in
62 | .SH SEE ALSO
63 | .BR unistat (3),
64 | .BR UNI_TRUST (3),
65 | .BR unicaseconv (3),
66 | .BR unisize (3),
67 | .BR uniattr (3)
68 | .SH AUTHOR
69 | .UR https://railgunlabs.com
70 | Railgun Labs
71 | .UE .
72 | .SH INTERNET RESOURCES
73 | The online documentation is published on the
74 | .UR https://railgunlabs.com/unicorn
75 | Railgun Labs website
76 | .UE .
77 | .SH LICENSING
78 | Unicorn is distributed with its end-user license agreement (EULA).
79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
80 | 


--------------------------------------------------------------------------------
/scripts/case_folding.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import List, Set, Type, Tuple
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace, Codepoint
18 | from .feature import Feature, FLAGS_PROPERTY, IS_NORMALIZATION_NEEDED, IS_CHANGES_WHEN_CASEFOLDED
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 | 
22 | class DefaultCaseFolding(Feature):
23 |     def pre_process(self, codespace: Codespace) -> None:
24 |         codespace.register_property("full_casefold_mapping_offset", 0, StorageSize.UINT16)
25 |         codespace.register_property(*FLAGS_PROPERTY)
26 | 
27 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
28 |         file = archive.open('casefold.json')
29 |         data = json.loads(file.read())
30 |         source: str = data["source"]
31 |         header: str = data["header"]
32 |         size: int = data["size"]
33 |         has_greek: List[Codepoint] = data["hasGreek"]
34 |         case_foldings: List[Tuple[Codepoint,int]] = data["caseFoldings"]
35 |         changes_when_casefolded: List[Codepoint] = data["changesWhenCasefolded"]
36 |         file.close()
37 | 
38 |         flags_property = codespace.get(FLAGS_PROPERTY[0])
39 |         casefold_property = codespace.get("full_casefold_mapping_offset")
40 | 
41 |         for cp in has_greek:
42 |             codespace.set_bitwise(cp, flags_property, IS_NORMALIZATION_NEEDED)
43 | 
44 |         for cp in changes_when_casefolded:
45 |             codespace.set_bitwise(cp, flags_property, IS_CHANGES_WHEN_CASEFOLDED)
46 | 
47 |         for cp,offset in case_foldings:
48 |             codespace.set(cp, casefold_property, offset)
49 | 
50 |         header += "#define UNICORN_CHAR_NEEDS_NORMALIZATION {0}u\n".format(IS_NORMALIZATION_NEEDED)
51 |         header += "#define UNICORN_CHAR_CHANGES_WHEN_CASEFOLDED {0}u\n".format(IS_CHANGES_WHEN_CASEFOLDED)
52 | 
53 |         public_header = "#define UNICORN_FEATURE_CASEFOLD_DEFAULT\n"
54 |         return SerializationData(source, header, public_header, size)
55 | 
56 | class CanonicalCaseFolding(Feature):
57 |     @staticmethod
58 |     def dependencies() -> Set[Type[Feature]]:
59 |         return set([DefaultCaseFolding, CanonicalDecompositionFeature])
60 | 
61 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
62 |         public_header = "#define UNICORN_FEATURE_CASEFOLD_CANONICAL\n"
63 |         return SerializationData(public_header=public_header)
64 | 


--------------------------------------------------------------------------------
/man/unistat.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | unistat \- status code
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .B enum unistat {
11 | .RS
12 | .B UNI_OK,
13 | .B UNI_DONE,
14 | .B UNI_NO_MEMORY,
15 | .B UNI_NO_SPACE,
16 | .B UNI_BAD_ENCODING,
17 | .B UNI_BAD_OPERATION,
18 | .B UNI_FEATURE_DISABLED,
19 | .B UNI_MALFUNCTION,
20 | .RE
21 | .B };
22 | .fi
23 | .SH DESCRIPTION
24 | Most functions in the Unicorn library return a constant from this enumeration.
25 | Of the constants, only \f[B]UNI_OK\f[R] and \f[B]UNI_DONE\f[R] represent a success status whereas the others represent failures.
26 | .SH CONSTANTS
27 | .TP
28 | .BR UNI_OK
29 | Represents the successful execution of an operation.
30 | .TP
31 | .BR UNI_DONE
32 | Represents the successful completion of an operation.
33 | An example of a function that returns this status code is \f[B]uni_next\f[R](3) which returns it when the end of the input string has been reached.
34 | .TP
35 | .BR UNI_NO_MEMORY
36 | Functions that can dynamically allocate memory will list this constant as one of their potential return values.
37 | Functions that omit this constant as a return value do not perform dynamic memory allocation.
38 | .TP
39 | .BR UNI_NO_SPACE
40 | This failure code indicates the destination buffer is too small to receive the content and therefore a larger buffer must be used.
41 | .TP
42 | .BR UNI_BAD_ENCODING
43 | This failure code indicates a malformed character sequence was encountered when decoding text.
44 | Examples of malformed character sequences would be overlong sequences in UTF-8 or unpaired surrogate characters in UTF-16.
45 | .TP
46 | .BR UNI_BAD_OPERATION
47 | This failure code is returned by functions when they are called with invalid arguments.
48 | For example, a function might return this code when given a null pointer when a non-null pointer was expected.
49 | .TP
50 | .BR UNI_FEATURE_DISABLED
51 | This failure code indicates the requested operation cannot be performed because the feature is disabled.
52 | Unicorn allows consumers to customize the Unicode features it's built with.
53 | This is useful for reducing the footprint of the library.
54 | If a user attempts to call a function that uses a disabled feature of the library, then this status code will be returned.
55 | .TP
56 | .BR UNI_MALFUNCTION
57 | This failure code indicates a defect with the implementation.
58 | In a working version of Unicorn, an application will never see this status code.
59 | If an application encounters this code, it means there is a bug in Unicorn.
60 | .SH SEE ALSO
61 | .BR uni_next (3)
62 | .SH AUTHOR
63 | .UR https://railgunlabs.com
64 | Railgun Labs
65 | .UE .
66 | .SH INTERNET RESOURCES
67 | The online documentation is published on the
68 | .UR https://railgunlabs.com/unicorn
69 | Railgun Labs website
70 | .UE .
71 | .SH LICENSING
72 | Unicorn is distributed with its end-user license agreement (EULA).
73 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
74 | 


--------------------------------------------------------------------------------
/man/uni_norm.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_norm \- normalize text
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_norm(uninormform " form ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function normalizes \f[I]src\f[R] and writes the result to the \f[I]dst\f[R] buffer.
14 | .PP
15 | The implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
16 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
17 | .PP
18 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the function writes to \f[I]dst_len\f[R] the number of code units in the fully normalized text and returns \f[B]UNI_OK\f[R].
19 | The function can be called this way to first compute the total size of the destination buffer, then called again with a sufficiently sized buffer.
20 | .PP
21 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated.
22 | .SH RETURN VALUE
23 | .TP
24 | UNI_OK
25 | On success.
26 | .TP
27 | UNI_BAD_OPERATION
28 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is negative, or if \f[I]dst\f[R] is NULL and \f[I]dst_len\f[R] is greater than zero.
29 | .TP
30 | UNI_BAD_ENCODING
31 | If \f[I]src\f[R] is malformed; this is never returned if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
32 | .TP
33 | UNI_NO_SPACE
34 | If \f[I]dst\f[R] lacks the capacity to store the normalization of \f[I]src\f[R].
35 | .TP
36 | UNI_NO_MEMORY
37 | If dynamic memory allocation failed.
38 | .TP
39 | UNI_FEATURE_DISABLED
40 | If Unicorn was built without support for normalizing to \f[I]form\f[R].
41 | .SH EXAMPLES
42 | This example normalizes the input string to Normalization Form D. This form is ideal for in-memory string comparison because it is quick to compute.
43 | For persistent storage or transmission, Normalization Form C is preferred.
44 | .PP
45 | .in +4n
46 | .EX
47 | #include <unicorn.h>
48 | #include <stdio.h>
49 | 
50 | int main(void)
51 | {
52 |     const char *in = u8"Åström";
53 |     char out[32];
54 |     unisize outlen = sizeof(out);
55 | 
56 |     if (uni_norm(UNI_NFD, in, -1, UNI_UTF8,  out, &outlen, UNI_UTF8) != UNI_OK)
57 |     {
58 |         // something went wrong
59 |         return 1;
60 |     }
61 | 
62 |     printf("%.*s", outlen, out);
63 |     return 0;
64 | }
65 | .EE
66 | .in
67 | .SH SEE ALSO
68 | .BR unistat (3),
69 | .BR UNI_TRUST (3),
70 | .BR uninormform (3),
71 | .BR unisize (3),
72 | .BR uniattr (3)
73 | .SH AUTHOR
74 | .UR https://railgunlabs.com
75 | Railgun Labs
76 | .UE .
77 | .SH INTERNET RESOURCES
78 | The online documentation is published on the
79 | .UR https://railgunlabs.com/unicorn
80 | Railgun Labs website
81 | .UE .
82 | .SH LICENSING
83 | Unicorn is distributed with its end-user license agreement (EULA).
84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
85 | 


--------------------------------------------------------------------------------
/scripts/case_conversion.py:
--------------------------------------------------------------------------------
 1 | #  Unicorn - Embeddable Unicode Algorithms
 2 | #  Copyright (c) 2024-2025 Railgun Labs
 3 | #
 4 | #  This software is dual-licensed: you can redistribute it and/or modify
 5 | #  it under the terms of the GNU General Public License version 3 as
 6 | #  published by the Free Software Foundation. For the terms of this
 7 | #  license, see <https://www.gnu.org/licenses/>.
 8 | #
 9 | #  Alternatively, you can license this software under a proprietary
10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
11 | 
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 | 
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature, FLAGS_PROPERTY
19 | from .simple_case_mappings import SimpleLowercaseMapping, SimpleTitlecaseMapping, SimpleUppercaseMapping
20 | from .segmentation import WordBreak
21 | from .ccc import CanonicalCombiningClass
22 | from .basics import SerializationData
23 | 
24 | class _CaseConversion(Feature):
25 |     def pre_process(self, codespace: Codespace) -> None:
26 |         codespace.register_property(*FLAGS_PROPERTY)
27 | 
28 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
29 |         file = archive.open('special_case_mappings.json')
30 |         data = json.loads(file.read())
31 |         source: str = data["source"]
32 |         header: str = data["header"]
33 |         size: int = data["size"]
34 |         mappings: Dict[str,int] = data["characterFlags"]
35 |         file.close()
36 | 
37 |         flags_property = codespace.get(FLAGS_PROPERTY[0])
38 | 
39 |         for cp, flags in mappings.items():
40 |             codespace.set_bitwise(int(cp,16), flags_property, flags)
41 | 
42 |         return SerializationData(source=source, header=header, size=size)
43 | 
44 | class LowercaseConversion(Feature):
45 |     @staticmethod
46 |     def dependencies() -> Set[Type[Feature]]:
47 |         return set([SimpleLowercaseMapping, _CaseConversion])
48 | 
49 |     def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
50 |         public_header = "#define UNICORN_FEATURE_LOWERCASE_CONVERT\n"
51 |         return SerializationData(public_header=public_header)
52 | 
53 | class UppercaseConversion(Feature):
54 |     @staticmethod
55 |     def dependencies() -> Set[Type[Feature]]:
56 |         return set([SimpleUppercaseMapping, _CaseConversion])
57 | 
58 |     def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
59 |         public_header = "#define UNICORN_FEATURE_UPPERCASE_CONVERT\n"
60 |         return SerializationData(public_header=public_header)
61 | 
62 | class TitlecaseConversion(Feature):
63 |     @staticmethod
64 |     def dependencies() -> Set[Type[Feature]]:
65 |         return set([SimpleTitlecaseMapping, WordBreak, CanonicalCombiningClass, LowercaseConversion])
66 | 
67 |     def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
68 |         public_header = "#define UNICORN_FEATURE_TITLECASE_CONVERT\n"
69 |         return SerializationData(public_header=public_header)
70 | 


--------------------------------------------------------------------------------
/man/uni_convert.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_convert \- convert encoding forms
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_convert(const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function converts the text pointed to by \f[I]src\f[R], which is in the encoding form specified by \f[I]src_attr\f[R], to the encoding specified by \f[I]dst_attr\f[R] and writes the result to \f[I]dst\f[R].
14 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated.
15 | .PP
16 | If \f[I]dst\f[R] isn't large enough to contain the re-encoded \f[I]src\f[R] text, then it will be truncated to the nearest code point boundary and \f[B]UNI_NO_SPACE\f[R] will be returned.
17 | If \f[I]dst\f[R] is large enough, then \f[B]UNI_OK\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully re-encoded text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .PP
22 | If \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] are identical, then this function behaves like \f[B]memcpy\f[R](3).
23 | .SH RETURN VALUE
24 | .TP
25 | UNI_OK
26 | On success.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]src\f[R] or \f[C]dest_len\f[R] are null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_NO_SPACE
35 | If \f[C]dest\f[R] is too small.
36 | .TP
37 | UNI_FEATURE_DISABLED
38 | If the \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] encoding forms are disabled.
39 | .SH EXAMPLES
40 | This example re-encodes text from UTF-32 to UTF-8.
41 | It uses \f[C]char32_t\f[R] and 'U' literal strings from C11.
42 | .PP
43 | .in +4n
44 | .EX
45 | #include <unicorn.h>
46 | #include <uchar.h>
47 | #include <stdio.h>
48 | 
49 | int main(void)
50 | {
51 |     const char32_t in[] = U"👨🏻‍🚀🧑🏼‍🚀 landed on the 🌕 in 1969.";
52 |     char out[64];
53 |     unisize outlen = sizeof(out);
54 | 
55 |     if (uni_convert(in, -1, UNI_UTF32, out, &outlen, UNI_UTF8) != UNI_OK)
56 |     {
57 |         // something went wrong
58 |         return 1;
59 |     }
60 | 
61 |     printf("%.*s", outlen, out);
62 |     return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unistat (3),
68 | .BR UNI_TRUST (3),
69 | .BR unisize (3),
70 | .BR uniattr (3)
71 | .SH AUTHOR
72 | .UR https://railgunlabs.com
73 | Railgun Labs
74 | .UE .
75 | .SH INTERNET RESOURCES
76 | The online documentation is published on the
77 | .UR https://railgunlabs.com/unicorn
78 | Railgun Labs website
79 | .UE .
80 | .SH LICENSING
81 | Unicorn is distributed with its end-user license agreement (EULA).
82 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
83 | 


--------------------------------------------------------------------------------
/man/uni_encode.3:
--------------------------------------------------------------------------------
  1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
  2 | .SH NAME
  3 | uni_encode \- encode a scalar value
  4 | .SH LIBRARY
  5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
  6 | .SH SYNOPSIS
  7 | .nf
  8 | .B #include <unicorn.h>
  9 | .PP
 10 | .BI "unistat uni_encode(unichar " cp ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
 11 | .fi
 12 | .SH DESCRIPTION
 13 | This function encodes the Unicode scalar value \f[I]cp\f[R] into the encoding form \f[I]dst_attr\f[R] and writes the resulting code units to \f[I]dst\f[R].
 14 | The code unit capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
 15 | The implementation will write the number of code units needed to encode \f[I]cp\f[R] to \f[I]dst_len\f[R] on output.
 16 | .PP
 17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
 18 | .PP
 19 | To ensure the \f[I]dst\f[R] is sufficiently sized, the following table lists the number of code units needed to encode any character within its respective encoding form.
 20 | .PP
 21 | .TS
 22 | allbox tab(|);
 23 | ll.
 24 | \fBEncoding Form\fR|\fBLongest Code Unit Sequence\fR
 25 | T{
 26 | UTF-8
 27 | T}|T{
 28 | 4
 29 | T}
 30 | T{
 31 | UTF-16
 32 | T}|T{
 33 | 2
 34 | T}
 35 | T{
 36 | UTF-32
 37 | T}|T{
 38 | 1
 39 | T}
 40 | .TE
 41 | .PP
 42 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the number of code units needed will be computed by the implementation and written to \f[I]dst_len\f[R] and \f[B]UNI_OK\f[R] is returned.
 43 | .SH RETURN VALUE
 44 | .TP
 45 | UNI_OK
 46 | If the character was encoded successfully.
 47 | .TP
 48 | UNI_BAD_OPERATION
 49 | If \f[I]cp\f[R] is not a Unicode scalar value, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero.
 50 | .TP
 51 | UNI_NO_SPACE
 52 | If \f[C]dest\f[R] is too small.
 53 | .TP
 54 | UNI_FEATURE_DISABLED
 55 | If Unicorn was built without support for the encoding form specified by \f[I]dst_attr\f[R].
 56 | .SH EXAMPLES
 57 | This example encodes the Unicode scalar value for GRINNING FACE (U+1F600) as UTF-8 encoded code units.
 58 | The code units are printed to stdout as hexadecimal numbers.
 59 | .PP
 60 | .in +4n
 61 | .EX
 62 | #include <unicorn.h>
 63 | #include <stdio.h>
 64 | 
 65 | int main(void)
 66 | {
 67 |     unichar cp = 0x1F600;
 68 | 
 69 |     uint8_t u8[4];
 70 |     unisize u8_len = 4;
 71 | 
 72 |     if (uni_encode(cp, u8, &u8_len, UNI_UTF8) != UNI_OK)
 73 |     {
 74 |         puts("failed to encode character");
 75 |     }
 76 | 
 77 |     for (unisize i = 0; i < u8_len; i++)
 78 |     {
 79 |         printf("0x%02X ", u8[i]);
 80 |     }
 81 |     return 0;
 82 | }
 83 | .EE
 84 | .in
 85 | .SH SEE ALSO
 86 | .BR unistat (3),
 87 | .BR unichar (3),
 88 | .BR unisize (3),
 89 | .BR uniattr (3)
 90 | .SH AUTHOR
 91 | .UR https://railgunlabs.com
 92 | Railgun Labs
 93 | .UE .
 94 | .SH INTERNET RESOURCES
 95 | The online documentation is published on the
 96 | .UR https://railgunlabs.com/unicorn
 97 | Railgun Labs website
 98 | .UE .
 99 | .SH LICENSING
100 | Unicorn is distributed with its end-user license agreement (EULA).
101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
102 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Unicorn - Embeddable Unicode Algorithms
  3 |  *  Copyright (c) 2024-2025 Railgun Labs
  4 |  *
  5 |  *  This software is dual-licensed: you can redistribute it and/or modify
  6 |  *  it under the terms of the GNU General Public License version 3 as
  7 |  *  published by the Free Software Foundation. For the terms of this
  8 |  *  license, see <https://www.gnu.org/licenses/>.
  9 |  *
 10 |  *  Alternatively, you can license this software under a proprietary
 11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
 12 |  */
 13 | 
 14 | #ifndef COMMON_H
 15 | #define COMMON_H
 16 | 
 17 | #include "unicorn.h"
 18 | #include <string.h>
 19 | #include <assert.h>
 20 | 
 21 | #define GET_ENCODING(FLAGS) ((FLAGS) & (UNI_SCALAR | UNI_UTF8 | UNI_UTF16 | UNI_UTF32))
 22 | 
 23 | #define UNICHAR_C(X) ((unichar)(X))
 24 | #define UNISIZE_C(X) ((unisize)(X))
 25 | #define UNIHASH_C(X) ((unihash)(X))
 26 | 
 27 | #define UNICORN_LARGEST_CODE_POINT UNICHAR_C(0x10FFFFL)
 28 | #define FIRST_ILLEGAL_CODE_POINT UNICHAR_C(0x110000L)
 29 | 
 30 | #define COUNT_OF(array) (sizeof(array) / sizeof((array)[0]))
 31 | 
 32 | typedef uint8_t UChar8;
 33 | typedef uint16_t UChar16;
 34 | typedef uint32_t UChar32;
 35 | 
 36 | typedef uint32_t unihash;
 37 | 
 38 | typedef uint16_t(*ByteSwap16)(uint16_t val);
 39 | typedef uint32_t(*ByteSwap32)(uint32_t val);
 40 | 
 41 | struct unitext
 42 | {
 43 |     const void *data;
 44 |     unisize index;
 45 |     unisize length;
 46 |     uniattr encoding;
 47 | };
 48 | 
 49 | void *uni_malloc(size_t size);
 50 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size);
 51 | void uni_free(void *ptr, size_t size);
 52 | 
 53 | unisize unichar_to_u8(unichar codepoint, UChar8 bytes[4]);
 54 | unisize unichar_to_u16(unichar codepoint, UChar16 words[2], ByteSwap16 swap); // cppcheck-suppress premium-misra-c-2012-17.3 ; This is a false positive.
 55 | 
 56 | unisize uni_prev_UTF8_seqlen(const UChar8 *start, unisize offset);
 57 | 
 58 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding);
 59 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding);
 60 | 
 61 | void uni_message(const char *msg);
 62 | 
 63 | static inline bool is_low_surrogate(unichar c)
 64 | {
 65 |     bool is_low;
 66 |     if (c < UNICHAR_C(0xDC00))
 67 |     {
 68 |         is_low = false;
 69 |     }
 70 |     else if (c > UNICHAR_C(0xDFFF))
 71 |     {
 72 |         is_low = false;
 73 |     }
 74 |     else
 75 |     {
 76 |         is_low = true;
 77 |     }
 78 |     return is_low;
 79 | }
 80 | 
 81 | static inline bool is_high_surrogate(unichar c)
 82 | {
 83 |     bool is_high;
 84 |     if (c < UNICHAR_C(0xD800))
 85 |     {
 86 |         is_high = false;
 87 |     }
 88 |     else if (c > UNICHAR_C(0xDBFF))
 89 |     {
 90 |         is_high = false;
 91 |     }
 92 |     else
 93 |     {
 94 |         is_high = true;
 95 |     }
 96 |     return is_high;
 97 | }
 98 | 
 99 | #if defined(_MSC_VER)
100 | #define UNREACHABLE __assume(false)
101 | #elif defined(__GNUC__) || defined(__clang__)
102 | #define UNREACHABLE __builtin_unreachable()
103 | #else
104 | #define UNREACHABLE assert(0 && "unreachable code") // LCOV_EXCL_BR_LINE
105 | #endif
106 | 
107 | #endif // COMMON_H
108 | 


--------------------------------------------------------------------------------
/man/uni_casefold.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_casefold \- case fold a string
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_casefold(unicasefold " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function case folds \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R].
14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
15 | .PP
16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If \f[C]text\f[R] was checked successfully.
25 | .TP
26 | UNI_BAD_OPERATION
27 | If \f[C]text\f[R] is null, \f[C]length\f[R] is negative, or \f[C]result\f[R] is null.
28 | .TP
29 | UNI_BAD_ENCODING
30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
31 | .TP
32 | UNI_NO_SPACE
33 | If \f[I]dst\f[R] was not large enough.
34 | .TP
35 | UNI_FEATURE_DISABLED
36 | If the library was built without support for case folding.
37 | .TP
38 | UNI_NO_MEMORY
39 | If dynamic memory allocation failed.
40 | .SH EXAMPLES
41 | This example casefolds a string for canonical caseless comparison.
42 | .PP
43 | The example will produce an output string that is longer than the input string.
44 | This is because the German 'ß' (U+00DF) case folds to 'ss' (U+0073 U+0073) and the Latin small letter 'ö' with diaeresis (U+00F6) canonically normalizes to an 'o' (U+006F) followed by a combining diaeresis character (U+0308).
45 | .PP
46 | .in +4n
47 | .EX
48 | #include <unicorn.h>
49 | #include <stdio.h>
50 | 
51 | int main(void)
52 | {
53 |     const char *in = u8"Stößen";
54 |     char out[32];
55 |     unisize outlen = sizeof(out);
56 | 
57 |     if (uni_casefold(UNI_CANONICAL, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK)
58 |     {
59 |         // something went wrong
60 |         return 1;
61 |     }
62 | 
63 |     printf("%.*s", outlen, out); // prints "stössen"
64 |     return 0;
65 | }
66 | .EE
67 | .in
68 | .SH SEE ALSO
69 | .BR unistat (3),
70 | .BR UNI_TRUST (3),
71 | .BR unicasefold (3),
72 | .BR unisize (3),
73 | .BR uniattr (3)
74 | .SH AUTHOR
75 | .UR https://railgunlabs.com
76 | Railgun Labs
77 | .UE .
78 | .SH INTERNET RESOURCES
79 | The online documentation is published on the
80 | .UR https://railgunlabs.com/unicorn
81 | Railgun Labs website
82 | .UE .
83 | .SH LICENSING
84 | Unicorn is distributed with its end-user license agreement (EULA).
85 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
86 | 


--------------------------------------------------------------------------------
/man/uni_sortkeymk.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_sortkeymk \- make a sort key
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_sortkeymk(const void *" text ", unisize " text_len ", uniattr " text_attr ", uniweighting " weighting ", unistrength " strength ", uint16_t *" sortkey ", size_t *" sortkey_cap ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function constructs a sort key for \f[I]text\f[R] and writes the result to \f[I]sortkey\f[R].
14 | .PP
15 | The number of code units in \f[I]text\f[R] is specified by \f[I]text_len\f[R].
16 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
17 | .PP
18 | The capacity of \f[I]sortkey\f[R] is specified by \f[I]sortkey_cap\f[R].
19 | The implementation always writes to \f[I]sortkey_cap\f[R] the total number of weights needed for \f[I]text\f[R].
20 | If the capacity of \f[I]sortkey\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If the scalar was successfully decoded.
25 | .TP
26 | UNI_NO_SPACE
27 | If \f[I]sortkey\f[R] lacks capacity for the collation elements.
28 | .TP
29 | UNI_BAD_OPERATION
30 | If \f[I]text\f[R] or \f[I]sortkey_cap\f[R] are NULL.
31 | .TP
32 | UNI_BAD_ENCODING
33 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
34 | .TP
35 | UNI_NO_MEMORY
36 | If dynamic memory allocation failed.
37 | .SH EXAMPLES
38 | This example constructs two sort keys and compares them.
39 | Sort keys must be generated with the same settings for their order to make sense.
40 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength.
41 | In a real application you would cache the sort keys for future comparisons.
42 | For one-off comparisons, use \f[B]uni_collate\f[R](3).
43 | .PP
44 | .in +4n
45 | .EX
46 | #include <unicorn.h>
47 | #include <stdio.h>
48 | 
49 | int main(void)
50 | {
51 |     const char *s1 = "hello", *s2 = "Hi";
52 |     uint16_t sk1[32] = {0}, sk2[16] = {0};
53 |     size_t sk1_len = 32, sk2_len = 16;
54 | 
55 |     if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK ||
56 |         uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK)
57 |     {
58 |         puts("failed to construct sort keys");
59 |         return 1;
60 |     }
61 | 
62 |     int result;
63 |     if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK)
64 |     {
65 |         puts("failed to compare sort keys");
66 |         return 1;
67 |     }
68 | 
69 |     printf("%d", result);
70 |     return 0;
71 | }
72 | .EE
73 | .in
74 | .SH SEE ALSO
75 | .BR unistat (3),
76 | .BR uniweighting (3),
77 | .BR unistrength (3),
78 | .BR UNI_TRUST (3),
79 | .BR unisize (3),
80 | .BR uniattr (3)
81 | .SH AUTHOR
82 | .UR https://railgunlabs.com
83 | Railgun Labs
84 | .UE .
85 | .SH INTERNET RESOURCES
86 | The online documentation is published on the
87 | .UR https://railgunlabs.com/unicorn
88 | Railgun Labs website
89 | .UE .
90 | .SH LICENSING
91 | Unicorn is distributed with its end-user license agreement (EULA).
92 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
93 | 


--------------------------------------------------------------------------------
/man/uni_collate.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_collate \- compare strings for sorting
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_collate(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", uniweighting " weighting ", unistrength " strength ", int32_t *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compares \f[I]s1\f[R] and \f[I]s2\f[R] for sorting and writes -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]s1 < s2\f[R], \f[C]s1 = s2\f[R], or \f[C]s1 > s2\f[R].
14 | .PP
15 | This function is intended for one-off string comparisons.
16 | If a string will be compared multiple times, then it's recommended to construct a sort key once with \f[B]uni_sortkeymk\f[R](3) and perform comparisons with \f[B]uni_sortkeycmp\f[R](3).
17 | .PP
18 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R].
19 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated.
20 | .PP
21 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R].
22 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated.
23 | .SH RETURN VALUE
24 | .TP
25 | UNI_OK
26 | On success.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_FEATURE_DISABLED
35 | If Unicorn was built without support for collation.
36 | .TP
37 | UNI_NO_MEMORY
38 | If dynamic memory allocation failed.
39 | .SH EXAMPLES
40 | This example collates two strings.
41 | The function conceptually constructs a sort key for the input strings and then compares them.
42 | Constructing a sort key is expensive.
43 | If a string will be collated multiple times, then it is recommended to construct a sort key once and cache it for future calculations.
44 | See \f[B]uni_sortkeymk\f[R](3) for details.
45 | .PP
46 | .in +4n
47 | .EX
48 | #include <unicorn.h>
49 | #include <stdio.h>
50 | 
51 | int main(void)
52 | {
53 |     const char *s1 = u8"Hello World";
54 |     const char *s2 = u8"こんにちは世界";
55 |     int result;
56 | 
57 |     if (uni_collate(s1, -1, UNI_UTF8,
58 |                     s2, -1, UNI_UTF8,
59 |                     UNI_NON_IGNORABLE, UNI_PRIMARY, &result) != UNI_OK)
60 |     {
61 |         puts("failed to collate strings");
62 |         return 1;
63 |     }
64 | 
65 |     printf("%d", result);
66 |     return 0;
67 | }
68 | .EE
69 | .in
70 | .SH SEE ALSO
71 | .BR uni_sortkeymk (3),
72 | .BR uni_sortkeycmp (3),
73 | .BR uniweighting (3),
74 | .BR unistrength (3),
75 | .BR uniattr (3),
76 | .BR UNI_TRUST (3),
77 | .BR unisize (3)
78 | .SH AUTHOR
79 | .UR https://railgunlabs.com
80 | Railgun Labs
81 | .UE .
82 | .SH INTERNET RESOURCES
83 | The online documentation is published on the
84 | .UR https://railgunlabs.com/unicorn
85 | Railgun Labs website
86 | .UE .
87 | .SH LICENSING
88 | Unicorn is distributed with its end-user license agreement (EULA).
89 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
90 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(SOURCES
 2 |     ../include/unicorn.h
 3 |     unicorn.c
 4 |     logger.c
 5 |     memory.c
 6 |     common.h
 7 |     byteswap.h
 8 |     encoding.c
 9 |     segmentation.c
10 |     normalize.c
11 |     normalize.h
12 |     collation.c
13 |     ducet.c
14 |     ducet.h
15 |     compression.c
16 |     properties.c
17 |     caseconv.c
18 |     casefold.c
19 |     charbuf.c
20 |     charbuf.h
21 |     charvec.c
22 |     charvec.h
23 |     ${CMAKE_BINARY_DIR}/unidata.c
24 |     ${CMAKE_BINARY_DIR}/unidata.h
25 | )
26 | 
27 | if (UNICORN_BUILD_STATIC)
28 |     add_library(unicorn_static STATIC ${SOURCES})
29 |     target_compile_definitions(unicorn_static PUBLIC UNICORN_STATIC)
30 |     set_property(TARGET unicorn_static PROPERTY C_STANDARD 99)
31 |     set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h)
32 |     set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h)
33 |     add_dependencies(unicorn_static unicorn_generate)
34 |     install(TARGETS unicorn_static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
35 | endif ()
36 | 
37 | if (UNICORN_BUILD_SHARED OR UNICORN_BUILD_EXAMPLES)
38 |     add_library(unicorn SHARED ${SOURCES})
39 |     target_compile_definitions(unicorn PRIVATE DLL_EXPORT)
40 |     set_property(TARGET unicorn PROPERTY C_STANDARD 99)
41 |     set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h)
42 |     set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h)
43 |     add_dependencies(unicorn unicorn_generate)
44 | 
45 |     if (CMAKE_C_COMPILER_ID MATCHES "Clang" OR
46 |         CMAKE_C_COMPILER_ID MATCHES "GNU")
47 |         target_compile_options(unicorn PRIVATE -pedantic)
48 |     endif ()
49 | 
50 |     if (CMAKE_C_COMPILER_ID MATCHES "Clang")
51 |         target_compile_options(unicorn PRIVATE -Wall -Wextra -Wconversion -Wno-missing-field-initializers
52 |             -Wdouble-promotion -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion)
53 |     endif ()
54 | 
55 |     install(TARGETS unicorn LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
56 | endif ()
57 | 
58 | # Define a special build of Unicorn specifically for unit testing.
59 | # This build is either the same as the static library build OR its
60 | # a special dynamic library build that optionally adds code
61 | # coverage instrumentation.
62 | #
63 | # The static build is neccessary on Windows to ensure the tests
64 | # have access to internal/private symbols e.g. those not exported
65 | # with DLL linkage. The coverage build is only ever run on *nix
66 | # systems and shared objects on those platforms don't have those
67 | # linker issues.
68 | if (UNICORN_BUILD_TESTS OR UNICORN_BUILD_COOKBOOK)
69 |      if (UNICORN_CODE_COVERAGE)
70 |         add_library(unicorn_internal SHARED ${SOURCES})
71 |         target_compile_options(unicorn_internal PRIVATE -g -fprofile-arcs -ftest-coverage -O0)
72 |         target_link_options(unicorn_internal PRIVATE -fprofile-arcs -ftest-coverage -fbranch-probabilities -O0)
73 |         target_compile_definitions(unicorn_internal PRIVATE DLL_EXPORT)
74 |     else ()
75 |         add_library(unicorn_internal STATIC ${SOURCES})
76 |         target_compile_definitions(unicorn_internal PUBLIC UNICORN_STATIC)
77 |     endif ()
78 |     set_property(TARGET unicorn_internal PROPERTY C_STANDARD 99)
79 |     add_dependencies(unicorn_internal unicorn_generate)
80 | endif ()
81 | 


--------------------------------------------------------------------------------
/man/uni_normcmp.3:
--------------------------------------------------------------------------------
 1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
 2 | .SH NAME
 3 | uni_normcmp \- compare strings for canonical equivalence
 4 | .SH LIBRARY
 5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
 6 | .SH SYNOPSIS
 7 | .nf
 8 | .B #include <unicorn.h>
 9 | .PP
10 | .BI "unistat uni_normcmp(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are canonically equivalent.
14 | That is, it checks if the graphemes of both strings are the same.
15 | The behavior of this function is identical to calling \f[B]uni_norm\f[R](3) with \f[B]UNI_NFD\f[R] on both strings followed by a code point comparison.
16 | .PP
17 | The implementation is optimized for normalizing the strings incrementally while simultaneously comparing them.
18 | This is a more optimal approach when it's unknown whether input strings are normalized or not.
19 | If it's known in advance that the strings are both normalized, then they can be compared directly with \f[B]memcmp\f[R](3) or \f[B]strcmp\f[R](3).
20 | .PP
21 | The implementation strives to be highly performant and avoid dynamic memory allocation when possible.
22 | Typically, memory allocation will only be performed for unnaturally long combining character sequences, like
23 | .UR https://en.wikipedia.org/wiki/Zalgo_text
24 | Zalgo text
25 | .UE .
26 | It's rare for real-world text to trigger memory allocation.
27 | .SH RETURN VALUE
28 | .TP
29 | UNI_OK
30 | If the string was normalized successfully.
31 | .TP
32 | UNI_BAD_OPERATION
33 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, if \f[I]s1_len\f[R] or \f[I]s2_len\f[R] are negative, or if \f[I]result\f[R] is null.
34 | .TP
35 | UNI_BAD_ENCODING
36 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
37 | .TP
38 | UNI_NO_MEMORY
39 | If dynamic memory allocation failed.
40 | .SH EXAMPLES
41 | This example compares two strings for canonical equivalence.
42 | Conceptually, the implementation normalizes both strings, performs the comparison, and reports the result.
43 | This approach is recommended when strings are compared for one-off equality.
44 | If strings are compared repeatedly, then it's recommended to normalize them with \f[B]uni_norm\f[R](3) and cache the result for the comparisons.
45 | .PP
46 | .in +4n
47 | .EX
48 | #include <unicorn.h>
49 | #include <stdio.h>
50 | #include <stdbool.h>
51 | 
52 | int main(void)
53 | {
54 |     const char *s1 = u8"ma\\u0301scara"; // 'a' + U+0301 = á (decomposed)
55 |     const char *s2 = u8"m\\u00E1scara";  //       U+00E1 = á (precomposed)
56 |     bool is_equal;
57 | 
58 |     if (uni_normcmp(s1, -1, UNI_UTF8,
59 |                     s2, -1, UNI_UTF8, &is_equal) != UNI_OK)
60 |     {
61 |         puts("failed to normalize and compare strings");
62 |         return 1;
63 |     }
64 | 
65 |     printf("%s", is_equal ? "equal" : "not equal");
66 |     return 0;
67 | }
68 | .EE
69 | .in
70 | .SH SEE ALSO
71 | .BR uni_norm (3),
72 | .BR uninormform (3),
73 | .BR uniattr (3),
74 | .BR UNI_TRUST (3),
75 | .BR unisize (3)
76 | .SH AUTHOR
77 | .UR https://railgunlabs.com
78 | Railgun Labs
79 | .UE .
80 | .SH INTERNET RESOURCES
81 | The online documentation is published on the
82 | .UR https://railgunlabs.com/unicorn
83 | Railgun Labs website
84 | .UE .
85 | .SH LICENSING
86 | Unicorn is distributed with its end-user license agreement (EULA).
87 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
88 | 


--------------------------------------------------------------------------------
/man/unibreak.3:
--------------------------------------------------------------------------------
  1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
  2 | .SH NAME
  3 | unibreak \- text boundaries
  4 | .SH LIBRARY
  5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
  6 | .SH SYNOPSIS
  7 | .nf
  8 | .B #include <unicorn.h>
  9 | .PP
 10 | .B enum unibreak {
 11 | .RS
 12 | .B UNI_GRAPHEME,
 13 | .B UNI_WORD,
 14 | .B UNI_SENTENCE,
 15 | .RE
 16 | .B };
 17 | .fi
 18 | .SH DESCRIPTION
 19 | The elements of this enumeration describe the boundaries that can be detected.
 20 | Conceptually, a break represents the space in between two code points.
 21 | .SH CONSTANTS
 22 | .TP
 23 | .BR UNI_GRAPHEME
 24 | A grapheme or grapheme cluster is a user-perceived character.
 25 | Grapheme clusters are useful for user interfaces and user-facing text manipulation.
 26 | For example, graphemes should be used in the implementation of text selection, arrow key movement, backspacing through text, and so forth.
 27 | When truncating text for presentation grapheme cluster boundaries should be used to avoid malforming user-perceived characters.
 28 | .IP
 29 | A key feature of Unicode grapheme clusters is that they remain unchanged across all canonically equivalent normalization forms.
 30 | That means the graphemes boundaries remain unchanged whether the text is normalized as NFC or NFD.
 31 | .IP
 32 | Support for grapheme cluster break detection can be enabled in the JSON configuration as shown below:
 33 | .IP
 34 | .in +4n
 35 | .EX
 36 | {
 37 |     "algorithms": {
 38 |         "segmentation": [
 39 |             "grapheme"
 40 |         ]
 41 |     }
 42 | }
 43 | .EE
 44 | .in
 45 | .TP
 46 | .BR UNI_WORD
 47 | Word boundaries are used in a number of different contexts.
 48 | The most familiar ones are selection (double-click mouse selection or "move to next word" control-arrow keys) and "whole word search" for search and replace.
 49 | They are also used in database queries and regular expressions, to determine whether elements are within a certain number of words of one another.
 50 | .IP
 51 | The default algorithm for detecting word boundaries primarily intended for languages that use white space to delimit words.
 52 | Unfortunately, some scripts, like Thai, Lao, Khmer, and Myanmar, do not use spaces between words.
 53 | Ideographic scripts such as Japanese and Chinese are even more complex.
 54 | It is therefore not possible to provide an algorithm that correctly detects word boundaries across languages.
 55 | These languages require special handling by a more sophisticated word break detection algorithm that understands the rules of the language.
 56 | .IP
 57 | Support for word break detection can be enabled in the JSON configuration file as shown below:
 58 | .IP
 59 | .in +4n
 60 | .EX
 61 | {
 62 |     "algorithms": {
 63 |         "segmentation": [
 64 |             "word"
 65 |         ]
 66 |     }
 67 | }
 68 | .EE
 69 | .in
 70 | .TP
 71 | .BR UNI_SENTENCE
 72 | Sentence boundaries are often used for triple-click or other methods of selecting or iterating through blocks of text that are larger than single words.
 73 | They are also used to determine whether words occur within the same sentence in database queries.
 74 | .IP
 75 | Support for sentence break detection can be enabled in the JSON configuration file as shown below:
 76 | .IP
 77 | .in +4n
 78 | .EX
 79 | {
 80 |     "algorithms": {
 81 |         "segmentation": [
 82 |             "sentence"
 83 |         ]
 84 |     }
 85 | }
 86 | .EE
 87 | .in
 88 | .SH SEE ALSO
 89 | .BR uninormform (3)
 90 | .SH AUTHOR
 91 | .UR https://railgunlabs.com
 92 | Railgun Labs
 93 | .UE .
 94 | .SH INTERNET RESOURCES
 95 | The online documentation is published on the
 96 | .UR https://railgunlabs.com/unicorn
 97 | Railgun Labs website
 98 | .UE .
 99 | .SH LICENSING
100 | Unicorn is distributed with its end-user license agreement (EULA).
101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
102 | 


--------------------------------------------------------------------------------
/src/charvec.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Unicorn - Embeddable Unicode Algorithms
  3 |  *  Copyright (c) 2024-2025 Railgun Labs
  4 |  *
  5 |  *  This software is dual-licensed: you can redistribute it and/or modify
  6 |  *  it under the terms of the GNU General Public License version 3 as
  7 |  *  published by the Free Software Foundation. For the terms of this
  8 |  *  license, see <https://www.gnu.org/licenses/>.
  9 |  *
 10 |  *  Alternatively, you can license this software under a proprietary
 11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
 12 |  */
 13 | 
 14 | #include "charvec.h"
 15 | 
 16 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity)
 17 | {
 18 |     unistat status = UNI_OK;
 19 | 
 20 |     // LCOV_EXCL_START
 21 |     assert(buffer != NULL);
 22 |     assert(new_capacity >= 0);
 23 |     // LCOV_EXCL_STOP
 24 | 
 25 |     if (new_capacity > buffer->capacity)
 26 |     {
 27 |         unichar *ptr;
 28 |         if (buffer->chars == buffer->scratch)
 29 |         {
 30 |             ptr = uni_malloc(sizeof(buffer->chars[0]) * (size_t)new_capacity); // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required.
 31 |             if (ptr != NULL)
 32 |             {
 33 |                 (void)memcpy(ptr, buffer->scratch, sizeof(buffer->chars[0]) * (size_t)buffer->capacity);
 34 |             }
 35 |         }
 36 |         else
 37 |         {
 38 |             ptr = uni_realloc(buffer->chars, // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required.
 39 |                               sizeof(buffer->chars[0]) * (size_t)buffer->capacity,
 40 |                               sizeof(buffer->chars[0]) * (size_t)new_capacity);
 41 |         }
 42 | 
 43 |         if (ptr == NULL)
 44 |         {
 45 |             uni_message("memory allocation failed");
 46 |             status = UNI_NO_MEMORY;
 47 |         }
 48 |         else
 49 |         {
 50 |             buffer->chars = ptr;
 51 |             buffer->capacity = new_capacity;
 52 |         }
 53 |     }
 54 | 
 55 |     return status;
 56 | }
 57 | 
 58 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count)
 59 | {
 60 |     // LCOV_EXCL_START
 61 |     assert(buffer);
 62 |     assert(chars);
 63 |     // LCOV_EXCL_STOP
 64 | 
 65 |     const unistat status = uni_charvec_reserve(buffer, buffer->length + chars_count);
 66 |     if (status == UNI_OK)
 67 |     {
 68 |         (void)memcpy(&buffer->chars[buffer->length], chars, sizeof(buffer->chars[0]) * (size_t)chars_count);
 69 |         buffer->length += chars_count;
 70 |     }
 71 | 
 72 |     return status;
 73 | }
 74 | 
 75 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count)
 76 | {
 77 |     assert((cb->length + chars_count) <= cb->capacity);  // LCOV_EXCL_BR_LINE
 78 |     (void)memcpy(&cb->chars[cb->length], chars, sizeof(chars[0]) * (size_t)chars_count);
 79 |     cb->length += chars_count;
 80 | }
 81 | 
 82 | void uni_charvec_reset(struct CharVec *buffer)
 83 | {
 84 |     buffer->length = 0;
 85 | }
 86 | 
 87 | void uni_charvec_init(struct CharVec *buffer)
 88 | {
 89 |     const size_t scratch_buffer_size = COUNT_OF(buffer->scratch);
 90 |     buffer->length = 0;
 91 |     buffer->capacity = (unisize)scratch_buffer_size;
 92 |     buffer->chars = buffer->scratch;
 93 | }
 94 | 
 95 | void uni_charvec_free(struct CharVec *buffer)
 96 | {
 97 |     if (buffer->chars != buffer->scratch)
 98 |     {
 99 |         uni_free(buffer->chars, sizeof(buffer->chars[0]) * (size_t)buffer->capacity);
100 |     }
101 | }
102 | 
103 | void uni_charvec_remove(struct CharVec *buf, unisize i)
104 | {
105 |     assert(buf->length >= 1); // LCOV_EXCL_BR_LINE
106 |     const unisize shift = buf->length - (i + UNISIZE_C(1));
107 |     assert(shift >= 0); // LCOV_EXCL_BR_LINE: The math should never work out negative.
108 |     (void)memmove(&buf->chars[i], &buf->chars[i + 1], sizeof(buf->chars[0]) * (size_t)shift);
109 |     buf->length -= 1;
110 | }
111 | 


--------------------------------------------------------------------------------
/scripts/segmentation.py:
--------------------------------------------------------------------------------
  1 | #  Unicorn - Embeddable Unicode Algorithms
  2 | #  Copyright (c) 2024-2025 Railgun Labs
  3 | #
  4 | #  This software is dual-licensed: you can redistribute it and/or modify
  5 | #  it under the terms of the GNU General Public License version 3 as
  6 | #  published by the Free Software Foundation. For the terms of this
  7 | #  license, see <https://www.gnu.org/licenses/>.
  8 | #
  9 | #  Alternatively, you can license this software under a proprietary
 10 | #  license, as set out in <https://railgunlabs.com/unicorn/license/>.
 11 | 
 12 | from typing import Set, Type
 13 | import zipfile
 14 | import json
 15 | 
 16 | from .config import Config
 17 | from .tools import Codespace
 18 | from .feature import Feature
 19 | from .basics import StorageSize, SerializationData
 20 | 
 21 | max_states = 0
 22 | 
 23 | class _Segmentation(Feature):
 24 |     def post_process(self, archive: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
 25 |         gcb = archive.open('segmentation.json')
 26 |         data = json.loads(gcb.read())
 27 |         header = data["header"]
 28 |         gcb.close()
 29 | 
 30 |         header += "#define MAX_BREAK_STATES {0}\n".format(max_states)
 31 | 
 32 |         public_header = "#define UNICORN_FEATURE_SEGMENTATION\n"
 33 |         return SerializationData(header=header, public_header=public_header)
 34 | 
 35 | class GraphemeBreak(Feature):
 36 |     @staticmethod
 37 |     def dependencies() -> Set[Type[Feature]]:
 38 |         return set([_Segmentation])
 39 | 
 40 |     def pre_process(self, codespace: Codespace) -> None:
 41 |         codespace.register_property("gcb", 0, StorageSize.UINT8)
 42 |         codespace.register_property("incb", 0, StorageSize.UINT8)
 43 | 
 44 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
 45 |         gcb = archive.open('gcb.json')
 46 |         data = json.loads(gcb.read())
 47 |         header = data["header"]
 48 |         source = data["source"]
 49 |         size = data["size"]
 50 |         gcb.close()
 51 | 
 52 |         global max_states
 53 |         max_states = max(int(data["states"]), max_states)
 54 | 
 55 |         gcb_property = codespace.get("gcb")
 56 |         for d in data["gcb"]:
 57 |             codespace.set(d[0], gcb_property, d[1])
 58 | 
 59 |         incb_property = codespace.get("incb")
 60 |         for d in data["incb"]:
 61 |             codespace.set(d[0], incb_property, d[1])
 62 | 
 63 |         public_header = "#define UNICORN_FEATURE_GCB\n"
 64 |         return SerializationData(source, header, public_header, size)
 65 | 
 66 | class WordBreak(Feature):
 67 |     @staticmethod
 68 |     def dependencies() -> Set[Type[Feature]]:
 69 |         return set([_Segmentation])
 70 | 
 71 |     def pre_process(self, codespace: Codespace) -> None:
 72 |         codespace.register_property("wb", 0, StorageSize.UINT8)
 73 |         codespace.register_property("wbx", 0, StorageSize.UINT8)
 74 | 
 75 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
 76 |         wb = archive.open('wb.json')
 77 |         data = json.loads(wb.read())
 78 |         header = data["header"]
 79 |         source = data["source"]
 80 |         size = data["size"]
 81 |         wb.close()
 82 | 
 83 |         global max_states
 84 |         max_states = max(int(data["states"]), max_states)
 85 | 
 86 |         wb_property = codespace.get("wb")
 87 |         for d in data["wb"]:
 88 |             codespace.set(d[0], wb_property, d[1])
 89 | 
 90 |         wbx_property = codespace.get("wbx")
 91 |         for d in data["wbx"]:
 92 |             codespace.set(d[0], wbx_property, d[1])
 93 | 
 94 |         public_header = "#define UNICORN_FEATURE_WB\n"
 95 |         return SerializationData(source, header, public_header, size)
 96 | 
 97 | class SentenceBreak(Feature):
 98 |     @staticmethod
 99 |     def dependencies() -> Set[Type[Feature]]:
100 |         return set([_Segmentation])
101 | 
102 |     def pre_process(self, codespace: Codespace) -> None:
103 |         codespace.register_property("sb", 0, StorageSize.UINT8)
104 | 
105 |     def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
106 |         wb = archive.open('sb.json')
107 |         data = json.loads(wb.read())
108 |         header = data["header"]
109 |         source = data["source"]
110 |         size = data["size"]
111 |         wb.close()
112 | 
113 |         global max_states
114 |         max_states = max(int(data["states"]), max_states)
115 | 
116 |         wb_property = codespace.get("sb")
117 |         for d in data["sb"]:
118 |             codespace.set(d[0], wb_property, d[1])
119 | 
120 |         public_header = "#define UNICORN_FEATURE_SB\n"
121 |         return SerializationData(source, header, public_header, size)
122 | 


--------------------------------------------------------------------------------
/src/properties.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Unicorn - Embeddable Unicode Algorithms
  3 |  *  Copyright (c) 2024-2025 Railgun Labs
  4 |  *
  5 |  *  This software is dual-licensed: you can redistribute it and/or modify
  6 |  *  it under the terms of the GNU General Public License version 3 as
  7 |  *  published by the Free Software Foundation. For the terms of this
  8 |  *  license, see <https://www.gnu.org/licenses/>.
  9 |  *
 10 |  *  Alternatively, you can license this software under a proprietary
 11 |  *  license, as set out in <https://railgunlabs.com/unicorn/license/>.
 12 |  */
 13 | 
 14 | #include "common.h"
 15 | #include "unidata.h"
 16 | 
 17 | UNICORN_API unigc uni_gc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
 18 | {
 19 | #if defined(UNICORN_FEATURE_GC)
 20 |     return (unigc)unicorn_get_codepoint_data(c)->general_category; // cppcheck-suppress premium-misra-c-2012-10.5 ; Intentional violation to unpack data from 32-bits to an enum.
 21 | #else
 22 |     (void)c;
 23 |     return UNI_UNASSIGNED;
 24 | #endif
 25 | }
 26 | 
 27 | UNICORN_API uint8_t uni_ccc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
 28 | {
 29 | #if defined(UNICORN_FEATURE_CCC)
 30 |     return unicorn_get_codepoint_data(c)->canonical_combining_class;
 31 | #else
 32 |     (void)c;
 33 |     return 0;
 34 | #endif
 35 | }
 36 | 
 37 | UNICORN_API bool uni_is(unichar c, unibp p) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
 38 | {
 39 | #if defined(UNICORN_FEATURE_BINARY_PROPERTIES)
 40 |     const uint32_t binary_props = (uint32_t)unicorn_get_codepoint_data(c)->binary_properties;
 41 |     const uint32_t prop = (uint32_t)p;
 42 |     const uint32_t bit_mask = (uint32_t)1 << prop;
 43 |     return ((binary_props & bit_mask) == bit_mask) ? true : false;
 44 | #else
 45 |     (void)c;
 46 |     (void)p;
 47 |     return false;
 48 | #endif
 49 | }
 50 | 
 51 | UNICORN_API const char *uni_numval(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
 52 | {
 53 | #if defined(UNICORN_FEATURE_NUMERIC_VALUE)
 54 |     const char *numval = NULL;
 55 |     const uint8_t index = unicorn_get_codepoint_data(c)->numeric_value_offset;
 56 |     if (index > (uint8_t)0)
 57 |     {
 58 |         assert((size_t)index < COUNT_OF(unicorn_numeric_values)); // LCOV_EXCL_BR_LINE
 59 |         numval = unicorn_numeric_values[index];
 60 |     }
 61 |     return numval;
 62 | #else
 63 |     (void)c;
 64 |     return NULL;
 65 | #endif
 66 | }
 67 | 
 68 | UNICORN_API unichar uni_tolower(unichar c)
 69 | {
 70 | #if defined(UNICORN_FEATURE_SIMPLE_LOWERCASE_MAPPINGS)
 71 |     unichar lower;
 72 |     const uint16_t mapping = unicorn_get_character_casing(c)->simple_lowercase_mapping;
 73 |     const uint16_t index = GET_CASED_VALUE(mapping);
 74 |     if (CASING_IN_TABLE(mapping))
 75 |     {
 76 |         assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
 77 |         lower = unicorn_simple_case_mappings[index];
 78 |     }
 79 |     else
 80 |     {
 81 |         const int32_t diff = (int32_t)index;
 82 |         const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
 83 |         lower = (unichar)cp;
 84 |     }
 85 |     return lower;
 86 | #else
 87 |     return c;
 88 | #endif
 89 | }
 90 | 
 91 | UNICORN_API unichar uni_toupper(unichar c)
 92 | {
 93 | #if defined(UNICORN_FEATURE_SIMPLE_UPPERCASE_MAPPINGS)
 94 |     unichar upper;
 95 |     const uint16_t mapping = unicorn_get_character_casing(c)->simple_uppercase_mapping;
 96 |     const uint16_t index = GET_CASED_VALUE(mapping);
 97 |     if (CASING_IN_TABLE(mapping))
 98 |     {
 99 |         assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
100 |         upper = unicorn_simple_case_mappings[index];
101 |     }
102 |     else
103 |     {
104 |         const int32_t diff = (int32_t)index;
105 |         const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
106 |         upper = (unichar)cp;
107 |     }
108 |     return upper;
109 | #else
110 |     return c;
111 | #endif
112 | }
113 | 
114 | UNICORN_API unichar uni_totitle(unichar c)
115 | {
116 | #if defined(UNICORN_FEATURE_SIMPLE_TITLECASE_MAPPINGS)
117 |     unichar title;
118 |     const uint16_t mapping = unicorn_get_character_casing(c)->simple_titlecase_mapping;
119 |     const uint16_t index = GET_CASED_VALUE(mapping);
120 |     if (CASING_IN_TABLE(mapping))
121 |     {
122 |         assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
123 |         title = unicorn_simple_case_mappings[index];
124 |     }
125 |     else
126 |     {
127 |         const int32_t diff = (int32_t)index;
128 |         const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
129 |         title = (unichar)cp;
130 |     }
131 |     return title;
132 | #else
133 |     return c;
134 | #endif
135 | }
136 | 


--------------------------------------------------------------------------------
/examples/character_properties.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  This file is public domain.
  3 |  *  You may use, modify, and distribute it without restriction.
  4 |  */
  5 | 
  6 | // Examples are also documented online at:
  7 | // <https://railgunlabs.com/unicorn/manual/code-examples/>.
  8 | 
  9 | #include <unicorn.h>
 10 | #include <stdio.h>
 11 | 
 12 | static const char *const general_categories[] = {
 13 |     [UNI_UPPERCASE_LETTER] = "Uppercase_Letter",
 14 |     [UNI_LOWERCASE_LETTER] = "Lowercase_Letter",
 15 |     [UNI_TITLECASE_LETTER] = "Titlecase_Letter",
 16 |     [UNI_MODIFIER_LETTER] = "Modifier_Letter",
 17 |     [UNI_OTHER_LETTER] = "Other_Letter",
 18 |     [UNI_NONSPACING_MARK] = "Nonspacing_Mark",
 19 |     [UNI_SPACING_MARK] = "Spacing_Mark",
 20 |     [UNI_ENCLOSING_MARK] = "Enclosing_Mark",
 21 |     [UNI_DECIMAL_NUMBER] = "Decimal_Number",
 22 |     [UNI_LETTER_NUMBER] = "Letter_Number",
 23 |     [UNI_OTHER_NUMBER] = "Other_Number",
 24 |     [UNI_CONNECTOR_PUNCTUATION] = "Connector_Punctuation",
 25 |     [UNI_DASH_PUNCTUATION] = "Dash_Punctuation",
 26 |     [UNI_OPEN_PUNCTUATION] = "Open_Punctuation",
 27 |     [UNI_CLOSE_PUNCTUATION] = "Close_Punctuation",
 28 |     [UNI_INITIAL_PUNCTUATION] = "Initial_Punctuation",
 29 |     [UNI_FINAL_PUNCTUATION] = "Final_Punctuation",
 30 |     [UNI_OTHER_PUNCTUATION] = "Other_Punctuation",
 31 |     [UNI_MATH_SYMBOL] = "Math_Symbol",
 32 |     [UNI_CURRENCY_SYMBOL] = "Currency_Symbol",
 33 |     [UNI_MODIFIER_SYMBOL] = "Modifier_Symbol",
 34 |     [UNI_OTHER_SYMBOL] = "Other_Symbol",
 35 |     [UNI_SPACE_SEPARATOR] = "Space_Separator",
 36 |     [UNI_LINE_SEPARATOR] = "Line_Separator",
 37 |     [UNI_PARAGRAPH_SEPARATOR] = "Paragraph_Separator",
 38 |     [UNI_CONTROL] = "Control",
 39 |     [UNI_FORMAT] = "Format",
 40 |     [UNI_SURROGATE] = "Surrogate",
 41 |     [UNI_PRIVATE_USE] = "Private_Use",
 42 |     [UNI_UNASSIGNED] = "Unassigned",
 43 | };
 44 | 
 45 | static const char *const binary_properties[] = {
 46 |     [UNI_NONCHARACTER_CODE_POINT] = "Noncharacter_Code_Point",
 47 |     [UNI_ALPHABETIC] = "Alphabetic",
 48 |     [UNI_LOWERCASE] = "Lowercase",
 49 |     [UNI_UPPERCASE] = "Uppercase",
 50 |     [UNI_HEX_DIGIT] = "Hex_Digit",
 51 |     [UNI_WHITE_SPACE] = "White_Space",
 52 |     [UNI_MATH] = "Math",
 53 |     [UNI_DASH] = "Dash",
 54 |     [UNI_DIACRITIC] = "Diacritic",
 55 |     [UNI_EXTENDER] = "Extender",
 56 |     [UNI_IDEOGRAPHIC] = "Ideographic",
 57 |     [UNI_QUOTATION_MARK] = "Quotation_Mark",
 58 |     [UNI_UNIFIED_IDEOGRAPH] = "Unified_Ideograph",
 59 |     [UNI_TERMINAL_PUNCTUATION] = "Terminal_Punctuation",
 60 | };
 61 | 
 62 | int main(int argc, char *argv[])
 63 | {
 64 |     // This code example prints character properties associated with
 65 |     // a Unicode code point. The definition of each character property
 66 |     // is documented here:
 67 |     // <https://railgunlabs.com/unicorn/manual/api/character-properties/>.
 68 | 
 69 |     // The following variable defines the Unicode character whose
 70 |     // properties will be printed. In this example, it's Unicode
 71 |     // character U+0300. Change this value to a different character
 72 |     // and notice how the printed properties change.
 73 |     unichar cp = 0x300;
 74 | 
 75 |     unichar tmp_cp;
 76 |     uint8_t ccc;
 77 |     const char *numeric_value;
 78 | 
 79 |     printf("Code point: U+%04X\n", cp);
 80 | 
 81 |     // Print the General_Category property value.
 82 |     printf("General category: %s\n", general_categories[uni_gc(cp)]);
 83 | 
 84 |     // Print the Canonical_Combining_Class property value (if non-zero).
 85 |     ccc = uni_ccc(cp);
 86 |     if (ccc != 0)
 87 |     {
 88 |         printf("Canonical combining class: %d\n", ccc);
 89 |     }
 90 | 
 91 |     // Print each binary property assigned to the code point.
 92 |     for (int i = 0; i < sizeof(binary_properties) / sizeof(binary_properties[0]); i++)
 93 |     {
 94 |         if (uni_is(cp, i))
 95 |         {
 96 |             printf("Binary property: %s\n", binary_properties[i]);
 97 |         }
 98 |     }
 99 | 
100 |     // Print the Numeric_Value property value (if present).
101 |     numeric_value = uni_numval(cp);
102 |     if (numeric_value != NULL)
103 |     {
104 |         printf("Numeric value: %s\n", numeric_value);
105 |     }
106 | 
107 |     // Print the Simple_Lowercase_Mapping (if unique).
108 |     tmp_cp = uni_tolower(cp);
109 |     if (tmp_cp != cp)
110 |     {
111 |         printf("Simple lower case mapping: U+%04X\n", tmp_cp);
112 |     }
113 | 
114 |     // Print the Simple_Uppercase_Mapping (if unique).
115 |     tmp_cp = uni_toupper(cp);
116 |     if (tmp_cp != cp)
117 |     {
118 |         printf("Simple upper case mapping: U+%04X\n", tmp_cp);
119 |     }
120 | 
121 |     // Print the Simple_Titlecase_Mapping (if unique).
122 |     tmp_cp = uni_totitle(cp);
123 |     if (tmp_cp != cp)
124 |     {
125 |         printf("Simple title case mapping: U+%04X\n", tmp_cp);
126 |     }
127 | 
128 |     return 0;
129 | }
130 | 


--------------------------------------------------------------------------------
/src/unicorn.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Unicorn - Embeddable Unicode Algorithms
  3 |  *  Copyright (c) 2021-2025 Railgun Labs
  4 |  *  
  5 |  *  THE CONTENTS OF THIS PROJECT ARE PROPRIETARY AND CONFIDENTIAL. UNAUTHORIZED
  6 |  *  COPYING, TRANSFERRING OR REPRODUCTION OF THE CONTENTS OF THIS PROJECT, VIA
  7 |  *  ANY MEDIUM IS STRICTLY PROHIBITED.
  8 |  *  
  9 |  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 10 |  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 11 |  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 12 |  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 13 |  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 14 |  *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 15 |  *  SOFTWARE.
 16 |  */
 17 | 
 18 | #include "charbuf.h"
 19 | #include "unidata.h"
 20 | 
 21 | #if defined(_MSC_VER)
 22 | #include <intrin.h>
 23 | #endif
 24 | 
 25 | static inline int32_t uni_popcnt(uint32_t n)
 26 | {
 27 | #if defined(_MSC_VER)
 28 |     return (int32_t)__popcnt(n); // cppcheck-suppress premium-misra-c-2012-17.3
 29 | #elif defined(__GNUC__) || defined(__clang__)
 30 |     return (int32_t)__builtin_popcount(n); // cppcheck-suppress premium-misra-c-2012-17.3
 31 | #else
 32 |     int32_t popcnt = 0;
 33 |     for (uint32_t v = (uint32_t)n; v != 0u; v >>= 1u)
 34 |     {
 35 |         if ((v & 1u) == 1u)
 36 |         {
 37 |             popcnt += 1;
 38 |         }
 39 |     }
 40 |     return popcnt;
 41 | #endif
 42 | }
 43 | 
 44 | static unistat check_encoding(uniattr *encoding)
 45 | {
 46 |     unistat status = UNI_OK;
 47 |     const uniattr enc = *encoding;
 48 |     uint32_t flags;
 49 | 
 50 |     // Only one or none of the endian flags can be provided.
 51 |     uniattr mask = enc & (UNI_LITTLE | UNI_BIG);
 52 |     flags = (uint32_t)mask;
 53 |     switch (uni_popcnt(flags))
 54 |     {
 55 |     case 0: // No endian flags was provided; default to native byte order.
 56 |         if ((GET_ENCODING(enc) & (UNI_UTF16 | UNI_UTF32)) != (uniattr)0)
 57 |         {
 58 | #if defined(UNICORN_BIG_ENDIAN)
 59 |             (*encoding) |= UNI_BIG;
 60 | #else
 61 |             (*encoding) |= UNI_LITTLE;
 62 | #endif
 63 |         }
 64 |         break;
 65 |     case 1: // Exactly one endian flag was provided (good).
 66 |         break;
 67 |     default: // Too many endian flags were provided (bad).
 68 |         uni_message("too many endian flags (must specify exactly one)");
 69 |         status = UNI_BAD_OPERATION;
 70 |         break;
 71 |     }
 72 | 
 73 |     if (status == UNI_OK)
 74 |     {
 75 |         // Make sure exactly ONE text encoding flag was provided.
 76 |         mask = GET_ENCODING(enc);
 77 |         flags = (uint32_t)mask;
 78 |         if (uni_popcnt(flags) != 1)
 79 |         {
 80 |             uni_message("expected exactly one encoding form");
 81 |             status = UNI_BAD_OPERATION;
 82 |         }
 83 |     }
 84 | 
 85 |     return status;
 86 | }
 87 | 
 88 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding)
 89 | {
 90 |     unistat status = UNI_OK;
 91 |     (void)length;
 92 | 
 93 |     // Only destination buffers can use the null terminated flag.
 94 |     if (((*encoding) & UNI_NULIFY) == UNI_NULIFY)
 95 |     {
 96 |         uni_message("input buffer incompatible with 'UNI_NULIFY' flag");
 97 |         status = UNI_BAD_OPERATION;
 98 |     }
 99 |     else if (text == NULL)
100 |     {
101 |         uni_message("input buffer is null");
102 |         status = UNI_BAD_OPERATION;
103 |     }
104 |     else
105 |     {
106 |         status = check_encoding(encoding);
107 |     }
108 | 
109 |     return status;
110 | }
111 | 
112 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding)
113 | {
114 |     unistat status = UNI_OK;
115 | 
116 |     // Only input buffers can use the trusted.
117 |     if (((*encoding) & UNI_TRUST) == UNI_TRUST)
118 |     {
119 |         uni_message("output buffer incompatible with 'UNI_TRUST' flag");
120 |         status = UNI_BAD_OPERATION;
121 |     }
122 |     else if (capacity == NULL)
123 |     {
124 |         uni_message("output buffer capacity is null");
125 |         status = UNI_BAD_OPERATION;
126 |     }
127 |     else if (*capacity < 0)
128 |     {
129 |         uni_message("output buffer capacity is negative");
130 |         status = UNI_BAD_OPERATION;
131 |     }
132 |     else if ((buffer == NULL) && (*capacity > 0))
133 |     {
134 |         uni_message("output buffer cannot be null if its capacity is non-zero");
135 |         status = UNI_BAD_OPERATION;
136 |     }
137 |     else
138 |     {
139 |         status = check_encoding(encoding);
140 |     }
141 | 
142 |     return status;
143 | }
144 | 
145 | UNICORN_API void uni_getversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
146 | {
147 |     if (major != NULL)
148 |     {
149 |         *major = 1;
150 |     }
151 | 
152 |     if (minor != NULL)
153 |     {
154 |         *minor = 0;
155 |     }
156 | 
157 |     if (patch != NULL)
158 |     {
159 |         *patch = 0;
160 |     }
161 | }
162 | 
163 | UNICORN_API void uni_getucdversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
164 | {
165 |     if (major != NULL)
166 |     {
167 |         *major = UNICODE_VERSION_MAJOR;
168 |     }
169 | 
170 |     if (minor != NULL)
171 |     {
172 |         *minor = UNICODE_VERSION_MINOR;
173 |     }
174 | 
175 |     if (patch != NULL)
176 |     {
177 |         *patch = UNICODE_VERSION_PATCH;
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.22)
  2 | project(unicorn VERSION 1.3.1 LANGUAGES C)
  3 | 
  4 | include(CheckIncludeFile)
  5 | include(GNUInstallDirs)
  6 | 
  7 | option(UNICORN_BUILD_STATIC "Build Unicorn as a static library" ON)
  8 | option(UNICORN_BUILD_SHARED "Build Unicorn as a dynamic library" ON)
  9 | option(UNICORN_BUILD_EXAMPLES "Build Unicorn code examples (requires the shared library)" OFF)
 10 | 
 11 | option(UNICORN_BUILD_TESTS "Build the tests (commercial licensees only)" OFF)
 12 | option(UNICORN_BUILD_COOKBOOK "Build the documentation/cookbook programs (commercial licensees only)" OFF)
 13 | 
 14 | option(UNICORN_CODE_COVERAGE "Toggle code coverage" OFF)
 15 | option(UNICORN_UNDEFINED_BEHAVIOR_SANITIZER "Toggle undefined behavior sanitizer" OFF)
 16 | option(UNICORN_ADDRESS_SANITIZER "Toggle address sanitizer" OFF)
 17 | option(UNICORN_MEMORY_SANITIZER "Toggle address sanitizer" OFF)
 18 | 
 19 | if (NOT UNICORN_BUILD_STATIC AND NOT UNICORN_BUILD_SHARED)
 20 |     message(FATAL_ERROR "Please build either the static or shared library.")
 21 | endif ()
 22 | 
 23 | set(UNICORN_CONFIG "features.json" CACHE STRING "Path to a JSON file that configures the Unicorn feature set.")
 24 | if (UNICORN_CONFIG STREQUAL "")
 25 |     message(FATAL_ERROR "please specify a unicorn configuration file (must be a JSON file)")
 26 | endif()
 27 | 
 28 | if (MSVC)
 29 |     # The Microsoft compiler assumes source is ANSI-encoded, which depends on the localized version of Windows in use
 30 |     # On U.S. and Western European Windows the encoding is assumed to be Windows-1252. Adding the following compiler
 31 |     # switch forces UTF-8.
 32 |     add_compile_options(/utf-8)
 33 | endif ()
 34 | 
 35 | if (UNICORN_CODE_COVERAGE)
 36 |     if (NOT CMAKE_C_COMPILER_ID STREQUAL "GNU")
 37 |         message(FATAL_ERROR "Code coverage can only be generated with GCC.")
 38 |     endif ()
 39 |     # You can't have sanitizers enabled with coverage checks.
 40 | elseif (UNICORN_UNDEFINED_BEHAVIOR_SANITIZER)
 41 |     add_compile_options(-g -fsanitize=undefined)
 42 |     add_link_options(-fsanitize=undefined)
 43 | elseif (UNICORN_ADDRESS_SANITIZER)
 44 |     add_compile_options(-g -fsanitize=address)
 45 |     add_link_options(-fsanitize=address)
 46 | elseif (UNICORN_MEMORY_SANITIZER)
 47 |     add_compile_options(-O0 -g -fsanitize=memory -fsanitize-memory-track-origins)
 48 |     add_link_options(-fsanitize=memory -fsanitize-memory-track-origins)
 49 | endif ()
 50 | 
 51 | # Check for required C standard library headers.
 52 | check_include_file(string.h HAVE_STRING_H)
 53 | if (NOT HAVE_STRING_H)
 54 |     message(FATAL_ERROR "could not find required header")
 55 | endif ()
 56 | 
 57 | check_include_file(assert.h HAVE_ASSERT_H)
 58 | if (NOT HAVE_ASSERT_H)
 59 |     message(FATAL_ERROR "could not find required header")
 60 | endif ()
 61 | 
 62 | check_include_file(stdbool.h HAVE_STDBOOL_H)
 63 | if (NOT HAVE_STDBOOL_H)
 64 |     message(FATAL_ERROR "could not find required header")
 65 | endif ()
 66 | 
 67 | # Python is required to generate the Unicode data tables.
 68 | find_package(Python3 3.10.0 REQUIRED COMPONENTS Interpreter)
 69 | 
 70 | # Execute the Python Unicode data generators.
 71 | file(GLOB_RECURSE PYTHON_SOURCES "scripts/${PYTHON_SOURCES}*.py")
 72 | set(GENERATED_FILES
 73 |     "${CMAKE_BINARY_DIR}/unidata.c"
 74 |     "${CMAKE_BINARY_DIR}/unidata.h"
 75 |     "${CMAKE_BINARY_DIR}/_unicorn.h"
 76 | )
 77 | add_custom_command(
 78 |     OUTPUT ${GENERATED_FILES}
 79 |     COMMAND Python3::Interpreter -m scripts --config "${UNICORN_CONFIG}" --output "${CMAKE_BINARY_DIR}"
 80 |     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 81 |     DEPENDS unicode.bin ${PYTHON_SOURCES} ${UNICORN_CONFIG})
 82 | add_custom_target(unicorn_generate DEPENDS ${GENERATED_FILES})
 83 | 
 84 | # Mark the following source files as being generated at build time so CMake doesn't try to find them.
 85 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.c PROPERTIES GENERATED 1)
 86 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.h PROPERTIES GENERATED 1)
 87 | set_source_files_properties(${CMAKE_BINARY_DIR}/_unicorn.h PROPERTIES GENERATED 1)
 88 | 
 89 | include_directories(${CMAKE_BINARY_DIR}) # For finding the generated files.
 90 | include_directories(include) # For finding the public header files.
 91 | include_directories(src) # For finding internal header files.
 92 | 
 93 | # Add the Unicorn library.
 94 | add_subdirectory(src)
 95 | 
 96 | # Add the examples, if requested.
 97 | if (UNICORN_BUILD_EXAMPLES)
 98 |     add_subdirectory(examples)
 99 | endif ()
100 | 
101 | # Generate version information.
102 | include(CMakePackageConfigHelpers)
103 | write_basic_package_version_file(${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake COMPATIBILITY SameMajorVersion)
104 | 
105 | # Pre-process pkg-config and UnicornConfig.cmake files.
106 | set(prefix "${CMAKE_INSTALL_PREFIX}")
107 | set(exec_prefix "\${prefix}")
108 | set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
109 | set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
110 | set(VERSION ${PROJECT_VERSION})
111 | configure_file("${CMAKE_SOURCE_DIR}/UnicornConfig.cmake.in" "${CMAKE_BINARY_DIR}/UnicornConfig.cmake" @ONLY)
112 | configure_file("${CMAKE_SOURCE_DIR}/unicorn.pc.in" "${CMAKE_BINARY_DIR}/unicorn.pc" @ONLY)
113 | 
114 | # Only install man pages when running "make install" locally.
115 | add_subdirectory(man)
116 | 
117 | # This is the "make install" action.
118 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfig.cmake DESTINATION cmake)
119 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake DESTINATION cmake)
120 | install(FILES ${CMAKE_BINARY_DIR}/unicorn.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
121 | 
122 | # Add the unit tests if requested (commerical users only).
123 | if (UNICORN_BUILD_TESTS)
124 |     enable_testing()
125 |     include(CTest)
126 |     add_subdirectory(../tests tests)
127 | endif ()
128 | 
129 | if (UNICORN_BUILD_COOKBOOK)
130 |     add_subdirectory(../docs docs)
131 | endif ()
132 | 


--------------------------------------------------------------------------------