├── scripts
├── __init__.py
├── Makefile.am
├── version.py
├── compression.py
├── basics.py
├── ccc.py
├── feature.py
├── general_category.py
├── encoding.py
├── numeric_value.py
├── collation.py
├── decomposition.py
├── composition.py
├── quickcheck.py
├── case_folding.py
├── case_conversion.py
└── segmentation.py
├── .gitattributes
├── include
└── Makefile.am
├── unicode.bin
├── examples
├── .gitignore
├── CMakeLists.txt
├── compress_text.c
├── decode_utf8.c
├── decode_utf16.c
├── decode_utf32.c
├── segment_text.c
├── Makefile.am
├── case_convert.c
├── encode_character.c
├── case_fold.c
├── compare_text.c
├── collate_text.c
└── character_properties.c
├── Makefile.am
├── unicorn.pc.in
├── tests
└── README.md
├── autogen.sh
├── .github
├── pull_request_template.md
├── ISSUE_TEMPLATE
│ └── config.yml
└── workflows
│ └── build.yml
├── man
├── unisize.3
├── uni_utf8.3
├── uni_utf32.3
├── uni_utf16.3
├── unicasefold.3
├── unicaseconv.3
├── uni_scalar.3
├── unierrfunc.3
├── uni_big.3
├── uni_little.3
├── Makefile.am
├── uni_native.3
├── uni_is.3
├── CMakeLists.txt
├── uni_getversion.3
├── uni_ccc.3
├── uni_tolower.3
├── uni_toupper.3
├── uni_totitle.3
├── uni_getucdversion.3
├── uni_trust.3
├── unimemfunc.3
├── uni_normchk.3
├── uninormform.3
├── unichar.3
├── uni_gc.3
├── uni_prevbrk.3
├── uni_validate.3
├── uni_nulify.3
├── uninormchk.3
├── uni_caseconvchk.3
├── uni_numval.3
├── unistrength.3
├── uni_casefoldchk.3
├── uniweighting.3
├── uni_seterrfunc.3
├── uni_prev.3
├── uni_compress.3
├── uni_nextbrk.3
├── uni_decompress.3
├── uni_casefoldcmp.3
├── uni_normqchk.3
├── uniattr.3
├── uni_sortkeycmp.3
├── uni_setmemfunc.3
├── uni_next.3
├── uni_caseconv.3
├── unistat.3
├── uni_norm.3
├── uni_convert.3
├── uni_encode.3
├── uni_casefold.3
├── uni_sortkeymk.3
├── uni_collate.3
├── uni_normcmp.3
└── unibreak.3
├── src
├── logger.c
├── Makefile.am
├── ducet.h
├── normalize.h
├── charvec.h
├── charbuf.h
├── byteswap.h
├── memory.c
├── common.h
├── CMakeLists.txt
├── charvec.c
├── properties.c
└── unicorn.c
├── .gitignore
├── features.json
├── configure.ac
├── UnicornConfig.cmake.in
└── CMakeLists.txt
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 |
--------------------------------------------------------------------------------
/include/Makefile.am:
--------------------------------------------------------------------------------
1 | include_HEADERS = unicorn.h
2 |
--------------------------------------------------------------------------------
/scripts/Makefile.am:
--------------------------------------------------------------------------------
1 | EXTRA_DIST = $(wildcard *.py)
2 |
--------------------------------------------------------------------------------
/unicode.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/railgunlabs/unicorn/HEAD/unicode.bin
--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | caseconv
2 | casefold
3 | charprops
4 | collate
5 | compare
6 | compress
7 | decode-*
8 | encode-*
9 | segmentation
10 |
--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS = man src include examples scripts
2 |
3 | EXTRA_DIST = unicode.bin features.json autogen.sh LICENSE CMakeLists.txt UnicornConfig.cmake.in README.md
4 |
5 | AUTOMAKE_OPTIONS = subdir-objects
6 |
7 | pkgconfigdir = $(libdir)/pkgconfig
8 | pkgconfig_DATA = unicorn.pc
9 |
--------------------------------------------------------------------------------
/unicorn.pc.in:
--------------------------------------------------------------------------------
1 | prefix=@prefix@
2 | exec_prefix=@exec_prefix@
3 | libdir=@libdir@
4 | includedir=@includedir@
5 |
6 | Name: unicorn
7 | Description: Embeddable Unicode algorithms.
8 | Version: @VERSION@
9 | URL: https://railgunlabs.com/unicorn/
10 |
11 | Cflags: -I${includedir}
12 | Libs: -L${libdir} -lunicorn
13 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Looking for Tests?
2 |
3 | Unicorn is extensively tested, however, the tests and code for generating the Unicode® data tables are **not** open source.
4 | Both are exclusively available to commercial licensees.
5 |
6 | See [https://railgunlabs.com/unicorn/license/](https://railgunlabs.com/unicorn/license/) for licensing details.
7 |
--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | echo "Generating build information using autoreconf"
4 | echo "This may take a while ..."
5 |
6 | if hash autoreconf 2>/dev/null; then
7 | autoreconf --verbose --install --force
8 | echo "Now you are ready to run ./configure"
9 | else
10 | echo "Couldn't find autoreconf, aborting"
11 | echo "Please install autoreconf and try again"
12 | fi
13 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 |
7 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: 📚 Documentation
4 | url: https://railgunlabs.com/unicorn/manual
5 | about: Review documentation and code examples.
6 | - name: 🔹 Basic Support
7 | url: https://railgunlabs.com/contribute
8 | about: Submit unsolicited bugs reports and code contributions.
9 | - name: 💎 Premium Support
10 | url: https://railgunlabs.com/services
11 | about: Priority support for bug reports and personal Q&A assistance.
12 |
--------------------------------------------------------------------------------
/scripts/version.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | UNICODE_VERSION = "17.0.0"
13 |
--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | link_libraries(unicorn)
2 | add_executable(example_case_convert case_convert.c)
3 | add_executable(example_case_fold case_fold.c)
4 | add_executable(example_character_properties character_properties.c)
5 | add_executable(example_collate_text collate_text.c)
6 | add_executable(example_compare_text compare_text.c)
7 | add_executable(example_decode_utf8 decode_utf8.c)
8 | add_executable(example_decode_utf16 decode_utf16.c)
9 | add_executable(example_decode_utf32 decode_utf32.c)
10 | add_executable(example_encode_character encode_character.c)
11 | add_executable(example_segment_text segment_text.c)
12 | add_executable(example_compress_text compress_text.c)
13 |
--------------------------------------------------------------------------------
/man/unisize.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unisize \- code unit count
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "typedef int32_t unisize;"
11 | .fi
12 | .SH DESCRIPTION
13 | An integer type whose storage size represents the maximum potential length (in code units) of encoded text.
14 | .SH AUTHOR
15 | .UR https://railgunlabs.com
16 | Railgun Labs
17 | .UE .
18 | .SH INTERNET RESOURCES
19 | The online documentation is published on the
20 | .UR https://railgunlabs.com/unicorn
21 | Railgun Labs website
22 | .UE .
23 | .SH LICENSING
24 | Unicorn is distributed with its end-user license agreement (EULA).
25 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
26 |
--------------------------------------------------------------------------------
/man/uni_utf8.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_UTF8 \- UTF-8 encoding form
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_UTF8 0x2u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded as UTF-8.
14 | The implementation uses an unsigned 8-bit integer (\f[C]uint8_t\f[R]) to represent UTF-8 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 |
--------------------------------------------------------------------------------
/man/uni_utf32.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_UTF32 \- UTF-32 encoding form
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_UTF32 0x8u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates text is encoded as UTF-32.
14 | The implementation uses an unsigned 32-bit integer (\f[C]uint32_t\f[R]) to represent UTF-32 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 |
--------------------------------------------------------------------------------
/man/uni_utf16.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_UTF16 \- UTF-16 encoding form
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_UTF16 0x4u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded as UTF-16.
14 | The implementation uses an unsigned 16-bit integer (\f[C]uint16_t\f[R]) to represent UTF-16 code units.
15 | .SH AUTHOR
16 | .UR https://railgunlabs.com
17 | Railgun Labs
18 | .UE .
19 | .SH INTERNET RESOURCES
20 | The online documentation is published on the
21 | .UR https://railgunlabs.com/unicorn
22 | Railgun Labs website
23 | .UE .
24 | .SH LICENSING
25 | Unicorn is distributed with its end-user license agreement (EULA).
26 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
27 |
--------------------------------------------------------------------------------
/scripts/compression.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | import zipfile
13 |
14 | from .config import Config
15 | from .tools import Codespace
16 | from .feature import Feature
17 | from .basics import SerializationData
18 |
19 | class ShortStringCompression(Feature):
20 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
21 | public_header = "#define UNICORN_FEATURE_COMPRESSION\n"
22 | return SerializationData(public_header=public_header)
23 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build Status
2 | on: [push, pull_request, workflow_dispatch]
3 | jobs:
4 | linux:
5 | name: "Linux - ${{ matrix.compiler.display }}"
6 | runs-on: ubuntu-latest
7 | strategy:
8 | matrix:
9 | compiler:
10 | - { CC: gcc, display: GCC }
11 | - { CC: clang, display: Clang }
12 | steps:
13 | - name: Checkout Repository
14 | uses: actions/checkout@v4
15 | - uses: actions/setup-python@v5
16 | with:
17 | python-version: '3.10'
18 | - name: CMake Build
19 | env:
20 | CC: ${{ matrix.platform.CC }}
21 | CFLAGS: ${{ matrix.platform.CFLAGS }}
22 | run: |
23 | cmake -B build -DCMAKE_BUILD_TYPE=Release -DUNICORN_BUILD_EXAMPLES=ON
24 | cmake --build build
25 | - name: Autotools Build
26 | env:
27 | CC: ${{ matrix.platform.CC }}
28 | CFLAGS: ${{ matrix.platform.CFLAGS }}
29 | run: |
30 | ./autogen.sh
31 | ./configure --enable-examples
32 | make
33 | make distcheck
34 |
--------------------------------------------------------------------------------
/scripts/basics.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from enum import IntEnum
13 | from dataclasses import dataclass
14 |
15 | class StorageSize(IntEnum):
16 | UINT32 = 32
17 | UINT16 = 16
18 | UINT8 = 8
19 |
20 | def byte_size(self) -> int:
21 | return int(self / 8)
22 |
23 | @dataclass
24 | class SerializationData:
25 | def __init__(self, source: str = "", header: str = "", public_header: str = "", size: int =0) -> None:
26 | self.source = source
27 | self.header = header
28 | self.public_header = public_header
29 | self.size = size
30 |
--------------------------------------------------------------------------------
/src/logger.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #include "common.h"
15 |
16 | static unierrfunc unicorn_logger_cb;
17 | static void *unicorn_logger_ud;
18 |
19 | UNICORN_API void uni_seterrfunc(void *user_data, unierrfunc callback) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
20 | {
21 | unicorn_logger_ud = user_data;
22 | unicorn_logger_cb = callback;
23 | }
24 |
25 | void uni_message(const char *msg)
26 | {
27 | if (unicorn_logger_cb != NULL)
28 | {
29 | unicorn_logger_cb(unicorn_logger_ud, msg);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/man/unicasefold.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unicasefold \- case folding operations
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum unicasefold {
11 | .RS
12 | .B UNI_DEFAULT,
13 | .B UNI_CANONICAL,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | This enumeration defines the supported case folding operations.
19 | .SH CONSTANTS
20 | .TP
21 | .BR UNI_DEFAULT
22 | Case folding for binary caseless string comparison.
23 | .TP
24 | .BR UNI_CANONICAL
25 | Case folding for canonical caseless string comparison.
26 | .SH AUTHOR
27 | .UR https://railgunlabs.com
28 | Railgun Labs
29 | .UE .
30 | .SH INTERNET RESOURCES
31 | The online documentation is published on the
32 | .UR https://railgunlabs.com/unicorn
33 | Railgun Labs website
34 | .UE .
35 | .SH LICENSING
36 | Unicorn is distributed with its end-user license agreement (EULA).
37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
38 |
--------------------------------------------------------------------------------
/man/unicaseconv.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unicaseconv \- case conversion operations
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum unicaseconv {
11 | .RS
12 | .B UNI_LOWER,
13 | .B UNI_TITLE,
14 | .B UNI_UPPER,
15 | .RE
16 | .B };
17 | .fi
18 | .SH DESCRIPTION
19 | This enumeration defines the supported case conversion operations.
20 | .SH CONSTANTS
21 | .TP
22 | .BR UNI_LOWER
23 | Lowercase case conversion.
24 | .TP
25 | .BR UNI_TITLE
26 | Titlecase case conversion.
27 | .TP
28 | .BR UNI_UPPER
29 | Uppercase case conversion.
30 | .SH AUTHOR
31 | .UR https://railgunlabs.com
32 | Railgun Labs
33 | .UE .
34 | .SH INTERNET RESOURCES
35 | The online documentation is published on the
36 | .UR https://railgunlabs.com/unicorn
37 | Railgun Labs website
38 | .UE .
39 | .SH LICENSING
40 | Unicorn is distributed with its end-user license agreement (EULA).
41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
42 |
--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
1 | AUTOMAKE_OPTIONS = subdir-objects
2 |
3 | config_json = @CONFIG_JSON@
4 |
5 | include_HEADERS = _unicorn.h
6 |
7 | lib_LTLIBRARIES = libunicorn.la
8 |
9 | libunicorn_la_SOURCES = \
10 | $(top_srcdir)/include/unicorn.h \
11 | unicorn.c \
12 | logger.c \
13 | memory.c \
14 | common.h \
15 | byteswap.h \
16 | encoding.c \
17 | segmentation.c \
18 | normalize.c \
19 | normalize.h \
20 | collation.c \
21 | ducet.c \
22 | ducet.h \
23 | compression.c \
24 | properties.c \
25 | caseconv.c \
26 | casefold.c \
27 | charbuf.c \
28 | charbuf.h \
29 | charvec.c \
30 | charvec.h \
31 | unidata.c \
32 | unidata.h \
33 | _unicorn.h
34 |
35 | libunicorn_la_CFLAGS = -I$(top_srcdir)/include
36 | libunicorn_la_LDFLAGS = -no-undefined -version-info 0:0:0
37 |
38 | unidata.c:
39 | cd $(top_srcdir) && $(PYTHON) -m scripts --config $(config_json) --output $(abs_top_builddir)/src
40 |
41 | unidata.h: unidata.c
42 |
43 | _unicorn.h: unidata.c
44 |
45 | noinst_HEADERS = unidata.h
46 |
47 | BUILT_SOURCES = \
48 | unidata.c \
49 | unidata.h \
50 | _unicorn.h
51 |
52 | CLEANFILES = \
53 | unidata.c \
54 | unidata.h \
55 | _unicorn.h
56 |
--------------------------------------------------------------------------------
/man/uni_scalar.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_SCALAR \- unicode scalar value
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_SCALAR 0x1u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is represented by an integer type large enough to accommodate Unicode scalar values.
14 | .PP
15 | This is a pseudo-encoding that uses the \f[B]unichar\f[R](3) type for storage.
16 | .PP
17 | Unicode scalars are not the same as code points.
18 | The former excludes surrogate characters whereas the latter does not.
19 | .SH SEE ALSO
20 | .BR unichar (3)
21 | .SH AUTHOR
22 | .UR https://railgunlabs.com
23 | Railgun Labs
24 | .UE .
25 | .SH INTERNET RESOURCES
26 | The online documentation is published on the
27 | .UR https://railgunlabs.com/unicorn
28 | Railgun Labs website
29 | .UE .
30 | .SH LICENSING
31 | Unicorn is distributed with its end-user license agreement (EULA).
32 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
33 |
--------------------------------------------------------------------------------
/man/unierrfunc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unierrfunc \- diagnostic function
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "typedef void(*unierrfunc)(void *" user_data ", const char *" message ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Defines the function signature for a custom error callback function.
14 | The resulting message, given by \f[I]message\f[R], is UTF-8 encoded and null-terminated.
15 | It is a message intended for programmers, written in US English, for debugging purposes.
16 | It must never be displayed to an end user.
17 | .SH SEE ALSO
18 | .BR uni_seterrfunc (3)
19 | .SH AUTHOR
20 | .UR https://railgunlabs.com
21 | Railgun Labs
22 | .UE .
23 | .SH INTERNET RESOURCES
24 | The online documentation is published on the
25 | .UR https://railgunlabs.com/unicorn
26 | Railgun Labs website
27 | .UE .
28 | .SH LICENSING
29 | Unicorn is distributed with its end-user license agreement (EULA).
30 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
31 |
--------------------------------------------------------------------------------
/examples/compress_text.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This code example compresses text using the BOCU-1 compression algorithm.
15 | // This algorithm is specifically designed for the compression of short
16 | // Unicode text. Once strings become longer, then a general purpose
17 | // compressor is preferable.
18 |
19 | const char src[] = u8"素早い茶色のキツネが怠け者の犬を飛び越えます";
20 |
21 | char buf[64];
22 | size_t buf_len = 64;
23 |
24 | // The following function compresses the input text 'src' and writes the
25 | // result to 'buf'. It's function signature is documented online at:
26 | // .
27 | if (uni_compress(src, -1, UNI_UTF8, buf, &buf_len) == UNI_OK)
28 | {
29 | printf("Uncompressed : %ld bytes\n", sizeof(src));
30 | printf("Compressed : %ld bytes\n", buf_len);
31 | }
32 | return 0;
33 | }
34 |
--------------------------------------------------------------------------------
/man/uni_big.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_BIG \- big endian byte order
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_BIG 0x10u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in big endian byte order.
14 | It's incompatible with the \f[B]UNI_LITTLE\f[R](3) flag.
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_LITTLE (3),
20 | .BR UNI_UTF16 (3),
21 | .BR UNI_UTF32 (3),
22 | .BR UNI_UTF8 (3),
23 | .BR UNI_SCALAR (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 |
--------------------------------------------------------------------------------
/man/uni_little.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_LITTLE \- little endian byte order
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_LITTLE 0x20u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in little endian byte order.
14 | It's incompatible with the \f[B]UNI_BIG\f[R](3) flag.
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_BIG (3),
20 | .BR UNI_UTF16 (3),
21 | .BR UNI_UTF32 (3),
22 | .BR UNI_UTF8 (3),
23 | .BR UNI_SCALAR (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 |
--------------------------------------------------------------------------------
/man/Makefile.am:
--------------------------------------------------------------------------------
1 | man3_MANS = \
2 | uni_convert.3 \
3 | uni_decompress.3 \
4 | uninormchk.3 \
5 | uni_normqchk.3 \
6 | uni_norm.3 \
7 | unisize.3 \
8 | uni_normcmp.3 \
9 | uni_caseconv.3 \
10 | uni_casefoldcmp.3 \
11 | uniattr.3 \
12 | unimemfunc.3 \
13 | uninormform.3 \
14 | uni_nextbrk.3 \
15 | uniweighting.3 \
16 | uni_setmemfunc.3 \
17 | uni_nulify.3 \
18 | uni_getucdversion.3 \
19 | uni_ccc.3 \
20 | uni_utf16.3 \
21 | unierrfunc.3 \
22 | uni_sortkeymk.3 \
23 | uni_encode.3 \
24 | uni_compress.3 \
25 | uni_utf8.3 \
26 | unistat.3 \
27 | uni_big.3 \
28 | uni_normchk.3 \
29 | uni_collate.3 \
30 | uni_trust.3 \
31 | uni_next.3 \
32 | uni_caseconvchk.3 \
33 | uni_getversion.3 \
34 | uni_validate.3 \
35 | uni_native.3 \
36 | uni_scalar.3 \
37 | unicaseconv.3 \
38 | uni_utf32.3 \
39 | uni_little.3 \
40 | unibp.3 \
41 | unistrength.3 \
42 | uni_totitle.3 \
43 | uni_sortkeycmp.3 \
44 | unicorn.3 \
45 | uni_toupper.3 \
46 | uni_prev.3 \
47 | unicasefold.3 \
48 | uni_casefold.3 \
49 | uni_gc.3 \
50 | uni_prevbrk.3 \
51 | unichar.3 \
52 | uni_numval.3 \
53 | uni_tolower.3 \
54 | unigc.3 \
55 | uni_seterrfunc.3 \
56 | uni_casefoldchk.3 \
57 | uni_is.3 \
58 | unibreak.3
59 | EXTRA_DIST = CMakeLists.txt $(man3_MANS)
60 |
--------------------------------------------------------------------------------
/examples/decode_utf8.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This example demonstrates decoding the code points of a UTF-8 encoded string.
15 | // It prints each code point on its own line.
16 |
17 | const char str[] = u8"I 🕵️."; // I spy
18 | unisize i = 0;
19 | for (;;)
20 | {
21 | unichar cp = 0x0;
22 | unistat s = uni_next(str, -1, UNI_UTF8, &i, &cp);
23 | if (s == UNI_OK)
24 | {
25 | printf("U+%04X\n", cp);
26 | }
27 | else if (s == UNI_DONE)
28 | {
29 | break;
30 | }
31 | else if (s == UNI_BAD_ENCODING)
32 | {
33 | printf("malformed character at code unit %d\n", i);
34 | return 1;
35 | }
36 | else
37 | {
38 | puts("an internal error occured");
39 | return 2; // Something else went wrong (check the status code).
40 | }
41 | }
42 | return 0;
43 | }
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Object files
2 | *.o
3 | *.ko
4 | *.obj
5 | *.elf
6 |
7 | # Libraries
8 | *.lib
9 | *.a
10 |
11 | # Shared objects (inc. Windows DLLs)
12 | *.dll
13 | *.so
14 | *.dylib
15 |
16 | # Executables
17 | *.exe
18 | *.out
19 | *.app
20 | *.i*86
21 | *.x86_64
22 | *.hex
23 | *.msi
24 |
25 | # Visual Studio temporary files
26 | *.suo
27 | *.user
28 | *.sbr
29 | *.pdb
30 | *.ncb
31 | *.sdf
32 | *.idb
33 |
34 | # Mac OS X
35 | .DS_Store
36 |
37 | # CMake out-of-source build files
38 | build
39 | Win32
40 | _CPack_Packages
41 |
42 | # Automake
43 | Makefile
44 | Makefile.in
45 | /ar-lib
46 | /mdate-sh
47 | /py-compile
48 | /test-driver
49 | /ylwrap
50 | .deps/
51 | .dirstamp
52 | libtool
53 | ltmain.sh
54 | *.lo
55 | *.la
56 | *.lai
57 | *.so*
58 | *.libs
59 |
60 | # Autoconf
61 | autom4te.cache
62 | configure~
63 | /autoscan.log
64 | /autoscan-*.log
65 | /aclocal.m4
66 | /compile
67 | /config.cache
68 | /config.guess
69 | /config.h.in
70 | /config.log
71 | /config.status
72 | /config.sub
73 | /configure
74 | /configure.scan
75 | /depcomp
76 | /install-sh
77 | /missing
78 | /stamp-h1
79 | *.tar.gz
80 | *.zip
81 |
82 | # Generated pkg-config file
83 | unicorn.pc
84 |
85 | # Python files
86 | *.pyc
87 |
88 | # Generate C source code
89 | unidata.c
90 | unidata.h
91 | _unicorn.h
92 |
--------------------------------------------------------------------------------
/examples/decode_utf16.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | int main(int argc, char *argv[])
14 | {
15 | // This example demonstrates decoding the code points of a UTF-16 encoded string.
16 | // It prints each code point on its own line.
17 |
18 | const char16_t str[] = u"I 🕵️."; // I spy
19 | unisize i = 0;
20 | for (;;)
21 | {
22 | unichar cp = 0x0;
23 | unistat s = uni_next(str, -1, UNI_UTF16, &i, &cp);
24 | if (s == UNI_OK)
25 | {
26 | printf("U+%04X\n", cp);
27 | }
28 | else if (s == UNI_DONE)
29 | {
30 | break;
31 | }
32 | else if (s == UNI_BAD_ENCODING)
33 | {
34 | printf("malformed character at code unit %d\n", i);
35 | return 1;
36 | }
37 | else
38 | {
39 | puts("an internal error occured");
40 | return 2; // Something else went wrong (check the status code).
41 | }
42 | }
43 | return 0;
44 | }
45 |
--------------------------------------------------------------------------------
/examples/decode_utf32.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | int main(int argc, char *argv[])
14 | {
15 | // This example demonstrates decoding the code points of a UTF-32 encoded string.
16 | // It prints each code point on its own line.
17 |
18 | const char32_t str[] = U"I 🕵️."; // I spy
19 | unisize i = 0;
20 | for (;;)
21 | {
22 | unichar cp = 0x0;
23 | unistat s = uni_next(str, -1, UNI_UTF32, &i, &cp);
24 | if (s == UNI_OK)
25 | {
26 | printf("U+%04X\n", cp);
27 | }
28 | else if (s == UNI_DONE)
29 | {
30 | break;
31 | }
32 | else if (s == UNI_BAD_ENCODING)
33 | {
34 | printf("malformed character at code unit %d\n", i);
35 | return 1;
36 | }
37 | else
38 | {
39 | puts("an internal error occured");
40 | return 2; // Something else went wrong (check the status code).
41 | }
42 | }
43 | return 0;
44 | }
45 |
--------------------------------------------------------------------------------
/man/uni_native.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_NATIVE \- native endian byte order
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_NATIVE
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the text is encoded in native byte order.
14 | The value of this flag will be identical to either \f[B]UNI_BIG\f[R](3) or \f[B]UNI_LITTLE\f[R](3).
15 | .PP
16 | The flag is intended for use with \f[B]UNI_UTF16\f[R](3) and \f[B]UNI_UTF32\f[R](3).
17 | It has no effect when used with \f[B]UNI_UTF8\f[R](3) or \f[B]UNI_SCALAR\f[R](3).
18 | .SH SEE ALSO
19 | .BR UNI_BIG (3),
20 | .BR UNI_LITTLE (3),
21 | .BR UNI_UTF16 (3),
22 | .BR UNI_UTF32 (3),
23 | .BR UNI_UTF8 (3),
24 | .BR UNI_SCALAR (3)
25 | .SH AUTHOR
26 | .UR https://railgunlabs.com
27 | Railgun Labs
28 | .UE .
29 | .SH INTERNET RESOURCES
30 | The online documentation is published on the
31 | .UR https://railgunlabs.com/unicorn
32 | Railgun Labs website
33 | .UE .
34 | .SH LICENSING
35 | Unicorn is distributed with its end-user license agreement (EULA).
36 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
37 |
--------------------------------------------------------------------------------
/man/uni_is.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_is \- binary property value
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "bool uni_is(unichar " c ", unibp " p ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function retrieves the value of the binary property \f[I]p\f[R] for the character \f[I]c\f[R].
14 | .PP
15 | The binary character property \f[I]p\f[R] must be enabled in the JSON configuration file otherwise \f[C]false\f[R] is always returned.
16 | Instructions for enabling it are documented alongside property \f[I]p\f[R] in the \f[B]unibp\f[R](3) enumeration.
17 | .SH RETURN VALUE
18 | Returns \f[C]true\f[R] if character \f[I]c\f[R] has property \f[I]p\f[R].
19 | .SH SEE ALSO
20 | .BR unibp (3),
21 | .BR unichar (3)
22 | .SH AUTHOR
23 | .UR https://railgunlabs.com
24 | Railgun Labs
25 | .UE .
26 | .SH INTERNET RESOURCES
27 | The online documentation is published on the
28 | .UR https://railgunlabs.com/unicorn
29 | Railgun Labs website
30 | .UE .
31 | .SH LICENSING
32 | Unicorn is distributed with its end-user license agreement (EULA).
33 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
34 |
--------------------------------------------------------------------------------
/man/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(MAN
2 | uni_convert.3
3 | uni_decompress.3
4 | uninormchk.3
5 | uni_normqchk.3
6 | uni_norm.3
7 | unisize.3
8 | uni_normcmp.3
9 | uni_caseconv.3
10 | uni_casefoldcmp.3
11 | uniattr.3
12 | unimemfunc.3
13 | uninormform.3
14 | uni_nextbrk.3
15 | uniweighting.3
16 | uni_setmemfunc.3
17 | uni_nulify.3
18 | uni_getucdversion.3
19 | uni_ccc.3
20 | uni_utf16.3
21 | unierrfunc.3
22 | uni_sortkeymk.3
23 | uni_encode.3
24 | uni_compress.3
25 | uni_utf8.3
26 | unistat.3
27 | uni_big.3
28 | uni_normchk.3
29 | uni_collate.3
30 | uni_trust.3
31 | uni_next.3
32 | uni_caseconvchk.3
33 | uni_getversion.3
34 | uni_validate.3
35 | uni_native.3
36 | uni_scalar.3
37 | unicaseconv.3
38 | uni_utf32.3
39 | uni_little.3
40 | unibp.3
41 | unistrength.3
42 | uni_totitle.3
43 | uni_sortkeycmp.3
44 | unicorn.3
45 | uni_toupper.3
46 | uni_prev.3
47 | unicasefold.3
48 | uni_casefold.3
49 | uni_gc.3
50 | uni_prevbrk.3
51 | unichar.3
52 | uni_numval.3
53 | uni_tolower.3
54 | unigc.3
55 | uni_seterrfunc.3
56 | uni_casefoldchk.3
57 | uni_is.3
58 | unibreak.3)
59 | install(FILES ${MAN} DESTINATION man/man3 COMPONENT doc)
60 |
--------------------------------------------------------------------------------
/src/ducet.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #ifndef DUCET_H
15 | #define DUCET_H
16 |
17 | #include "normalize.h"
18 | #include "unidata.h"
19 |
20 | #if defined(UNICORN_FEATURE_COLLATION)
21 |
22 | typedef struct CE
23 | {
24 | uint16_t primary;
25 | uint16_t secondary;
26 | uint16_t tertiary;
27 | } CE;
28 |
29 | struct CEDecoder
30 | {
31 | int32_t ces_index;
32 | int32_t ces_count;
33 | struct NormalizeState normbuf;
34 | struct CharVec charvec;
35 | CE ces[UNICORN_MAX_COLLATION];
36 | };
37 |
38 | void uni_cebuf_init(struct CEDecoder *state);
39 | void uni_cebuf_free(struct CEDecoder *state);
40 | bool uni_cebuf_is_empty(const struct CEDecoder *state);
41 | CE uni_cebuf_pop(struct CEDecoder *state);
42 | unistat uni_cebuf_append_run(struct CEDecoder *state, struct unitext *text);
43 |
44 | #endif
45 |
46 | #endif // DUCET_H
47 |
--------------------------------------------------------------------------------
/man/uni_getversion.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_getversion \- library version
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "void uni_getversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Writes the major, minor, and patch version of the Unicorn library to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively.
14 | Null arguments are ignored.
15 | .SH EXAMPLES
16 | This example prints the major, minor, and patch version of the library to stdout.
17 | .PP
18 | .in +4n
19 | .EX
20 | #include
21 | #include
22 |
23 | int main(void)
24 | {
25 | int32_t major, minor, patch;
26 | uni_getversion(&major, &minor, &patch);
27 | printf("%d.%d.%d", major, minor, patch);
28 | return 0;
29 | }
30 | .EE
31 | .in
32 | .SH AUTHOR
33 | .UR https://railgunlabs.com
34 | Railgun Labs
35 | .UE .
36 | .SH INTERNET RESOURCES
37 | The online documentation is published on the
38 | .UR https://railgunlabs.com/unicorn
39 | Railgun Labs website
40 | .UE .
41 | .SH LICENSING
42 | Unicorn is distributed with its end-user license agreement (EULA).
43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
44 |
--------------------------------------------------------------------------------
/man/uni_ccc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_ccc \- canonical combining class
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "uint8_t uni_ccc(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]Canonical_Combining_Class\f[R] property for the code point \f[I]c\f[R].
14 | The canonical combining class property is used during the Unicode normalization algorithm to sort combining characters.
15 | .PP
16 | Support for the canonical combining class property must be enabled in the JSON configuration file otherwise the function will always return zero.
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 | "characterProperties": [
22 | "Canonical_Combining_Class"
23 | ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The canonical combining class of \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR unichar (3)
31 | .SH AUTHOR
32 | .UR https://railgunlabs.com
33 | Railgun Labs
34 | .UE .
35 | .SH INTERNET RESOURCES
36 | The online documentation is published on the
37 | .UR https://railgunlabs.com/unicorn
38 | Railgun Labs website
39 | .UE .
40 | .SH LICENSING
41 | Unicorn is distributed with its end-user license agreement (EULA).
42 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
43 |
--------------------------------------------------------------------------------
/man/uni_tolower.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_tolower \- simple lower case mapping
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unichar uni_tolower(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple lower case mapping for the code point \f[I]c\f[R].
14 | For the full lowercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_LOWER\f[R].
15 | .PP
16 | Support for simple lower case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 | "characterProperties": [
22 | "Simple_Lowercase_Mapping"
23 | ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple lower case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 |
--------------------------------------------------------------------------------
/man/uni_toupper.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_toupper \- simple upper case mapping
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unichar uni_toupper(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple upper case mapping for the code point \f[I]c\f[R].
14 | For the full uppercase mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_UPPER\f[R].
15 | .PP
16 | Support for simple upper case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 | "characterProperties": [
22 | "Simple_Uppercase_Mapping"
23 | ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple upper case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 |
--------------------------------------------------------------------------------
/man/uni_totitle.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_totitle \- simple title case mapping
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unichar uni_totitle(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the simple title case mapping for the code point \f[I]c\f[R].
14 | For the full title case mappings use \f[B]uni_caseconv\f[R](3) with \f[B]UNI_TITLE\f[R].
15 | .PP
16 | Support for simple title case mappings must be enabled in the JSON configuration file otherwise the function will always return \f[I]c\f[R].
17 | .PP
18 | .in +4n
19 | .EX
20 | {
21 | "characterProperties": [
22 | "Simple_Titlecase_Mapping"
23 | ]
24 | }
25 | .EE
26 | .in
27 | .SH RETURN VALUE
28 | The simple title case mapping for \f[I]c\f[R].
29 | .SH SEE ALSO
30 | .BR uni_caseconv (3),
31 | .BR unicaseconv (3),
32 | .BR unichar (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 |
--------------------------------------------------------------------------------
/man/uni_getucdversion.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_getucdversion \- unicode standard version
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "void uni_getucdversion(int32_t *" major ", int32_t *" minor ", int32_t *" patch ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Writes the major, minor, and patch version of the Unicode Standard to \f[I]major\f[R], \f[I]minor\f[R], and \f[I]patch\f[R], respectively.
14 | Null arguments are ignored.
15 | .SH EXAMPLES
16 | This example prints the major, minor, and patch version of the Unicode Standard implemented by the library to stdout.
17 | .PP
18 | .in +4n
19 | .EX
20 | #include
21 | #include
22 |
23 | int main(void)
24 | {
25 | int32_t major, minor, patch;
26 | uni_getucdversion(&major, &minor, &patch);
27 | printf("%d.%d.%d", major, minor, patch);
28 | return 0;
29 | }
30 | .EE
31 | .in
32 | .SH AUTHOR
33 | .UR https://railgunlabs.com
34 | Railgun Labs
35 | .UE .
36 | .SH INTERNET RESOURCES
37 | The online documentation is published on the
38 | .UR https://railgunlabs.com/unicorn
39 | Railgun Labs website
40 | .UE .
41 | .SH LICENSING
42 | Unicorn is distributed with its end-user license agreement (EULA).
43 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
44 |
--------------------------------------------------------------------------------
/src/normalize.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #ifndef NORMALIZE_H
15 | #define NORMALIZE_H
16 |
17 | #include "common.h"
18 | #include "charvec.h"
19 |
20 | typedef bool (*IsStable)(unichar cp);
21 |
22 | // Incremental normalization state book keeping.
23 | struct NormalizeState
24 | {
25 | int32_t offset;
26 | int32_t r;
27 | IsStable is_stable;
28 |
29 | // Tracks an unstable span of characters within unnormalized text.
30 | // This is the span of text that will be normalized.
31 | struct CharVec span;
32 |
33 | // Buffer with decomposed characters.
34 | struct CharVec decomp;
35 | };
36 |
37 | void uni_norm_init(struct NormalizeState *state, IsStable is_stable);
38 | void uni_norm_free(struct NormalizeState *state);
39 |
40 | void uni_norm_reset(struct NormalizeState *ns);
41 |
42 | unistat uni_norm_append_run(struct NormalizeState *state, struct unitext *it);
43 |
44 | bool uni_is_stable_nfd(unichar cp);
45 |
46 | #endif // NORMALIZE_H
47 |
--------------------------------------------------------------------------------
/scripts/ccc.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 |
21 | class CanonicalCombiningClass(Feature):
22 | def pre_process(self, codespace: Codespace) -> None:
23 | codespace.register_property("canonical_combining_class", 0, StorageSize.UINT8)
24 |
25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 | file = archive.open('ccc.json')
27 | mappings: Dict[str,int] = json.loads(file.read())["ccc"]
28 | file.close()
29 |
30 | ccc_property = codespace.get("canonical_combining_class")
31 | for cp, ccc in mappings.items():
32 | codespace.set(int(cp,16), ccc_property, ccc)
33 |
34 | public_header = "#define UNICORN_FEATURE_CCC\n"
35 | return SerializationData(public_header=public_header)
36 |
--------------------------------------------------------------------------------
/man/uni_trust.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_TRUST \- well-formed text
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_TRUST 0x40u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates the encoded text is well-formed.
14 | .PP
15 | Under normal usage Unicorn processes text with the assumption that it might be malformed.
16 | This means it performs safety checks which are redundant if the text has already been validated.
17 | The purpose of this flag is to inform Unicorn that it can skip redundant validation checks thereby improving performance.
18 | .PP
19 | Usage of this flag means functions will never return \f[B]UNI_BAD_ENCODING\f[R].
20 | .PP
21 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer.
22 | Using this flag with an output buffer will produce \f[B]UNI_BAD_OPERATION\f[R].
23 | .SH SEE ALSO
24 | .BR unistat (3),
25 | .BR uni_norm (3),
26 | .BR uni_validate (3)
27 | .SH AUTHOR
28 | .UR https://railgunlabs.com
29 | Railgun Labs
30 | .UE .
31 | .SH INTERNET RESOURCES
32 | The online documentation is published on the
33 | .UR https://railgunlabs.com/unicorn
34 | Railgun Labs website
35 | .UE .
36 | .SH LICENSING
37 | Unicorn is distributed with its end-user license agreement (EULA).
38 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
39 |
--------------------------------------------------------------------------------
/man/unimemfunc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unimemfunc \- memory allocation function
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "typedef void *(*unimemfunc)(void *" user_data ", void *" ptr ", size_t " old_size ", size_t " new_size ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Defines the function signature for a custom memory management implementation.
14 | The memory manager is responsible for allocating, reallocating, and freeing memory.
15 | The implementation must behave like \f[B]realloc\f[R](3) or \f[B]free\f[R](3) depending upon its arguments.
16 | .PP
17 | When \f[I]new_size\f[R] is non-zero, the allocator must behave like \f[B]realloc\f[R](3).
18 | If \f[I]ptr\f[R] is NULL and \f[I]old_size\f[R] is zero, the allocator must behave like \f[B]malloc\f[R](3).
19 | .PP
20 | When \f[I]new_size\f[R] is zero, the allocator must behave like \f[B]free\f[R](3).
21 | It must also return NULL to the caller.
22 | .SH SEE ALSO
23 | .BR uni_setmemfunc (3)
24 | .SH AUTHOR
25 | .UR https://railgunlabs.com
26 | Railgun Labs
27 | .UE .
28 | .SH INTERNET RESOURCES
29 | The online documentation is published on the
30 | .UR https://railgunlabs.com/unicorn
31 | Railgun Labs website
32 | .UE .
33 | .SH LICENSING
34 | Unicorn is distributed with its end-user license agreement (EULA).
35 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
36 |
--------------------------------------------------------------------------------
/scripts/feature.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Set, Tuple, Type
13 | import zipfile
14 |
15 | from .basics import StorageSize, SerializationData
16 | from .tools import Codespace
17 | from .config import Config
18 |
19 | class Feature(object):
20 | def __init__(self) -> None:
21 | self.total_size = 0
22 | def pre_process(self, codespace: Codespace) -> None:
23 | pass
24 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
25 | return SerializationData()
26 | def post_process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
27 | return SerializationData()
28 | @staticmethod
29 | def dependencies() -> Set[Type['Feature']]:
30 | return set()
31 |
32 | FLAGS_PROPERTY: Tuple[str, int, StorageSize] = ("flags", 0, StorageSize.UINT8)
33 |
34 | IS_COMPOSABLE = 0x1
35 | IS_CASED = 0x2
36 | IS_CASE_IGNORABLE = 0x4
37 | IS_NORMALIZATION_NEEDED = 0x8
38 | IS_CHANGES_WHEN_CASEFOLDED = 0x10
39 |
--------------------------------------------------------------------------------
/examples/segment_text.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This example prints the code unit indices of the extended grapheme clusters in the string.
15 | // A grapheme cluster is a user-perceived character. It correspond to a single graphical
16 | // character on the end-users display. A grapheme can consist of multiple code points.
17 |
18 | // Change 'UNI_GRAPHEME' to 'UNI_WORD' or 'UNI_SENTENCE' and observe how it affects the
19 | // printed break positions.
20 |
21 | const char str[] = u8"👨🏼🚀👨🏽🚀 landed on the 🌕";
22 | unisize break_at = 0;
23 | for (;;)
24 | {
25 | unistat s = uni_nextbrk(UNI_GRAPHEME, str, -1, UNI_UTF8, &break_at);
26 | if (s == UNI_OK)
27 | {
28 | printf("%d\n", break_at);
29 | }
30 | else if (s == UNI_DONE)
31 | {
32 | break;
33 | }
34 | else if (s == UNI_BAD_ENCODING)
35 | {
36 | printf("malformed character at %d\n", break_at);
37 | return 1;
38 | }
39 | else
40 | {
41 | puts("an internal error occured");
42 | return 2; // Something else went wrong (check the status code).
43 | }
44 | }
45 | return 0;
46 | }
47 |
--------------------------------------------------------------------------------
/scripts/general_category.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 |
21 | class GeneralCategoryProperty(Feature):
22 | def pre_process(self, codespace: Codespace) -> None:
23 | codespace.register_property("general_category", 29, StorageSize.UINT8)
24 |
25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 | file = archive.open('gc.json')
27 | data = json.loads(file.read())
28 | characters: Dict[str,int] = data["characters"]
29 | file.close()
30 |
31 | gc_property = codespace.get("general_category")
32 | for cp, value in characters.items():
33 | codespace.set(int(cp,16), gc_property, value)
34 |
35 | public_header = "#define UNICORN_FEATURE_GC\n"
36 | return SerializationData(public_header=public_header)
37 |
--------------------------------------------------------------------------------
/features.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "1.0",
3 | "endian": "native",
4 | "hasStandardAllocators": true,
5 | "characterStorage": "uint32_t",
6 | "optimizeFor": "speed",
7 | "stackBufferSize": 32,
8 | "algorithms": {
9 | "normalization": [
10 | "NFC",
11 | "NFD"
12 | ],
13 | "normalizationQuickCheck": true,
14 | "caseConversion": [
15 | "lower",
16 | "title",
17 | "upper"
18 | ],
19 | "caseFolding": [
20 | "default",
21 | "canonical"
22 | ],
23 | "segmentation": [
24 | "grapheme",
25 | "word",
26 | "sentence"
27 | ],
28 | "compression": true,
29 | "collation": true
30 | },
31 | "characterProperties": [
32 | "Alphabetic",
33 | "Canonical_Combining_Class",
34 | "Dash",
35 | "Diacritic",
36 | "Extender",
37 | "General_Category",
38 | "Hex_Digit",
39 | "Ideographic",
40 | "Lowercase",
41 | "Math",
42 | "Noncharacter_Code_Point",
43 | "Numeric_Value",
44 | "Quotation_Mark",
45 | "Simple_Lowercase_Mapping",
46 | "Simple_Titlecase_Mapping",
47 | "Simple_Uppercase_Mapping",
48 | "Terminal_Punctuation",
49 | "Unified_Ideograph",
50 | "Uppercase",
51 | "White_Space"
52 | ],
53 | "encodingForms": [
54 | "UTF-8",
55 | "UTF-16",
56 | "UTF-32"
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/man/uni_normchk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_normchk \- normalization check
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_normchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is normalized.
14 | It's more "expensive" than \f[B]uni_normqchk\f[R](3), but returns a definitive yes/no answer.
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | On success.
19 | .TP
20 | UNI_BAD_OPERATION
21 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
22 | .TP
23 | UNI_BAD_ENCODING
24 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
25 | .TP
26 | UNI_FEATURE_DISABLED
27 | If Unicorn was built without support for \f[I]form\f[R].
28 | .SH SEE ALSO
29 | .BR uni_normqchk (3),
30 | .BR uninormchk (3),
31 | .BR UNI_TRUST (3),
32 | .BR uninormform (3),
33 | .BR unisize (3),
34 | .BR uniattr (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 |
--------------------------------------------------------------------------------
/scripts/encoding.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | import zipfile
13 |
14 | from .config import Config
15 | from .tools import Codespace
16 | from .feature import Feature
17 | from .basics import SerializationData
18 |
19 | class EncodingUTF8(Feature):
20 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
21 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF8\n"
22 | return SerializationData(public_header=public_header)
23 |
24 | class EncodingUTF16(Feature):
25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF16\n"
27 | return SerializationData(public_header=public_header)
28 |
29 | class EncodingUTF32(Feature):
30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
31 | public_header = "#define UNICORN_FEATURE_ENCODING_UTF32\n"
32 | return SerializationData(public_header=public_header)
33 |
--------------------------------------------------------------------------------
/man/uninormform.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uninormform \- unicode normalization forms
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum uninormform {
11 | .RS
12 | .B UNI_NFC,
13 | .B UNI_NFD,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | Normalization forms define how strings should be transformed for the purposes of testing equivalence.
19 | .SH CONSTANTS
20 | .TP
21 | .BR UNI_NFC
22 | Performs Canonical Decomposition followed by Canonical Composition.
23 | .IP
24 | Support for canonical composition can be enabled in the JSON configuration file.
25 | .IP
26 | .in +4n
27 | .EX
28 | {
29 | "algorithms": {
30 | "normalization": ["NFC"]
31 | }
32 | }
33 | .EE
34 | .in
35 | .TP
36 | .BR UNI_NFD
37 | Performs Canonical Decomposition.
38 | .IP
39 | Support for canonical decomposition can be enabled in the JSON configuration file.
40 | .IP
41 | .in +4n
42 | .EX
43 | {
44 | "algorithms": {
45 | "normalization": ["NFD"]
46 | }
47 | }
48 | .EE
49 | .in
50 | .SH AUTHOR
51 | .UR https://railgunlabs.com
52 | Railgun Labs
53 | .UE .
54 | .SH INTERNET RESOURCES
55 | The online documentation is published on the
56 | .UR https://railgunlabs.com/unicorn
57 | Railgun Labs website
58 | .UE .
59 | .SH LICENSING
60 | Unicorn is distributed with its end-user license agreement (EULA).
61 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
62 |
--------------------------------------------------------------------------------
/man/unichar.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unichar \- code point
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "typedef uint32_t unichar;"
11 | .fi
12 | .SH DESCRIPTION
13 | This integer represents a Unicode code point or Unicode scalar value depending on how it's used.
14 | .PP
15 | The integer type used as the underlying storage for this type is configurable.
16 | This helps simplify integration with existing applications which may already have defined their own Unicode character type.
17 | The type chosen can be signed or unsigned, but it must be large enough to accommodate the entire Unicode character repertoire.
18 | As of Unicode 17.0, that means an integer with at least 21 bits of storage.
19 | .PP
20 | The snippet below demonstrates how to redefine \f[B]unichar\f[R] as a signed 64-bit integer.
21 | Using a 64-bit integer is wasteful, as you only need 21-bits, and was chosen for example purposes only.
22 | .PP
23 | .in +4n
24 | .EX
25 | {
26 | "characterStorage": "int64_t",
27 | }
28 | .EE
29 | .in
30 | .SH AUTHOR
31 | .UR https://railgunlabs.com
32 | Railgun Labs
33 | .UE .
34 | .SH INTERNET RESOURCES
35 | The online documentation is published on the
36 | .UR https://railgunlabs.com/unicorn
37 | Railgun Labs website
38 | .UE .
39 | .SH LICENSING
40 | Unicorn is distributed with its end-user license agreement (EULA).
41 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
42 |
--------------------------------------------------------------------------------
/src/charvec.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | // Implements a heap-allocated growable character buffer.
15 |
16 | #ifndef CHARVEC_H
17 | #define CHARVEC_H
18 |
19 | #include "common.h"
20 |
21 | struct CharVec
22 | {
23 | unisize length;
24 | unisize capacity;
25 | unichar *chars;
26 | unichar scratch[UNICORN_STACK_BUFFER_SIZE];
27 | };
28 |
29 | void uni_charvec_init(struct CharVec *buffer);
30 | void uni_charvec_free(struct CharVec *buffer);
31 | void uni_charvec_reset(struct CharVec *buffer);
32 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count);
33 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity);
34 | void uni_charvec_remove(struct CharVec *buf, unisize i);
35 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count);
36 |
37 | static inline unisize uni_charvec_length(const struct CharVec *buffer)
38 | {
39 | return buffer->length;
40 | }
41 |
42 | static inline unisize uni_charvec_capacity(const struct CharVec *buffer)
43 | {
44 | return buffer->capacity;
45 | }
46 |
47 | #endif // CHARVEC_H
48 |
--------------------------------------------------------------------------------
/man/uni_gc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_gc \- general category
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unigc uni_gc(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]General_Category\f[R] character property for the code point \f[I]c\f[R].
14 | This property describes a character's general classification.
15 | For example, the Latin Capital Letter A (\f[C]U+0041\f[R]) has a general category value of \f[B]UNI_UPPERCASE_LETTER\f[R] which indicates it's an uppercase letter.
16 | .PP
17 | For a complete list of all possible general category values, see \f[B]unigc\f[R](3).
18 | .PP
19 | Support for the \f[C]General_Category\f[R] character property must be enabled in the JSON configuration file otherwise the function will always return \f[B]UNI_UNASSIGNED\f[R].
20 | .PP
21 | .in +4n
22 | .EX
23 | {
24 | "characterProperties": [
25 | "General_Category"
26 | ]
27 | }
28 | .EE
29 | .in
30 | .SH RETURN VALUE
31 | The general category of \f[I]c\f[R].
32 | .SH SEE ALSO
33 | .BR unigc (3),
34 | .BR unichar (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 |
--------------------------------------------------------------------------------
/man/uni_prevbrk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_prevbrk \- compute preceding boundary
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_prevbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function computes the preceding \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R].
14 | The implementation sets \f[I]index\f[R] to the code unit offset of the preceding \f[I]boundary\f[R].
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the break iterator was successfully repositioned.
19 | .TP
20 | UNI_DONE
21 | If \f[I]index\f[R] is at the beginning of \f[I]text\f[R].
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]index\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
28 | .SH SEE ALSO
29 | .BR UNI_TRUST (3),
30 | .BR unibreak (3),
31 | .BR unisize (3),
32 | .BR uniattr (3)
33 | .SH AUTHOR
34 | .UR https://railgunlabs.com
35 | Railgun Labs
36 | .UE .
37 | .SH INTERNET RESOURCES
38 | The online documentation is published on the
39 | .UR https://railgunlabs.com/unicorn
40 | Railgun Labs website
41 | .UE .
42 | .SH LICENSING
43 | Unicorn is distributed with its end-user license agreement (EULA).
44 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
45 |
--------------------------------------------------------------------------------
/man/uni_validate.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_validate \- validate text
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_validate(const void *" text ", unisize " text_len ", uniattr " text_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is well-formed.
14 | The length of \f[I]text\f[R] is given in code units by \f[I]text_len\f[R] and its encoding form is specified by \f[I]text_attr\f[R].
15 | .PP
16 | The \f[B]UNI_TRUST\f[R](3) flag has no effect when used with \f[I]text_attr\f[R] as the entire purpose of this function is to verify \f[I]text\f[R] is well-formed and assuming it is well-formed would defeat the purpose.
17 | .SH RETURN VALUE
18 | .TP
19 | UNI_OK
20 | If \f[I]text\f[R] is well-formed.
21 | .TP
22 | UNI_BAD_OPERATION
23 | If \f[I]text\f[R] is null.
24 | .TP
25 | UNI_BAD_ENCODING
26 | If the encoding of \f[I]text\f[R] is malformed.
27 | .TP
28 | UNI_FEATURE_DISABLED
29 | If the encoding form flagged in \f[I]text_attr\f[R] is disabled.
30 | .SH SEE ALSO
31 | .BR UNI_TRUST (3),
32 | .BR unisize (3),
33 | .BR uniattr (3)
34 | .SH AUTHOR
35 | .UR https://railgunlabs.com
36 | Railgun Labs
37 | .UE .
38 | .SH INTERNET RESOURCES
39 | The online documentation is published on the
40 | .UR https://railgunlabs.com/unicorn
41 | Railgun Labs website
42 | .UE .
43 | .SH LICENSING
44 | Unicorn is distributed with its end-user license agreement (EULA).
45 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
46 |
--------------------------------------------------------------------------------
/man/uni_nulify.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | UNI_NULIFY \- null-terminated output
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B #define UNI_NULIFY 0x80u
11 | .fi
12 | .SH DESCRIPTION
13 | Text attribute bit flag that indicates that the output buffer must be null-terminated by the implementation.
14 | .PP
15 | Some functions, like \f[B]uni_norm\f[R](3), accept an input buffer and an output buffer.
16 | The output buffer is the destination buffer that the implementation writes encoded characters to.
17 | When this flag is applied to the output buffer the implementation guarantees a NUL character (U+0000) is appended, even if it means truncating the last non-null character.
18 | Conceptually, this behavior is like the BSD function \f[C]strlcpy\f[R] which also guarantees the destination buffer is null-terminated.
19 | .PP
20 | The implementation will never append an extraneous null character if the output buffer is already null-terminated.
21 | .PP
22 | Using this flag with an input buffer will produce \f[B]UNI_BAD_OPERATION\f[R].
23 | .SH SEE ALSO
24 | .BR uni_norm (3),
25 | .BR unistat (3)
26 | .SH AUTHOR
27 | .UR https://railgunlabs.com
28 | Railgun Labs
29 | .UE .
30 | .SH INTERNET RESOURCES
31 | The online documentation is published on the
32 | .UR https://railgunlabs.com/unicorn
33 | Railgun Labs website
34 | .UE .
35 | .SH LICENSING
36 | Unicorn is distributed with its end-user license agreement (EULA).
37 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
38 |
--------------------------------------------------------------------------------
/scripts/numeric_value.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 |
21 | class NumericValueProperty(Feature):
22 | def pre_process(self, codespace: Codespace) -> None:
23 | codespace.register_property("numeric_value_offset", 0, StorageSize.UINT8)
24 |
25 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
26 | file = archive.open('numeric_values.json')
27 | data = json.loads(file.read())
28 | mappings: Dict[str,int] = data["mappings"]
29 | source: str = data["source"]
30 | header: str = data["header"]
31 | size: int = data["size"]
32 | file.close()
33 |
34 | numval_property = codespace.get("numeric_value_offset")
35 | for cp, index in mappings.items():
36 | codespace.set(int(cp,16), numval_property, index)
37 |
38 | public_header = "#define UNICORN_FEATURE_NUMERIC_VALUE\n"
39 | return SerializationData(source, header, public_header, size)
40 |
--------------------------------------------------------------------------------
/examples/Makefile.am:
--------------------------------------------------------------------------------
1 | AUTOMAKE_OPTIONS = subdir-objects
2 |
3 | EXTRA_DIST = CMakeLists.txt
4 |
5 | if BUILD_EXAMPLES
6 |
7 | PROG_LDADD = ../src/libunicorn.la
8 | PROG_CFLAGS = -I$(top_srcdir)/include -I$(abs_top_builddir)/src
9 |
10 | bin_PROGRAMS = caseconv casefold charprops collate compare compress decode-utf8 decode-utf16 decode-utf32 encode-char segmentation
11 |
12 | caseconv_SOURCES = case_convert.c
13 | caseconv_LDADD = $(PROG_LDADD)
14 | caseconv_CFLAGS = $(PROG_CFLAGS)
15 |
16 | casefold_SOURCES = case_fold.c
17 | casefold_LDADD = $(PROG_LDADD)
18 | casefold_CFLAGS = $(PROG_CFLAGS)
19 |
20 | charprops_SOURCES = character_properties.c
21 | charprops_LDADD = $(PROG_LDADD)
22 | charprops_CFLAGS = $(PROG_CFLAGS)
23 |
24 | collate_SOURCES = collate_text.c
25 | collate_LDADD = $(PROG_LDADD)
26 | collate_CFLAGS = $(PROG_CFLAGS)
27 |
28 | compare_SOURCES = compare_text.c
29 | compare_LDADD = $(PROG_LDADD)
30 | compare_CFLAGS = $(PROG_CFLAGS)
31 |
32 | compress_SOURCES = compress_text.c
33 | compress_LDADD = $(PROG_LDADD)
34 | compress_CFLAGS = $(PROG_CFLAGS)
35 |
36 | decode_utf8_SOURCES = decode_utf8.c
37 | decode_utf8_LDADD = $(PROG_LDADD)
38 | decode_utf8_CFLAGS = $(PROG_CFLAGS)
39 |
40 | decode_utf16_SOURCES = decode_utf16.c
41 | decode_utf16_LDADD = $(PROG_LDADD)
42 | decode_utf16_CFLAGS = $(PROG_CFLAGS)
43 |
44 | decode_utf32_SOURCES = decode_utf32.c
45 | decode_utf32_LDADD = $(PROG_LDADD)
46 | decode_utf32_CFLAGS = $(PROG_CFLAGS)
47 |
48 | encode_char_SOURCES = encode_character.c
49 | encode_char_LDADD = $(PROG_LDADD)
50 | encode_char_CFLAGS = $(PROG_CFLAGS)
51 |
52 | segmentation_SOURCES = segment_text.c
53 | segmentation_LDADD = $(PROG_LDADD)
54 | segmentation_CFLAGS = $(PROG_CFLAGS)
55 |
56 | endif
57 |
--------------------------------------------------------------------------------
/man/uninormchk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uninormchk \- quick check constants
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum uninormchk {
11 | .RS
12 | .B UNI_YES,
13 | .B UNI_MAYBE,
14 | .B UNI_NO,
15 | .RE
16 | .B };
17 | .fi
18 | .SH DESCRIPTION
19 | These properties indicate whether a character can or cannot appear in a given normalization form.
20 | .PP
21 | When used with \f[B]uni_normqchk\f[R](3) these properties indicate whether some input string is possibly in the desired normalization form.
22 | This may make it possible to bypass the more time-consuming call to run the complete Unicode Normalization Algorithm.
23 | .PP
24 | Users can alternatively use \f[B]uni_normchk\f[R](3) which will definitively check if text is normalized.
25 | .SH CONSTANTS
26 | .TP
27 | .BR UNI_YES
28 | Characters that can occur in the respective normalization form.
29 | .TP
30 | .BR UNI_MAYBE
31 | Characters that may occur in the respective normalization, depending on the context.
32 | .TP
33 | .BR UNI_NO
34 | Characters that cannot occur in the respective normalization form.
35 | .SH SEE ALSO
36 | .BR uni_normqchk (3),
37 | .BR uni_normchk (3)
38 | .SH AUTHOR
39 | .UR https://railgunlabs.com
40 | Railgun Labs
41 | .UE .
42 | .SH INTERNET RESOURCES
43 | The online documentation is published on the
44 | .UR https://railgunlabs.com/unicorn
45 | Railgun Labs website
46 | .UE .
47 | .SH LICENSING
48 | Unicorn is distributed with its end-user license agreement (EULA).
49 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
50 |
--------------------------------------------------------------------------------
/src/charbuf.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #ifndef CHARBUF_H
15 | #define CHARBUF_H
16 |
17 | #include "common.h"
18 |
19 | struct CharBufImpl;
20 |
21 | struct CharBuf
22 | {
23 | void *storage;
24 |
25 | // This is where the next character will be written.
26 | // When finished this is the number of characters that needed to be
27 | // written, but not necessarily how many were actually written.
28 | unisize length;
29 |
30 | // Number of characters actually written.
31 | unisize written;
32 |
33 | unisize capacity;
34 | unisize *result;
35 | bool null_terminate;
36 | bool is_null_terminated;
37 | const struct CharBufImpl *impl;
38 | };
39 |
40 | struct CharBufImpl
41 | {
42 | void (*append)(struct CharBuf *buf, const unichar *chars, unisize chars_count);
43 | void (*nullterminate)(struct CharBuf *buf);
44 | };
45 |
46 | unistat uni_charbuf_init(struct CharBuf *buf, void *text, unisize *capacity, uniattr attributes);
47 | unistat uni_charbuf_finalize(struct CharBuf *buf);
48 |
49 | void uni_charbuf_append(struct CharBuf *buf, const unichar *chars, unisize chars_count);
50 | void uni_charbuf_appendchar(struct CharBuf *buf, unichar ch);
51 |
52 | #endif // CHARBUF_H
53 |
--------------------------------------------------------------------------------
/man/uni_caseconvchk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_caseconvchk \- check case status
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_caseconvchk(unicaseconv " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R].
14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R].
17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
18 | .SH RETURN VALUE
19 | .TP
20 | UNI_OK
21 | On success.
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[C]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
28 | .TP
29 | UNI_FEATURE_DISABLED
30 | If Unicorn was built without support for case conversion.
31 | .SH SEE ALSO
32 | .BR UNI_TRUST (3),
33 | .BR unicaseconv (3),
34 | .BR unisize (3),
35 | .BR uniattr (3)
36 | .SH AUTHOR
37 | .UR https://railgunlabs.com
38 | Railgun Labs
39 | .UE .
40 | .SH INTERNET RESOURCES
41 | The online documentation is published on the
42 | .UR https://railgunlabs.com/unicorn
43 | Railgun Labs website
44 | .UE .
45 | .SH LICENSING
46 | Unicorn is distributed with its end-user license agreement (EULA).
47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
48 |
--------------------------------------------------------------------------------
/examples/case_convert.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This code example case converts text to upper case, e.g. it would
15 | // convert the text "hello world" to "HELLO WORLD". Unicorn also
16 | // supports lower and title case conversion.
17 |
18 | const char *src = u8"Straße";
19 |
20 | char dest[64];
21 | unisize dest_len = 64;
22 |
23 | // The following function case converts a string to uppercase.
24 | // The resulting string can be presented to an end-user.
25 | // The signature for the function is documented online at:
26 | // .
27 | if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, dest, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
28 | {
29 | printf("Case converted: %s\n", dest);
30 | }
31 |
32 | // If you don't know how big the destination buffer will be, you can first
33 | // call uni_caseconv() with a null and zero-sized destination buffer.
34 | // When called this way, the implementation will write to 'dest_len' the
35 | // number of code units needed to accommodate the output. You can then
36 | // allocate a buffer with the appropriate size and call the function again.
37 | dest_len = 0;
38 | if (uni_caseconv(UNI_UPPER, src, -1, UNI_UTF8, NULL, &dest_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
39 | {
40 | printf("Code units needed: %d\n", dest_len);
41 | }
42 | return 0;
43 | }
44 |
--------------------------------------------------------------------------------
/man/uni_numval.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_numval \- numeric property
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "const char *uni_numval(unichar " c ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function returns the \f[C]Numeric_Value\f[R] character property for the code point \f[I]c\f[R].
14 | If the input character does not have a numeric value, then null is returned.
15 | .PP
16 | The numeric value is returned as a null-terminated C string in decimal notation.
17 | That means if the character has a numeric value of -1/2, which is the case for TIBETAN DIGIT HALF ZERO (U+0F33), it would be returned as the string "-0.5".
18 | The caller can parse the string into an integer or floating-point storage format as needed.
19 | .PP
20 | Support for this character property must be enabled in the JSON configuration file otherwise the function will always return null.
21 | .PP
22 | .in +4n
23 | .EX
24 | {
25 | "characterProperties": [
26 | "Numeric_Value"
27 | ]
28 | }
29 | .EE
30 | .in
31 | .SH RETURN VALUE
32 | The value of the \f[C]Numeric_Value\f[R] character property in decimal notation or null if the character does not have a numeric value.
33 | .SH SEE ALSO
34 | .BR unichar (3)
35 | .SH AUTHOR
36 | .UR https://railgunlabs.com
37 | Railgun Labs
38 | .UE .
39 | .SH INTERNET RESOURCES
40 | The online documentation is published on the
41 | .UR https://railgunlabs.com/unicorn
42 | Railgun Labs website
43 | .UE .
44 | .SH LICENSING
45 | Unicorn is distributed with its end-user license agreement (EULA).
46 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
47 |
--------------------------------------------------------------------------------
/man/unistrength.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unistrength \- collation comparison levels
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum unistrength {
11 | .RS
12 | .B UNI_PRIMARY,
13 | .B UNI_SECONDARY,
14 | .B UNI_TERTIARY,
15 | .B UNI_QUATERNARY,
16 | .RE
17 | .B };
18 | .fi
19 | .SH DESCRIPTION
20 | The Unicode collation algorithm is a multilevel comparison algorithm.
21 | The number of levels that are considered in comparison is known as the collation strength.
22 | This enumeration defines constants for each level.
23 | .PP
24 | In comparing two strings, the most important feature is the identity of the base letters.
25 | For example, the difference between an A and a B. If the base letters differ, accent differences are typically ignored.
26 | If the base letters or their accents differ, case differences (uppercase versus lowercase) are typically ignored.
27 | .SH CONSTANTS
28 | .TP
29 | .BR UNI_PRIMARY
30 | Represents differences in the base letter or symbol.
31 | .TP
32 | .BR UNI_SECONDARY
33 | Represents differences in accents.
34 | .TP
35 | .BR UNI_TERTIARY
36 | Represents differences in case or variants of symbols.
37 | .TP
38 | .BR UNI_QUATERNARY
39 | Represents differences in punctuation.
40 | .SH AUTHOR
41 | .UR https://railgunlabs.com
42 | Railgun Labs
43 | .UE .
44 | .SH INTERNET RESOURCES
45 | The online documentation is published on the
46 | .UR https://railgunlabs.com/unicorn
47 | Railgun Labs website
48 | .UE .
49 | .SH LICENSING
50 | Unicorn is distributed with its end-user license agreement (EULA).
51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
52 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | # Specify the project and version + the minimum automake version required.
2 | AC_PREREQ(2.66)
3 | AC_INIT([unicorn], [1.3.1])
4 |
5 | # Sanity check to verify this is indeed the source directory.
6 | AC_CONFIG_SRCDIR(src/unicorn.c)
7 |
8 | # Initialize automake and use "foreign" so it doesn't attmpet to look for or
9 | # create a GNU project structure: COPYING, NEWS, etc...
10 | AM_INIT_AUTOMAKE([foreign])
11 |
12 | # Check for the --with-unicorn-config option.
13 | AC_ARG_WITH([unicorn-config],
14 | [AS_HELP_STRING([--with-unicorn-config=PATH],
15 | [Path to a JSON file that configures the Unicorn feature set. (default: features.json)])],
16 | [CONFIG_JSON="$withval"],
17 | [CONFIG_JSON="features.json"])
18 | AC_SUBST([CONFIG_JSON])
19 |
20 | # Check for the --enable-examples option.
21 | AC_ARG_ENABLE([examples],
22 | [AS_HELP_STRING([--enable-examples], [Build the example programs (disabled by default)])],
23 | [enable_examples=$enableval],
24 | [enable_examples=no]) # Do not build the examples by default.
25 | AM_CONDITIONAL([BUILD_EXAMPLES], [test "x$enable_examples" = "xyes"])
26 |
27 | # Checks for toolchain programs.
28 | AC_PROG_CC
29 | AM_PROG_AR
30 | AC_PROG_CPP
31 | AC_PROG_INSTALL
32 | LT_INIT([win32-dll])
33 |
34 | # Verify the expected C headers are available.AM_PATH_PYTHON([3.10])
35 | AC_SUBST([PYTHON],[$PYTHON])
36 | AC_CHECK_HEADERS([string.h assert.h stdbool.h], [], [AC_MSG_ERROR([could not find required header])])
37 |
38 | # Check for Python (required to build the project).
39 | AM_PATH_PYTHON([3.10])
40 | AC_SUBST([PYTHON],[$PYTHON])
41 |
42 | AC_CONFIG_FILES([
43 | Makefile
44 | src/Makefile
45 | include/Makefile
46 | examples/Makefile
47 | scripts/Makefile
48 | man/Makefile
49 | unicorn.pc
50 | ])
51 | AC_OUTPUT
52 |
--------------------------------------------------------------------------------
/man/uni_casefoldchk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_casefoldchk \- check case fold status
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_casefoldchk(unicasefold " casing ", const void *" text ", unisize " text_len ", uniattr " text_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]text\f[R] is in casing form \f[I]casing\f[R].
14 | If it is, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The text attributes of \f[I]text\f[R] are specified by \f[I]text_attr\f[R].
17 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
18 | .SH RETURN VALUE
19 | .TP
20 | UNI_OK
21 | On success.
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
28 | .TP
29 | UNI_FEATURE_DISABLED
30 | If Unicorn was built without support for case folding.
31 | .TP
32 | UNI_NO_MEMORY
33 | If dynamic memory allocation failed.
34 | .SH SEE ALSO
35 | .BR UNI_TRUST (3),
36 | .BR unicasefold (3),
37 | .BR unisize (3),
38 | .BR uniattr (3)
39 | .SH AUTHOR
40 | .UR https://railgunlabs.com
41 | Railgun Labs
42 | .UE .
43 | .SH INTERNET RESOURCES
44 | The online documentation is published on the
45 | .UR https://railgunlabs.com/unicorn
46 | Railgun Labs website
47 | .UE .
48 | .SH LICENSING
49 | Unicorn is distributed with its end-user license agreement (EULA).
50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
51 |
--------------------------------------------------------------------------------
/src/byteswap.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #ifndef BYTESWAP_H
15 | #define BYTESWAP_H
16 |
17 | #include "common.h"
18 |
19 | static inline uint32_t uni_swap32(uint32_t val)
20 | {
21 | uint32_t valm = (val << 8u) & 0xFF00FF00u;
22 | valm |= (val >> 8u) & 0xFF00FFu;
23 | return (valm << 16u) | (valm >> 16u);
24 | }
25 |
26 | static inline uint16_t uni_swap16(uint16_t val)
27 | {
28 | const uint32_t byte1 = (uint32_t)val << 8u;
29 | const uint32_t byte2 = (uint32_t)val >> 8u;
30 | return (uint16_t)(byte1 | byte2);
31 | }
32 |
33 | static inline uint32_t uni_swap32_le(uint32_t val)
34 | {
35 | #if defined(UNICORN_BIG_ENDIAN)
36 | return uni_swap32(val);
37 | #else
38 | return val;
39 | #endif
40 | }
41 |
42 | static inline uint16_t uni_swap16_le(uint16_t val)
43 | {
44 | #if defined(UNICORN_BIG_ENDIAN)
45 | return uni_swap16(val);
46 | #else
47 | return val;
48 | #endif
49 | }
50 |
51 | static inline uint32_t uni_swap32_be(uint32_t val)
52 | {
53 | #if defined(UNICORN_LITTLE_ENDIAN)
54 | return uni_swap32(val);
55 | #else
56 | return val;
57 | #endif
58 | }
59 |
60 | static inline uint16_t uni_swap16_be(uint16_t val)
61 | {
62 | #if defined(UNICORN_LITTLE_ENDIAN)
63 | return uni_swap16(val);
64 | #else
65 | return val;
66 | #endif
67 | }
68 |
69 | #endif // BYTESWAP_H
70 |
--------------------------------------------------------------------------------
/man/uniweighting.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uniweighting \- collation weighting algorithm
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum uniweighting {
11 | .RS
12 | .B UNI_NON_IGNORABLE,
13 | .B UNI_SHIFTED,
14 | .RE
15 | .B };
16 | .fi
17 | .SH DESCRIPTION
18 | This enumeration defines constants representing the possible options for variable weighted characters.
19 | .PP
20 | The Unicode Collation Algorithm defines the collation for some characters as variable.
21 | This typically includes punctuation characters and symbols.
22 | Based on the variable-weighting setting, these characters can be interpreted as having a quaternary level.
23 | .SH CONSTANTS
24 | .TP
25 | .BR UNI_NON_IGNORABLE
26 | The words with hyphen-minus or hyphen are grouped together, but before all letters in the third position.
27 | This is because they are not ignorable, and have primary values that differ from the letters.
28 | The symbols ☠ and ♡ have primary differences.
29 | .TP
30 | .BR UNI_SHIFTED
31 | The hyphen-minus and hyphen are grouped together, and their differences are less significant than the casing differences in the letter "l".
32 | This grouping results from the fact that they are ignorable, but their fourth level differences are according to the original primary order, which is more intuitive than Unicode order.
33 | The symbols ☠ and ♡ are ignored on levels 1-3.
34 | .SH SEE ALSO
35 | .BR unistrength (3)
36 | .SH AUTHOR
37 | .UR https://railgunlabs.com
38 | Railgun Labs
39 | .UE .
40 | .SH INTERNET RESOURCES
41 | The online documentation is published on the
42 | .UR https://railgunlabs.com/unicorn
43 | Railgun Labs website
44 | .UE .
45 | .SH LICENSING
46 | Unicorn is distributed with its end-user license agreement (EULA).
47 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
48 |
--------------------------------------------------------------------------------
/scripts/collation.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 |
22 | class CollationFeature(Feature):
23 | @staticmethod
24 | def dependencies() -> Set[Type[Feature]]:
25 | return set([CanonicalDecompositionFeature])
26 |
27 | def pre_process(self, codespace: Codespace) -> None:
28 | codespace.register_property("collation_subtype", 0, StorageSize.UINT8)
29 |
30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
31 | file = archive.open('collation.json')
32 | data = json.loads(file.read())
33 | subtypes: Dict[str,int] = data["subtypes"]
34 | size: int = data["size"]
35 | header: str = data["header"]
36 | source: str = data["source"]
37 | file.close()
38 |
39 | # Associate collation subtypes with code points.
40 | subtype_property = codespace.get("collation_subtype")
41 | for key,subtype in subtypes.items():
42 | codepoint = int(key, 16)
43 | codespace.set(codepoint, subtype_property, subtype)
44 |
45 | public_header = "#define UNICORN_FEATURE_COLLATION\n"
46 | return SerializationData(source, header, public_header, size)
47 |
--------------------------------------------------------------------------------
/man/uni_seterrfunc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_seterrfunc \- receive diagnostic events
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "void uni_seterrfunc(void *" user_data ", unierrfunc " callback ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Associate the function \f[I]callback\f[R] with the library's internal error logger.
14 | Anytime a function returns an error \f[I]callback\f[R] will be invoked with a description of the error.
15 | It is up to the implementation of \f[I]callback\f[R] to be thread safe.
16 | .PP
17 | The implementation generally invokes \f[I]callback\f[R] at the moment the error occurs.
18 | Callers are encouraged to set breakpoints in their implementation of \f[I]callback\f[R] and view the stack trace to discover the exact cause of the error.
19 | .SH EXAMPLES
20 | This example registers a custom error callback with Unicorn.
21 | This callback will be invoked when an error occurs in Unicorn.
22 | In practice, you can set a breakpoint within the function for your debugger to pause on and then troubleshoot the error by inspecting the error message.
23 | .PP
24 | .in +4n
25 | .EX
26 | #include
27 | #include
28 |
29 | static void on_error(void *user_data, const char *message)
30 | {
31 | puts(message);
32 | }
33 |
34 | int main(void)
35 | {
36 | uni_seterrfunc(NULL, on_error);
37 | return 0;
38 | }
39 | .EE
40 | .in
41 | .SH SEE ALSO
42 | .BR unimemfunc (3),
43 | .BR unierrfunc (3)
44 | .SH AUTHOR
45 | .UR https://railgunlabs.com
46 | Railgun Labs
47 | .UE .
48 | .SH INTERNET RESOURCES
49 | The online documentation is published on the
50 | .UR https://railgunlabs.com/unicorn
51 | Railgun Labs website
52 | .UE .
53 | .SH LICENSING
54 | Unicorn is distributed with its end-user license agreement (EULA).
55 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
56 |
--------------------------------------------------------------------------------
/man/uni_prev.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_prev \- decode the previous scalar value
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_prev(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decodes the preceding Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R].
14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the preceding scalar.
15 | .PP
16 | This function returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters.
17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R].
18 | .PP
19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If the character was successfully decoded.
24 | .TP
25 | UNI_DONE
26 | If the end of \f[I]text\f[R] was reached.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
30 | .TP
31 | UNI_BAD_OPERATION
32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL.
33 | .SH SEE ALSO
34 | .BR unistat (3),
35 | .BR UNI_TRUST (3),
36 | .BR unisize (3),
37 | .BR uniattr (3),
38 | .BR unichar (3)
39 | .SH AUTHOR
40 | .UR https://railgunlabs.com
41 | Railgun Labs
42 | .UE .
43 | .SH INTERNET RESOURCES
44 | The online documentation is published on the
45 | .UR https://railgunlabs.com/unicorn
46 | Railgun Labs website
47 | .UE .
48 | .SH LICENSING
49 | Unicorn is distributed with its end-user license agreement (EULA).
50 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
51 |
--------------------------------------------------------------------------------
/man/uni_compress.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_compress \- compress text
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_compress(const void *" text ", unisize " text_len ", uniattr " text_attr ", uint8_t *" buffer ", size_t *" buffer_length ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compresses \f[I]text\f[R] using Binary Ordered Compression for Unicode (BOCU-1) and writes the results to \f[I]buffer\f[R].
14 | Text is compressed based on its code points, i.e. UTF-8, 16, and 32 will compress to the same byte sequence.
15 | .PP
16 | The capacity of \f[I]buffer\f[R] is specified by \f[I]buffer_length\f[R].
17 | The implementation always writes to \f[I]buffer_length\f[R] the total number of bytes in the fully compressed text.
18 | If the capacity of \f[I]buffer\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
19 | If \f[I]buffer\f[R] is null, then \f[I]buffer_length\f[R] must be zero.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | On success.
24 | .TP
25 | UNI_NO_SPACE
26 | If \f[I]buffer\f[R] is not large enough.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]text\f[R] or \f[I]buffer_length\f[R] is null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_FEATURE_DISABLED
35 | If Unicorn was configured without support for the compression algorithm.
36 | .SH SEE ALSO
37 | .BR unistat (3),
38 | .BR UNI_TRUST (3),
39 | .BR unisize (3),
40 | .BR uniattr (3)
41 | .SH AUTHOR
42 | .UR https://railgunlabs.com
43 | Railgun Labs
44 | .UE .
45 | .SH INTERNET RESOURCES
46 | The online documentation is published on the
47 | .UR https://railgunlabs.com/unicorn
48 | Railgun Labs website
49 | .UE .
50 | .SH LICENSING
51 | Unicorn is distributed with its end-user license agreement (EULA).
52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
53 |
--------------------------------------------------------------------------------
/man/uni_nextbrk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_nextbrk \- compute next boundary
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_nextbrk(unibreak " boundary ", const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function computes the next \f[I]boundary\f[R] for \f[I]text\f[R] starting from a known code point specified by code unit \f[I]index\f[R].
14 | The implementation sets \f[I]index\f[R] to the code unit offset of the next \f[I]boundary\f[R].
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the break iterator was successfully repositioned.
19 | .TP
20 | UNI_DONE
21 | If \f[I]index\f[R] is beyond the last character of \f[I]text\f[R].
22 | .TP
23 | UNI_BAD_OPERATION
24 | If \f[I]text\f[R] or \f[I]index\f[R] is null.
25 | .TP
26 | UNI_BAD_ENCODING
27 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
28 | .SH EXAMPLES
29 | This example iterates the extended grapheme clusters of a UTF-8 encoded string and prints the indices where each grapheme begins.
30 | .PP
31 | .in +4n
32 | .EX
33 | #include
34 | #include
35 |
36 | int main(void)
37 | {
38 | const char *string = u8"Hi, 世界";
39 | unisize index = 0;
40 | while (uni_nextbrk(UNI_GRAPHEME, string, -1, UNI_UTF8, &index) == UNI_OK)
41 | {
42 | printf("%d\\n", index); // prints '1', '2', '3', '4', 7', '10'
43 | }
44 | return 0;
45 | }
46 | .EE
47 | .in
48 | .SH SEE ALSO
49 | .BR UNI_TRUST (3),
50 | .BR unibreak (3),
51 | .BR unisize (3),
52 | .BR uniattr (3)
53 | .SH AUTHOR
54 | .UR https://railgunlabs.com
55 | Railgun Labs
56 | .UE .
57 | .SH INTERNET RESOURCES
58 | The online documentation is published on the
59 | .UR https://railgunlabs.com/unicorn
60 | Railgun Labs website
61 | .UE .
62 | .SH LICENSING
63 | Unicorn is distributed with its end-user license agreement (EULA).
64 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
65 |
--------------------------------------------------------------------------------
/man/uni_decompress.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_decompress \- decompress text
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_decompress(const uint8_t *" buffer ", size_t " buffer_length ", void *" text ", unisize *" text_len ", uniattr " text_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decompresses \f[I]buffer\f[R] and writes the result to \f[I]text\f[R].
14 | The encoding of \f[I]text_attr\f[R] does not need to match the encoding of the original uncompressed text.
15 | .PP
16 | The capacity of \f[I]text\f[R] is specified by \f[I]text_len\f[R].
17 | If \f[I]text\f[R] is not null, then the implementation writes to \f[I]text_len\f[R] the total number of code units written to \f[I]text\f[R].
18 | If the capacity of \f[I]text\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
19 | .PP
20 | If \f[I]text\f[R] is null and \f[I]text_len\f[R] is zero, then the implementation writes to \f[I]text_len\f[R] the number of code units in the fully decompressed text and returns \f[B]UNI_OK\f[R].
21 | Call the function this way to first compute the total length of the uncompressed text before calling it again with a sufficiently sized buffer.
22 | .SH RETURN VALUE
23 | .TP
24 | UNI_OK
25 | On success.
26 | .TP
27 | UNI_NO_SPACE
28 | If \f[I]text\f[R] is not large enough.
29 | .TP
30 | UNI_BAD_OPERATION
31 | If \f[I]buffer\f[R] is null, \f[I]text\f[R] is null and \f[I]text_len\f[R] is non-zero.
32 | .TP
33 | UNI_FEATURE_DISABLED
34 | If Unicorn was configured without support for the compression algorithm.
35 | .SH SEE ALSO
36 | .BR unistat (3),
37 | .BR uni_compress (3),
38 | .BR unisize (3),
39 | .BR uniattr (3)
40 | .SH AUTHOR
41 | .UR https://railgunlabs.com
42 | Railgun Labs
43 | .UE .
44 | .SH INTERNET RESOURCES
45 | The online documentation is published on the
46 | .UR https://railgunlabs.com/unicorn
47 | Railgun Labs website
48 | .UE .
49 | .SH LICENSING
50 | Unicorn is distributed with its end-user license agreement (EULA).
51 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
52 |
--------------------------------------------------------------------------------
/man/uni_casefoldcmp.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_casefoldcmp \- case-insensitive string comparison
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_casefoldcmp(unicasefold " casing ", const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are a caseless match in casing form \f[I]casing\f[R].
14 | If they match, then the implementation writes \f[C]true\f[R] to \f[I]result\f[R] else it writes \f[C]false\f[R].
15 | .PP
16 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R].
17 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R].
18 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated.
19 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If \f[C]text\f[R] was checked successfully.
24 | .TP
25 | UNI_BAD_OPERATION
26 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
30 | .TP
31 | UNI_FEATURE_DISABLED
32 | If the library was built without support for case folding.
33 | .TP
34 | UNI_NO_MEMORY
35 | If dynamic memory allocation failed.
36 | .SH SEE ALSO
37 | .BR uniattr (3),
38 | .BR UNI_TRUST (3),
39 | .BR unicasefold (3),
40 | .BR unisize (3)
41 | .SH AUTHOR
42 | .UR https://railgunlabs.com
43 | Railgun Labs
44 | .UE .
45 | .SH INTERNET RESOURCES
46 | The online documentation is published on the
47 | .UR https://railgunlabs.com/unicorn
48 | Railgun Labs website
49 | .UE .
50 | .SH LICENSING
51 | Unicorn is distributed with its end-user license agreement (EULA).
52 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
53 |
--------------------------------------------------------------------------------
/man/uni_normqchk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_normqchk \- normalization quick check
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_normqchk(uninormform " form ", const void *" text ", unisize " text_len ", uniattr " text_attr ", uninormchk *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function performs a quick check to see if \f[I]text\f[R] is, is not, or might be normalized.
14 | For a definitive yes/no answer, use \f[B]uni_normchk\f[R](3).
15 | .PP
16 | The implementation behaves as follows:
17 | .RS
18 | .IP \[bu] 2
19 | \f[I]result\f[R] is set to \f[B]UNI_YES\f[R] if all characters in \f[I]text\f[R] quick check to yes.
20 | .IP \[bu] 2
21 | \f[I]result\f[R] is set to \f[B]UNI_NO\f[R] if at least one character in \f[I]text\f[R] is quick checks to no.
22 | .IP \[bu] 2
23 | \f[I]result\f[R] is set to \f[B]UNI_MAYBE\f[R] if at least one character in \f[I]text\f[R] quick checks to maybe and no characters quick check to no.
24 | .RE
25 | .PP
26 | Support for normalization quick check must be enabled in the JSON configuration file.
27 | .PP
28 | .in +4n
29 | .EX
30 | {
31 | "algorithms": {
32 | "normalizationQuickCheck": true
33 | }
34 | }
35 | .EE
36 | .in
37 | .SH RETURN VALUE
38 | .TP
39 | UNI_OK
40 | On success.
41 | .TP
42 | UNI_BAD_OPERATION
43 | If \f[I]text\f[R] or \f[I]result\f[R] is null.
44 | .TP
45 | UNI_BAD_ENCODING
46 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
47 | .TP
48 | UNI_FEATURE_DISABLED
49 | If Unicorn was built without support for \f[I]form\f[R].
50 | .SH SEE ALSO
51 | .BR uni_normchk (3),
52 | .BR uninormchk (3),
53 | .BR UNI_TRUST (3),
54 | .BR uninormform (3),
55 | .BR unisize (3),
56 | .BR uniattr (3)
57 | .SH AUTHOR
58 | .UR https://railgunlabs.com
59 | Railgun Labs
60 | .UE .
61 | .SH INTERNET RESOURCES
62 | The online documentation is published on the
63 | .UR https://railgunlabs.com/unicorn
64 | Railgun Labs website
65 | .UE .
66 | .SH LICENSING
67 | Unicorn is distributed with its end-user license agreement (EULA).
68 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
69 |
--------------------------------------------------------------------------------
/examples/encode_character.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This example demonstrates how to encode a code point as UTF-8, UTF-16, and UTF-32.
15 | // After encoding, the code units are printed as hexadecimal.
16 |
17 | // The following variable defines the Unicode code point that will be encoded.
18 | // In this example, it's Unicode character U+0300. Change this value to a
19 | // different character and notice how the printed code units change.
20 | unichar cp = 0x1F600;
21 |
22 | printf("Code point: U+%04X\n", cp);
23 |
24 | //
25 | // Encode the code point as UTF-8. UTF-8 is a variable length encoding and will encode
26 | // to either 1, 2, 3, or 4 code units.
27 | //
28 |
29 | uint8_t u8[4];
30 | unisize u8_len = 4;
31 |
32 | if (uni_encode(cp, u8, &u8_len, UNI_UTF8) == UNI_OK)
33 | {
34 | printf("UTF-8 :");
35 | for (unisize i = 0; i < u8_len; i++)
36 | {
37 | printf(" 0x%02X", u8[i]);
38 | }
39 | putchar('\n');
40 | }
41 |
42 | //
43 | // Encode the code point as UTF-16. UTF-16 is a variable length encoding and will encode
44 | // to either 1 or 2 code units.
45 | //
46 |
47 | uint16_t u16[2];
48 | unisize u16_len = 2;
49 |
50 | if (uni_encode(cp, u16, &u16_len, UNI_UTF16) == UNI_OK)
51 | {
52 | printf("UTF-16 :");
53 | for (unisize i = 0; i < u16_len; i++)
54 | {
55 | printf(" 0x%04X", u16[i]);
56 | }
57 | putchar('\n');
58 | }
59 |
60 | //
61 | // Encode the code point as UTF-32. Code points are 21-bit integers and UTF-32 stores
62 | // them in a 32-bit integer. UTF-32 is not variable length, unlike UTF-8 and UTF-16.
63 | //
64 |
65 | uint32_t u32[1];
66 | unisize u32_len = 1;
67 |
68 | if (uni_encode(cp, u32, &u32_len, UNI_UTF32) == UNI_OK)
69 | {
70 | printf("UTF-32 :");
71 | for (unisize i = 0; i < u32_len; i++)
72 | {
73 | printf(" 0x%08X", u32[i]);
74 | }
75 | putchar('\n');
76 | }
77 |
78 | return 0;
79 | }
80 |
--------------------------------------------------------------------------------
/examples/case_fold.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | int main(int argc, char *argv[])
15 | {
16 | // This code example demonstrates case folding strings. Case folding
17 | // transform strings for the purposes of caseless string comparison.
18 | //
19 | // You are not supposed to display case folded strings to the end-user.
20 | // They for internal caseless string comparisons only.
21 |
22 | const char *src1 = u8"Straße";
23 | const char *src2 = u8"STrasse";
24 |
25 | char dst1[16];
26 | char dst2[16];
27 |
28 | unisize dst1_len = 16;
29 | unisize dst2_len = 16;
30 |
31 | bool is_equal = false;
32 |
33 | // The following function case folds two strings, performs the comparison, and
34 | // reports the result. This approach is recommended when two strings are being
35 | // compared in a one-off comparison check. If you'll be repeatedly comparing
36 | // the same strings over and over again, then the approach in the next code
37 | // fragment is preferred.
38 | if (uni_casefoldcmp(UNI_CANONICAL, src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK)
39 | {
40 | if (is_equal)
41 | {
42 | puts("equal");
43 | }
44 | else
45 | {
46 | puts("not equal");
47 | }
48 | }
49 |
50 | // The following code fragment case folds two strings and compares the results.
51 | // This more manual approach is recommended when you'll be comparing the same
52 | // strings over and over again because you can case fold them _once_ and then
53 | // re-use the results for each subsequent comparison.
54 | if (uni_casefold(UNI_CANONICAL, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
55 | {
56 | if (uni_casefold(UNI_CANONICAL, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
57 | {
58 | if (strcmp(dst1, dst2) == 0)
59 | {
60 | puts("equal");
61 | }
62 | else
63 | {
64 | puts("not equal");
65 | }
66 | }
67 | }
68 | return 0;
69 | }
70 |
--------------------------------------------------------------------------------
/examples/compare_text.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | int main(int argc, char *argv[])
15 | {
16 | // This code example demonstrates comparing strings for canonical equivalence.
17 | // Canonical equivalence means the graphemes are compared, not the code points
18 | // or code units. It interprets precomposed and decomposed characters that
19 | // represent the same grapheme as equivalent.
20 |
21 | const char *src1 = u8"ma\u0301scara"; // 'a' + U+0301 = á (decomposed)
22 | const char *src2 = u8"m\u00E1scara"; // U+00E1 = á (precomposed)
23 |
24 | char dst1[16];
25 | char dst2[16];
26 |
27 | unisize dst1_len = 16;
28 | unisize dst2_len = 16;
29 |
30 | bool is_equal = false;
31 |
32 | // The following function normalizes two strings, performs the comparison, and
33 | // prints the result. This approach is recommended when two strings are compared
34 | // in a one-off comparison check. If you'll be repeatedly comparing the same
35 | // strings over and over again, then the approach in the next fragment is preferred.
36 | if (uni_normcmp(src1, -1, UNI_UTF8, src2, -1, UNI_UTF8, &is_equal) == UNI_OK)
37 | {
38 | if (is_equal)
39 | {
40 | puts("equal");
41 | }
42 | else
43 | {
44 | puts("not equal");
45 | }
46 | }
47 |
48 | // The following lines normalize two strings and compare the results.
49 | // This more manual approach is recommended when you'll be comparing the same
50 | // strings over and over again because you can normalize them _once_ and then
51 | // re-use the results for each subsequent comparison.
52 | if (uni_norm(UNI_NFD, src1, -1, UNI_UTF8, dst1, &dst1_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
53 | {
54 | if (uni_norm(UNI_NFD, src2, -1, UNI_UTF8, dst2, &dst2_len, UNI_UTF8 | UNI_NULIFY) == UNI_OK)
55 | {
56 | if (strcmp(dst1, dst2) == 0)
57 | {
58 | puts("equal");
59 | }
60 | else
61 | {
62 | puts("not equal");
63 | }
64 | }
65 | }
66 | return 0;
67 | }
68 |
--------------------------------------------------------------------------------
/man/uniattr.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uniattr \- text attributes
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "typedef uint32_t uniattr;"
11 | .fi
12 | .SH DESCRIPTION
13 | This type represents a bit mask of text attributes.
14 | Unless documented otherwise, all functions that accept a \f[B]uniattr\f[R] must observe the following rules:
15 | .PP
16 | There must be exactly one of the following character encoding flags present.
17 | .PP
18 | .RS
19 | .IP \[bu] 2
20 | \f[B]UNI_SCALAR\f[R](3)
21 | .IP \[bu] 2
22 | \f[B]UNI_UTF8\f[R](3)
23 | .IP \[bu] 2
24 | \f[B]UNI_UTF16\f[R](3)
25 | .IP \[bu] 2
26 | \f[B]UNI_UTF32\f[R](3)
27 | .RE
28 | .PP
29 | There can optionally be one of the following endian flags.
30 | If none are present, then native byte order is assumed.
31 | These flags are incompatible with \f[B]UNI_SCALAR\f[R](3) which is always in native byte order.
32 | These flags have no effect with \f[B]UNI_UTF8\f[R](3) because UTF-8 is endian independent.
33 | .PP
34 | .RS
35 | .IP \[bu] 2
36 | \f[B]UNI_NATIVE\f[R](3)
37 | .IP \[bu] 2
38 | \f[B]UNI_LITTLE\f[R](3)
39 | .IP \[bu] 2
40 | \f[B]UNI_BIG\f[R](3)
41 | .RE
42 | .PP
43 | There can optionally be any combination of the following flags.
44 | .PP
45 | .RS
46 | .IP \[bu] 2
47 | \f[B]UNI_TRUST\f[R](3)
48 | .IP \[bu] 2
49 | \f[B]UNI_NULIFY\f[R](3)
50 | .RE
51 | .PP
52 | The following example demonstrates using these flags with \f[B]uni_next\f[R](3) to parse the first code point of a UTF-16 big endian string.
53 | .PP
54 | .in +4n
55 | .EX
56 | const char16_t text[] = u"Hello, World!";
57 | int index = 0;
58 | unichar cp;
59 | uni_next(text, -1, UNI_UTF16 | UNI_BIG, &index, &cp);
60 | .EE
61 | .in
62 | .SH SEE ALSO
63 | .BR UNI_SCALAR (3),
64 | .BR UNI_UTF8 (3),
65 | .BR UNI_UTF16 (3),
66 | .BR UNI_UTF32 (3),
67 | .BR UNI_NATIVE (3),
68 | .BR UNI_LITTLE (3),
69 | .BR UNI_BIG (3),
70 | .BR UNI_TRUST (3),
71 | .BR UNI_NULIFY (3),
72 | .BR uni_next (3)
73 | .SH AUTHOR
74 | .UR https://railgunlabs.com
75 | Railgun Labs
76 | .UE .
77 | .SH INTERNET RESOURCES
78 | The online documentation is published on the
79 | .UR https://railgunlabs.com/unicorn
80 | Railgun Labs website
81 | .UE .
82 | .SH LICENSING
83 | Unicorn is distributed with its end-user license agreement (EULA).
84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
85 |
--------------------------------------------------------------------------------
/scripts/decomposition.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import List, Set, Type, Tuple
13 | import zipfile
14 | import json
15 |
16 | from .config import Config, OptimizeFor
17 | from .tools import Codepoint, Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .ccc import CanonicalCombiningClass
21 | from .encoding import EncodingUTF8
22 |
23 | class CanonicalDecompositionFeature(Feature):
24 | @staticmethod
25 | def dependencies() -> Set[Type[Feature]]:
26 | return set([CanonicalCombiningClass, EncodingUTF8])
27 |
28 | def pre_process(self, codespace: Codespace) -> None:
29 | codespace.register_property("canonical_decomposition_mapping_offset", 0, StorageSize.UINT16)
30 |
31 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
32 | decomp_property = codespace.get("canonical_decomposition_mapping_offset")
33 | if config.optimize == OptimizeFor.SPEED:
34 | file = archive.open('normalize_for_speed.json')
35 | else:
36 | file = archive.open('normalize_for_size.json')
37 | data = json.loads(file.read())
38 | mappings: List[Tuple[Codepoint,int]] = data["mappings"]
39 | source: str = data["source"]
40 | header: str = data["header"]
41 | size: int = data["size"]
42 | file.close()
43 |
44 | # Compute the size (in bytes) occupied by the decomposition mapping data.
45 | if config.optimize == OptimizeFor.SPEED:
46 | size *= config.character_storage_bytes()
47 |
48 | # Associate code points with their canonical decomposition mapping.
49 | for mapping in mappings:
50 | cp = mapping[0]
51 | offset = mapping[1]
52 | codespace.set(cp, decomp_property, offset)
53 |
54 | public_header = "#define UNICORN_FEATURE_NFD\n"
55 | return SerializationData(source, header, public_header, size)
56 |
--------------------------------------------------------------------------------
/scripts/composition.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import List, Set, Tuple, Type
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codepoint, Codespace
18 | from .feature import Feature, IS_COMPOSABLE, FLAGS_PROPERTY
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 |
22 | class CanonicalCompositionFeature(Feature):
23 | @staticmethod
24 | def dependencies() -> Set[Type[Feature]]:
25 | return set([CanonicalDecompositionFeature])
26 |
27 | def pre_process(self, codespace: Codespace) -> None:
28 | codespace.register_property("canonical_composition_mapping_offset", 0, StorageSize.UINT16)
29 | codespace.register_property("canonical_composition_mapping_count", 0, StorageSize.UINT8)
30 | codespace.register_property(*FLAGS_PROPERTY)
31 |
32 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
33 | file = archive.open('composition.json')
34 | data = json.loads(file.read())
35 | mappings: List[Tuple[Codepoint,int,int]] = data["compositionMappings"]
36 | size: int = data["compositionSize"]
37 | source: str = data["compositionSource"]
38 | header: str = data["compositionHeader"]
39 | file.close()
40 |
41 | offset_property = codespace.get("canonical_composition_mapping_offset")
42 | count_property = codespace.get("canonical_composition_mapping_count")
43 | flags_property = codespace.get(FLAGS_PROPERTY[0])
44 |
45 | for value_triplet in mappings:
46 | first = value_triplet[0]
47 | offset = value_triplet[1]
48 | count = value_triplet[2]
49 | codespace.set(first, offset_property, offset)
50 | codespace.set(first, count_property, count)
51 | codespace.set_bitwise(first, flags_property, IS_COMPOSABLE)
52 |
53 | public_header = "#define UNICORN_FEATURE_NFC\n"
54 | return SerializationData(source, header, public_header, size)
55 |
--------------------------------------------------------------------------------
/man/uni_sortkeycmp.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_sortkeycmp \- compare sort keys
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_sortkeycmp(const uint16_t *" sk1 ", size_t " sk1_len ", const uint16_t *" sk2 ", size_t " sk2_len ", int32_t *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compares sort keys \f[I]sk1\f[R] and \f[I]sk2\f[R] and writes either -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]sk1 < sk2\f[R], \f[C]sk1 = sk2\f[R], or \f[C]sk1 > sk2\f[R].
14 | The sort keys must be generated with \f[B]uni_sortkeymk\f[R](3).
15 | .SH RETURN VALUE
16 | .TP
17 | UNI_OK
18 | If the collation algorithm executed successfully.
19 | .TP
20 | UNI_BAD_OPERATION
21 | If \f[I]sk1\f[R], \f[I]sk2\f[R], or \f[I]result\f[R] are null.
22 | .SH EXAMPLES
23 | This example constructs two sort keys and compares them.
24 | Sort keys must be generated with the same settings for their order to make sense.
25 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength.
26 | In a real application you would cache the sort keys for future comparisons.
27 | For one-off comparisons, use \f[B]uni_collate\f[R](3).
28 | .PP
29 | .in +4n
30 | .EX
31 | #include
32 | #include
33 |
34 | int main(void)
35 | {
36 | const char *s1 = "hello", *s2 = "Hi";
37 | uint16_t sk1[32] = {0}, sk2[16] = {0};
38 | size_t sk1_len = 32, sk2_len = 16;
39 |
40 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK ||
41 | uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK)
42 | {
43 | puts("failed to construct sort keys");
44 | return 1;
45 | }
46 |
47 | int result;
48 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK)
49 | {
50 | puts("failed to compare sort keys");
51 | return 1;
52 | }
53 |
54 | printf("%d", result);
55 | return 0;
56 | }
57 | .EE
58 | .in
59 | .SH SEE ALSO
60 | .BR uni_sortkeymk (3)
61 | .SH AUTHOR
62 | .UR https://railgunlabs.com
63 | Railgun Labs
64 | .UE .
65 | .SH INTERNET RESOURCES
66 | The online documentation is published on the
67 | .UR https://railgunlabs.com/unicorn
68 | Railgun Labs website
69 | .UE .
70 | .SH LICENSING
71 | Unicorn is distributed with its end-user license agreement (EULA).
72 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
73 |
--------------------------------------------------------------------------------
/man/uni_setmemfunc.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_setmemfunc \- register a memory allocator
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_setmemfunc(void *" user_data ", unimemfunc " allocf ");"
11 | .fi
12 | .SH DESCRIPTION
13 | Sets \f[I]allocf\f[R] as the implementation for dynamic memory allocations.
14 | If \f[I]allocf\f[R] is null then the implementation reverts to its default allocator which may be the C standard allocator or a dummy allocator that always fails to allocate memory.
15 | The latter is only present when the C standard library allocators are disabled.
16 | .PP
17 | Support for C standard library allocators must be enabled in the JSON configuration file.
18 | If the standard allocators are not enabled, then dummy allocators that always return NULL are used.
19 | In this case, a custom allocator must be supplied.
20 | .PP
21 | .in +4n
22 | .EX
23 | {
24 | "hasStandardAllocators": true
25 | }
26 | .EE
27 | .in
28 | .SH RETURN VALUE
29 | .TP
30 | UNI_OK
31 | If the memory allocator was modified.
32 | .TP
33 | UNI_FEATURE_DISABLED
34 | If \f[I]allocf\f[R] is null and Unicorn was built without support for the C standard library memory allocator.
35 | .SH EXAMPLES
36 | This example registers a custom memory allocator with Unicorn.
37 | In the example, the custom allocator wraps the C standard memory allocators, however, this is redundant because Unicorn already initializes with a custom allocator that wraps the C standard allocators.
38 | They are wrapped here for example purposes only.
39 | An actual implementation would implement and wrap its own custom memory allocator.
40 | .PP
41 | .in +4n
42 | .EX
43 | #include
44 | #include
45 |
46 | static void *allocfree(void *ud, void *ptr, size_t osize, size_t nsize)
47 | {
48 | if (nsize == 0)
49 | {
50 | free(ptr);
51 | return NULL;
52 | }
53 | else
54 | {
55 | return realloc(ptr, nsize);
56 | }
57 | }
58 |
59 | int main(void)
60 | {
61 | uni_setmemfunc(NULL, allocfree);
62 | return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unimemfunc (3)
68 | .SH AUTHOR
69 | .UR https://railgunlabs.com
70 | Railgun Labs
71 | .UE .
72 | .SH INTERNET RESOURCES
73 | The online documentation is published on the
74 | .UR https://railgunlabs.com/unicorn
75 | Railgun Labs website
76 | .UE .
77 | .SH LICENSING
78 | Unicorn is distributed with its end-user license agreement (EULA).
79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
80 |
--------------------------------------------------------------------------------
/scripts/quickcheck.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 | from .ccc import CanonicalCombiningClass
21 |
22 | class NFCQuickCheck(Feature):
23 | @staticmethod
24 | def dependencies() -> Set[Type[Feature]]:
25 | return set([CanonicalCombiningClass])
26 |
27 | def pre_process(self, codespace: Codespace) -> None:
28 | codespace.register_property("quick_check_flags", 0, StorageSize.UINT8)
29 |
30 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
31 | file = archive.open('quickcheck.json')
32 | data = json.loads(file.read())
33 | nfc_quick_check: Dict[str,int] = data["quickCheckNFC"]
34 | file.close()
35 |
36 | qc_flags_property = codespace.get("quick_check_flags")
37 | for cp, qc_value in nfc_quick_check.items():
38 | codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value << 4)
39 |
40 | public_header = "#define UNICORN_FEATURE_NFC_QUICK_CHECK\n"
41 | return SerializationData(public_header=public_header)
42 |
43 | class NFDQuickCheck(Feature):
44 | @staticmethod
45 | def dependencies() -> Set[Type[Feature]]:
46 | return set([CanonicalCombiningClass])
47 |
48 | def pre_process(self, codespace: Codespace) -> None:
49 | codespace.register_property("quick_check_flags", 0, StorageSize.UINT8)
50 |
51 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
52 | file = archive.open('quickcheck.json')
53 | data = json.loads(file.read())
54 | nfd_quick_check: Dict[str,int] = data["quickCheckNFD"]
55 | file.close()
56 |
57 | qc_flags_property = codespace.get("quick_check_flags")
58 | for cp, qc_value in nfd_quick_check.items():
59 | codespace.set_bitwise(int(cp,16), qc_flags_property, qc_value)
60 |
61 | public_header = "#define UNICORN_FEATURE_NFD_QUICK_CHECK\n"
62 | return SerializationData(public_header=public_header)
63 |
--------------------------------------------------------------------------------
/man/uni_next.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_next \- decode a scalar value
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_next(const void *" text ", unisize " text_len ", uniattr " text_attr ", unisize *" index ", unichar *" cp ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function decodes one Unicode scalar value from \f[I]text\f[R] at code unit \f[I]index\f[R] and writes the result to \f[I]cp\f[R].
14 | The \f[I]index\f[R] parameter is updated by the implementation to refer to the code unit beginning the next scalar.
15 | .PP
16 | It returns \f[B]UNI_DONE\f[R] when iteration has reached the end of \f[I]text\f[R] indicating there are more characters.
17 | If the implementation detects \f[I]text\f[R] is malformed, then it returns \f[B]UNI_BAD_ENCODING\f[R].
18 | .PP
19 | The \f[I]index\f[R] parameter must refer to a code point boundary otherwise the behavior is undefined.
20 | .SH RETURN VALUE
21 | .TP
22 | UNI_OK
23 | If the scalar was successfully decoded.
24 | .TP
25 | UNI_DONE
26 | If the end of \f[I]text\f[R] was reached.
27 | .TP
28 | UNI_BAD_ENCODING
29 | If \f[I]text\f[R] is malformed; this is never returned if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
30 | .TP
31 | UNI_BAD_OPERATION
32 | If \f[I]text\f[R], \f[I]index\f[R], or \f[I]cp\f[R] are NULL.
33 | .SH EXAMPLES
34 | This example prints each Unicode scalar value from a text string encoded as UTF-8.
35 | .PP
36 | .in +4n
37 | .EX
38 | #include
39 | #include
40 |
41 | int main(void)
42 | {
43 | const char str[] = u8"I 🕵️."; // I spy
44 | unisize i = 0;
45 | for (;;)
46 | {
47 | unichar cp;
48 | unistat r = uni_next(str, -1, UNI_UTF8, &i, &cp);
49 | if (r == UNI_DONE)
50 | {
51 | break;
52 | }
53 | else if (r == UNI_BAD_ENCODING)
54 | {
55 | // malformed character
56 | }
57 | else
58 | {
59 | printf("U+%04X\\n", cp); // print scalar
60 | }
61 | }
62 | return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unistat (3),
68 | .BR UNI_TRUST (3),
69 | .BR unisize (3),
70 | .BR uniattr (3),
71 | .BR unichar (3)
72 | .SH AUTHOR
73 | .UR https://railgunlabs.com
74 | Railgun Labs
75 | .UE .
76 | .SH INTERNET RESOURCES
77 | The online documentation is published on the
78 | .UR https://railgunlabs.com/unicorn
79 | Railgun Labs website
80 | .UE .
81 | .SH LICENSING
82 | Unicorn is distributed with its end-user license agreement (EULA).
83 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
84 |
--------------------------------------------------------------------------------
/examples/collate_text.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | int main(int argc, char *argv[])
13 | {
14 | // This code example demonstrates how to collate strings (compare strings for sorting).
15 | // The online documentation is available here:
16 | // .
17 |
18 | const char *s1 = u8"Hello World";
19 | const char *s2 = u8"こんにちは世界";
20 |
21 | uint16_t sk1[64] = {0};
22 | uint16_t sk2[64] = {0};
23 |
24 | size_t sk1_len = 64;
25 | size_t sk2_len = 64;
26 |
27 | int32_t result = 0;
28 |
29 | // The following snippet collates two strings and prints the result of the comparison.
30 | // This function, conceptually, builds sort keys for the input strings and compares them.
31 | // Building a sort key is expensive. If strings are collated multiple times, it is recommend
32 | // to build the sort keys once and use them for subsequent comparisons.
33 | if (uni_collate(s1, -1, UNI_UTF8, s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, &result) == UNI_OK)
34 | {
35 | if (result < 0)
36 | {
37 | puts("s1 < s2");
38 | }
39 | else if (result > 0)
40 | {
41 | puts("s1 > s2");
42 | }
43 | else
44 | {
45 | puts("s1 = s2");
46 | }
47 | }
48 |
49 | // The following snippet collates strings the manual way. The first step is to
50 | // generate a "sort key" for both strings. The sort key encodes the sorting
51 | // weights of the string.
52 | //
53 | // Constructing a sort key is the computationally expensive part of the process.
54 | // Once the sort keys are constructed, they are compared which is inexpensive.
55 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk1, &sk1_len) == UNI_OK)
56 | {
57 | if (uni_sortkeymk(s2, -1, UNI_UTF8, UNI_NON_IGNORABLE, UNI_PRIMARY, sk2, &sk2_len) == UNI_OK)
58 | {
59 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) == UNI_OK)
60 | {
61 | if (result < 0)
62 | {
63 | puts("s1 < s2");
64 | }
65 | else if (result > 0)
66 | {
67 | puts("s1 > s2");
68 | }
69 | else
70 | {
71 | puts("s1 = s2");
72 | }
73 | }
74 | }
75 | }
76 | return 0;
77 | }
78 |
--------------------------------------------------------------------------------
/src/memory.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #include "common.h"
15 | #include "unidata.h"
16 |
17 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES)
18 | #include
19 | static void *default_allocf(void *ud, void *ptr, size_t osize, size_t nsize)
20 | {
21 | (void)ud;
22 | (void)osize;
23 | void *mem = NULL;
24 | if (nsize == (size_t)0)
25 | {
26 | free(ptr); // cppcheck-suppress misra-c2012-21.3 ; This function is optional.
27 | }
28 | else
29 | {
30 | mem = realloc(ptr, nsize); // cppcheck-suppress misra-c2012-21.3 ; This function is optional.
31 | }
32 | return mem;
33 | }
34 | static unimemfunc unicorn_allocf = &default_allocf;
35 | #else
36 | static unimemfunc unicorn_allocf;
37 | #endif
38 |
39 | static void *unicorn_ud;
40 |
41 | UNICORN_API unistat uni_setmemfunc(void *user_data, unimemfunc allocf) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
42 | {
43 | unistat status = UNI_OK;
44 | if (allocf == NULL)
45 | {
46 | unicorn_ud = NULL;
47 | #if defined(UNICORN_HAVE_C_MEMORY_ROUTINES)
48 | unicorn_allocf = &default_allocf;
49 | #else
50 | unicorn_allocf = NULL;
51 | status = UNI_FEATURE_DISABLED;
52 | #endif
53 | }
54 | else
55 | {
56 | unicorn_ud = user_data;
57 | unicorn_allocf = allocf;
58 | }
59 | return status;
60 | }
61 |
62 | void *uni_malloc(size_t size)
63 | {
64 | void *ptr = NULL;
65 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
66 | {
67 | ptr = unicorn_allocf(unicorn_ud, NULL, 0, size);
68 | }
69 | return ptr;
70 | }
71 |
72 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size)
73 | {
74 | void *new_ptr = NULL;
75 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
76 | {
77 | new_ptr = unicorn_allocf(unicorn_ud, old_ptr, old_size, new_size);
78 | }
79 | return new_ptr;
80 | }
81 |
82 | void uni_free(void *ptr, size_t size)
83 | {
84 | if (unicorn_allocf != NULL) // LCOV_EXCL_BR_LINE: Branch taken only by the features.json combinations tests.
85 | {
86 | (void)unicorn_allocf(unicorn_ud, ptr, size, 0);
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/UnicornConfig.cmake.in:
--------------------------------------------------------------------------------
1 | #[=======================================================================[.rst:
2 | FindUnicorn
3 | ------------
4 |
5 | Finds the Unicorn library.
6 |
7 | Imported Targets
8 | ^^^^^^^^^^^^^^^^
9 |
10 | This module provides the following imported targets, if found:
11 |
12 | ``Railgun::Unicorn``
13 | The Unicorn library
14 |
15 | Result Variables
16 | ^^^^^^^^^^^^^^^^
17 |
18 | This will define the following variables:
19 |
20 | ``UNICORN_FOUND``
21 | True if the system has the Unicorn library.
22 | ``UNICORN_VERSION``
23 | The version of the Unicorn library which was found.
24 | ``UNICORN_INCLUDE_DIRS``
25 | Include directories needed to use Unicorn.
26 | ``UNICORN_LIBRARIES``
27 | Libraries needed to link to Unicorn.
28 |
29 | Cache Variables
30 | ^^^^^^^^^^^^^^^
31 |
32 | The following cache variables may also be set:
33 |
34 | ``UNICORN_INCLUDE_DIR``
35 | The directory containing ``unicorn.h``.
36 | ``UNICORN_LIBRARY``
37 | The path to the Unicorn library.
38 |
39 | #]=======================================================================]
40 |
41 | # On Windows the header and library are installed in the same directory as this
42 | # script is therefore the following function is called to obtain its path.
43 | get_filename_component(UNICORN_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" DIRECTORY)
44 |
45 | # Find header.
46 | find_path(UNICORN_INCLUDE_DIR unicorn.h
47 | HINTS /usr/local/include /usr/include ${UNICORN_CONFIG_DIR})
48 |
49 | # Find library.
50 | find_library(UNICORN_LIBRARY
51 | NAMES Unicorn unicorn
52 | HINTS /usr/local/lib /usr/lib ${UNICORN_CONFIG_DIR})
53 |
54 | # Not needed anymore.
55 | unset(UNICORN_CONFIG_DIR)
56 |
57 | # Use version specified in top-level CMakeLists.txt
58 | set(UNICORN_VERSION "@PROJECT_VERSION@")
59 |
60 | # Handle the QUIETLY and REQUIRED arguments and set UNICORN_FOUND to TRUE if all listed variables are TRUE.
61 | include(FindPackageHandleStandardArgs)
62 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Unicorn
63 | FOUND_VAR UNICORN_FOUND
64 | REQUIRED_VARS UNICORN_LIBRARY UNICORN_INCLUDE_DIR
65 | VERSION_VAR UNICORN_VERSION)
66 |
67 | # Set result variables.
68 | if(UNICORN_FOUND)
69 | set(UNICORN_LIBRARIES ${UNICORN_LIBRARY})
70 | set(UNICORN_INCLUDE_DIRS ${UNICORN_INCLUDE_DIR})
71 | endif()
72 |
73 | # Export a module.
74 | if(UNICORN_FOUND AND NOT TARGET Railgun::Unicorn)
75 | add_library(Railgun::Unicorn UNKNOWN IMPORTED)
76 | set_target_properties(Railgun::Unicorn PROPERTIES
77 | IMPORTED_LOCATION "${UNICORN_LIBRARY}"
78 | INTERFACE_INCLUDE_DIRECTORIES "${UNICORN_INCLUDE_DIR}")
79 | endif()
80 |
81 | # Cached variables should be hidden in the CMake interface unless the user explicitly asks to edit them.
82 | mark_as_advanced(UNICORN_INCLUDE_DIR UNICORN_LIBRARY)
83 |
--------------------------------------------------------------------------------
/man/uni_caseconv.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_caseconv \- case convert a string
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_caseconv(unicaseconv " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function case converts \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R].
14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
15 | .PP
16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If \f[I]src\f[R] was case converted successfully.
25 | .TP
26 | UNI_BAD_OPERATION
27 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero.
28 | .TP
29 | UNI_BAD_ENCODING
30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
31 | .TP
32 | UNI_NO_SPACE
33 | If \f[I]dst\f[R] was not large enough to accommodate the case converted text.
34 | .TP
35 | UNI_FEATURE_DISABLED
36 | If the library was built without support for case conversion.
37 | .SH EXAMPLES
38 | This example case converts a string to uppercase.
39 | .PP
40 | .in +4n
41 | .EX
42 | #include
43 | #include
44 |
45 | int main(void)
46 | {
47 | const char *in = u8"hello, 世界";
48 | char out[32];
49 | unisize outlen = sizeof(out);
50 |
51 | if (uni_caseconv(UNI_UPPER, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK)
52 | {
53 | // something went wrong
54 | return 1;
55 | }
56 |
57 | printf("%.*s", outlen, out); // prints "HELLO, 世界"
58 | return 0;
59 | }
60 | .EE
61 | .in
62 | .SH SEE ALSO
63 | .BR unistat (3),
64 | .BR UNI_TRUST (3),
65 | .BR unicaseconv (3),
66 | .BR unisize (3),
67 | .BR uniattr (3)
68 | .SH AUTHOR
69 | .UR https://railgunlabs.com
70 | Railgun Labs
71 | .UE .
72 | .SH INTERNET RESOURCES
73 | The online documentation is published on the
74 | .UR https://railgunlabs.com/unicorn
75 | Railgun Labs website
76 | .UE .
77 | .SH LICENSING
78 | Unicorn is distributed with its end-user license agreement (EULA).
79 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
80 |
--------------------------------------------------------------------------------
/scripts/case_folding.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import List, Set, Type, Tuple
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace, Codepoint
18 | from .feature import Feature, FLAGS_PROPERTY, IS_NORMALIZATION_NEEDED, IS_CHANGES_WHEN_CASEFOLDED
19 | from .basics import StorageSize, SerializationData
20 | from .decomposition import CanonicalDecompositionFeature
21 |
22 | class DefaultCaseFolding(Feature):
23 | def pre_process(self, codespace: Codespace) -> None:
24 | codespace.register_property("full_casefold_mapping_offset", 0, StorageSize.UINT16)
25 | codespace.register_property(*FLAGS_PROPERTY)
26 |
27 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
28 | file = archive.open('casefold.json')
29 | data = json.loads(file.read())
30 | source: str = data["source"]
31 | header: str = data["header"]
32 | size: int = data["size"]
33 | has_greek: List[Codepoint] = data["hasGreek"]
34 | case_foldings: List[Tuple[Codepoint,int]] = data["caseFoldings"]
35 | changes_when_casefolded: List[Codepoint] = data["changesWhenCasefolded"]
36 | file.close()
37 |
38 | flags_property = codespace.get(FLAGS_PROPERTY[0])
39 | casefold_property = codespace.get("full_casefold_mapping_offset")
40 |
41 | for cp in has_greek:
42 | codespace.set_bitwise(cp, flags_property, IS_NORMALIZATION_NEEDED)
43 |
44 | for cp in changes_when_casefolded:
45 | codespace.set_bitwise(cp, flags_property, IS_CHANGES_WHEN_CASEFOLDED)
46 |
47 | for cp,offset in case_foldings:
48 | codespace.set(cp, casefold_property, offset)
49 |
50 | header += "#define UNICORN_CHAR_NEEDS_NORMALIZATION {0}u\n".format(IS_NORMALIZATION_NEEDED)
51 | header += "#define UNICORN_CHAR_CHANGES_WHEN_CASEFOLDED {0}u\n".format(IS_CHANGES_WHEN_CASEFOLDED)
52 |
53 | public_header = "#define UNICORN_FEATURE_CASEFOLD_DEFAULT\n"
54 | return SerializationData(source, header, public_header, size)
55 |
56 | class CanonicalCaseFolding(Feature):
57 | @staticmethod
58 | def dependencies() -> Set[Type[Feature]]:
59 | return set([DefaultCaseFolding, CanonicalDecompositionFeature])
60 |
61 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
62 | public_header = "#define UNICORN_FEATURE_CASEFOLD_CANONICAL\n"
63 | return SerializationData(public_header=public_header)
64 |
--------------------------------------------------------------------------------
/man/unistat.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unistat \- status code
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum unistat {
11 | .RS
12 | .B UNI_OK,
13 | .B UNI_DONE,
14 | .B UNI_NO_MEMORY,
15 | .B UNI_NO_SPACE,
16 | .B UNI_BAD_ENCODING,
17 | .B UNI_BAD_OPERATION,
18 | .B UNI_FEATURE_DISABLED,
19 | .B UNI_MALFUNCTION,
20 | .RE
21 | .B };
22 | .fi
23 | .SH DESCRIPTION
24 | Most functions in the Unicorn library return a constant from this enumeration.
25 | Of the constants, only \f[B]UNI_OK\f[R] and \f[B]UNI_DONE\f[R] represent a success status whereas the others represent failures.
26 | .SH CONSTANTS
27 | .TP
28 | .BR UNI_OK
29 | Represents the successful execution of an operation.
30 | .TP
31 | .BR UNI_DONE
32 | Represents the successful completion of an operation.
33 | An example of a function that returns this status code is \f[B]uni_next\f[R](3) which returns it when the end of the input string has been reached.
34 | .TP
35 | .BR UNI_NO_MEMORY
36 | Functions that can dynamically allocate memory will list this constant as one of their potential return values.
37 | Functions that omit this constant as a return value do not perform dynamic memory allocation.
38 | .TP
39 | .BR UNI_NO_SPACE
40 | This failure code indicates the destination buffer is too small to receive the content and therefore a larger buffer must be used.
41 | .TP
42 | .BR UNI_BAD_ENCODING
43 | This failure code indicates a malformed character sequence was encountered when decoding text.
44 | Examples of malformed character sequences would be overlong sequences in UTF-8 or unpaired surrogate characters in UTF-16.
45 | .TP
46 | .BR UNI_BAD_OPERATION
47 | This failure code is returned by functions when they are called with invalid arguments.
48 | For example, a function might return this code when given a null pointer when a non-null pointer was expected.
49 | .TP
50 | .BR UNI_FEATURE_DISABLED
51 | This failure code indicates the requested operation cannot be performed because the feature is disabled.
52 | Unicorn allows consumers to customize the Unicode features it's built with.
53 | This is useful for reducing the footprint of the library.
54 | If a user attempts to call a function that uses a disabled feature of the library, then this status code will be returned.
55 | .TP
56 | .BR UNI_MALFUNCTION
57 | This failure code indicates a defect with the implementation.
58 | In a working version of Unicorn, an application will never see this status code.
59 | If an application encounters this code, it means there is a bug in Unicorn.
60 | .SH SEE ALSO
61 | .BR uni_next (3)
62 | .SH AUTHOR
63 | .UR https://railgunlabs.com
64 | Railgun Labs
65 | .UE .
66 | .SH INTERNET RESOURCES
67 | The online documentation is published on the
68 | .UR https://railgunlabs.com/unicorn
69 | Railgun Labs website
70 | .UE .
71 | .SH LICENSING
72 | Unicorn is distributed with its end-user license agreement (EULA).
73 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
74 |
--------------------------------------------------------------------------------
/man/uni_norm.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_norm \- normalize text
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_norm(uninormform " form ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function normalizes \f[I]src\f[R] and writes the result to the \f[I]dst\f[R] buffer.
14 | .PP
15 | The implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
16 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
17 | .PP
18 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the function writes to \f[I]dst_len\f[R] the number of code units in the fully normalized text and returns \f[B]UNI_OK\f[R].
19 | The function can be called this way to first compute the total size of the destination buffer, then called again with a sufficiently sized buffer.
20 | .PP
21 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated.
22 | .SH RETURN VALUE
23 | .TP
24 | UNI_OK
25 | On success.
26 | .TP
27 | UNI_BAD_OPERATION
28 | If \f[I]src\f[R] is null, if \f[I]dst_len\f[R] is negative, or if \f[I]dst\f[R] is NULL and \f[I]dst_len\f[R] is greater than zero.
29 | .TP
30 | UNI_BAD_ENCODING
31 | If \f[I]src\f[R] is malformed; this is never returned if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3).
32 | .TP
33 | UNI_NO_SPACE
34 | If \f[I]dst\f[R] lacks the capacity to store the normalization of \f[I]src\f[R].
35 | .TP
36 | UNI_NO_MEMORY
37 | If dynamic memory allocation failed.
38 | .TP
39 | UNI_FEATURE_DISABLED
40 | If Unicorn was built without support for normalizing to \f[I]form\f[R].
41 | .SH EXAMPLES
42 | This example normalizes the input string to Normalization Form D. This form is ideal for in-memory string comparison because it is quick to compute.
43 | For persistent storage or transmission, Normalization Form C is preferred.
44 | .PP
45 | .in +4n
46 | .EX
47 | #include
48 | #include
49 |
50 | int main(void)
51 | {
52 | const char *in = u8"Åström";
53 | char out[32];
54 | unisize outlen = sizeof(out);
55 |
56 | if (uni_norm(UNI_NFD, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK)
57 | {
58 | // something went wrong
59 | return 1;
60 | }
61 |
62 | printf("%.*s", outlen, out);
63 | return 0;
64 | }
65 | .EE
66 | .in
67 | .SH SEE ALSO
68 | .BR unistat (3),
69 | .BR UNI_TRUST (3),
70 | .BR uninormform (3),
71 | .BR unisize (3),
72 | .BR uniattr (3)
73 | .SH AUTHOR
74 | .UR https://railgunlabs.com
75 | Railgun Labs
76 | .UE .
77 | .SH INTERNET RESOURCES
78 | The online documentation is published on the
79 | .UR https://railgunlabs.com/unicorn
80 | Railgun Labs website
81 | .UE .
82 | .SH LICENSING
83 | Unicorn is distributed with its end-user license agreement (EULA).
84 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
85 |
--------------------------------------------------------------------------------
/scripts/case_conversion.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Dict, Set, Type
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature, FLAGS_PROPERTY
19 | from .simple_case_mappings import SimpleLowercaseMapping, SimpleTitlecaseMapping, SimpleUppercaseMapping
20 | from .segmentation import WordBreak
21 | from .ccc import CanonicalCombiningClass
22 | from .basics import SerializationData
23 |
24 | class _CaseConversion(Feature):
25 | def pre_process(self, codespace: Codespace) -> None:
26 | codespace.register_property(*FLAGS_PROPERTY)
27 |
28 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, config: Config) -> SerializationData:
29 | file = archive.open('special_case_mappings.json')
30 | data = json.loads(file.read())
31 | source: str = data["source"]
32 | header: str = data["header"]
33 | size: int = data["size"]
34 | mappings: Dict[str,int] = data["characterFlags"]
35 | file.close()
36 |
37 | flags_property = codespace.get(FLAGS_PROPERTY[0])
38 |
39 | for cp, flags in mappings.items():
40 | codespace.set_bitwise(int(cp,16), flags_property, flags)
41 |
42 | return SerializationData(source=source, header=header, size=size)
43 |
44 | class LowercaseConversion(Feature):
45 | @staticmethod
46 | def dependencies() -> Set[Type[Feature]]:
47 | return set([SimpleLowercaseMapping, _CaseConversion])
48 |
49 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
50 | public_header = "#define UNICORN_FEATURE_LOWERCASE_CONVERT\n"
51 | return SerializationData(public_header=public_header)
52 |
53 | class UppercaseConversion(Feature):
54 | @staticmethod
55 | def dependencies() -> Set[Type[Feature]]:
56 | return set([SimpleUppercaseMapping, _CaseConversion])
57 |
58 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
59 | public_header = "#define UNICORN_FEATURE_UPPERCASE_CONVERT\n"
60 | return SerializationData(public_header=public_header)
61 |
62 | class TitlecaseConversion(Feature):
63 | @staticmethod
64 | def dependencies() -> Set[Type[Feature]]:
65 | return set([SimpleTitlecaseMapping, WordBreak, CanonicalCombiningClass, LowercaseConversion])
66 |
67 | def process(self, _: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
68 | public_header = "#define UNICORN_FEATURE_TITLECASE_CONVERT\n"
69 | return SerializationData(public_header=public_header)
70 |
--------------------------------------------------------------------------------
/man/uni_convert.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_convert \- convert encoding forms
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_convert(const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function converts the text pointed to by \f[I]src\f[R], which is in the encoding form specified by \f[I]src_attr\f[R], to the encoding specified by \f[I]dst_attr\f[R] and writes the result to \f[I]dst\f[R].
14 | If \f[I]src_len\f[R] is -1, then \f[I]src\f[R] is assumed to be null-terminated.
15 | .PP
16 | If \f[I]dst\f[R] isn't large enough to contain the re-encoded \f[I]src\f[R] text, then it will be truncated to the nearest code point boundary and \f[B]UNI_NO_SPACE\f[R] will be returned.
17 | If \f[I]dst\f[R] is large enough, then \f[B]UNI_OK\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully re-encoded text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .PP
22 | If \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] are identical, then this function behaves like \f[B]memcpy\f[R](3).
23 | .SH RETURN VALUE
24 | .TP
25 | UNI_OK
26 | On success.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]src\f[R] or \f[C]dest_len\f[R] are null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_NO_SPACE
35 | If \f[C]dest\f[R] is too small.
36 | .TP
37 | UNI_FEATURE_DISABLED
38 | If the \f[I]src_attr\f[R] and \f[I]dst_attr\f[R] encoding forms are disabled.
39 | .SH EXAMPLES
40 | This example re-encodes text from UTF-32 to UTF-8.
41 | It uses \f[C]char32_t\f[R] and 'U' literal strings from C11.
42 | .PP
43 | .in +4n
44 | .EX
45 | #include
46 | #include
47 | #include
48 |
49 | int main(void)
50 | {
51 | const char32_t in[] = U"👨🏻🚀🧑🏼🚀 landed on the 🌕 in 1969.";
52 | char out[64];
53 | unisize outlen = sizeof(out);
54 |
55 | if (uni_convert(in, -1, UNI_UTF32, out, &outlen, UNI_UTF8) != UNI_OK)
56 | {
57 | // something went wrong
58 | return 1;
59 | }
60 |
61 | printf("%.*s", outlen, out);
62 | return 0;
63 | }
64 | .EE
65 | .in
66 | .SH SEE ALSO
67 | .BR unistat (3),
68 | .BR UNI_TRUST (3),
69 | .BR unisize (3),
70 | .BR uniattr (3)
71 | .SH AUTHOR
72 | .UR https://railgunlabs.com
73 | Railgun Labs
74 | .UE .
75 | .SH INTERNET RESOURCES
76 | The online documentation is published on the
77 | .UR https://railgunlabs.com/unicorn
78 | Railgun Labs website
79 | .UE .
80 | .SH LICENSING
81 | Unicorn is distributed with its end-user license agreement (EULA).
82 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
83 |
--------------------------------------------------------------------------------
/man/uni_encode.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_encode \- encode a scalar value
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_encode(unichar " cp ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function encodes the Unicode scalar value \f[I]cp\f[R] into the encoding form \f[I]dst_attr\f[R] and writes the resulting code units to \f[I]dst\f[R].
14 | The code unit capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
15 | The implementation will write the number of code units needed to encode \f[I]cp\f[R] to \f[I]dst_len\f[R] on output.
16 | .PP
17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
18 | .PP
19 | To ensure the \f[I]dst\f[R] is sufficiently sized, the following table lists the number of code units needed to encode any character within its respective encoding form.
20 | .PP
21 | .TS
22 | allbox tab(|);
23 | ll.
24 | \fBEncoding Form\fR|\fBLongest Code Unit Sequence\fR
25 | T{
26 | UTF-8
27 | T}|T{
28 | 4
29 | T}
30 | T{
31 | UTF-16
32 | T}|T{
33 | 2
34 | T}
35 | T{
36 | UTF-32
37 | T}|T{
38 | 1
39 | T}
40 | .TE
41 | .PP
42 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the number of code units needed will be computed by the implementation and written to \f[I]dst_len\f[R] and \f[B]UNI_OK\f[R] is returned.
43 | .SH RETURN VALUE
44 | .TP
45 | UNI_OK
46 | If the character was encoded successfully.
47 | .TP
48 | UNI_BAD_OPERATION
49 | If \f[I]cp\f[R] is not a Unicode scalar value, if \f[I]dst_len\f[R] is null, or if \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is non-zero.
50 | .TP
51 | UNI_NO_SPACE
52 | If \f[C]dest\f[R] is too small.
53 | .TP
54 | UNI_FEATURE_DISABLED
55 | If Unicorn was built without support for the encoding form specified by \f[I]dst_attr\f[R].
56 | .SH EXAMPLES
57 | This example encodes the Unicode scalar value for GRINNING FACE (U+1F600) as UTF-8 encoded code units.
58 | The code units are printed to stdout as hexadecimal numbers.
59 | .PP
60 | .in +4n
61 | .EX
62 | #include
63 | #include
64 |
65 | int main(void)
66 | {
67 | unichar cp = 0x1F600;
68 |
69 | uint8_t u8[4];
70 | unisize u8_len = 4;
71 |
72 | if (uni_encode(cp, u8, &u8_len, UNI_UTF8) != UNI_OK)
73 | {
74 | puts("failed to encode character");
75 | }
76 |
77 | for (unisize i = 0; i < u8_len; i++)
78 | {
79 | printf("0x%02X ", u8[i]);
80 | }
81 | return 0;
82 | }
83 | .EE
84 | .in
85 | .SH SEE ALSO
86 | .BR unistat (3),
87 | .BR unichar (3),
88 | .BR unisize (3),
89 | .BR uniattr (3)
90 | .SH AUTHOR
91 | .UR https://railgunlabs.com
92 | Railgun Labs
93 | .UE .
94 | .SH INTERNET RESOURCES
95 | The online documentation is published on the
96 | .UR https://railgunlabs.com/unicorn
97 | Railgun Labs website
98 | .UE .
99 | .SH LICENSING
100 | Unicorn is distributed with its end-user license agreement (EULA).
101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
102 |
--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #ifndef COMMON_H
15 | #define COMMON_H
16 |
17 | #include "unicorn.h"
18 | #include
19 | #include
20 |
21 | #define GET_ENCODING(FLAGS) ((FLAGS) & (UNI_SCALAR | UNI_UTF8 | UNI_UTF16 | UNI_UTF32))
22 |
23 | #define UNICHAR_C(X) ((unichar)(X))
24 | #define UNISIZE_C(X) ((unisize)(X))
25 | #define UNIHASH_C(X) ((unihash)(X))
26 |
27 | #define UNICORN_LARGEST_CODE_POINT UNICHAR_C(0x10FFFFL)
28 | #define FIRST_ILLEGAL_CODE_POINT UNICHAR_C(0x110000L)
29 |
30 | #define COUNT_OF(array) (sizeof(array) / sizeof((array)[0]))
31 |
32 | typedef uint8_t UChar8;
33 | typedef uint16_t UChar16;
34 | typedef uint32_t UChar32;
35 |
36 | typedef uint32_t unihash;
37 |
38 | typedef uint16_t(*ByteSwap16)(uint16_t val);
39 | typedef uint32_t(*ByteSwap32)(uint32_t val);
40 |
41 | struct unitext
42 | {
43 | const void *data;
44 | unisize index;
45 | unisize length;
46 | uniattr encoding;
47 | };
48 |
49 | void *uni_malloc(size_t size);
50 | void *uni_realloc(void *old_ptr, size_t old_size, size_t new_size);
51 | void uni_free(void *ptr, size_t size);
52 |
53 | unisize unichar_to_u8(unichar codepoint, UChar8 bytes[4]);
54 | unisize unichar_to_u16(unichar codepoint, UChar16 words[2], ByteSwap16 swap); // cppcheck-suppress premium-misra-c-2012-17.3 ; This is a false positive.
55 |
56 | unisize uni_prev_UTF8_seqlen(const UChar8 *start, unisize offset);
57 |
58 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding);
59 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding);
60 |
61 | void uni_message(const char *msg);
62 |
63 | static inline bool is_low_surrogate(unichar c)
64 | {
65 | bool is_low;
66 | if (c < UNICHAR_C(0xDC00))
67 | {
68 | is_low = false;
69 | }
70 | else if (c > UNICHAR_C(0xDFFF))
71 | {
72 | is_low = false;
73 | }
74 | else
75 | {
76 | is_low = true;
77 | }
78 | return is_low;
79 | }
80 |
81 | static inline bool is_high_surrogate(unichar c)
82 | {
83 | bool is_high;
84 | if (c < UNICHAR_C(0xD800))
85 | {
86 | is_high = false;
87 | }
88 | else if (c > UNICHAR_C(0xDBFF))
89 | {
90 | is_high = false;
91 | }
92 | else
93 | {
94 | is_high = true;
95 | }
96 | return is_high;
97 | }
98 |
99 | #if defined(_MSC_VER)
100 | #define UNREACHABLE __assume(false)
101 | #elif defined(__GNUC__) || defined(__clang__)
102 | #define UNREACHABLE __builtin_unreachable()
103 | #else
104 | #define UNREACHABLE assert(0 && "unreachable code") // LCOV_EXCL_BR_LINE
105 | #endif
106 |
107 | #endif // COMMON_H
108 |
--------------------------------------------------------------------------------
/man/uni_casefold.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_casefold \- case fold a string
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_casefold(unicasefold " casing ", const void *" src ", unisize " src_len ", uniattr " src_attr ", void *" dst ", unisize *" dst_len ", uniattr " dst_attr ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function case folds \f[I]src\f[R] to casing form \f[I]casing\f[R] and writes the result to \f[I]dst\f[R].
14 | The capacity of \f[I]dst\f[R] is specified by \f[I]dst_len\f[R].
15 | .PP
16 | If \f[I]dst\f[R] is not null, then the implementation writes to \f[I]dst_len\f[R] the total number of code units written to \f[I]dst\f[R].
17 | If the capacity of \f[I]dst\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
18 | .PP
19 | If \f[I]dst\f[R] is null and \f[I]dst_len\f[R] is zero, then the implementation writes to \f[I]dst_len\f[R] the number of code units in the fully case converted text and returns \f[B]UNI_OK\f[R].
20 | Call the function this way to first compute the total length of the destination buffer before calling it again with a sufficiently sized buffer.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If \f[C]text\f[R] was checked successfully.
25 | .TP
26 | UNI_BAD_OPERATION
27 | If \f[C]text\f[R] is null, \f[C]length\f[R] is negative, or \f[C]result\f[R] is null.
28 | .TP
29 | UNI_BAD_ENCODING
30 | If \f[I]src\f[R] is not well-formed (checks are omitted if \f[I]src_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
31 | .TP
32 | UNI_NO_SPACE
33 | If \f[I]dst\f[R] was not large enough.
34 | .TP
35 | UNI_FEATURE_DISABLED
36 | If the library was built without support for case folding.
37 | .TP
38 | UNI_NO_MEMORY
39 | If dynamic memory allocation failed.
40 | .SH EXAMPLES
41 | This example casefolds a string for canonical caseless comparison.
42 | .PP
43 | The example will produce an output string that is longer than the input string.
44 | This is because the German 'ß' (U+00DF) case folds to 'ss' (U+0073 U+0073) and the Latin small letter 'ö' with diaeresis (U+00F6) canonically normalizes to an 'o' (U+006F) followed by a combining diaeresis character (U+0308).
45 | .PP
46 | .in +4n
47 | .EX
48 | #include
49 | #include
50 |
51 | int main(void)
52 | {
53 | const char *in = u8"Stößen";
54 | char out[32];
55 | unisize outlen = sizeof(out);
56 |
57 | if (uni_casefold(UNI_CANONICAL, in, -1, UNI_UTF8, out, &outlen, UNI_UTF8) != UNI_OK)
58 | {
59 | // something went wrong
60 | return 1;
61 | }
62 |
63 | printf("%.*s", outlen, out); // prints "stössen"
64 | return 0;
65 | }
66 | .EE
67 | .in
68 | .SH SEE ALSO
69 | .BR unistat (3),
70 | .BR UNI_TRUST (3),
71 | .BR unicasefold (3),
72 | .BR unisize (3),
73 | .BR uniattr (3)
74 | .SH AUTHOR
75 | .UR https://railgunlabs.com
76 | Railgun Labs
77 | .UE .
78 | .SH INTERNET RESOURCES
79 | The online documentation is published on the
80 | .UR https://railgunlabs.com/unicorn
81 | Railgun Labs website
82 | .UE .
83 | .SH LICENSING
84 | Unicorn is distributed with its end-user license agreement (EULA).
85 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
86 |
--------------------------------------------------------------------------------
/man/uni_sortkeymk.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_sortkeymk \- make a sort key
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_sortkeymk(const void *" text ", unisize " text_len ", uniattr " text_attr ", uniweighting " weighting ", unistrength " strength ", uint16_t *" sortkey ", size_t *" sortkey_cap ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function constructs a sort key for \f[I]text\f[R] and writes the result to \f[I]sortkey\f[R].
14 | .PP
15 | The number of code units in \f[I]text\f[R] is specified by \f[I]text_len\f[R].
16 | If \f[I]text_len\f[R] is -1, then the implementation assumes \f[I]text\f[R] is null-terminated.
17 | .PP
18 | The capacity of \f[I]sortkey\f[R] is specified by \f[I]sortkey_cap\f[R].
19 | The implementation always writes to \f[I]sortkey_cap\f[R] the total number of weights needed for \f[I]text\f[R].
20 | If the capacity of \f[I]sortkey\f[R] is insufficient, then \f[B]UNI_NO_SPACE\f[R] is returned.
21 | .SH RETURN VALUE
22 | .TP
23 | UNI_OK
24 | If the scalar was successfully decoded.
25 | .TP
26 | UNI_NO_SPACE
27 | If \f[I]sortkey\f[R] lacks capacity for the collation elements.
28 | .TP
29 | UNI_BAD_OPERATION
30 | If \f[I]text\f[R] or \f[I]sortkey_cap\f[R] are NULL.
31 | .TP
32 | UNI_BAD_ENCODING
33 | If \f[I]text\f[R] is not well-formed (checks are omitted if \f[I]text_attr\f[R] has \f[B]UNI_TRUST\f[R](3)).
34 | .TP
35 | UNI_NO_MEMORY
36 | If dynamic memory allocation failed.
37 | .SH EXAMPLES
38 | This example constructs two sort keys and compares them.
39 | Sort keys must be generated with the same settings for their order to make sense.
40 | In this example, the sort keys are constructed with \f[B]UNI_SHIFTED\f[R] weighting and \f[B]UNI_PRIMARY\f[R] strength.
41 | In a real application you would cache the sort keys for future comparisons.
42 | For one-off comparisons, use \f[B]uni_collate\f[R](3).
43 | .PP
44 | .in +4n
45 | .EX
46 | #include
47 | #include
48 |
49 | int main(void)
50 | {
51 | const char *s1 = "hello", *s2 = "Hi";
52 | uint16_t sk1[32] = {0}, sk2[16] = {0};
53 | size_t sk1_len = 32, sk2_len = 16;
54 |
55 | if (uni_sortkeymk(s1, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk1, &sk1_len) != UNI_OK ||
56 | uni_sortkeymk(s2, -1, UNI_UTF8, UNI_SHIFTED, UNI_PRIMARY, sk2, &sk2_len) != UNI_OK)
57 | {
58 | puts("failed to construct sort keys");
59 | return 1;
60 | }
61 |
62 | int result;
63 | if (uni_sortkeycmp(sk1, sk1_len, sk2, sk2_len, &result) != UNI_OK)
64 | {
65 | puts("failed to compare sort keys");
66 | return 1;
67 | }
68 |
69 | printf("%d", result);
70 | return 0;
71 | }
72 | .EE
73 | .in
74 | .SH SEE ALSO
75 | .BR unistat (3),
76 | .BR uniweighting (3),
77 | .BR unistrength (3),
78 | .BR UNI_TRUST (3),
79 | .BR unisize (3),
80 | .BR uniattr (3)
81 | .SH AUTHOR
82 | .UR https://railgunlabs.com
83 | Railgun Labs
84 | .UE .
85 | .SH INTERNET RESOURCES
86 | The online documentation is published on the
87 | .UR https://railgunlabs.com/unicorn
88 | Railgun Labs website
89 | .UE .
90 | .SH LICENSING
91 | Unicorn is distributed with its end-user license agreement (EULA).
92 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
93 |
--------------------------------------------------------------------------------
/man/uni_collate.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_collate \- compare strings for sorting
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_collate(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", uniweighting " weighting ", unistrength " strength ", int32_t *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function compares \f[I]s1\f[R] and \f[I]s2\f[R] for sorting and writes -1, 0, or 1 to \f[I]result\f[R] depending on if \f[C]s1 < s2\f[R], \f[C]s1 = s2\f[R], or \f[C]s1 > s2\f[R].
14 | .PP
15 | This function is intended for one-off string comparisons.
16 | If a string will be compared multiple times, then it's recommended to construct a sort key once with \f[B]uni_sortkeymk\f[R](3) and perform comparisons with \f[B]uni_sortkeycmp\f[R](3).
17 | .PP
18 | The length and text attributes for \f[I]s1\f[R] are specified by \f[I]s1_len\f[R] and \f[I]s1_attr\f[R].
19 | If \f[I]s1_len\f[R] is -1, then the implementation assumes \f[I]s1\f[R] is null-terminated.
20 | .PP
21 | The length and text attributes for \f[I]s2\f[R] are specified by \f[I]s2_len\f[R] and \f[I]s2_attr\f[R].
22 | If \f[I]s2_len\f[R] is -1, then the implementation assumes \f[I]s2\f[R] is null-terminated.
23 | .SH RETURN VALUE
24 | .TP
25 | UNI_OK
26 | On success.
27 | .TP
28 | UNI_BAD_OPERATION
29 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, or if \f[I]result\f[R] is null.
30 | .TP
31 | UNI_BAD_ENCODING
32 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
33 | .TP
34 | UNI_FEATURE_DISABLED
35 | If Unicorn was built without support for collation.
36 | .TP
37 | UNI_NO_MEMORY
38 | If dynamic memory allocation failed.
39 | .SH EXAMPLES
40 | This example collates two strings.
41 | The function conceptually constructs a sort key for the input strings and then compares them.
42 | Constructing a sort key is expensive.
43 | If a string will be collated multiple times, then it is recommended to construct a sort key once and cache it for future calculations.
44 | See \f[B]uni_sortkeymk\f[R](3) for details.
45 | .PP
46 | .in +4n
47 | .EX
48 | #include
49 | #include
50 |
51 | int main(void)
52 | {
53 | const char *s1 = u8"Hello World";
54 | const char *s2 = u8"こんにちは世界";
55 | int result;
56 |
57 | if (uni_collate(s1, -1, UNI_UTF8,
58 | s2, -1, UNI_UTF8,
59 | UNI_NON_IGNORABLE, UNI_PRIMARY, &result) != UNI_OK)
60 | {
61 | puts("failed to collate strings");
62 | return 1;
63 | }
64 |
65 | printf("%d", result);
66 | return 0;
67 | }
68 | .EE
69 | .in
70 | .SH SEE ALSO
71 | .BR uni_sortkeymk (3),
72 | .BR uni_sortkeycmp (3),
73 | .BR uniweighting (3),
74 | .BR unistrength (3),
75 | .BR uniattr (3),
76 | .BR UNI_TRUST (3),
77 | .BR unisize (3)
78 | .SH AUTHOR
79 | .UR https://railgunlabs.com
80 | Railgun Labs
81 | .UE .
82 | .SH INTERNET RESOURCES
83 | The online documentation is published on the
84 | .UR https://railgunlabs.com/unicorn
85 | Railgun Labs website
86 | .UE .
87 | .SH LICENSING
88 | Unicorn is distributed with its end-user license agreement (EULA).
89 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
90 |
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(SOURCES
2 | ../include/unicorn.h
3 | unicorn.c
4 | logger.c
5 | memory.c
6 | common.h
7 | byteswap.h
8 | encoding.c
9 | segmentation.c
10 | normalize.c
11 | normalize.h
12 | collation.c
13 | ducet.c
14 | ducet.h
15 | compression.c
16 | properties.c
17 | caseconv.c
18 | casefold.c
19 | charbuf.c
20 | charbuf.h
21 | charvec.c
22 | charvec.h
23 | ${CMAKE_BINARY_DIR}/unidata.c
24 | ${CMAKE_BINARY_DIR}/unidata.h
25 | )
26 |
27 | if (UNICORN_BUILD_STATIC)
28 | add_library(unicorn_static STATIC ${SOURCES})
29 | target_compile_definitions(unicorn_static PUBLIC UNICORN_STATIC)
30 | set_property(TARGET unicorn_static PROPERTY C_STANDARD 99)
31 | set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h)
32 | set_target_properties(unicorn_static PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h)
33 | add_dependencies(unicorn_static unicorn_generate)
34 | install(TARGETS unicorn_static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
35 | endif ()
36 |
37 | if (UNICORN_BUILD_SHARED OR UNICORN_BUILD_EXAMPLES)
38 | add_library(unicorn SHARED ${SOURCES})
39 | target_compile_definitions(unicorn PRIVATE DLL_EXPORT)
40 | set_property(TARGET unicorn PROPERTY C_STANDARD 99)
41 | set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_SOURCE_DIR}/unicorn.h)
42 | set_target_properties(unicorn PROPERTIES PUBLIC_HEADER ${CMAKE_BINARY_DIR}/_unicorn.h)
43 | add_dependencies(unicorn unicorn_generate)
44 |
45 | if (CMAKE_C_COMPILER_ID MATCHES "Clang" OR
46 | CMAKE_C_COMPILER_ID MATCHES "GNU")
47 | target_compile_options(unicorn PRIVATE -pedantic)
48 | endif ()
49 |
50 | if (CMAKE_C_COMPILER_ID MATCHES "Clang")
51 | target_compile_options(unicorn PRIVATE -Wall -Wextra -Wconversion -Wno-missing-field-initializers
52 | -Wdouble-promotion -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion)
53 | endif ()
54 |
55 | install(TARGETS unicorn LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
56 | endif ()
57 |
58 | # Define a special build of Unicorn specifically for unit testing.
59 | # This build is either the same as the static library build OR its
60 | # a special dynamic library build that optionally adds code
61 | # coverage instrumentation.
62 | #
63 | # The static build is neccessary on Windows to ensure the tests
64 | # have access to internal/private symbols e.g. those not exported
65 | # with DLL linkage. The coverage build is only ever run on *nix
66 | # systems and shared objects on those platforms don't have those
67 | # linker issues.
68 | if (UNICORN_BUILD_TESTS OR UNICORN_BUILD_COOKBOOK)
69 | if (UNICORN_CODE_COVERAGE)
70 | add_library(unicorn_internal SHARED ${SOURCES})
71 | target_compile_options(unicorn_internal PRIVATE -g -fprofile-arcs -ftest-coverage -O0)
72 | target_link_options(unicorn_internal PRIVATE -fprofile-arcs -ftest-coverage -fbranch-probabilities -O0)
73 | target_compile_definitions(unicorn_internal PRIVATE DLL_EXPORT)
74 | else ()
75 | add_library(unicorn_internal STATIC ${SOURCES})
76 | target_compile_definitions(unicorn_internal PUBLIC UNICORN_STATIC)
77 | endif ()
78 | set_property(TARGET unicorn_internal PROPERTY C_STANDARD 99)
79 | add_dependencies(unicorn_internal unicorn_generate)
80 | endif ()
81 |
--------------------------------------------------------------------------------
/man/uni_normcmp.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | uni_normcmp \- compare strings for canonical equivalence
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .BI "unistat uni_normcmp(const void *" s1 ", unisize " s1_len ", uniattr " s1_attr ", const void *" s2 ", unisize " s2_len ", uniattr " s2_attr ", bool *" result ");"
11 | .fi
12 | .SH DESCRIPTION
13 | This function checks if \f[I]s1\f[R] and \f[I]s2\f[R] are canonically equivalent.
14 | That is, it checks if the graphemes of both strings are the same.
15 | The behavior of this function is identical to calling \f[B]uni_norm\f[R](3) with \f[B]UNI_NFD\f[R] on both strings followed by a code point comparison.
16 | .PP
17 | The implementation is optimized for normalizing the strings incrementally while simultaneously comparing them.
18 | This is a more optimal approach when it's unknown whether input strings are normalized or not.
19 | If it's known in advance that the strings are both normalized, then they can be compared directly with \f[B]memcmp\f[R](3) or \f[B]strcmp\f[R](3).
20 | .PP
21 | The implementation strives to be highly performant and avoid dynamic memory allocation when possible.
22 | Typically, memory allocation will only be performed for unnaturally long combining character sequences, like
23 | .UR https://en.wikipedia.org/wiki/Zalgo_text
24 | Zalgo text
25 | .UE .
26 | It's rare for real-world text to trigger memory allocation.
27 | .SH RETURN VALUE
28 | .TP
29 | UNI_OK
30 | If the string was normalized successfully.
31 | .TP
32 | UNI_BAD_OPERATION
33 | If \f[I]s1\f[R] or \f[I]s2\f[R] are null, if \f[I]s1_len\f[R] or \f[I]s2_len\f[R] are negative, or if \f[I]result\f[R] is null.
34 | .TP
35 | UNI_BAD_ENCODING
36 | If \f[I]s1\f[R] or \f[I]s2\f[R] is not well-formed (checks are omitted if the corresponding \f[B]uniattr\f[R](3) has \f[B]UNI_TRUST\f[R](3)).
37 | .TP
38 | UNI_NO_MEMORY
39 | If dynamic memory allocation failed.
40 | .SH EXAMPLES
41 | This example compares two strings for canonical equivalence.
42 | Conceptually, the implementation normalizes both strings, performs the comparison, and reports the result.
43 | This approach is recommended when strings are compared for one-off equality.
44 | If strings are compared repeatedly, then it's recommended to normalize them with \f[B]uni_norm\f[R](3) and cache the result for the comparisons.
45 | .PP
46 | .in +4n
47 | .EX
48 | #include
49 | #include
50 | #include
51 |
52 | int main(void)
53 | {
54 | const char *s1 = u8"ma\\u0301scara"; // 'a' + U+0301 = á (decomposed)
55 | const char *s2 = u8"m\\u00E1scara"; // U+00E1 = á (precomposed)
56 | bool is_equal;
57 |
58 | if (uni_normcmp(s1, -1, UNI_UTF8,
59 | s2, -1, UNI_UTF8, &is_equal) != UNI_OK)
60 | {
61 | puts("failed to normalize and compare strings");
62 | return 1;
63 | }
64 |
65 | printf("%s", is_equal ? "equal" : "not equal");
66 | return 0;
67 | }
68 | .EE
69 | .in
70 | .SH SEE ALSO
71 | .BR uni_norm (3),
72 | .BR uninormform (3),
73 | .BR uniattr (3),
74 | .BR UNI_TRUST (3),
75 | .BR unisize (3)
76 | .SH AUTHOR
77 | .UR https://railgunlabs.com
78 | Railgun Labs
79 | .UE .
80 | .SH INTERNET RESOURCES
81 | The online documentation is published on the
82 | .UR https://railgunlabs.com/unicorn
83 | Railgun Labs website
84 | .UE .
85 | .SH LICENSING
86 | Unicorn is distributed with its end-user license agreement (EULA).
87 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
88 |
--------------------------------------------------------------------------------
/man/unibreak.3:
--------------------------------------------------------------------------------
1 | .TH "UNICORN" "3" "Dec 6th 2025" "Unicorn 1.3.1"
2 | .SH NAME
3 | unibreak \- text boundaries
4 | .SH LIBRARY
5 | Embeddable Unicode Algorithms (libunicorn, -lunicorn)
6 | .SH SYNOPSIS
7 | .nf
8 | .B #include
9 | .PP
10 | .B enum unibreak {
11 | .RS
12 | .B UNI_GRAPHEME,
13 | .B UNI_WORD,
14 | .B UNI_SENTENCE,
15 | .RE
16 | .B };
17 | .fi
18 | .SH DESCRIPTION
19 | The elements of this enumeration describe the boundaries that can be detected.
20 | Conceptually, a break represents the space in between two code points.
21 | .SH CONSTANTS
22 | .TP
23 | .BR UNI_GRAPHEME
24 | A grapheme or grapheme cluster is a user-perceived character.
25 | Grapheme clusters are useful for user interfaces and user-facing text manipulation.
26 | For example, graphemes should be used in the implementation of text selection, arrow key movement, backspacing through text, and so forth.
27 | When truncating text for presentation grapheme cluster boundaries should be used to avoid malforming user-perceived characters.
28 | .IP
29 | A key feature of Unicode grapheme clusters is that they remain unchanged across all canonically equivalent normalization forms.
30 | That means the graphemes boundaries remain unchanged whether the text is normalized as NFC or NFD.
31 | .IP
32 | Support for grapheme cluster break detection can be enabled in the JSON configuration as shown below:
33 | .IP
34 | .in +4n
35 | .EX
36 | {
37 | "algorithms": {
38 | "segmentation": [
39 | "grapheme"
40 | ]
41 | }
42 | }
43 | .EE
44 | .in
45 | .TP
46 | .BR UNI_WORD
47 | Word boundaries are used in a number of different contexts.
48 | The most familiar ones are selection (double-click mouse selection or "move to next word" control-arrow keys) and "whole word search" for search and replace.
49 | They are also used in database queries and regular expressions, to determine whether elements are within a certain number of words of one another.
50 | .IP
51 | The default algorithm for detecting word boundaries primarily intended for languages that use white space to delimit words.
52 | Unfortunately, some scripts, like Thai, Lao, Khmer, and Myanmar, do not use spaces between words.
53 | Ideographic scripts such as Japanese and Chinese are even more complex.
54 | It is therefore not possible to provide an algorithm that correctly detects word boundaries across languages.
55 | These languages require special handling by a more sophisticated word break detection algorithm that understands the rules of the language.
56 | .IP
57 | Support for word break detection can be enabled in the JSON configuration file as shown below:
58 | .IP
59 | .in +4n
60 | .EX
61 | {
62 | "algorithms": {
63 | "segmentation": [
64 | "word"
65 | ]
66 | }
67 | }
68 | .EE
69 | .in
70 | .TP
71 | .BR UNI_SENTENCE
72 | Sentence boundaries are often used for triple-click or other methods of selecting or iterating through blocks of text that are larger than single words.
73 | They are also used to determine whether words occur within the same sentence in database queries.
74 | .IP
75 | Support for sentence break detection can be enabled in the JSON configuration file as shown below:
76 | .IP
77 | .in +4n
78 | .EX
79 | {
80 | "algorithms": {
81 | "segmentation": [
82 | "sentence"
83 | ]
84 | }
85 | }
86 | .EE
87 | .in
88 | .SH SEE ALSO
89 | .BR uninormform (3)
90 | .SH AUTHOR
91 | .UR https://railgunlabs.com
92 | Railgun Labs
93 | .UE .
94 | .SH INTERNET RESOURCES
95 | The online documentation is published on the
96 | .UR https://railgunlabs.com/unicorn
97 | Railgun Labs website
98 | .UE .
99 | .SH LICENSING
100 | Unicorn is distributed with its end-user license agreement (EULA).
101 | Please review the agreement for information on terms & conditions for accessing or otherwise using Unicorn and for a DISCLAIMER OF ALL WARRANTIES.
102 |
--------------------------------------------------------------------------------
/src/charvec.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #include "charvec.h"
15 |
16 | unistat uni_charvec_reserve(struct CharVec *buffer, unisize new_capacity)
17 | {
18 | unistat status = UNI_OK;
19 |
20 | // LCOV_EXCL_START
21 | assert(buffer != NULL);
22 | assert(new_capacity >= 0);
23 | // LCOV_EXCL_STOP
24 |
25 | if (new_capacity > buffer->capacity)
26 | {
27 | unichar *ptr;
28 | if (buffer->chars == buffer->scratch)
29 | {
30 | ptr = uni_malloc(sizeof(buffer->chars[0]) * (size_t)new_capacity); // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required.
31 | if (ptr != NULL)
32 | {
33 | (void)memcpy(ptr, buffer->scratch, sizeof(buffer->chars[0]) * (size_t)buffer->capacity);
34 | }
35 | }
36 | else
37 | {
38 | ptr = uni_realloc(buffer->chars, // cppcheck-suppress misra-c2012-11.5 ; MISRA 2012 Rule 11.5 - Pointer conversion required.
39 | sizeof(buffer->chars[0]) * (size_t)buffer->capacity,
40 | sizeof(buffer->chars[0]) * (size_t)new_capacity);
41 | }
42 |
43 | if (ptr == NULL)
44 | {
45 | uni_message("memory allocation failed");
46 | status = UNI_NO_MEMORY;
47 | }
48 | else
49 | {
50 | buffer->chars = ptr;
51 | buffer->capacity = new_capacity;
52 | }
53 | }
54 |
55 | return status;
56 | }
57 |
58 | unistat uni_charvec_append(struct CharVec *buffer, const unichar *chars, unisize chars_count)
59 | {
60 | // LCOV_EXCL_START
61 | assert(buffer);
62 | assert(chars);
63 | // LCOV_EXCL_STOP
64 |
65 | const unistat status = uni_charvec_reserve(buffer, buffer->length + chars_count);
66 | if (status == UNI_OK)
67 | {
68 | (void)memcpy(&buffer->chars[buffer->length], chars, sizeof(buffer->chars[0]) * (size_t)chars_count);
69 | buffer->length += chars_count;
70 | }
71 |
72 | return status;
73 | }
74 |
75 | void uni_charvec_append_unsafe(struct CharVec *cb, const unichar *chars, unisize chars_count)
76 | {
77 | assert((cb->length + chars_count) <= cb->capacity); // LCOV_EXCL_BR_LINE
78 | (void)memcpy(&cb->chars[cb->length], chars, sizeof(chars[0]) * (size_t)chars_count);
79 | cb->length += chars_count;
80 | }
81 |
82 | void uni_charvec_reset(struct CharVec *buffer)
83 | {
84 | buffer->length = 0;
85 | }
86 |
87 | void uni_charvec_init(struct CharVec *buffer)
88 | {
89 | const size_t scratch_buffer_size = COUNT_OF(buffer->scratch);
90 | buffer->length = 0;
91 | buffer->capacity = (unisize)scratch_buffer_size;
92 | buffer->chars = buffer->scratch;
93 | }
94 |
95 | void uni_charvec_free(struct CharVec *buffer)
96 | {
97 | if (buffer->chars != buffer->scratch)
98 | {
99 | uni_free(buffer->chars, sizeof(buffer->chars[0]) * (size_t)buffer->capacity);
100 | }
101 | }
102 |
103 | void uni_charvec_remove(struct CharVec *buf, unisize i)
104 | {
105 | assert(buf->length >= 1); // LCOV_EXCL_BR_LINE
106 | const unisize shift = buf->length - (i + UNISIZE_C(1));
107 | assert(shift >= 0); // LCOV_EXCL_BR_LINE: The math should never work out negative.
108 | (void)memmove(&buf->chars[i], &buf->chars[i + 1], sizeof(buf->chars[0]) * (size_t)shift);
109 | buf->length -= 1;
110 | }
111 |
--------------------------------------------------------------------------------
/scripts/segmentation.py:
--------------------------------------------------------------------------------
1 | # Unicorn - Embeddable Unicode Algorithms
2 | # Copyright (c) 2024-2025 Railgun Labs
3 | #
4 | # This software is dual-licensed: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License version 3 as
6 | # published by the Free Software Foundation. For the terms of this
7 | # license, see .
8 | #
9 | # Alternatively, you can license this software under a proprietary
10 | # license, as set out in .
11 |
12 | from typing import Set, Type
13 | import zipfile
14 | import json
15 |
16 | from .config import Config
17 | from .tools import Codespace
18 | from .feature import Feature
19 | from .basics import StorageSize, SerializationData
20 |
21 | max_states = 0
22 |
23 | class _Segmentation(Feature):
24 | def post_process(self, archive: zipfile.ZipFile, __: Codespace, ___: Config) -> SerializationData:
25 | gcb = archive.open('segmentation.json')
26 | data = json.loads(gcb.read())
27 | header = data["header"]
28 | gcb.close()
29 |
30 | header += "#define MAX_BREAK_STATES {0}\n".format(max_states)
31 |
32 | public_header = "#define UNICORN_FEATURE_SEGMENTATION\n"
33 | return SerializationData(header=header, public_header=public_header)
34 |
35 | class GraphemeBreak(Feature):
36 | @staticmethod
37 | def dependencies() -> Set[Type[Feature]]:
38 | return set([_Segmentation])
39 |
40 | def pre_process(self, codespace: Codespace) -> None:
41 | codespace.register_property("gcb", 0, StorageSize.UINT8)
42 | codespace.register_property("incb", 0, StorageSize.UINT8)
43 |
44 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
45 | gcb = archive.open('gcb.json')
46 | data = json.loads(gcb.read())
47 | header = data["header"]
48 | source = data["source"]
49 | size = data["size"]
50 | gcb.close()
51 |
52 | global max_states
53 | max_states = max(int(data["states"]), max_states)
54 |
55 | gcb_property = codespace.get("gcb")
56 | for d in data["gcb"]:
57 | codespace.set(d[0], gcb_property, d[1])
58 |
59 | incb_property = codespace.get("incb")
60 | for d in data["incb"]:
61 | codespace.set(d[0], incb_property, d[1])
62 |
63 | public_header = "#define UNICORN_FEATURE_GCB\n"
64 | return SerializationData(source, header, public_header, size)
65 |
66 | class WordBreak(Feature):
67 | @staticmethod
68 | def dependencies() -> Set[Type[Feature]]:
69 | return set([_Segmentation])
70 |
71 | def pre_process(self, codespace: Codespace) -> None:
72 | codespace.register_property("wb", 0, StorageSize.UINT8)
73 | codespace.register_property("wbx", 0, StorageSize.UINT8)
74 |
75 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
76 | wb = archive.open('wb.json')
77 | data = json.loads(wb.read())
78 | header = data["header"]
79 | source = data["source"]
80 | size = data["size"]
81 | wb.close()
82 |
83 | global max_states
84 | max_states = max(int(data["states"]), max_states)
85 |
86 | wb_property = codespace.get("wb")
87 | for d in data["wb"]:
88 | codespace.set(d[0], wb_property, d[1])
89 |
90 | wbx_property = codespace.get("wbx")
91 | for d in data["wbx"]:
92 | codespace.set(d[0], wbx_property, d[1])
93 |
94 | public_header = "#define UNICORN_FEATURE_WB\n"
95 | return SerializationData(source, header, public_header, size)
96 |
97 | class SentenceBreak(Feature):
98 | @staticmethod
99 | def dependencies() -> Set[Type[Feature]]:
100 | return set([_Segmentation])
101 |
102 | def pre_process(self, codespace: Codespace) -> None:
103 | codespace.register_property("sb", 0, StorageSize.UINT8)
104 |
105 | def process(self, archive: zipfile.ZipFile, codespace: Codespace, _: Config) -> SerializationData:
106 | wb = archive.open('sb.json')
107 | data = json.loads(wb.read())
108 | header = data["header"]
109 | source = data["source"]
110 | size = data["size"]
111 | wb.close()
112 |
113 | global max_states
114 | max_states = max(int(data["states"]), max_states)
115 |
116 | wb_property = codespace.get("sb")
117 | for d in data["sb"]:
118 | codespace.set(d[0], wb_property, d[1])
119 |
120 | public_header = "#define UNICORN_FEATURE_SB\n"
121 | return SerializationData(source, header, public_header, size)
122 |
--------------------------------------------------------------------------------
/src/properties.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2024-2025 Railgun Labs
4 | *
5 | * This software is dual-licensed: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License version 3 as
7 | * published by the Free Software Foundation. For the terms of this
8 | * license, see .
9 | *
10 | * Alternatively, you can license this software under a proprietary
11 | * license, as set out in .
12 | */
13 |
14 | #include "common.h"
15 | #include "unidata.h"
16 |
17 | UNICORN_API unigc uni_gc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
18 | {
19 | #if defined(UNICORN_FEATURE_GC)
20 | return (unigc)unicorn_get_codepoint_data(c)->general_category; // cppcheck-suppress premium-misra-c-2012-10.5 ; Intentional violation to unpack data from 32-bits to an enum.
21 | #else
22 | (void)c;
23 | return UNI_UNASSIGNED;
24 | #endif
25 | }
26 |
27 | UNICORN_API uint8_t uni_ccc(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
28 | {
29 | #if defined(UNICORN_FEATURE_CCC)
30 | return unicorn_get_codepoint_data(c)->canonical_combining_class;
31 | #else
32 | (void)c;
33 | return 0;
34 | #endif
35 | }
36 |
37 | UNICORN_API bool uni_is(unichar c, unibp p) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
38 | {
39 | #if defined(UNICORN_FEATURE_BINARY_PROPERTIES)
40 | const uint32_t binary_props = (uint32_t)unicorn_get_codepoint_data(c)->binary_properties;
41 | const uint32_t prop = (uint32_t)p;
42 | const uint32_t bit_mask = (uint32_t)1 << prop;
43 | return ((binary_props & bit_mask) == bit_mask) ? true : false;
44 | #else
45 | (void)c;
46 | (void)p;
47 | return false;
48 | #endif
49 | }
50 |
51 | UNICORN_API const char *uni_numval(unichar c) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
52 | {
53 | #if defined(UNICORN_FEATURE_NUMERIC_VALUE)
54 | const char *numval = NULL;
55 | const uint8_t index = unicorn_get_codepoint_data(c)->numeric_value_offset;
56 | if (index > (uint8_t)0)
57 | {
58 | assert((size_t)index < COUNT_OF(unicorn_numeric_values)); // LCOV_EXCL_BR_LINE
59 | numval = unicorn_numeric_values[index];
60 | }
61 | return numval;
62 | #else
63 | (void)c;
64 | return NULL;
65 | #endif
66 | }
67 |
68 | UNICORN_API unichar uni_tolower(unichar c)
69 | {
70 | #if defined(UNICORN_FEATURE_SIMPLE_LOWERCASE_MAPPINGS)
71 | unichar lower;
72 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_lowercase_mapping;
73 | const uint16_t index = GET_CASED_VALUE(mapping);
74 | if (CASING_IN_TABLE(mapping))
75 | {
76 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
77 | lower = unicorn_simple_case_mappings[index];
78 | }
79 | else
80 | {
81 | const int32_t diff = (int32_t)index;
82 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
83 | lower = (unichar)cp;
84 | }
85 | return lower;
86 | #else
87 | return c;
88 | #endif
89 | }
90 |
91 | UNICORN_API unichar uni_toupper(unichar c)
92 | {
93 | #if defined(UNICORN_FEATURE_SIMPLE_UPPERCASE_MAPPINGS)
94 | unichar upper;
95 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_uppercase_mapping;
96 | const uint16_t index = GET_CASED_VALUE(mapping);
97 | if (CASING_IN_TABLE(mapping))
98 | {
99 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
100 | upper = unicorn_simple_case_mappings[index];
101 | }
102 | else
103 | {
104 | const int32_t diff = (int32_t)index;
105 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
106 | upper = (unichar)cp;
107 | }
108 | return upper;
109 | #else
110 | return c;
111 | #endif
112 | }
113 |
114 | UNICORN_API unichar uni_totitle(unichar c)
115 | {
116 | #if defined(UNICORN_FEATURE_SIMPLE_TITLECASE_MAPPINGS)
117 | unichar title;
118 | const uint16_t mapping = unicorn_get_character_casing(c)->simple_titlecase_mapping;
119 | const uint16_t index = GET_CASED_VALUE(mapping);
120 | if (CASING_IN_TABLE(mapping))
121 | {
122 | assert((size_t)index < COUNT_OF(unicorn_simple_case_mappings)); // LCOV_EXCL_BR_LINE
123 | title = unicorn_simple_case_mappings[index];
124 | }
125 | else
126 | {
127 | const int32_t diff = (int32_t)index;
128 | const int32_t cp = (int32_t)c - (diff - CASING_DIFF);
129 | title = (unichar)cp;
130 | }
131 | return title;
132 | #else
133 | return c;
134 | #endif
135 | }
136 |
--------------------------------------------------------------------------------
/examples/character_properties.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is public domain.
3 | * You may use, modify, and distribute it without restriction.
4 | */
5 |
6 | // Examples are also documented online at:
7 | // .
8 |
9 | #include
10 | #include
11 |
12 | static const char *const general_categories[] = {
13 | [UNI_UPPERCASE_LETTER] = "Uppercase_Letter",
14 | [UNI_LOWERCASE_LETTER] = "Lowercase_Letter",
15 | [UNI_TITLECASE_LETTER] = "Titlecase_Letter",
16 | [UNI_MODIFIER_LETTER] = "Modifier_Letter",
17 | [UNI_OTHER_LETTER] = "Other_Letter",
18 | [UNI_NONSPACING_MARK] = "Nonspacing_Mark",
19 | [UNI_SPACING_MARK] = "Spacing_Mark",
20 | [UNI_ENCLOSING_MARK] = "Enclosing_Mark",
21 | [UNI_DECIMAL_NUMBER] = "Decimal_Number",
22 | [UNI_LETTER_NUMBER] = "Letter_Number",
23 | [UNI_OTHER_NUMBER] = "Other_Number",
24 | [UNI_CONNECTOR_PUNCTUATION] = "Connector_Punctuation",
25 | [UNI_DASH_PUNCTUATION] = "Dash_Punctuation",
26 | [UNI_OPEN_PUNCTUATION] = "Open_Punctuation",
27 | [UNI_CLOSE_PUNCTUATION] = "Close_Punctuation",
28 | [UNI_INITIAL_PUNCTUATION] = "Initial_Punctuation",
29 | [UNI_FINAL_PUNCTUATION] = "Final_Punctuation",
30 | [UNI_OTHER_PUNCTUATION] = "Other_Punctuation",
31 | [UNI_MATH_SYMBOL] = "Math_Symbol",
32 | [UNI_CURRENCY_SYMBOL] = "Currency_Symbol",
33 | [UNI_MODIFIER_SYMBOL] = "Modifier_Symbol",
34 | [UNI_OTHER_SYMBOL] = "Other_Symbol",
35 | [UNI_SPACE_SEPARATOR] = "Space_Separator",
36 | [UNI_LINE_SEPARATOR] = "Line_Separator",
37 | [UNI_PARAGRAPH_SEPARATOR] = "Paragraph_Separator",
38 | [UNI_CONTROL] = "Control",
39 | [UNI_FORMAT] = "Format",
40 | [UNI_SURROGATE] = "Surrogate",
41 | [UNI_PRIVATE_USE] = "Private_Use",
42 | [UNI_UNASSIGNED] = "Unassigned",
43 | };
44 |
45 | static const char *const binary_properties[] = {
46 | [UNI_NONCHARACTER_CODE_POINT] = "Noncharacter_Code_Point",
47 | [UNI_ALPHABETIC] = "Alphabetic",
48 | [UNI_LOWERCASE] = "Lowercase",
49 | [UNI_UPPERCASE] = "Uppercase",
50 | [UNI_HEX_DIGIT] = "Hex_Digit",
51 | [UNI_WHITE_SPACE] = "White_Space",
52 | [UNI_MATH] = "Math",
53 | [UNI_DASH] = "Dash",
54 | [UNI_DIACRITIC] = "Diacritic",
55 | [UNI_EXTENDER] = "Extender",
56 | [UNI_IDEOGRAPHIC] = "Ideographic",
57 | [UNI_QUOTATION_MARK] = "Quotation_Mark",
58 | [UNI_UNIFIED_IDEOGRAPH] = "Unified_Ideograph",
59 | [UNI_TERMINAL_PUNCTUATION] = "Terminal_Punctuation",
60 | };
61 |
62 | int main(int argc, char *argv[])
63 | {
64 | // This code example prints character properties associated with
65 | // a Unicode code point. The definition of each character property
66 | // is documented here:
67 | // .
68 |
69 | // The following variable defines the Unicode character whose
70 | // properties will be printed. In this example, it's Unicode
71 | // character U+0300. Change this value to a different character
72 | // and notice how the printed properties change.
73 | unichar cp = 0x300;
74 |
75 | unichar tmp_cp;
76 | uint8_t ccc;
77 | const char *numeric_value;
78 |
79 | printf("Code point: U+%04X\n", cp);
80 |
81 | // Print the General_Category property value.
82 | printf("General category: %s\n", general_categories[uni_gc(cp)]);
83 |
84 | // Print the Canonical_Combining_Class property value (if non-zero).
85 | ccc = uni_ccc(cp);
86 | if (ccc != 0)
87 | {
88 | printf("Canonical combining class: %d\n", ccc);
89 | }
90 |
91 | // Print each binary property assigned to the code point.
92 | for (int i = 0; i < sizeof(binary_properties) / sizeof(binary_properties[0]); i++)
93 | {
94 | if (uni_is(cp, i))
95 | {
96 | printf("Binary property: %s\n", binary_properties[i]);
97 | }
98 | }
99 |
100 | // Print the Numeric_Value property value (if present).
101 | numeric_value = uni_numval(cp);
102 | if (numeric_value != NULL)
103 | {
104 | printf("Numeric value: %s\n", numeric_value);
105 | }
106 |
107 | // Print the Simple_Lowercase_Mapping (if unique).
108 | tmp_cp = uni_tolower(cp);
109 | if (tmp_cp != cp)
110 | {
111 | printf("Simple lower case mapping: U+%04X\n", tmp_cp);
112 | }
113 |
114 | // Print the Simple_Uppercase_Mapping (if unique).
115 | tmp_cp = uni_toupper(cp);
116 | if (tmp_cp != cp)
117 | {
118 | printf("Simple upper case mapping: U+%04X\n", tmp_cp);
119 | }
120 |
121 | // Print the Simple_Titlecase_Mapping (if unique).
122 | tmp_cp = uni_totitle(cp);
123 | if (tmp_cp != cp)
124 | {
125 | printf("Simple title case mapping: U+%04X\n", tmp_cp);
126 | }
127 |
128 | return 0;
129 | }
130 |
--------------------------------------------------------------------------------
/src/unicorn.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Unicorn - Embeddable Unicode Algorithms
3 | * Copyright (c) 2021-2025 Railgun Labs
4 | *
5 | * THE CONTENTS OF THIS PROJECT ARE PROPRIETARY AND CONFIDENTIAL. UNAUTHORIZED
6 | * COPYING, TRANSFERRING OR REPRODUCTION OF THE CONTENTS OF THIS PROJECT, VIA
7 | * ANY MEDIUM IS STRICTLY PROHIBITED.
8 | *
9 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
11 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
12 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
13 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
14 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
15 | * SOFTWARE.
16 | */
17 |
18 | #include "charbuf.h"
19 | #include "unidata.h"
20 |
21 | #if defined(_MSC_VER)
22 | #include
23 | #endif
24 |
25 | static inline int32_t uni_popcnt(uint32_t n)
26 | {
27 | #if defined(_MSC_VER)
28 | return (int32_t)__popcnt(n); // cppcheck-suppress premium-misra-c-2012-17.3
29 | #elif defined(__GNUC__) || defined(__clang__)
30 | return (int32_t)__builtin_popcount(n); // cppcheck-suppress premium-misra-c-2012-17.3
31 | #else
32 | int32_t popcnt = 0;
33 | for (uint32_t v = (uint32_t)n; v != 0u; v >>= 1u)
34 | {
35 | if ((v & 1u) == 1u)
36 | {
37 | popcnt += 1;
38 | }
39 | }
40 | return popcnt;
41 | #endif
42 | }
43 |
44 | static unistat check_encoding(uniattr *encoding)
45 | {
46 | unistat status = UNI_OK;
47 | const uniattr enc = *encoding;
48 | uint32_t flags;
49 |
50 | // Only one or none of the endian flags can be provided.
51 | uniattr mask = enc & (UNI_LITTLE | UNI_BIG);
52 | flags = (uint32_t)mask;
53 | switch (uni_popcnt(flags))
54 | {
55 | case 0: // No endian flags was provided; default to native byte order.
56 | if ((GET_ENCODING(enc) & (UNI_UTF16 | UNI_UTF32)) != (uniattr)0)
57 | {
58 | #if defined(UNICORN_BIG_ENDIAN)
59 | (*encoding) |= UNI_BIG;
60 | #else
61 | (*encoding) |= UNI_LITTLE;
62 | #endif
63 | }
64 | break;
65 | case 1: // Exactly one endian flag was provided (good).
66 | break;
67 | default: // Too many endian flags were provided (bad).
68 | uni_message("too many endian flags (must specify exactly one)");
69 | status = UNI_BAD_OPERATION;
70 | break;
71 | }
72 |
73 | if (status == UNI_OK)
74 | {
75 | // Make sure exactly ONE text encoding flag was provided.
76 | mask = GET_ENCODING(enc);
77 | flags = (uint32_t)mask;
78 | if (uni_popcnt(flags) != 1)
79 | {
80 | uni_message("expected exactly one encoding form");
81 | status = UNI_BAD_OPERATION;
82 | }
83 | }
84 |
85 | return status;
86 | }
87 |
88 | unistat uni_check_input_encoding(const void *text, unisize length, uniattr *encoding)
89 | {
90 | unistat status = UNI_OK;
91 | (void)length;
92 |
93 | // Only destination buffers can use the null terminated flag.
94 | if (((*encoding) & UNI_NULIFY) == UNI_NULIFY)
95 | {
96 | uni_message("input buffer incompatible with 'UNI_NULIFY' flag");
97 | status = UNI_BAD_OPERATION;
98 | }
99 | else if (text == NULL)
100 | {
101 | uni_message("input buffer is null");
102 | status = UNI_BAD_OPERATION;
103 | }
104 | else
105 | {
106 | status = check_encoding(encoding);
107 | }
108 |
109 | return status;
110 | }
111 |
112 | unistat uni_check_output_encoding(const void *buffer, const unisize *capacity, uniattr *encoding)
113 | {
114 | unistat status = UNI_OK;
115 |
116 | // Only input buffers can use the trusted.
117 | if (((*encoding) & UNI_TRUST) == UNI_TRUST)
118 | {
119 | uni_message("output buffer incompatible with 'UNI_TRUST' flag");
120 | status = UNI_BAD_OPERATION;
121 | }
122 | else if (capacity == NULL)
123 | {
124 | uni_message("output buffer capacity is null");
125 | status = UNI_BAD_OPERATION;
126 | }
127 | else if (*capacity < 0)
128 | {
129 | uni_message("output buffer capacity is negative");
130 | status = UNI_BAD_OPERATION;
131 | }
132 | else if ((buffer == NULL) && (*capacity > 0))
133 | {
134 | uni_message("output buffer cannot be null if its capacity is non-zero");
135 | status = UNI_BAD_OPERATION;
136 | }
137 | else
138 | {
139 | status = check_encoding(encoding);
140 | }
141 |
142 | return status;
143 | }
144 |
145 | UNICORN_API void uni_getversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
146 | {
147 | if (major != NULL)
148 | {
149 | *major = 1;
150 | }
151 |
152 | if (minor != NULL)
153 | {
154 | *minor = 0;
155 | }
156 |
157 | if (patch != NULL)
158 | {
159 | *patch = 0;
160 | }
161 | }
162 |
163 | UNICORN_API void uni_getucdversion(int32_t *major, int32_t *minor, int32_t *patch) // cppcheck-suppress misra-c2012-8.7 ; This is supposed to have external linkage.
164 | {
165 | if (major != NULL)
166 | {
167 | *major = UNICODE_VERSION_MAJOR;
168 | }
169 |
170 | if (minor != NULL)
171 | {
172 | *minor = UNICODE_VERSION_MINOR;
173 | }
174 |
175 | if (patch != NULL)
176 | {
177 | *patch = UNICODE_VERSION_PATCH;
178 | }
179 | }
180 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.22)
2 | project(unicorn VERSION 1.3.1 LANGUAGES C)
3 |
4 | include(CheckIncludeFile)
5 | include(GNUInstallDirs)
6 |
7 | option(UNICORN_BUILD_STATIC "Build Unicorn as a static library" ON)
8 | option(UNICORN_BUILD_SHARED "Build Unicorn as a dynamic library" ON)
9 | option(UNICORN_BUILD_EXAMPLES "Build Unicorn code examples (requires the shared library)" OFF)
10 |
11 | option(UNICORN_BUILD_TESTS "Build the tests (commercial licensees only)" OFF)
12 | option(UNICORN_BUILD_COOKBOOK "Build the documentation/cookbook programs (commercial licensees only)" OFF)
13 |
14 | option(UNICORN_CODE_COVERAGE "Toggle code coverage" OFF)
15 | option(UNICORN_UNDEFINED_BEHAVIOR_SANITIZER "Toggle undefined behavior sanitizer" OFF)
16 | option(UNICORN_ADDRESS_SANITIZER "Toggle address sanitizer" OFF)
17 | option(UNICORN_MEMORY_SANITIZER "Toggle address sanitizer" OFF)
18 |
19 | if (NOT UNICORN_BUILD_STATIC AND NOT UNICORN_BUILD_SHARED)
20 | message(FATAL_ERROR "Please build either the static or shared library.")
21 | endif ()
22 |
23 | set(UNICORN_CONFIG "features.json" CACHE STRING "Path to a JSON file that configures the Unicorn feature set.")
24 | if (UNICORN_CONFIG STREQUAL "")
25 | message(FATAL_ERROR "please specify a unicorn configuration file (must be a JSON file)")
26 | endif()
27 |
28 | if (MSVC)
29 | # The Microsoft compiler assumes source is ANSI-encoded, which depends on the localized version of Windows in use
30 | # On U.S. and Western European Windows the encoding is assumed to be Windows-1252. Adding the following compiler
31 | # switch forces UTF-8.
32 | add_compile_options(/utf-8)
33 | endif ()
34 |
35 | if (UNICORN_CODE_COVERAGE)
36 | if (NOT CMAKE_C_COMPILER_ID STREQUAL "GNU")
37 | message(FATAL_ERROR "Code coverage can only be generated with GCC.")
38 | endif ()
39 | # You can't have sanitizers enabled with coverage checks.
40 | elseif (UNICORN_UNDEFINED_BEHAVIOR_SANITIZER)
41 | add_compile_options(-g -fsanitize=undefined)
42 | add_link_options(-fsanitize=undefined)
43 | elseif (UNICORN_ADDRESS_SANITIZER)
44 | add_compile_options(-g -fsanitize=address)
45 | add_link_options(-fsanitize=address)
46 | elseif (UNICORN_MEMORY_SANITIZER)
47 | add_compile_options(-O0 -g -fsanitize=memory -fsanitize-memory-track-origins)
48 | add_link_options(-fsanitize=memory -fsanitize-memory-track-origins)
49 | endif ()
50 |
51 | # Check for required C standard library headers.
52 | check_include_file(string.h HAVE_STRING_H)
53 | if (NOT HAVE_STRING_H)
54 | message(FATAL_ERROR "could not find required header")
55 | endif ()
56 |
57 | check_include_file(assert.h HAVE_ASSERT_H)
58 | if (NOT HAVE_ASSERT_H)
59 | message(FATAL_ERROR "could not find required header")
60 | endif ()
61 |
62 | check_include_file(stdbool.h HAVE_STDBOOL_H)
63 | if (NOT HAVE_STDBOOL_H)
64 | message(FATAL_ERROR "could not find required header")
65 | endif ()
66 |
67 | # Python is required to generate the Unicode data tables.
68 | find_package(Python3 3.10.0 REQUIRED COMPONENTS Interpreter)
69 |
70 | # Execute the Python Unicode data generators.
71 | file(GLOB_RECURSE PYTHON_SOURCES "scripts/${PYTHON_SOURCES}*.py")
72 | set(GENERATED_FILES
73 | "${CMAKE_BINARY_DIR}/unidata.c"
74 | "${CMAKE_BINARY_DIR}/unidata.h"
75 | "${CMAKE_BINARY_DIR}/_unicorn.h"
76 | )
77 | add_custom_command(
78 | OUTPUT ${GENERATED_FILES}
79 | COMMAND Python3::Interpreter -m scripts --config "${UNICORN_CONFIG}" --output "${CMAKE_BINARY_DIR}"
80 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
81 | DEPENDS unicode.bin ${PYTHON_SOURCES} ${UNICORN_CONFIG})
82 | add_custom_target(unicorn_generate DEPENDS ${GENERATED_FILES})
83 |
84 | # Mark the following source files as being generated at build time so CMake doesn't try to find them.
85 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.c PROPERTIES GENERATED 1)
86 | set_source_files_properties(${CMAKE_BINARY_DIR}/unidata.h PROPERTIES GENERATED 1)
87 | set_source_files_properties(${CMAKE_BINARY_DIR}/_unicorn.h PROPERTIES GENERATED 1)
88 |
89 | include_directories(${CMAKE_BINARY_DIR}) # For finding the generated files.
90 | include_directories(include) # For finding the public header files.
91 | include_directories(src) # For finding internal header files.
92 |
93 | # Add the Unicorn library.
94 | add_subdirectory(src)
95 |
96 | # Add the examples, if requested.
97 | if (UNICORN_BUILD_EXAMPLES)
98 | add_subdirectory(examples)
99 | endif ()
100 |
101 | # Generate version information.
102 | include(CMakePackageConfigHelpers)
103 | write_basic_package_version_file(${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake COMPATIBILITY SameMajorVersion)
104 |
105 | # Pre-process pkg-config and UnicornConfig.cmake files.
106 | set(prefix "${CMAKE_INSTALL_PREFIX}")
107 | set(exec_prefix "\${prefix}")
108 | set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
109 | set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
110 | set(VERSION ${PROJECT_VERSION})
111 | configure_file("${CMAKE_SOURCE_DIR}/UnicornConfig.cmake.in" "${CMAKE_BINARY_DIR}/UnicornConfig.cmake" @ONLY)
112 | configure_file("${CMAKE_SOURCE_DIR}/unicorn.pc.in" "${CMAKE_BINARY_DIR}/unicorn.pc" @ONLY)
113 |
114 | # Only install man pages when running "make install" locally.
115 | add_subdirectory(man)
116 |
117 | # This is the "make install" action.
118 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfig.cmake DESTINATION cmake)
119 | install(FILES ${CMAKE_BINARY_DIR}/UnicornConfigVersion.cmake DESTINATION cmake)
120 | install(FILES ${CMAKE_BINARY_DIR}/unicorn.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
121 |
122 | # Add the unit tests if requested (commerical users only).
123 | if (UNICORN_BUILD_TESTS)
124 | enable_testing()
125 | include(CTest)
126 | add_subdirectory(../tests tests)
127 | endif ()
128 |
129 | if (UNICORN_BUILD_COOKBOOK)
130 | add_subdirectory(../docs docs)
131 | endif ()
132 |
--------------------------------------------------------------------------------