├── .gitignore
├── mise.toml
├── resources
├── terminal-support-23107.pdf
└── wcwidth
│ ├── LICENSE_uniseg.txt
│ ├── LICENSE_go_runewidth.txt
│ ├── LICENSE_ziglyph.txt
│ ├── LICENSE_unicode_width.txt
│ ├── LICENSE_zg.txt
│ ├── LICENSE_wcwidth.txt
│ ├── go_runewidth.go
│ ├── utf8proc.jl
│ ├── uniseg.go
│ ├── ziglyph.zig
│ ├── LICENSE_utf8proc.md
│ ├── zg.zig
│ ├── wcwidth.py
│ └── wcwidth.c
├── src
├── x
│ ├── types.x.zig
│ ├── config.x.zig
│ ├── types_x
│ │ └── grapheme.zig
│ ├── config_x
│ │ ├── grapheme_break.zig
│ │ └── wcwidth.zig
│ └── root.zig
├── quirks.zig
├── ascii.zig
├── utf8.zig
├── root.zig
├── code_point.zig
├── get.zig
├── build
│ └── test_build_config.zig
└── config.zig
├── RESOURCES.md
├── licenses
├── LICENSE_Bjoern_Hoehrmann
└── LICENSE_unicode
├── AGENTS.md
├── bin
└── fetch-ucd.sh
├── LICENSE.md
├── ucd
├── .gitignore
├── BidiBrackets.txt
├── Blocks.txt
└── SpecialCasing.txt
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .zig-cache/
2 | zig-out/
3 |
--------------------------------------------------------------------------------
/mise.toml:
--------------------------------------------------------------------------------
1 | [tools]
2 | zig = "0.15.2"
3 | hyperfine = "1.19.0"
4 |
--------------------------------------------------------------------------------
/resources/terminal-support-23107.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jacobsandlund/uucode/HEAD/resources/terminal-support-23107.pdf
--------------------------------------------------------------------------------
/src/x/types.x.zig:
--------------------------------------------------------------------------------
1 | pub const grapheme = @import("types_x/grapheme.zig");
2 |
3 | pub const GraphemeBreakNoControl = grapheme.GraphemeBreakNoControl;
4 |
--------------------------------------------------------------------------------
/src/x/config.x.zig:
--------------------------------------------------------------------------------
1 | pub const grapheme_break_no_control = @import("config_x/grapheme_break.zig").grapheme_break_no_control;
2 | pub const wcwidth = @import("config_x/wcwidth.zig").wcwidth;
3 |
--------------------------------------------------------------------------------
/RESOURCES.md:
--------------------------------------------------------------------------------
1 | # Resources
2 |
3 | This is an index of the resources used in `uucode`, mostly residing in [./resources](./resources).
4 |
5 | ## Unicode Character Database
6 |
7 | See [./ucd](./ucd) and
8 |
9 | ## wcwidth
10 |
11 | * `terminal-support-23107.pdf` -
12 |
--------------------------------------------------------------------------------
/src/x/types_x/grapheme.zig:
--------------------------------------------------------------------------------
1 | pub const GraphemeBreakNoControl = enum(u5) {
2 | other,
3 | prepend,
4 | regional_indicator,
5 | spacing_mark,
6 | l,
7 | v,
8 | t,
9 | lv,
10 | lvt,
11 | zwj,
12 | zwnj,
13 | extended_pictographic,
14 | emoji_modifier_base,
15 | emoji_modifier,
16 | // extend, ==
17 | // zwnj +
18 | // indic_conjunct_break_extend +
19 | // indic_conjunct_break_linker
20 | indic_conjunct_break_extend,
21 | indic_conjunct_break_linker,
22 | indic_conjunct_break_consonant,
23 | };
24 |
--------------------------------------------------------------------------------
/src/x/config_x/grapheme_break.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 | const config = @import("config.zig");
3 | const types_x = @import("types.x.zig");
4 |
5 | fn compute(
6 | allocator: std.mem.Allocator,
7 | cp: u21,
8 | data: anytype,
9 | backing: anytype,
10 | tracking: anytype,
11 | ) std.mem.Allocator.Error!void {
12 | _ = allocator;
13 | _ = cp;
14 | _ = backing;
15 | _ = tracking;
16 |
17 | data.grapheme_break_no_control = switch (data.grapheme_break) {
18 | .control, .cr, .lf => .other,
19 | inline else => |tag| comptime std.meta.stringToEnum(
20 | types_x.GraphemeBreakNoControl,
21 | @tagName(tag),
22 | ) orelse unreachable,
23 | };
24 | }
25 |
26 | pub const grapheme_break_no_control = config.Extension{
27 | .inputs = &.{
28 | "grapheme_break",
29 | },
30 | .compute = &compute,
31 | .fields = &.{
32 | .{ .name = "grapheme_break_no_control", .type = types_x.GraphemeBreakNoControl },
33 | },
34 | };
35 |
--------------------------------------------------------------------------------
/licenses/LICENSE_Bjoern_Hoehrmann:
--------------------------------------------------------------------------------
1 | From https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
2 |
3 | Copyright (c) 2008-2009 Bjoern Hoehrmann
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
1 | # uucode (Micro/µ Unicode)
2 |
3 | ## Project Overview
4 |
5 | This library intends to provide a minimal set of unicode functionality to enable Ghostty and similar projects.
6 |
7 | The architecture works in a few layers:
8 |
9 | * Layer 1 (@src/build/Ucd.zig): Parses the Unicode Character Database (UCD).
10 | * Layer 2 (@src/build/tables.zig): Generates table data written to a zig file.
11 | * Layer 3 (@src/root.zig): Exposes methods to fetch information from the built tables.
12 |
13 | ## Build & Commands
14 |
15 | * Build and test with: `zig build test`
16 | * Run a single test: `zig build test -Dtest-filter="test name"`
17 | * Format code with: `zig fmt`
18 |
19 | Always `zig build test` to check that changes still pass.
20 |
21 | ## Code Style
22 |
23 | Follow Zig standard conventions, also keeping imports at the top.
24 |
25 | Prefer self-documenting code to comments, but add detailed comments for anything that needs explanation.
26 |
27 | Never leave trailing whitespace in lines of source code.
28 |
29 | ## Testing
30 |
31 | Add `test ""` blocks directly below code that it is testing, with more blocks at the bottom of module for testing the entire module.
32 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_uniseg.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/LICENSE.txt#L1-L21
2 | MIT License
3 |
4 | Copyright (c) 2019 Oliver Kuederle
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | # cpv: end
24 |
--------------------------------------------------------------------------------
/bin/fetch-ucd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | version="16.0.0"
4 |
5 | base_url="https://www.unicode.org/Public/zipped/${version}"
6 | emoji_url="https://www.unicode.org/Public/emoji/latest"
7 |
8 | mv ucd/.gitignore ucd-gitignore
9 | rm -rf ucd
10 | mkdir -p ucd/Unihan
11 | mv ucd-gitignore ucd/.gitignore
12 |
13 | cd ucd
14 | curl -o ucd.zip "${base_url}/UCD.zip"
15 | unzip ucd.zip
16 | rm ucd.zip
17 |
18 | cd emoji
19 | curl -o emoji-sequences.txt "${emoji_url}/emoji-sequences.txt"
20 | curl -o emoji-test.txt "${emoji_url}/emoji-test.txt"
21 | curl -o emoji-zwj-sequences.txt "${emoji_url}/emoji-zwj-sequences.txt"
22 | cd ..
23 |
24 | cd Unihan
25 | curl -o unihan.zip "${base_url}/Unihan.zip"
26 | unzip unihan.zip
27 | rm unihan.zip
28 | cd ..
29 |
30 | cd ..
31 |
32 | echo
33 | echo "########################################################################"
34 | echo
35 | echo "Done fetching UCD files"
36 | echo
37 | echo "Explicitly add any new files to start parsing to the list of .gitignore"
38 | echo "exceptions. Add a '#' to comment them out, appending '(used)' at the end."
39 | echo
40 | echo "Next, flip the 'is_updating_ucd' flag in 'src/config.zig' to true, and"
41 | echo "'zig build test' once, updating the 'default' config if it needs"
42 | echo "changing, before flipping 'is_updating_ucd' back to false."
43 | echo
44 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_go_runewidth.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/LICENSE#L1-L21
2 | The MIT License (MIT)
3 |
4 | Copyright (c) 2016 Yasuhiro Matsumoto
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | # cpv: end
24 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_ziglyph.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://codeberg.org/dude_the_builder/ziglyph/src/commit/29760d237219cc4d486f5cd654262d7b0d62d511/LICENSE#L1-L21
2 | MIT License
3 |
4 | Copyright (c) 2021 Jose Colon Rodriguez
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | # cpv: end
24 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_unicode_width.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/unicode-rs/unicode-width/blob/9d98411769fe13c7c18cab0b3fbbab29ba8350ea/LICENSE-MIT#L1-L25
2 | Copyright (c) 2015 The Rust Project Developers
3 |
4 | Permission is hereby granted, free of charge, to any
5 | person obtaining a copy of this software and associated
6 | documentation files (the "Software"), to deal in the
7 | Software without restriction, including without
8 | limitation the rights to use, copy, modify, merge,
9 | publish, distribute, sublicense, and/or sell copies of
10 | the Software, and to permit persons to whom the Software
11 | is furnished to do so, subject to the following
12 | conditions:
13 |
14 | The above copyright notice and this permission notice
15 | shall be included in all copies or substantial portions
16 | of the Software.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 | DEALINGS IN THE SOFTWARE.
27 | # cpv: end
28 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_zg.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/LICENSE#L1-L22
2 | MIT License
3 |
4 | Copyright (c) 2021 Jose Colon Rodriguez
5 | Copyright (c) 2025 Sam Atman and contributors
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy
8 | of this software and associated documentation files (the "Software"), to deal
9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | # cpv: end
25 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # uucode license
2 |
3 | MIT License
4 |
5 | Copyright (c) 2025 Jacob Sandlund
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy of
8 | this software and associated documentation files (the "Software"), to deal in
9 | the Software without restriction, including without limitation the rights to
10 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
11 | of the Software, and to permit persons to whom the Software is furnished to do
12 | so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
25 | ## Other licenses
26 |
27 | See [./licenses](./licenses) for licenses of code being used in the repo:
28 |
29 | * [LICENSE_Bjoern_Hoehrmann](./licenses/LICENSE_Bjoern_Hoehrmann)
30 | * [LICENSE_unicode](./licenses/LICENSE_unicode)
31 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_wcwidth.txt:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/LICENSE#L1-L27
2 | The MIT License (MIT)
3 |
4 | Copyright (c) 2014 Jeff Quast
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
24 | Markus Kuhn -- 2007-05-26 (Unicode 5.0)
25 |
26 | Permission to use, copy, modify, and distribute this software
27 | for any purpose and without fee is hereby granted. The author
28 | disclaims all warranties with regard to this software.
29 | # cpv: end
30 |
--------------------------------------------------------------------------------
/resources/wcwidth/go_runewidth.go:
--------------------------------------------------------------------------------
1 | // cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/runewidth.go#L115-L156
2 | // RuneWidth returns the number of cells in r.
3 | // See http://www.unicode.org/reports/tr11/
4 | func (c *Condition) RuneWidth(r rune) int {
5 | if r < 0 || r > 0x10FFFF {
6 | return 0
7 | }
8 | if len(c.combinedLut) > 0 {
9 | return int(c.combinedLut[r>>1]>>(uint(r&1)*4)) & 3
10 | }
11 | // optimized version, verified by TestRuneWidthChecksums()
12 | if !c.EastAsianWidth {
13 | switch {
14 | case r < 0x20:
15 | return 0
16 | case (r >= 0x7F && r <= 0x9F) || r == 0xAD: // nonprint
17 | return 0
18 | case r < 0x300:
19 | return 1
20 | case inTable(r, narrow):
21 | return 1
22 | case inTables(r, nonprint, combining):
23 | return 0
24 | case inTable(r, doublewidth):
25 | return 2
26 | default:
27 | return 1
28 | }
29 | } else {
30 | switch {
31 | case inTables(r, nonprint, combining):
32 | return 0
33 | case inTable(r, narrow):
34 | return 1
35 | case inTables(r, ambiguous, doublewidth):
36 | return 2
37 | case !c.StrictEmojiNeutral && inTables(r, ambiguous, emoji, narrow):
38 | return 2
39 | default:
40 | return 1
41 | }
42 | }
43 | }
44 | // cpv: end
45 |
46 | // cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/runewidth.go#L179-L193
47 | // StringWidth return width as you can see
48 | func (c *Condition) StringWidth(s string) (width int) {
49 | g := graphemes.FromString(s)
50 | for g.Next() {
51 | var chWidth int
52 | for _, r := range g.Value() {
53 | chWidth = c.RuneWidth(r)
54 | if chWidth > 0 {
55 | break // Our best guess at this point is to use the width of the first non-zero-width rune.
56 | }
57 | }
58 | width += chWidth
59 | }
60 | return
61 | }
62 | // cpv: end
63 |
--------------------------------------------------------------------------------
/src/quirks.zig:
--------------------------------------------------------------------------------
1 | //! cpv: track https://github.com/ghostty-org/ghostty/blob/cb45410dccc381b0dab54110b841dd216eb86d66/src/quirks.zig#L1-L10
2 | //! Inspired by WebKit's quirks.cpp[1], this file centralizes all our
3 | //! sad environment-specific hacks that we have to do to make things work.
4 | //! This is a last resort; if we can find a general solution to a problem,
5 | //! we of course prefer that, but sometimes other software, fonts, etc. are
6 | //! just broken or weird and we have to work around it.
7 | //!
8 | //! [1]: https://github.com/WebKit/WebKit/blob/main/Source/WebCore/page/Quirks.cpp
9 |
10 | const std = @import("std");
11 | const builtin = @import("builtin");
12 | // cpv: end
13 |
14 | /// cpv: track https://github.com/ghostty-org/ghostty/blob/cb45410dccc381b0dab54110b841dd216eb86d66/src/quirks.zig#L32-L57
15 | /// We use our own assert function instead of `std.debug.assert`.
16 | ///
17 | /// The only difference between this and the one in
18 | /// the stdlib is that this version is marked inline.
19 | ///
20 | /// The reason for this is that, despite the promises of the doc comment
21 | /// on the stdlib function, the function call to `std.debug.assert` isn't
22 | /// always optimized away in `ReleaseFast` mode, at least in Zig 0.15.2.
23 | ///
24 | /// In the majority of places, the overhead from calling an empty function
25 | /// is negligible, but we have some asserts inside tight loops and hotpaths
26 | /// that cause significant overhead (as much as 15-20%) when they don't get
27 | /// optimized out.
28 | pub const inlineAssert = switch (builtin.mode) {
29 | // In debug builds we just use std.debug.assert because this
30 | // fixes up stack traces. `inline` causes broken stack traces. This
31 | // is probably a Zig compiler bug but until it is fixed we have to
32 | // do this for development sanity.
33 | .Debug => std.debug.assert,
34 |
35 | .ReleaseSmall, .ReleaseSafe, .ReleaseFast => (struct {
36 | inline fn assert(ok: bool) void {
37 | if (!ok) unreachable;
38 | }
39 | }).assert,
40 | };
41 | // cpv: end
42 |
--------------------------------------------------------------------------------
/resources/wcwidth/utf8proc.jl:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/JuliaStrings/utf8proc/blob/90daf9f396cfec91668758eb9cc54bd5248a6b89/data/data_generator.jl#L202-L249
2 | let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
3 | # Following work by @jiahao, we compute character widths using a combination of
4 | # * character category
5 | # * UAX 11: East Asian Width
6 | # * a few exceptions as needed
7 | # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
8 | global function derive_char_width(code, category)
9 | # Use a default width of 1 for all character categories that are
10 | # letter/symbol/number-like, as well as for unassigned/private-use chars.
11 | # This provides a useful nonzero fallback for new codepoints when a new
12 | # Unicode version has been released.
13 | width = 1
14 |
15 | # Various zero-width categories
16 | #
17 | # "Sk" not included in zero width - see issue #167
18 | if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
19 | width = 0
20 | end
21 |
22 | # Widths from UAX #11: East Asian Width
23 | eaw = get(ea_widths, code, nothing)
24 | if !isnothing(eaw)
25 | width = eaw < 0 ? 1 : eaw
26 | end
27 |
28 | # A few exceptional cases, found by manual comparison to other wcwidth
29 | # functions and similar checks.
30 | if category == "Mn"
31 | width = 0
32 | end
33 |
34 | if code == 0x00ad
35 | # Soft hyphen is typically printed as a hyphen (-) in terminals.
36 | width = 1
37 | elseif code == 0x2028 || code == 0x2029
38 | #By definition, should have zero width (on the same line)
39 | #0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
40 | #0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
41 | width = 0
42 | end
43 |
44 | return width
45 | end
46 | global function is_ambiguous_width(code)
47 | return get(ea_widths, code, 0) < 0
48 | end
49 | end
50 | # cpv: end
51 |
--------------------------------------------------------------------------------
/licenses/LICENSE_unicode:
--------------------------------------------------------------------------------
1 | https://www.unicode.org/license.txt
2 |
3 | UNICODE LICENSE V3
4 |
5 | COPYRIGHT AND PERMISSION NOTICE
6 |
7 | Copyright © 1991-2025 Unicode, Inc.
8 |
9 | NOTICE TO USER: Carefully read the following legal agreement. BY
10 | DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
11 | SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
12 | TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
13 | DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
14 |
15 | Permission is hereby granted, free of charge, to any person obtaining a
16 | copy of data files and any associated documentation (the "Data Files") or
17 | software and any associated documentation (the "Software") to deal in the
18 | Data Files or Software without restriction, including without limitation
19 | the rights to use, copy, modify, merge, publish, distribute, and/or sell
20 | copies of the Data Files or Software, and to permit persons to whom the
21 | Data Files or Software are furnished to do so, provided that either (a)
22 | this copyright and permission notice appear with all copies of the Data
23 | Files or Software, or (b) this copyright and permission notice appear in
24 | associated Documentation.
25 |
26 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
27 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
29 | THIRD PARTY RIGHTS.
30 |
31 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
32 | BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
33 | OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
34 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
35 | ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
36 | FILES OR SOFTWARE.
37 |
38 | Except as contained in this notice, the name of a copyright holder shall
39 | not be used in advertising or otherwise to promote the sale, use or other
40 | dealings in these Data Files or Software without prior written
41 | authorization of the copyright holder.
42 |
--------------------------------------------------------------------------------
/src/ascii.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 |
3 | /// Returns whether the code point is alphanumeric: A-Z, a-z, or 0-9.
4 | pub fn isAlphanumeric(c: u21) bool {
5 | return switch (c) {
6 | '0'...'9', 'A'...'Z', 'a'...'z' => true,
7 | else => false,
8 | };
9 | }
10 |
11 | /// Returns whether the code point is alphabetic: A-Z or a-z.
12 | pub fn isAlphabetic(c: u21) bool {
13 | return switch (c) {
14 | 'A'...'Z', 'a'...'z' => true,
15 | else => false,
16 | };
17 | }
18 |
19 | /// Returns whether the code point is a control character.
20 | ///
21 | /// See also: `control_code`
22 | pub fn isControl(c: u21) bool {
23 | return c <= std.ascii.control_code.us or c == std.ascii.control_code.del;
24 | }
25 |
26 | /// Returns whether the code point is a digit.
27 | pub fn isDigit(c: u21) bool {
28 | return switch (c) {
29 | '0'...'9' => true,
30 | else => false,
31 | };
32 | }
33 |
34 | /// Returns whether the code point is a lowercase letter.
35 | pub fn isLower(c: u21) bool {
36 | return switch (c) {
37 | 'a'...'z' => true,
38 | else => false,
39 | };
40 | }
41 |
42 | /// Returns whether the code point is printable and has some graphical representation,
43 | /// including the space code point.
44 | pub fn isPrint(c: u21) bool {
45 | return isAscii(c) and !isControl(c);
46 | }
47 |
48 | /// Returns whether this code point is included in `whitespace`.
49 | pub fn isWhitespace(c: u21) bool {
50 | return switch (c) {
51 | ' ', '\t'...'\r' => true,
52 | else => false,
53 | };
54 | }
55 |
56 | /// Returns whether the code point is an uppercase letter.
57 | pub fn isUpper(c: u21) bool {
58 | return switch (c) {
59 | 'A'...'Z' => true,
60 | else => false,
61 | };
62 | }
63 |
64 | /// Returns whether the code point is a hexadecimal digit: A-F, a-f, or 0-9.
65 | pub fn isHex(c: u21) bool {
66 | return switch (c) {
67 | '0'...'9', 'A'...'F', 'a'...'f' => true,
68 | else => false,
69 | };
70 | }
71 |
72 | /// Returns whether the code point is a 7-bit ASCII character.
73 | pub fn isAscii(c: u21) bool {
74 | return c < 128;
75 | }
76 |
77 | /// Uppercases the code point and returns it as-is if already uppercase or not a letter.
78 | pub fn toUpper(c: u21) u21 {
79 | const mask = @as(u21, @intFromBool(isLower(c))) << 5;
80 | return c ^ mask;
81 | }
82 |
83 | /// Lowercases the code point and returns it as-is if already lowercase or not a letter.
84 | pub fn toLower(c: u21) u21 {
85 | const mask = @as(u21, @intFromBool(isUpper(c))) << 5;
86 | return c | mask;
87 | }
88 |
--------------------------------------------------------------------------------
/ucd/.gitignore:
--------------------------------------------------------------------------------
1 | # We use explicit ignore instead of a blanket ignore rule
2 | # plus un-ignoring needed files because `fd` doesn't seem
3 | # to list un-ignored files correctly
4 |
5 | ArabicShaping.txt
6 | # BidiBrackets.txt (used)
7 | BidiCharacterTest.txt
8 | BidiMirroring.txt
9 | BidiTest.txt
10 | # Blocks.txt (used)
11 | CJKRadicals.txt
12 | # CaseFolding.txt (used)
13 | CompositionExclusions.txt
14 | DerivedAge.txt
15 | # DerivedCoreProperties.txt (used)
16 | DerivedNormalizationProps.txt
17 | DoNotEmit.txt
18 | EastAsianWidth.txt
19 | EmojiSources.txt
20 | EquivalentUnifiedIdeograph.txt
21 | HangulSyllableType.txt
22 | Index.txt
23 | IndicPositionalCategory.txt
24 | IndicSyllabicCategory.txt
25 | Jamo.txt
26 | LineBreak.txt
27 | NameAliases.txt
28 | NamedSequences.txt
29 | NamedSequencesProv.txt
30 | NamesList.html
31 | NamesList.txt
32 | NormalizationCorrections.txt
33 | NormalizationTest.txt
34 | NushuSources.txt
35 | PropList.txt
36 | PropertyAliases.txt
37 | PropertyValueAliases.txt
38 | ReadMe.txt
39 | ScriptExtensions.txt
40 | Scripts.txt
41 | # SpecialCasing.txt (used)
42 | StandardizedVariants.txt
43 | TangutSources.txt
44 | USourceData.txt
45 | USourceGlyphs.pdf
46 | USourceRSChart.pdf
47 | # UnicodeData.txt (used)
48 | Unihan/Unihan_DictionaryIndices.txt
49 | Unihan/Unihan_DictionaryLikeData.txt
50 | Unihan/Unihan_IRGSources.txt
51 | Unihan/Unihan_NumericValues.txt
52 | Unihan/Unihan_OtherMappings.txt
53 | Unihan/Unihan_RadicalStrokeCounts.txt
54 | Unihan/Unihan_Readings.txt
55 | Unihan/Unihan_Variants.txt
56 | Unikemet.txt
57 | VerticalOrientation.txt
58 | # auxiliary/GraphemeBreakProperty.txt (used)
59 | auxiliary/GraphemeBreakTest.html
60 | # auxiliary/GraphemeBreakTest.txt
61 | auxiliary/LineBreakTest.html
62 | auxiliary/LineBreakTest.txt
63 | auxiliary/SentenceBreakProperty.txt
64 | auxiliary/SentenceBreakTest.html
65 | auxiliary/SentenceBreakTest.txt
66 | auxiliary/WordBreakProperty.txt
67 | auxiliary/WordBreakTest.html
68 | auxiliary/WordBreakTest.txt
69 | emoji/ReadMe.txt
70 | # emoji/emoji-data.txt (used)
71 | emoji/emoji-sequences.txt
72 | emoji/emoji-test.txt
73 | # emoji/emoji-variation-sequences.txt (used)
74 | # emoji/emoji-zwj-sequences.txt (for reference)
75 | # extracted/DerivedBidiClass.txt (used)
76 | extracted/DerivedBinaryProperties.txt
77 | extracted/DerivedCombiningClass.txt
78 | extracted/DerivedDecompositionType.txt
79 | # extracted/DerivedEastAsianWidth.txt (used)
80 | # extracted/DerivedGeneralCategory.txt (unused, but useful for reference)
81 | extracted/DerivedJoiningGroup.txt
82 | extracted/DerivedJoiningType.txt
83 | extracted/DerivedLineBreak.txt
84 | extracted/DerivedName.txt
85 | extracted/DerivedNumericType.txt
86 | extracted/DerivedNumericValues.txt
87 |
--------------------------------------------------------------------------------
/resources/wcwidth/uniseg.go:
--------------------------------------------------------------------------------
1 | // cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/width.go#L3-L61
2 | // EastAsianAmbiguousWidth specifies the monospace width for East Asian
3 | // characters classified as Ambiguous. The default is 1 but some rare fonts
4 | // render them with a width of 2.
5 | var EastAsianAmbiguousWidth = 1
6 |
7 | // runeWidth returns the monospace width for the given rune. The provided
8 | // grapheme property is a value mapped by the [graphemeCodePoints] table.
9 | //
10 | // Every rune has a width of 1, except for runes with the following properties
11 | // (evaluated in this order):
12 | //
13 | // - Control, CR, LF, Extend, ZWJ: Width of 0
14 | // - \u2e3a, TWO-EM DASH: Width of 3
15 | // - \u2e3b, THREE-EM DASH: Width of 4
16 | // - East-Asian width Fullwidth and Wide: Width of 2 (Ambiguous and Neutral
17 | // have a width of 1)
18 | // - Regional Indicator: Width of 2
19 | // - Extended Pictographic: Width of 2, unless Emoji Presentation is "No".
20 | func runeWidth(r rune, graphemeProperty int) int {
21 | switch graphemeProperty {
22 | case prControl, prCR, prLF, prExtend, prZWJ:
23 | return 0
24 | case prRegionalIndicator:
25 | return 2
26 | case prExtendedPictographic:
27 | if property(emojiPresentation, r) == prEmojiPresentation {
28 | return 2
29 | }
30 | return 1
31 | }
32 |
33 | switch r {
34 | case 0x2e3a:
35 | return 3
36 | case 0x2e3b:
37 | return 4
38 | }
39 |
40 | switch propertyEastAsianWidth(r) {
41 | case prW, prF:
42 | return 2
43 | case prA:
44 | return EastAsianAmbiguousWidth
45 | }
46 |
47 | return 1
48 | }
49 |
50 | // StringWidth returns the monospace width for the given string, that is, the
51 | // number of same-size cells to be occupied by the string.
52 | func StringWidth(s string) (width int) {
53 | state := -1
54 | for len(s) > 0 {
55 | var w int
56 | _, s, w, state = FirstGraphemeClusterInString(s, state)
57 | width += w
58 | }
59 | return
60 | }
61 | // cpv: end
62 |
63 | // cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/grapheme.go#L287-L345
64 | // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
65 | // outputs are strings.
66 | func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
67 | // An empty string returns nothing.
68 | if len(str) == 0 {
69 | return
70 | }
71 |
72 | // Extract the first rune.
73 | r, length := utf8.DecodeRuneInString(str)
74 | if len(str) <= length { // If we're already past the end, there is nothing else to parse.
75 | var prop int
76 | if state < 0 {
77 | prop = propertyGraphemes(r)
78 | } else {
79 | prop = state >> shiftGraphemePropState
80 | }
81 | return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
82 | }
83 |
84 | // If we don't know the state, determine it now.
85 | var firstProp int
86 | if state < 0 {
87 | state, firstProp, _ = transitionGraphemeState(state, r)
88 | } else {
89 | firstProp = state >> shiftGraphemePropState
90 | }
91 | width += runeWidth(r, firstProp)
92 |
93 | // Transition until we find a boundary.
94 | for {
95 | var (
96 | prop int
97 | boundary bool
98 | )
99 |
100 | r, l := utf8.DecodeRuneInString(str[length:])
101 | state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
102 |
103 | if boundary {
104 | return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
105 | }
106 |
107 | if firstProp == prExtendedPictographic {
108 | if r == vs15 {
109 | width = 1
110 | } else if r == vs16 {
111 | width = 2
112 | }
113 | } else if firstProp != prRegionalIndicator && firstProp != prL {
114 | width += runeWidth(r, prop)
115 | }
116 |
117 | length += l
118 | if len(str) <= length {
119 | return str, "", width, grAny | (prop << shiftGraphemePropState)
120 | }
121 | }
122 | }
123 | // cpv: end
124 |
--------------------------------------------------------------------------------
/resources/wcwidth/ziglyph.zig:
--------------------------------------------------------------------------------
1 | //! cpv: track https://codeberg.org/dude_the_builder/ziglyph/src/commit/29760d237219cc4d486f5cd654262d7b0d62d511/src/display_width.zig#L15-L124
2 | fn isAsciiStr(str: []const u8) bool {
3 | return for (str) |b| {
4 | if (b > 127) break false;
5 | } else true;
6 | }
7 |
8 | /// AmbiguousWidth determines the width of ambiguous characters according to the context. In an
9 | /// East Asian context, the width of ambiguous code points should be 2 (full), and 1 (half)
10 | /// in non-East Asian contexts. The most common use case is `half`.
11 | pub const AmbiguousWidth = enum(u2) {
12 | half = 1,
13 | full = 2,
14 | };
15 |
16 | /// codePointWidth returns how many cells (or columns) wide `cp` should be when rendered in a
17 | /// fixed-width font.
18 | pub fn codePointWidth(cp: u21, am_width: AmbiguousWidth) i3 {
19 | if (cp == 0x000 or cp == 0x0005 or cp == 0x0007 or (cp >= 0x000A and cp <= 0x000F)) {
20 | // Control.
21 | return 0;
22 | } else if (cp == 0x0008 or cp == 0x007F) {
23 | // backspace and DEL
24 | return -1;
25 | } else if (cp == 0x00AD) {
26 | // soft-hyphen
27 | return 1;
28 | } else if (cp == 0x2E3A) {
29 | // two-em dash
30 | return 2;
31 | } else if (cp == 0x2E3B) {
32 | // three-em dash
33 | return 3;
34 | } else if (cats.isEnclosingMark(cp) or cats.isNonspacingMark(cp)) {
35 | // Combining Marks.
36 | return 0;
37 | } else if (cats.isFormat(cp) and (!(cp >= 0x0600 and cp <= 0x0605) and cp != 0x061C and
38 | cp != 0x06DD and cp != 0x08E2))
39 | {
40 | // Format except Arabic.
41 | return 0;
42 | } else if ((cp >= 0x1160 and cp <= 0x11FF) or (cp >= 0x2060 and cp <= 0x206F) or
43 | (cp >= 0xFFF0 and cp <= 0xFFF8) or (cp >= 0xE0000 and cp <= 0xE0FFF))
44 | {
45 | // Hangul syllable and ignorable.
46 | return 0;
47 | } else if ((cp >= 0x3400 and cp <= 0x4DBF) or (cp >= 0x4E00 and cp <= 0x9FFF) or
48 | (cp >= 0xF900 and cp <= 0xFAFF) or (cp >= 0x20000 and cp <= 0x2FFFD) or
49 | (cp >= 0x30000 and cp <= 0x3FFFD))
50 | {
51 | return 2;
52 | } else if (eaw.isWide(cp) or eaw.isFullwidth(cp)) {
53 | return 2;
54 | } else if (gbp.isRegionalIndicator(cp)) {
55 | return 2;
56 | } else if (eaw.isAmbiguous(cp)) {
57 | return @intFromEnum(am_width);
58 | } else {
59 | return 1;
60 | }
61 | }
62 |
63 | /// strWidth returns how many cells (or columns) wide `str` should be when rendered in a
64 | /// fixed-width font.
65 | pub fn strWidth(str: []const u8, am_width: AmbiguousWidth) !usize {
66 | var total: isize = 0;
67 |
68 | // ASCII bytes are all width == 1.
69 | if (isAsciiStr(str)) {
70 | for (str) |b| {
71 | // Backspace and DEL
72 | if (b == 8 or b == 127) {
73 | total -= 1;
74 | continue;
75 | }
76 |
77 | // Control
78 | if (b < 32) continue;
79 |
80 | // All other ASCII.
81 | total += 1;
82 | }
83 |
84 | return if (total > 0) @intCast(total) else 0;
85 | }
86 |
87 | var giter = GraphemeIterator.init(str);
88 |
89 | while (giter.next()) |gc| {
90 | var cp_iter = (try unicode.Utf8View.init(str[gc.offset .. gc.offset + gc.len])).iterator();
91 |
92 | while (cp_iter.nextCodepoint()) |cp| {
93 | var w = codePointWidth(cp, am_width);
94 |
95 | if (w != 0) {
96 | // Only adding width of first non-zero-width code point.
97 | if (emoji.isExtendedPictographic(cp)) {
98 | if (cp_iter.nextCodepoint()) |ncp| {
99 | // emoji text sequence.
100 | if (ncp == 0xFE0E) w = 1;
101 | if (ncp == 0xFE0F) w = 2;
102 | }
103 | }
104 | total += w;
105 | break;
106 | }
107 | }
108 | }
109 |
110 | return if (total > 0) @intCast(total) else 0;
111 | }
112 | // cpv: end
113 |
--------------------------------------------------------------------------------
/src/utf8.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 |
3 | // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
4 | // and licenses/LICENSE_Bjoern_Hoehrmann
5 |
6 | const UTF8_ACCEPT = 0;
7 | const UTF8_REJECT = 12;
8 |
9 | // The first part of the table maps bytes to character classes to reduce the
10 | // size of the transition table and create bitmasks.
11 | // zig fmt: off
12 | const utf8d = [_]u8{
13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
14 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
15 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
16 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
17 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
18 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
19 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
20 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
21 | };
22 |
23 | // The second part is a transition table that maps a combination of a state of
24 | // the automaton and a character class to a state.
25 | const state_utf8d = [_]u8{
26 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
27 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
28 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
29 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
30 | 12,36,12,12,12,12,12,12,12,12,12,12,
31 | };
32 | // zig fmt: on
33 |
34 | fn decodeByte(state: *usize, cp: *u21, byte: u8) void {
35 | const class: std.math.IntFittingRange(0, 11) = @intCast(utf8d[byte]);
36 | const mask: u21 = 0xff;
37 |
38 | cp.* = if (state.* != UTF8_ACCEPT)
39 | (byte & 0x3f) | (cp.* << 6)
40 | else
41 | (mask >> class) & byte;
42 |
43 | state.* = state_utf8d[state.* + class];
44 | }
45 |
46 | fn isDoneDecoding(state: usize) bool {
47 | return state == UTF8_ACCEPT or state == UTF8_REJECT;
48 | }
49 |
50 | pub const Iterator = struct {
51 | // This "i" is part of the documented API of this iterator, pointing to the
52 | // current location of the iterator in `bytes`.
53 | i: usize = 0,
54 | bytes: []const u8,
55 |
56 | const Self = @This();
57 |
58 | pub fn init(bytes: []const u8) Self {
59 | return .{
60 | .bytes = bytes,
61 | };
62 | }
63 |
64 | pub fn next(self: *Self) ?u21 {
65 | if (self.i >= self.bytes.len) return null;
66 |
67 | var cp: u21 = 0;
68 | var state: usize = UTF8_ACCEPT;
69 |
70 | while (true) {
71 | decodeByte(&state, &cp, self.bytes[self.i]);
72 | self.i += 1;
73 | if (isDoneDecoding(state) or self.i >= self.bytes.len) break;
74 | }
75 |
76 | if (state == UTF8_ACCEPT) return cp;
77 | return 0xFFFD; // Replacement character
78 | }
79 |
80 | pub fn peek(self: Self) ?u21 {
81 | var it = self;
82 | return it.next();
83 | }
84 | };
85 |
86 | test "Iterator for ascii" {
87 | var it = Iterator.init("abc");
88 | try std.testing.expectEqual('a', it.next());
89 | try std.testing.expectEqual(1, it.i);
90 | try std.testing.expectEqual('b', it.peek());
91 | try std.testing.expectEqual('b', it.next());
92 | try std.testing.expectEqual('c', it.next());
93 | try std.testing.expectEqual(null, it.peek());
94 | try std.testing.expectEqual(null, it.next());
95 | try std.testing.expectEqual(null, it.next());
96 | }
97 |
98 | test "Iterator for emoji" {
99 | var it = Iterator.init("😀😅😻👺");
100 | try std.testing.expectEqual(0x1F600, it.next());
101 | try std.testing.expectEqual(4, it.i);
102 | try std.testing.expectEqual(0x1F605, it.peek());
103 | try std.testing.expectEqual(4, it.i);
104 | try std.testing.expectEqual(0x1F605, it.next());
105 | try std.testing.expectEqual(8, it.i);
106 | try std.testing.expectEqual(0x1F63B, it.next());
107 | try std.testing.expectEqual(12, it.i);
108 | try std.testing.expectEqual(0x1F47A, it.next());
109 | try std.testing.expectEqual(16, it.i);
110 | try std.testing.expectEqual(null, it.next());
111 | try std.testing.expectEqual(16, it.i);
112 | }
113 |
114 | test "Iterator overlong utf8" {
115 | var it = Iterator.init("\xf0\x80\x80\xaf");
116 | try std.testing.expectEqual(0xFFFD, it.next());
117 | try std.testing.expectEqual(0xFFFD, it.next());
118 | try std.testing.expectEqual(0xFFFD, it.next());
119 | try std.testing.expectEqual(null, it.next());
120 | try std.testing.expectEqual(null, it.next());
121 | }
122 |
--------------------------------------------------------------------------------
/resources/wcwidth/LICENSE_utf8proc.md:
--------------------------------------------------------------------------------
1 |
2 | ## utf8proc license ##
3 |
4 | **utf8proc** is a software package originally developed
5 | by Jan Behrens and the rest of the Public Software Group, who
6 | deserve nearly all of the credit for this library, that is now maintained by the Julia-language developers. Like the original utf8proc,
7 | whose copyright and license statements are reproduced below, all new
8 | work on the utf8proc library is licensed under the [MIT "expat"
9 | license](http://opensource.org/licenses/MIT):
10 |
11 | *Copyright © 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
12 |
13 | Permission is hereby granted, free of charge, to any person obtaining a
14 | copy of this software and associated documentation files (the "Software"),
15 | to deal in the Software without restriction, including without limitation
16 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 | and/or sell copies of the Software, and to permit persons to whom the
18 | Software is furnished to do so, subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be included in
21 | all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 | DEALINGS IN THE SOFTWARE.
30 |
31 | ## Original utf8proc license ##
32 |
33 | *Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany*
34 |
35 | Permission is hereby granted, free of charge, to any person obtaining a
36 | copy of this software and associated documentation files (the "Software"),
37 | to deal in the Software without restriction, including without limitation
38 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
39 | and/or sell copies of the Software, and to permit persons to whom the
40 | Software is furnished to do so, subject to the following conditions:
41 |
42 | The above copyright notice and this permission notice shall be included in
43 | all copies or substantial portions of the Software.
44 |
45 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
51 | DEALINGS IN THE SOFTWARE.
52 |
53 | ## Unicode data license ##
54 |
55 | This software contains data (`utf8proc_data.c`) derived from processing
56 | the Unicode data files. The following license applies to that data:
57 |
58 | **COPYRIGHT AND PERMISSION NOTICE**
59 |
60 | *Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
61 | under the Terms of Use in http://www.unicode.org/copyright.html.*
62 |
63 | Permission is hereby granted, free of charge, to any person obtaining a
64 | copy of the Unicode data files and any associated documentation (the "Data
65 | Files") or Unicode software and any associated documentation (the
66 | "Software") to deal in the Data Files or Software without restriction,
67 | including without limitation the rights to use, copy, modify, merge,
68 | publish, distribute, and/or sell copies of the Data Files or Software, and
69 | to permit persons to whom the Data Files or Software are furnished to do
70 | so, provided that (a) the above copyright notice(s) and this permission
71 | notice appear with all copies of the Data Files or Software, (b) both the
72 | above copyright notice(s) and this permission notice appear in associated
73 | documentation, and (c) there is clear notice in each modified Data File or
74 | in the Software as well as in the documentation associated with the Data
75 | File(s) or Software that the data or software has been modified.
76 |
77 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
78 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
79 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
80 | THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
81 | INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
82 | CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
83 | USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
84 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
85 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
86 |
87 | Except as contained in this notice, the name of a copyright holder shall
88 | not be used in advertising or otherwise to promote the sale, use or other
89 | dealings in these Data Files or Software without prior written
90 | authorization of the copyright holder.
91 |
92 | Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
93 | registered in some jurisdictions. All other trademarks and registered
94 | trademarks mentioned herein are the property of their respective owners.
95 |
96 |
--------------------------------------------------------------------------------
/src/root.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 | const getpkg = @import("get.zig");
3 | pub const config = @import("config.zig");
4 | pub const types = @import("types.zig");
5 | pub const ascii = @import("ascii.zig");
6 | pub const grapheme = @import("grapheme.zig");
7 | pub const code_point = @import("code_point.zig");
8 | pub const utf8 = @import("utf8.zig");
9 | pub const x = @import("x/root.zig");
10 | const testing = std.testing;
11 |
12 | pub const FieldEnum = getpkg.FieldEnum;
13 | pub const TypeOf = getpkg.TypeOf;
14 | pub const TypeOfAll = getpkg.TypeOfAll;
15 | pub const get = getpkg.get;
16 | pub const getAll = getpkg.getAll;
17 | pub const hasField = getpkg.hasField;
18 |
19 | test {
20 | std.testing.refAllDeclsRecursive(@This());
21 | }
22 |
23 | test "name" {
24 | try testing.expect(std.mem.eql(u8, get(.name, 65), "LATIN CAPITAL LETTER A"));
25 | }
26 |
27 | test "is_alphabetic" {
28 | try testing.expect(get(.is_alphabetic, 65)); // 'A'
29 | try testing.expect(get(.is_alphabetic, 97)); // 'a'
30 | try testing.expect(!get(.is_alphabetic, 0));
31 | }
32 |
33 | test "case_folding_simple" {
34 | try testing.expectEqual(97, get(.case_folding_simple, 65)); // 'a'
35 | try testing.expectEqual(97, get(.case_folding_simple, 97)); // 'a'
36 | }
37 |
38 | test "simple_uppercase_mapping" {
39 | try testing.expectEqual(65, get(.simple_uppercase_mapping, 97)); // 'a'
40 | try testing.expectEqual(null, get(.simple_uppercase_mapping, 65)); // 'A'
41 | }
42 |
43 | test "generalCategory" {
44 | try testing.expect(get(.general_category, 65) == .letter_uppercase); // 'A'
45 | }
46 |
47 | test "getAll" {
48 | const d1 = getAll("1", 65);
49 | try testing.expect(d1.general_category == .letter_uppercase);
50 | try testing.expect(d1.case_folding_simple.unshift(65) == 97);
51 |
52 | const d_checks = getAll("checks", 65);
53 | // auto should become packed for these checks
54 | try testing.expectEqual(.@"packed", @typeInfo(TypeOfAll("checks")).@"struct".layout);
55 | try testing.expect(d_checks.simple_uppercase_mapping.unshift(65) == null);
56 | try testing.expect(d_checks.is_alphabetic);
57 | try testing.expect(d_checks.is_uppercase);
58 | try testing.expect(!d_checks.is_lowercase);
59 | }
60 |
61 | test "get extension foo" {
62 | try testing.expectEqual(0, get(.foo, 65));
63 | try testing.expectEqual(3, get(.foo, 0));
64 | }
65 |
66 | test "get extension emoji_odd_or_even" {
67 | try testing.expectEqual(.odd_emoji, get(.emoji_odd_or_even, 0x1F34B)); // 🍋
68 | }
69 |
70 | test "get packed optional enum works" {
71 | try testing.expectEqual(.odd_emoji, get(.opt_emoji_odd_or_even, 0x1F34B)); // 🍋
72 | try testing.expectEqual(null, get(.opt_emoji_odd_or_even, 0x01D8)); // ǘ
73 | }
74 |
75 | test "get packed optional bool works" {
76 | try testing.expectEqual(true, get(.maybe_bit, 0x1200));
77 | try testing.expectEqual(false, get(.maybe_bit, 0x1235));
78 | try testing.expectEqual(null, get(.maybe_bit, 0x1236));
79 | }
80 |
81 | test "get union unpacked, shift" {
82 | try testing.expectEqual(0x1234, get(.next_or_prev, 0x1233).next);
83 | try testing.expectEqual(0x1200, get(.next_or_prev, 0x1201).prev);
84 | try testing.expectEqual(.none, get(.next_or_prev, 0x1235));
85 | }
86 |
87 | test "get union unpacked, direct" {
88 | try testing.expectEqual(0x1234, get(.next_or_prev_direct, 0x1233).next);
89 | try testing.expectEqual(0x1200, get(.next_or_prev_direct, 0x1201).prev);
90 | try testing.expectEqual(.none, get(.next_or_prev_direct, 0x1235));
91 | }
92 |
93 | test "get union packed, shift" {
94 | try testing.expectEqual(5, @bitSizeOf(@FieldType(TypeOfAll("pack"), "bidi_paired_bracket")));
95 | try testing.expectEqual(0x0029, get(.bidi_paired_bracket, 0x0028).open);
96 | try testing.expectEqual(0x2997, get(.bidi_paired_bracket, 0x2998).close);
97 | try testing.expectEqual(.none, get(.bidi_paired_bracket, 0x4000));
98 | }
99 |
100 | test "get union packed, direct" {
101 | try testing.expectEqual(0x0029, get(.bidi_paired_bracket_direct, 0x0028).open);
102 | try testing.expectEqual(0x2997, get(.bidi_paired_bracket_direct, 0x2998).close);
103 | try testing.expectEqual(.none, get(.bidi_paired_bracket_direct, 0x4000));
104 | }
105 |
106 | test "get bidi_class" {
107 | try testing.expectEqual(.arabic_number, get(.bidi_class, 0x0600));
108 | }
109 |
110 | test "special_casing_condition" {
111 | const conditions1 = get(.special_casing_condition, 65); // 'A'
112 | try testing.expectEqual(0, conditions1.len);
113 |
114 | // Greek Capital Sigma (U+03A3) which has Final_Sigma condition
115 | const conditions = get(.special_casing_condition, 0x03A3);
116 | try testing.expectEqual(1, conditions.len);
117 | try testing.expectEqual(types.SpecialCasingCondition.final_sigma, conditions[0]);
118 | }
119 |
120 | test "special_lowercase_mapping" {
121 | var buffer: [1]u21 = undefined;
122 |
123 | // Greek Capital Sigma (U+03A3) which has Final_Sigma condition
124 | const mapping = get(.special_lowercase_mapping, 0x03A3).with(&buffer, 0x03A3);
125 | try testing.expectEqual(1, mapping.len);
126 | try testing.expectEqual(0x03C2, mapping[0]); // Should map to Greek Small Letter Final Sigma
127 | }
128 |
129 | test "info extension" {
130 | // ǰ -> J
131 | try testing.expectEqual(0x004A, get(.uppercase_mapping_first_char, 0x01F0));
132 |
133 | try testing.expect(get(.has_simple_lowercase, 0x1FD9)); // Ῑ
134 | try testing.expect(!get(.has_simple_lowercase, 0x1FE0)); // ῠ
135 |
136 | // MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH
137 | try testing.expect(std.mem.eql(u8, "061/1", get(.numeric_value_numeric_reversed, 0x0D58)));
138 | }
139 |
140 | test "is_emoji_vs_base" {
141 | try testing.expect(get(.is_emoji_vs_base, 0x231B)); // ⌛
142 | try testing.expect(get(.is_emoji_vs_base, 0x1F327)); // 🌧
143 | try testing.expect(!get(.is_emoji_vs_base, 0x1F46C)); // 👬
144 | }
145 |
--------------------------------------------------------------------------------
/src/code_point.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 |
3 | pub const Iterator = struct {
4 | // This "i" is part of the documented API of this iterator, pointing to the
5 | // current location of the iterator in `code_points`.
6 | i: usize = 0,
7 | code_points: []const u21,
8 |
9 | const Self = @This();
10 |
11 | pub fn init(code_points: []const u21) Self {
12 | return .{
13 | .code_points = code_points,
14 | };
15 | }
16 |
17 | pub fn next(self: *Self) ?u21 {
18 | if (self.i >= self.code_points.len) return null;
19 | defer self.i += 1;
20 | return self.code_points[self.i];
21 | }
22 |
23 | pub fn peek(self: Self) ?u21 {
24 | if (self.i >= self.code_points.len) return null;
25 | return self.code_points[self.i];
26 | }
27 | };
28 |
29 | test "Iterator for emoji code points" {
30 | const code_points = &[_]u21{
31 | 0x1F600, // 😀
32 | 0x1F605, // 😅
33 | 0x1F63B, // 😻
34 | 0x1F47A, // 👺
35 | };
36 |
37 | var it = Iterator.init(code_points);
38 | try std.testing.expectEqual(0x1F600, it.next());
39 | try std.testing.expectEqual(1, it.i);
40 | try std.testing.expectEqual(0x1F605, it.peek());
41 | try std.testing.expectEqual(1, it.i);
42 | try std.testing.expectEqual(0x1F605, it.next());
43 | try std.testing.expectEqual(2, it.i);
44 | try std.testing.expectEqual(0x1F63B, it.next());
45 | try std.testing.expectEqual(3, it.i);
46 | try std.testing.expectEqual(0x1F47A, it.next());
47 | try std.testing.expectEqual(4, it.i);
48 | try std.testing.expectEqual(null, it.next());
49 | try std.testing.expectEqual(4, it.i);
50 | }
51 |
52 | /// Returns a custom iterator for a given Context type.
53 | ///
54 | /// The Context must have the following methods:
55 | ///
56 | /// * len(self: *Context) usize
57 | /// * get(self: *Context, i: usize) ?u21 // or u21
58 | ///
59 | /// If `get` returns null, the code continues incrementing `i` until it returns
60 | /// a non-null result or `len` is reached, with `len` being called every
61 | /// iteration to allow for `Context` to end early. If instead `get` has a
62 | /// return type of non-optional `u21`, we don't loop.
63 | pub fn CustomIterator(comptime Context: type) type {
64 | return struct {
65 | // This "i" is part of the documented API of this iterator, pointing to the
66 | // current location of the iterator in `code_points`.
67 | i: usize = 0,
68 | ctx: Context,
69 |
70 | const Self = @This();
71 |
72 | pub fn init(ctx: Context) Self {
73 | return .{
74 | .ctx = ctx,
75 | };
76 | }
77 |
78 | pub fn next(self: *Self) ?u21 {
79 | const getFn = @typeInfo(@TypeOf(@TypeOf(self.ctx).get)).@"fn";
80 | if (comptime getFn.return_type.? == ?u21) {
81 | while (self.i < self.ctx.len()) : (self.i += 1) {
82 | const value = self.ctx.get(self.i);
83 | if (value) |cp| {
84 | @branchHint(.likely);
85 | self.i += 1;
86 | return cp;
87 | }
88 | }
89 | } else {
90 | if (self.i < self.ctx.len()) {
91 | defer self.i += 1;
92 | return self.ctx.get(self.i);
93 | }
94 | }
95 |
96 | return null;
97 | }
98 |
99 | pub fn peek(self: Self) ?u21 {
100 | var it = self;
101 | return it.next();
102 | }
103 | };
104 | }
105 |
106 | test "CustomIterator for emoji code points" {
107 | const Wrapper = struct {
108 | cp: u21,
109 | };
110 |
111 | const code_points = &[_]Wrapper{
112 | .{ .cp = 0x1F600 }, // 😀
113 | .{ .cp = 0x1F605 }, // 😅
114 | .{ .cp = 0x1F63B }, // 😻
115 | .{ .cp = 0x1F47A }, // 👺
116 | };
117 |
118 | var it = CustomIterator(struct {
119 | points: []const Wrapper,
120 |
121 | pub fn len(self: @This()) usize {
122 | return self.points.len;
123 | }
124 |
125 | pub fn get(self: @This(), i: usize) u21 {
126 | return self.points[i].cp;
127 | }
128 | }).init(.{ .points = code_points });
129 | try std.testing.expectEqual(0x1F600, it.next());
130 | try std.testing.expectEqual(1, it.i);
131 | try std.testing.expectEqual(0x1F605, it.peek());
132 | try std.testing.expectEqual(1, it.i);
133 | try std.testing.expectEqual(0x1F605, it.next());
134 | try std.testing.expectEqual(2, it.i);
135 | try std.testing.expectEqual(0x1F63B, it.next());
136 | try std.testing.expectEqual(3, it.i);
137 | try std.testing.expectEqual(0x1F47A, it.next());
138 | try std.testing.expectEqual(4, it.i);
139 | try std.testing.expectEqual(null, it.next());
140 | try std.testing.expectEqual(4, it.i);
141 | }
142 |
143 | test "CustomIterator for emoji code points with gaps and optional get" {
144 | const Wrapper = struct {
145 | cp: ?u21,
146 | };
147 |
148 | const code_points = &[_]Wrapper{
149 | .{ .cp = 0x1F600 }, // 😀
150 | .{ .cp = null },
151 | .{ .cp = 0x1F605 }, // 😅
152 | .{ .cp = 0x1F63B }, // 😻
153 | .{ .cp = 0x1F47A }, // 👺
154 | .{ .cp = null },
155 | .{ .cp = null },
156 | };
157 |
158 | var it = CustomIterator(struct {
159 | points: []const Wrapper,
160 |
161 | pub fn len(self: @This()) usize {
162 | return self.points.len;
163 | }
164 |
165 | pub fn get(self: @This(), i: usize) ?u21 {
166 | return self.points[i].cp;
167 | }
168 | }).init(.{ .points = code_points });
169 | try std.testing.expectEqual(0x1F600, it.next());
170 | try std.testing.expectEqual(1, it.i);
171 | try std.testing.expectEqual(0x1F605, it.peek());
172 | try std.testing.expectEqual(1, it.i);
173 | try std.testing.expectEqual(0x1F605, it.next());
174 | try std.testing.expectEqual(3, it.i);
175 | try std.testing.expectEqual(0x1F63B, it.next());
176 | try std.testing.expectEqual(4, it.i);
177 | try std.testing.expectEqual(0x1F47A, it.next());
178 | try std.testing.expectEqual(5, it.i);
179 | try std.testing.expectEqual(null, it.next());
180 | try std.testing.expectEqual(7, it.i);
181 | }
182 |
--------------------------------------------------------------------------------
/src/get.zig:
--------------------------------------------------------------------------------
1 | //! This file defines the low(er)-level `get` method, returning `Data`.
2 | //! (It also must be separate from `root.zig` so that `types.zig` can use it to
3 | //! allow for a better API on `Slice` fields.)
4 | const std = @import("std");
5 | const tables = @import("tables").tables;
6 | const types = @import("types.zig");
7 |
8 | fn TableData(comptime Table: anytype) type {
9 | const DataSlice = if (@hasField(Table, "stage3"))
10 | @FieldType(Table, "stage3")
11 | else
12 | @FieldType(Table, "stage2");
13 | return @typeInfo(DataSlice).pointer.child;
14 | }
15 |
16 | fn tableInfoFor(comptime field: []const u8) std.builtin.Type.StructField {
17 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| {
18 | if (@hasField(TableData(tableInfo.type), field)) {
19 | return tableInfo;
20 | }
21 | }
22 |
23 | @compileError("Table not found for field: " ++ field);
24 | }
25 |
26 | pub fn hasField(comptime field: []const u8) bool {
27 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| {
28 | if (@hasField(TableData(tableInfo.type), field)) {
29 | return true;
30 | }
31 | }
32 |
33 | return false;
34 | }
35 |
36 | fn getTableInfo(comptime table_name: []const u8) std.builtin.Type.StructField {
37 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| {
38 | if (std.mem.eql(u8, tableInfo.name, table_name)) {
39 | return tableInfo;
40 | }
41 | }
42 |
43 | @compileError("Table '" ++ table_name ++ "' not found in tables");
44 | }
45 |
46 | fn BackingFor(comptime field: []const u8) type {
47 | const tableInfo = tableInfoFor(field);
48 | const Backing = @FieldType(@FieldType(@TypeOf(tables), tableInfo.name), "backing");
49 | return @FieldType(@typeInfo(Backing).pointer.child, field);
50 | }
51 |
52 | pub fn backingFor(comptime field: []const u8) BackingFor(field) {
53 | const tableInfo = tableInfoFor(field);
54 | return @field(@field(tables, tableInfo.name).backing, field);
55 | }
56 |
57 | fn TableFor(comptime field: []const u8) type {
58 | const tableInfo = tableInfoFor(field);
59 | return @FieldType(@TypeOf(tables), tableInfo.name);
60 | }
61 |
62 | fn tableFor(comptime field: []const u8) TableFor(field) {
63 | return @field(tables, tableInfoFor(field).name);
64 | }
65 |
66 | fn GetTable(comptime table_name: []const u8) type {
67 | const tableInfo = getTableInfo(table_name);
68 | return @FieldType(@TypeOf(tables), tableInfo.name);
69 | }
70 |
71 | fn getTable(comptime table_name: []const u8) GetTable(table_name) {
72 | return @field(tables, getTableInfo(table_name).name);
73 | }
74 |
75 | fn data(comptime table: anytype, cp: u21) TableData(@TypeOf(table)) {
76 | const stage1_idx = cp >> 8;
77 | const stage2_idx = cp & 0xFF;
78 | if (@hasField(@TypeOf(table), "stage3")) {
79 | return table.stage3[table.stage2[table.stage1[stage1_idx] + stage2_idx]];
80 | } else {
81 | return table.stage2[table.stage1[stage1_idx] + stage2_idx];
82 | }
83 | }
84 |
85 | pub fn getAll(comptime table_name: []const u8, cp: u21) TypeOfAll(table_name) {
86 | const table = comptime getTable(table_name);
87 | return data(table, cp);
88 | }
89 |
90 | pub fn TypeOfAll(comptime table_name: []const u8) type {
91 | return TableData(getTableInfo(table_name).type);
92 | }
93 |
94 | pub const FieldEnum = blk: {
95 | var fields_len: usize = 0;
96 | for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| {
97 | fields_len += @typeInfo(TableData(tableInfo.type)).@"struct".fields.len;
98 | }
99 |
100 | var fields: [fields_len]std.builtin.Type.EnumField = undefined;
101 | var i: usize = 0;
102 |
103 | for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| {
104 | for (@typeInfo(TableData(tableInfo.type)).@"struct".fields) |f| {
105 | fields[i] = .{
106 | .name = f.name,
107 | .value = i,
108 | };
109 | i += 1;
110 | }
111 | }
112 |
113 | break :blk @Type(.{
114 | .@"enum" = .{
115 | .tag_type = std.math.IntFittingRange(0, fields_len - 1),
116 | .fields = &fields,
117 | .decls = &[_]std.builtin.Type.Declaration{},
118 | .is_exhaustive = true,
119 | },
120 | });
121 | };
122 |
123 | fn DataField(comptime field: []const u8) type {
124 | return @FieldType(TableData(tableInfoFor(field).type), field);
125 | }
126 |
127 | fn FieldValue(comptime field: []const u8) type {
128 | const D = DataField(field);
129 | if (@typeInfo(D) == .@"struct") {
130 | if (@hasDecl(D, "unshift") and @TypeOf(D.unshift) != void) {
131 | return @typeInfo(@TypeOf(D.unshift)).@"fn".return_type.?;
132 | } else if (@hasDecl(D, "unpack")) {
133 | return @typeInfo(@TypeOf(D.unpack)).@"fn".return_type.?;
134 | } else if (@hasDecl(D, "value") and @TypeOf(D.value) != void) {
135 | return @typeInfo(@TypeOf(D.value)).@"fn".return_type.?;
136 | } else {
137 | return D;
138 | }
139 | } else {
140 | return D;
141 | }
142 | }
143 |
144 | // Note: I tried using a union with members that are the known types, and using
145 | // @FieldType(KnownFieldsForLspUnion, field) but the LSP was still unable to
146 | // figure out the type. It seems like the only way to get the LSP to know the
147 | // type would be having dedicated `get` functions for each field, but I don't
148 | // want to go that route.
149 | pub fn get(comptime field: FieldEnum, cp: u21) TypeOf(field) {
150 | const name = @tagName(field);
151 | const D = DataField(name);
152 | const table = comptime tableFor(name);
153 |
154 | if (@typeInfo(D) == .@"struct" and (@hasDecl(D, "unpack") or @hasDecl(D, "unshift") or (@hasDecl(D, "value") and @TypeOf(D.value) != void))) {
155 | const d = @field(data(table, cp), name);
156 | if (@hasDecl(D, "unshift") and @TypeOf(D.unshift) != void) {
157 | return d.unshift(cp);
158 | } else if (@hasDecl(D, "unpack")) {
159 | return d.unpack();
160 | } else {
161 | return d.value();
162 | }
163 | } else {
164 | return @field(data(table, cp), name);
165 | }
166 | }
167 |
168 | pub fn TypeOf(comptime field: FieldEnum) type {
169 | return FieldValue(@tagName(field));
170 | }
171 |
--------------------------------------------------------------------------------
/resources/wcwidth/zg.zig:
--------------------------------------------------------------------------------
1 | // cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/codegen/dwp.zig#L31-L223
2 | var flat_map = std.AutoHashMap(u21, i4).init(allocator);
3 | defer flat_map.deinit();
4 |
5 | var line_buf: [4096]u8 = undefined;
6 |
7 | // Process DerivedEastAsianWidth.txt
8 | var deaw_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedEastAsianWidth.txt", .{});
9 | defer deaw_file.close();
10 | var deaw_buf = std.io.bufferedReader(deaw_file.reader());
11 | const deaw_reader = deaw_buf.reader();
12 |
13 | while (try deaw_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
14 | if (line.len == 0) continue;
15 |
16 | // @missing ranges
17 | if (std.mem.startsWith(u8, line, "# @missing: ")) {
18 | const semi = std.mem.indexOfScalar(u8, line, ';').?;
19 | const field = line[12..semi];
20 | const dots = std.mem.indexOf(u8, field, "..").?;
21 | const from = try std.fmt.parseInt(u21, field[0..dots], 16);
22 | const to = try std.fmt.parseInt(u21, field[dots + 2 ..], 16);
23 | if (from == 0 and to == 0x10ffff) continue;
24 | for (from..to + 1) |cp| try flat_map.put(@intCast(cp), 2);
25 | continue;
26 | }
27 |
28 | if (line[0] == '#') continue;
29 |
30 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
31 |
32 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
33 | var current_code: [2]u21 = undefined;
34 |
35 | var i: usize = 0;
36 | while (field_iter.next()) |field| : (i += 1) {
37 | switch (i) {
38 | 0 => {
39 | // Code point(s)
40 | if (std.mem.indexOf(u8, field, "..")) |dots| {
41 | current_code = .{
42 | try std.fmt.parseInt(u21, field[0..dots], 16),
43 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
44 | };
45 | } else {
46 | const code = try std.fmt.parseInt(u21, field, 16);
47 | current_code = .{ code, code };
48 | }
49 | },
50 | 1 => {
51 | // Width
52 | if (std.mem.eql(u8, field, "W") or
53 | std.mem.eql(u8, field, "F") or
54 | (options.cjk and std.mem.eql(u8, field, "A")))
55 | {
56 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 2);
57 | }
58 | },
59 | else => {},
60 | }
61 | }
62 | }
63 |
64 | // Process DerivedGeneralCategory.txt
65 | var dgc_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedGeneralCategory.txt", .{});
66 | defer dgc_file.close();
67 | var dgc_buf = std.io.bufferedReader(dgc_file.reader());
68 | const dgc_reader = dgc_buf.reader();
69 |
70 | while (try dgc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| {
71 | if (line.len == 0 or line[0] == '#') continue;
72 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line;
73 |
74 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; ");
75 | var current_code: [2]u21 = undefined;
76 |
77 | var i: usize = 0;
78 | while (field_iter.next()) |field| : (i += 1) {
79 | switch (i) {
80 | 0 => {
81 | // Code point(s)
82 | if (std.mem.indexOf(u8, field, "..")) |dots| {
83 | current_code = .{
84 | try std.fmt.parseInt(u21, field[0..dots], 16),
85 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16),
86 | };
87 | } else {
88 | const code = try std.fmt.parseInt(u21, field, 16);
89 | current_code = .{ code, code };
90 | }
91 | },
92 | 1 => {
93 | // General category
94 | if (std.mem.eql(u8, field, "Mn")) {
95 | // Nonspacing_Mark
96 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
97 | } else if (std.mem.eql(u8, field, "Me")) {
98 | // Enclosing_Mark
99 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
100 | } else if (std.mem.eql(u8, field, "Mc")) {
101 | // Spacing_Mark
102 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
103 | } else if (std.mem.eql(u8, field, "Cf")) {
104 | if (std.mem.indexOf(u8, line, "ARABIC") == null) {
105 | // Format except Arabic
106 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0);
107 | }
108 | }
109 | },
110 | else => {},
111 | }
112 | }
113 | }
114 |
115 | var blocks_map = BlockMap.init(allocator);
116 | defer blocks_map.deinit();
117 |
118 | var stage1 = std.ArrayList(u16).init(allocator);
119 | defer stage1.deinit();
120 |
121 | var stage2 = std.ArrayList(i4).init(allocator);
122 | defer stage2.deinit();
123 |
124 | var block: Block = [_]i4{0} ** block_size;
125 | var block_len: u16 = 0;
126 |
127 | for (0..0x110000) |i| {
128 | const cp: u21 = @intCast(i);
129 | var width = flat_map.get(cp) orelse 1;
130 |
131 | // Specific overrides
132 | switch (cp) {
133 | // Three-em dash
134 | 0x2e3b => width = 3,
135 |
136 | // C0/C1 control codes
137 | 0...0x20 => width = if (options.c0_width) |c0| c0 else 0,
138 | 0x80...0x9f => width = if (options.c1_width) |c1| c1 else 0,
139 |
140 | // Line separator
141 | 0x2028,
142 |
143 | // Paragraph separator
144 | 0x2029,
145 |
146 | // Hangul syllable and ignorable.
147 | 0x1160...0x11ff,
148 | 0xd7b0...0xd7ff,
149 | 0x2060...0x206f,
150 | 0xfff0...0xfff8,
151 | 0xe0000...0xE0fff,
152 | => width = 0,
153 |
154 | // Two-em dash
155 | 0x2e3a,
156 |
157 | // Regional indicators
158 | 0x1f1e6...0x1f200,
159 |
160 | // CJK Blocks
161 | 0x3400...0x4dbf, // CJK Unified Ideographs Extension A
162 | 0x4e00...0x9fff, // CJK Unified Ideographs
163 | 0xf900...0xfaff, // CJK Compatibility Ideographs
164 | 0x20000...0x2fffd, // Plane 2
165 | 0x30000...0x3fffd, // Plane 3
166 | => width = 2,
167 |
168 | else => {},
169 | }
170 |
171 | // ASCII
172 | if (0x20 <= cp and cp < 0x7f) width = 1;
173 |
174 | // Soft hyphen
175 | if (cp == 0xad) width = 1;
176 |
177 | // Backspace and delete
178 | if (cp == 0x8 or cp == 0x7f) width = if (options.c0_width) |c0| c0 else -1;
179 |
180 | // Process block
181 | block[block_len] = width;
182 | block_len += 1;
183 |
184 | if (block_len < block_size and cp != 0x10ffff) continue;
185 |
186 | const gop = try blocks_map.getOrPut(block);
187 | if (!gop.found_existing) {
188 | gop.value_ptr.* = @intCast(stage2.items.len);
189 | try stage2.appendSlice(&block);
190 | }
191 |
192 | try stage1.append(gop.value_ptr.*);
193 | block_len = 0;
194 | }
195 | // cpv: end
196 |
197 | /// cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/src/DisplayWidth.zig#L105-L145
198 | /// strWidth returns the total display width of `str` as the number of cells
199 | /// required in a fixed-pitch font (i.e. a terminal screen).
200 | pub fn strWidth(dw: DisplayWidth, str: []const u8) usize {
201 | var total: isize = 0;
202 |
203 | // ASCII fast path
204 | if (ascii.isAsciiOnly(str)) {
205 | for (str) |b| total += dw.codePointWidth(b);
206 | return @intCast(@max(0, total));
207 | }
208 |
209 | var giter = dw.graphemes.iterator(str);
210 |
211 | while (giter.next()) |gc| {
212 | var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) };
213 | var gc_total: isize = 0;
214 |
215 | while (cp_iter.next()) |cp| {
216 | var w = dw.codePointWidth(cp.code);
217 |
218 | if (w != 0) {
219 | // Handle text emoji sequence.
220 | if (cp_iter.next()) |ncp| {
221 | // emoji text sequence.
222 | if (ncp.code == 0xFE0E) w = 1;
223 | if (ncp.code == 0xFE0F) w = 2;
224 | }
225 |
226 | // Only adding width of first non-zero-width code point.
227 | if (gc_total == 0) {
228 | gc_total = w;
229 | break;
230 | }
231 | }
232 | }
233 |
234 | total += gc_total;
235 | }
236 |
237 | return @intCast(@max(0, total));
238 | }
239 | // cpv: end
240 |
--------------------------------------------------------------------------------
/src/x/root.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 | pub const types = @import("types.x.zig");
3 | pub const grapheme = @import("grapheme.zig");
4 | const testing = std.testing;
5 |
6 | test {
7 | std.testing.refAllDeclsRecursive(@This());
8 | }
9 |
10 | // wcwidth tests
11 |
12 | test "wcwidth_standalone control characters are width 0" {
13 | const get = @import("get.zig").get;
14 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x0000)); // NULL (C0)
15 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x001F)); // UNIT SEPARATOR (C0)
16 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x007F)); // DELETE (C0)
17 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x0080)); // C1 control
18 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x009F)); // C1 control
19 | }
20 |
21 | test "wcwidth_standalone surrogates are width 0" {
22 | const get = @import("get.zig").get;
23 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xD800)); // High surrogate start
24 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDBFF)); // High surrogate end
25 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDC00)); // Low surrogate start
26 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDFFF)); // Low surrogate end
27 | }
28 |
29 | test "wcwidth_standalone line and paragraph separators are width 0" {
30 | const get = @import("get.zig").get;
31 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x2028)); // LINE SEPARATOR (Zl)
32 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x2029)); // PARAGRAPH SEPARATOR (Zp)
33 | }
34 |
35 | test "wcwidth_standalone default ignorable characters are width 0" {
36 | const get = @import("get.zig").get;
37 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200B)); // ZERO WIDTH SPACE
38 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200C)); // ZERO WIDTH NON-JOINER (ZWNJ)
39 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200D)); // ZERO WIDTH JOINER (ZWJ)
40 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFE00)); // VARIATION SELECTOR-1
41 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFE0F)); // VARIATION SELECTOR-16
42 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFEFF)); // ZERO WIDTH NO-BREAK SPACE
43 | }
44 |
45 | test "wcwidth_standalone soft hyphen exception is width 1" {
46 | const get = @import("get.zig").get;
47 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00AD)); // SOFT HYPHEN
48 | }
49 |
50 | test "wcwidth_standalone combining marks are width 1" {
51 | const get = @import("get.zig").get;
52 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0300)); // COMBINING GRAVE ACCENT (Mn)
53 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0903)); // DEVANAGARI SIGN VISARGA (Mc)
54 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x20DD)); // COMBINING ENCLOSING CIRCLE (Me)
55 | }
56 |
57 | test "wcwidth_zero_in_grapheme combining marks" {
58 | const get = @import("get.zig").get;
59 | // mark_nonspacing (Mn) are true
60 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0300)); // COMBINING GRAVE ACCENT (Mn)
61 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0341)); // COMBINING GREEK PERISPOMENI (Mn)
62 | // mark_enclosing (Me) are true
63 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20DD)); // COMBINING ENCLOSING CIRCLE (Me)
64 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20DE)); // COMBINING ENCLOSING SQUARE (Me)
65 | // mark_spacing_combining (Mc) follow EAW - Neutral=1, so false
66 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x0903)); // DEVANAGARI SIGN VISARGA (Mc, N)
67 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x093E)); // DEVANAGARI VOWEL SIGN AA (Mc, N)
68 | // mark_spacing_combining with EAW=Wide are width 2, so false
69 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x302E)); // HANGUL SINGLE DOT TONE MARK (Mc, W)
70 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x302F)); // HANGUL DOUBLE DOT TONE MARK (Mc, W)
71 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x16FF0)); // VIETNAMESE ALTERNATE READING MARK CA (Mc, W)
72 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x16FF1)); // VIETNAMESE ALTERNATE READING MARK NHAY (Mc, W)
73 | }
74 |
75 | test "wcwidth_standalone combining enclosing keycap exception is width 2" {
76 | const get = @import("get.zig").get;
77 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x20E3)); // COMBINING ENCLOSING KEYCAP
78 | }
79 |
80 | test "wcwidth_zero_in_grapheme combining enclosing keycap exception is true" {
81 | const get = @import("get.zig").get;
82 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20E3)); // COMBINING ENCLOSING KEYCAP
83 | }
84 |
85 | test "wcwidth_standalone regional indicators are width 2" {
86 | const get = @import("get.zig").get;
87 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1E6)); // Regional Indicator A
88 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1FA)); // Regional Indicator U
89 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1F8)); // Regional Indicator S
90 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1FF)); // Regional Indicator Z
91 | }
92 |
93 | test "wcwidth_standalone em dashes have special widths" {
94 | const get = @import("get.zig").get;
95 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x2E3A)); // TWO-EM DASH
96 | try testing.expectEqual(3, get(.wcwidth_standalone, 0x2E3B)); // THREE-EM DASH
97 | }
98 |
99 | test "wcwidth_standalone ambiguous width characters are width 1" {
100 | const get = @import("get.zig").get;
101 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00A1)); // INVERTED EXCLAMATION MARK (A)
102 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00B1)); // PLUS-MINUS SIGN (A)
103 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x2664)); // WHITE SPADE SUIT (A)
104 | }
105 |
106 | test "wcwidth_standalone east asian wide and fullwidth are width 2" {
107 | const get = @import("get.zig").get;
108 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x3000)); // IDEOGRAPHIC SPACE (F)
109 | try testing.expectEqual(2, get(.wcwidth_standalone, 0xFF01)); // FULLWIDTH EXCLAMATION MARK (F)
110 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x4E00)); // CJK UNIFIED IDEOGRAPH (W)
111 | try testing.expectEqual(2, get(.wcwidth_standalone, 0xAC00)); // HANGUL SYLLABLE (W)
112 | }
113 |
114 | test "wcwidth_standalone hangul jamo V and T are width 1" {
115 | const get = @import("get.zig").get;
116 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x1161)); // HANGUL JUNGSEONG A (V)
117 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x11A8)); // HANGUL JONGSEONG KIYEOK (T)
118 | try testing.expectEqual(1, get(.wcwidth_standalone, 0xD7B0)); // HANGUL JUNGSEONG O-YEO (V)
119 | try testing.expectEqual(1, get(.wcwidth_standalone, 0xD7CB)); // HANGUL JONGSEONG NIEUN-RIEUL (T)
120 | }
121 |
122 | test "wcwidth_zero_in_grapheme hangul jamo V and T are true" {
123 | const get = @import("get.zig").get;
124 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1161)); // HANGUL JUNGSEONG A (V)
125 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x11A8)); // HANGUL JONGSEONG KIYEOK (T)
126 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0xD7B0)); // HANGUL JUNGSEONG O-YEO (V)
127 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0xD7CB)); // HANGUL JONGSEONG NIEUN-RIEUL (T)
128 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x16D63)); // KIRAT RAI VOWEL SIGN AA (V)
129 | }
130 |
131 | test "wcwidth_standalone format characters non-DI are width 1" {
132 | const get = @import("get.zig").get;
133 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0600)); // ARABIC NUMBER SIGN (Cf, not DI)
134 | }
135 |
136 | test "wcwidth_zero_in_grapheme format characters non-DI is true" {
137 | const get = @import("get.zig").get;
138 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0600)); // ARABIC NUMBER SIGN (Cf, not DI)
139 | }
140 |
141 | test "wcwidth_standalone prepend characters are width 1" {
142 | const get = @import("get.zig").get;
143 | // Lo Prepend (0D4E)
144 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0D4E));
145 | }
146 |
147 | test "wcwidth_zero_in_grapheme prepend characters are true" {
148 | const get = @import("get.zig").get;
149 | // Lo Prepend (0D4E)
150 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0D4E));
151 | }
152 |
153 | test "wcwidth_standalone emoji with default text presentation is 1" {
154 | const get = @import("get.zig").get;
155 | // weight lifter
156 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x1F3CB));
157 | }
158 |
159 | test "wcwidth_standalone emoji_modifier is 2" {
160 | const get = @import("get.zig").get;
161 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F3FB)); // 🏻 EMOJI MODIFIER FITZPATRICK TYPE-1-2
162 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F3FF)); // 🏿 EMOJI MODIFIER FITZPATRICK TYPE-6
163 | }
164 |
165 | test "wcwidth_zero_in_grapheme emoji_modifier is true" {
166 | const get = @import("get.zig").get;
167 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1F3FB)); // 🏻 EMOJI MODIFIER FITZPATRICK TYPE-1-2
168 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1F3FF)); // 🏿 EMOJI MODIFIER FITZPATRICK TYPE-6
169 | }
170 |
--------------------------------------------------------------------------------
/ucd/BidiBrackets.txt:
--------------------------------------------------------------------------------
1 | # BidiBrackets-16.0.0.txt
2 | # Date: 2024-02-02
3 | # © 2024 Unicode®, Inc.
4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html
6 | #
7 | # Unicode Character Database
8 | # For documentation, see https://www.unicode.org/reports/tr44/
9 | #
10 | # Bidi_Paired_Bracket and Bidi_Paired_Bracket_Type Properties
11 | #
12 | # This file is a normative contributory data file in the Unicode
13 | # Character Database.
14 | #
15 | # Bidi_Paired_Bracket is a normative property
16 | # which establishes a mapping between characters that are treated as
17 | # bracket pairs by the Unicode Bidirectional Algorithm.
18 | #
19 | # Bidi_Paired_Bracket_Type is a normative property
20 | # which classifies characters into opening and closing paired brackets
21 | # for the purposes of the Unicode Bidirectional Algorithm.
22 | #
23 | # This file lists the set of code points with Bidi_Paired_Bracket_Type
24 | # property values Open and Close. The set is derived from the character
25 | # properties General_Category (gc), Bidi_Class (bc), Bidi_Mirrored (Bidi_M),
26 | # and Bidi_Mirroring_Glyph (bmg), as follows: two characters, A and B,
27 | # form a bracket pair if A has gc=Ps and B has gc=Pe, both have bc=ON and
28 | # Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket (bpb) maps A to B and
29 | # vice versa, and their Bidi_Paired_Bracket_Type (bpt) property values are
30 | # Open (o) and Close (c), respectively.
31 | #
32 | # The brackets with ticks U+298D LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
33 | # through U+2990 RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER are paired the
34 | # same way their glyphs form mirror pairs, according to their bmg property
35 | # values. They are not paired on the basis of a diagonal or antidiagonal
36 | # matching of the corner ticks inferred from code point order.
37 | #
38 | # For legacy reasons, the characters U+FD3E ORNATE LEFT PARENTHESIS and
39 | # U+FD3F ORNATE RIGHT PARENTHESIS do not mirror in bidirectional display
40 | # and therefore do not form a bracket pair.
41 | #
42 | # The Unicode property value stability policy guarantees that characters
43 | # which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y. As a result, an
44 | # implementation can optimize the lookup of the Bidi_Paired_Bracket_Type
45 | # property values Open and Close by restricting the processing to characters
46 | # with bc=ON.
47 | #
48 | # The format of the file is three fields separated by a semicolon.
49 | # Field 0: Unicode code point value, represented as a hexadecimal value
50 | # Field 1: Bidi_Paired_Bracket property value, a code point value or
51 | # Field 2: Bidi_Paired_Bracket_Type property value, one of the following:
52 | # o Open
53 | # c Close
54 | # n None
55 | # The names of the characters in field 0 are given in comments at the end
56 | # of each line.
57 | #
58 | # For information on bidirectional paired brackets, see UAX #9: Unicode
59 | # Bidirectional Algorithm, at https://www.unicode.org/reports/tr9/
60 | #
61 | # This file was originally created by Andrew Glass and Laurentiu Iancu
62 | # for Unicode 6.3.
63 |
64 | 0028; 0029; o # LEFT PARENTHESIS
65 | 0029; 0028; c # RIGHT PARENTHESIS
66 | 005B; 005D; o # LEFT SQUARE BRACKET
67 | 005D; 005B; c # RIGHT SQUARE BRACKET
68 | 007B; 007D; o # LEFT CURLY BRACKET
69 | 007D; 007B; c # RIGHT CURLY BRACKET
70 | 0F3A; 0F3B; o # TIBETAN MARK GUG RTAGS GYON
71 | 0F3B; 0F3A; c # TIBETAN MARK GUG RTAGS GYAS
72 | 0F3C; 0F3D; o # TIBETAN MARK ANG KHANG GYON
73 | 0F3D; 0F3C; c # TIBETAN MARK ANG KHANG GYAS
74 | 169B; 169C; o # OGHAM FEATHER MARK
75 | 169C; 169B; c # OGHAM REVERSED FEATHER MARK
76 | 2045; 2046; o # LEFT SQUARE BRACKET WITH QUILL
77 | 2046; 2045; c # RIGHT SQUARE BRACKET WITH QUILL
78 | 207D; 207E; o # SUPERSCRIPT LEFT PARENTHESIS
79 | 207E; 207D; c # SUPERSCRIPT RIGHT PARENTHESIS
80 | 208D; 208E; o # SUBSCRIPT LEFT PARENTHESIS
81 | 208E; 208D; c # SUBSCRIPT RIGHT PARENTHESIS
82 | 2308; 2309; o # LEFT CEILING
83 | 2309; 2308; c # RIGHT CEILING
84 | 230A; 230B; o # LEFT FLOOR
85 | 230B; 230A; c # RIGHT FLOOR
86 | 2329; 232A; o # LEFT-POINTING ANGLE BRACKET
87 | 232A; 2329; c # RIGHT-POINTING ANGLE BRACKET
88 | 2768; 2769; o # MEDIUM LEFT PARENTHESIS ORNAMENT
89 | 2769; 2768; c # MEDIUM RIGHT PARENTHESIS ORNAMENT
90 | 276A; 276B; o # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
91 | 276B; 276A; c # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
92 | 276C; 276D; o # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
93 | 276D; 276C; c # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
94 | 276E; 276F; o # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
95 | 276F; 276E; c # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
96 | 2770; 2771; o # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
97 | 2771; 2770; c # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
98 | 2772; 2773; o # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
99 | 2773; 2772; c # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
100 | 2774; 2775; o # MEDIUM LEFT CURLY BRACKET ORNAMENT
101 | 2775; 2774; c # MEDIUM RIGHT CURLY BRACKET ORNAMENT
102 | 27C5; 27C6; o # LEFT S-SHAPED BAG DELIMITER
103 | 27C6; 27C5; c # RIGHT S-SHAPED BAG DELIMITER
104 | 27E6; 27E7; o # MATHEMATICAL LEFT WHITE SQUARE BRACKET
105 | 27E7; 27E6; c # MATHEMATICAL RIGHT WHITE SQUARE BRACKET
106 | 27E8; 27E9; o # MATHEMATICAL LEFT ANGLE BRACKET
107 | 27E9; 27E8; c # MATHEMATICAL RIGHT ANGLE BRACKET
108 | 27EA; 27EB; o # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
109 | 27EB; 27EA; c # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
110 | 27EC; 27ED; o # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
111 | 27ED; 27EC; c # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET
112 | 27EE; 27EF; o # MATHEMATICAL LEFT FLATTENED PARENTHESIS
113 | 27EF; 27EE; c # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
114 | 2983; 2984; o # LEFT WHITE CURLY BRACKET
115 | 2984; 2983; c # RIGHT WHITE CURLY BRACKET
116 | 2985; 2986; o # LEFT WHITE PARENTHESIS
117 | 2986; 2985; c # RIGHT WHITE PARENTHESIS
118 | 2987; 2988; o # Z NOTATION LEFT IMAGE BRACKET
119 | 2988; 2987; c # Z NOTATION RIGHT IMAGE BRACKET
120 | 2989; 298A; o # Z NOTATION LEFT BINDING BRACKET
121 | 298A; 2989; c # Z NOTATION RIGHT BINDING BRACKET
122 | 298B; 298C; o # LEFT SQUARE BRACKET WITH UNDERBAR
123 | 298C; 298B; c # RIGHT SQUARE BRACKET WITH UNDERBAR
124 | 298D; 2990; o # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
125 | 298E; 298F; c # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
126 | 298F; 298E; o # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
127 | 2990; 298D; c # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
128 | 2991; 2992; o # LEFT ANGLE BRACKET WITH DOT
129 | 2992; 2991; c # RIGHT ANGLE BRACKET WITH DOT
130 | 2993; 2994; o # LEFT ARC LESS-THAN BRACKET
131 | 2994; 2993; c # RIGHT ARC GREATER-THAN BRACKET
132 | 2995; 2996; o # DOUBLE LEFT ARC GREATER-THAN BRACKET
133 | 2996; 2995; c # DOUBLE RIGHT ARC LESS-THAN BRACKET
134 | 2997; 2998; o # LEFT BLACK TORTOISE SHELL BRACKET
135 | 2998; 2997; c # RIGHT BLACK TORTOISE SHELL BRACKET
136 | 29D8; 29D9; o # LEFT WIGGLY FENCE
137 | 29D9; 29D8; c # RIGHT WIGGLY FENCE
138 | 29DA; 29DB; o # LEFT DOUBLE WIGGLY FENCE
139 | 29DB; 29DA; c # RIGHT DOUBLE WIGGLY FENCE
140 | 29FC; 29FD; o # LEFT-POINTING CURVED ANGLE BRACKET
141 | 29FD; 29FC; c # RIGHT-POINTING CURVED ANGLE BRACKET
142 | 2E22; 2E23; o # TOP LEFT HALF BRACKET
143 | 2E23; 2E22; c # TOP RIGHT HALF BRACKET
144 | 2E24; 2E25; o # BOTTOM LEFT HALF BRACKET
145 | 2E25; 2E24; c # BOTTOM RIGHT HALF BRACKET
146 | 2E26; 2E27; o # LEFT SIDEWAYS U BRACKET
147 | 2E27; 2E26; c # RIGHT SIDEWAYS U BRACKET
148 | 2E28; 2E29; o # LEFT DOUBLE PARENTHESIS
149 | 2E29; 2E28; c # RIGHT DOUBLE PARENTHESIS
150 | 2E55; 2E56; o # LEFT SQUARE BRACKET WITH STROKE
151 | 2E56; 2E55; c # RIGHT SQUARE BRACKET WITH STROKE
152 | 2E57; 2E58; o # LEFT SQUARE BRACKET WITH DOUBLE STROKE
153 | 2E58; 2E57; c # RIGHT SQUARE BRACKET WITH DOUBLE STROKE
154 | 2E59; 2E5A; o # TOP HALF LEFT PARENTHESIS
155 | 2E5A; 2E59; c # TOP HALF RIGHT PARENTHESIS
156 | 2E5B; 2E5C; o # BOTTOM HALF LEFT PARENTHESIS
157 | 2E5C; 2E5B; c # BOTTOM HALF RIGHT PARENTHESIS
158 | 3008; 3009; o # LEFT ANGLE BRACKET
159 | 3009; 3008; c # RIGHT ANGLE BRACKET
160 | 300A; 300B; o # LEFT DOUBLE ANGLE BRACKET
161 | 300B; 300A; c # RIGHT DOUBLE ANGLE BRACKET
162 | 300C; 300D; o # LEFT CORNER BRACKET
163 | 300D; 300C; c # RIGHT CORNER BRACKET
164 | 300E; 300F; o # LEFT WHITE CORNER BRACKET
165 | 300F; 300E; c # RIGHT WHITE CORNER BRACKET
166 | 3010; 3011; o # LEFT BLACK LENTICULAR BRACKET
167 | 3011; 3010; c # RIGHT BLACK LENTICULAR BRACKET
168 | 3014; 3015; o # LEFT TORTOISE SHELL BRACKET
169 | 3015; 3014; c # RIGHT TORTOISE SHELL BRACKET
170 | 3016; 3017; o # LEFT WHITE LENTICULAR BRACKET
171 | 3017; 3016; c # RIGHT WHITE LENTICULAR BRACKET
172 | 3018; 3019; o # LEFT WHITE TORTOISE SHELL BRACKET
173 | 3019; 3018; c # RIGHT WHITE TORTOISE SHELL BRACKET
174 | 301A; 301B; o # LEFT WHITE SQUARE BRACKET
175 | 301B; 301A; c # RIGHT WHITE SQUARE BRACKET
176 | FE59; FE5A; o # SMALL LEFT PARENTHESIS
177 | FE5A; FE59; c # SMALL RIGHT PARENTHESIS
178 | FE5B; FE5C; o # SMALL LEFT CURLY BRACKET
179 | FE5C; FE5B; c # SMALL RIGHT CURLY BRACKET
180 | FE5D; FE5E; o # SMALL LEFT TORTOISE SHELL BRACKET
181 | FE5E; FE5D; c # SMALL RIGHT TORTOISE SHELL BRACKET
182 | FF08; FF09; o # FULLWIDTH LEFT PARENTHESIS
183 | FF09; FF08; c # FULLWIDTH RIGHT PARENTHESIS
184 | FF3B; FF3D; o # FULLWIDTH LEFT SQUARE BRACKET
185 | FF3D; FF3B; c # FULLWIDTH RIGHT SQUARE BRACKET
186 | FF5B; FF5D; o # FULLWIDTH LEFT CURLY BRACKET
187 | FF5D; FF5B; c # FULLWIDTH RIGHT CURLY BRACKET
188 | FF5F; FF60; o # FULLWIDTH LEFT WHITE PARENTHESIS
189 | FF60; FF5F; c # FULLWIDTH RIGHT WHITE PARENTHESIS
190 | FF62; FF63; o # HALFWIDTH LEFT CORNER BRACKET
191 | FF63; FF62; c # HALFWIDTH RIGHT CORNER BRACKET
192 |
193 | # EOF
194 |
--------------------------------------------------------------------------------
/resources/wcwidth/wcwidth.py:
--------------------------------------------------------------------------------
1 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/wcwidth/wcwidth.py#L103-L203
2 | @lru_cache(maxsize=1000)
3 | def wcwidth(wc, unicode_version='auto'):
4 | r"""
5 | Given one Unicode character, return its printable length on a terminal.
6 |
7 | :param str wc: A single Unicode character.
8 | :param str unicode_version: A Unicode version number, such as
9 | ``'6.0.0'``. A list of version levels suported by wcwidth
10 | is returned by :func:`list_versions`.
11 |
12 | Any version string may be specified without error -- the nearest
13 | matching version is selected. When ``latest`` (default), the
14 | highest Unicode version level is used.
15 | :return: The width, in cells, necessary to display the character of
16 | Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
17 | no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
18 | not printable, or has an indeterminate effect on the terminal, such as
19 | a control character. Otherwise, the number of column positions the
20 | character occupies on a graphic terminal (1 or 2) is returned.
21 | :rtype: int
22 |
23 | See :ref:`Specification` for details of cell measurement.
24 | """
25 | ucs = ord(wc) if wc else 0
26 |
27 | # small optimization: early return of 1 for printable ASCII, this provides
28 | # approximately 40% performance improvement for mostly-ascii documents, with
29 | # less than 1% impact to others.
30 | if 32 <= ucs < 0x7f:
31 | return 1
32 |
33 | # C0/C1 control characters are -1 for compatibility with POSIX-like calls
34 | if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
35 | return -1
36 |
37 | _unicode_version = _wcmatch_version(unicode_version)
38 |
39 | # Zero width
40 | if _bisearch(ucs, ZERO_WIDTH[_unicode_version]):
41 | return 0
42 |
43 | # 1 or 2 width
44 | return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version])
45 |
46 |
47 | def wcswidth(pwcs, n=None, unicode_version='auto'):
48 | """
49 | Given a unicode string, return its printable length on a terminal.
50 |
51 | :param str pwcs: Measure width of given unicode string.
52 | :param int n: When ``n`` is None (default), return the length of the entire
53 | string, otherwise only the first ``n`` characters are measured. This
54 | argument exists only for compatibility with the C POSIX function
55 | signature. It is suggested instead to use python's string slicing
56 | capability, ``wcswidth(pwcs[:n])``
57 | :param str unicode_version: An explicit definition of the unicode version
58 | level to use for determination, may be ``auto`` (default), which uses
59 | the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest
60 | available unicode version, otherwise.
61 | :rtype: int
62 | :returns: The width, in cells, needed to display the first ``n`` characters
63 | of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control
64 | characters!
65 |
66 | See :ref:`Specification` for details of cell measurement.
67 | """
68 | # this 'n' argument is a holdover for POSIX function
69 | _unicode_version = None
70 | end = len(pwcs) if n is None else n
71 | width = 0
72 | idx = 0
73 | last_measured_char = None
74 | while idx < end:
75 | char = pwcs[idx]
76 | if char == '\u200D':
77 | # Zero Width Joiner, do not measure this or next character
78 | idx += 2
79 | continue
80 | if char == '\uFE0F' and last_measured_char:
81 | # on variation selector 16 (VS16) following another character,
82 | # conditionally add '1' to the measured width if that character is
83 | # known to be converted from narrow to wide by the VS16 character.
84 | if _unicode_version is None:
85 | _unicode_version = _wcversion_value(_wcmatch_version(unicode_version))
86 | if _unicode_version >= (9, 0, 0):
87 | width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"])
88 | last_measured_char = None
89 | idx += 1
90 | continue
91 | # measure character at current index
92 | wcw = wcwidth(char, unicode_version)
93 | if wcw < 0:
94 | # early return -1 on C0 and C1 control characters
95 | return wcw
96 | if wcw > 0:
97 | # track last character measured to contain a cell, so that
98 | # subsequent VS-16 modifiers may be understood
99 | last_measured_char = char
100 | width += wcw
101 | idx += 1
102 | return width
103 | # cpv: end
104 |
105 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/bin/update-tables.py#L122-L160
106 | @dataclass(frozen=True)
107 | class TableEntry:
108 | """An entry of a unicode table."""
109 | code_range: tuple[int, int] | None
110 | properties: tuple[str, ...]
111 | comment: str
112 |
113 | def filter_by_category_width(self, wide: int) -> bool:
114 | """
115 | Return whether entry matches displayed width.
116 |
117 | Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
118 | """
119 | if self.code_range is None:
120 | return False
121 | elif self.properties[0] == 'Sk':
122 | if 'EMOJI MODIFIER' in self.comment:
123 | # These codepoints are fullwidth when used without emoji, 0-width with.
124 | # Generate code that expects the best case, that is always combined
125 | return wide == 0
126 | elif 'FULLWIDTH' in self.comment:
127 | # Some codepoints in 'Sk' categories are fullwidth(!)
128 | # at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON
129 | return wide == 2
130 | else:
131 | # the rest are narrow
132 | return wide == 1
133 | # Me Enclosing Mark
134 | # Mn Nonspacing Mark
135 | # Cf Format
136 | # Zl Line Separator
137 | # Zp Paragraph Separator
138 | if self.properties[0] in ('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp'):
139 | return wide == 0
140 | # F Fullwidth
141 | # W Wide
142 | if self.properties[0] in ('W', 'F'):
143 | return wide == 2
144 | return wide == 1
145 | # cpv: end
146 |
147 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/bin/update-tables.py#L336-L391
148 | def fetch_table_wide_data() -> UnicodeTableRenderCtx:
149 | """Fetch east-asian tables."""
150 | table: dict[UnicodeVersion, TableDef] = {}
151 | for version in fetch_unicode_versions():
152 | # parse typical 'wide' characters by categories 'W' and 'F',
153 | table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
154 | wide=2)
155 |
156 | # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
157 | # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
158 | table[version].values = table[version].values.difference(parse_category(
159 | fname=UnicodeDataFile.DerivedGeneralCategory(version),
160 | wide=0).values)
161 |
162 | # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
163 | table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
164 |
165 | # finally, join with atypical 'wide' characters defined by category 'Sk',
166 | table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
167 | wide=2).values)
168 | return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
169 |
170 |
171 | def fetch_table_zero_data() -> UnicodeTableRenderCtx:
172 | """
173 | Fetch zero width tables.
174 |
175 | See also: https://unicode.org/L2/L2002/02368-default-ignorable.html
176 | """
177 | table: dict[UnicodeVersion, TableDef] = {}
178 | for version in fetch_unicode_versions():
179 | # Determine values of zero-width character lookup table by the following category codes
180 | table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
181 | wide=0)
182 |
183 | # Include NULL
184 | table[version].values.add(0)
185 |
186 | # Add Hangul Jamo Vowels and Hangul Trailing Consonants
187 | table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
188 |
189 | # Remove u+00AD categoryCode=Cf name="SOFT HYPHEN",
190 | # > https://www.unicode.org/faq/casemap_charprop.html
191 | #
192 | # > Q: Unicode now treats the SOFT HYPHEN as format control (Cf)
193 | # > character when formerly it was a punctuation character (Pd).
194 | # > Doesn't this break ISO 8859-1 compatibility?
195 | #
196 | # > [..] In a terminal emulation environment, particularly in
197 | # > ISO-8859-1 contexts, one could display the SOFT HYPHEN as a hyphen
198 | # > in all circumstances.
199 | #
200 | # This value was wrongly measured as a width of '0' in this wcwidth
201 | # versions 0.2.9 - 0.2.13. Fixed in 0.2.14
202 | table[version].values.discard(0x00AD) # SOFT HYPHEN
203 | return UnicodeTableRenderCtx('ZERO_WIDTH', table)
204 | # cpv: end
205 |
--------------------------------------------------------------------------------
/src/build/test_build_config.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 | const config = @import("config.zig");
3 | const config_x = @import("config.x.zig");
4 | const types = @import("types.zig");
5 | const d = config.default;
6 |
7 | const Allocator = std.mem.Allocator;
8 | pub const log_level = .debug;
9 |
10 | fn computeFoo(
11 | allocator: Allocator,
12 | cp: u21,
13 | data: anytype,
14 | b: anytype,
15 | t: anytype,
16 | ) Allocator.Error!void {
17 | _ = allocator;
18 | _ = cp;
19 | _ = b;
20 | _ = t;
21 | data.foo = switch (data.original_grapheme_break) {
22 | .other => 0,
23 | .control => 3,
24 | else => 10,
25 | };
26 | }
27 |
28 | const foo = config.Extension{
29 | .inputs = &.{"original_grapheme_break"},
30 | .compute = &computeFoo,
31 | .fields = &.{
32 | .{ .name = "foo", .type = u8 },
33 | },
34 | };
35 |
36 | // Or build your own extension:
37 | const emoji_odd_or_even = config.Extension{
38 | .inputs = &.{"is_emoji"},
39 | .compute = &computeEmojiOddOrEven,
40 | .fields = &.{
41 | .{ .name = "emoji_odd_or_even", .type = EmojiOddOrEven },
42 | },
43 | };
44 |
45 | fn computeEmojiOddOrEven(
46 | allocator: Allocator,
47 | cp: u21,
48 | data: anytype,
49 | backing: anytype,
50 | tracking: anytype,
51 | ) Allocator.Error!void {
52 | // allocator is an ArenaAllocator, so don't worry about freeing
53 | _ = allocator;
54 |
55 | // backing and tracking are only used for slice types (see
56 | // src/build/test_build_config.zig for examples).
57 | _ = backing;
58 | _ = tracking;
59 |
60 | if (!data.is_emoji) {
61 | data.emoji_odd_or_even = .not_emoji;
62 | } else if (cp % 2 == 0) {
63 | data.emoji_odd_or_even = .even_emoji;
64 | } else {
65 | data.emoji_odd_or_even = .odd_emoji;
66 | }
67 | }
68 |
69 | // Types must be marked `pub`
70 | pub const EmojiOddOrEven = enum(u2) {
71 | not_emoji,
72 | even_emoji,
73 | odd_emoji,
74 | };
75 |
76 | const info = config.Extension{
77 | .inputs = &.{
78 | "uppercase_mapping",
79 | "numeric_value_numeric",
80 | "numeric_value_decimal",
81 | "simple_lowercase_mapping",
82 | },
83 | .compute = &computeInfo,
84 | .fields = &.{
85 | .{
86 | .name = "uppercase_mapping_first_char",
87 | .type = u21,
88 | .cp_packing = .shift,
89 | .shift_low = -64190,
90 | .shift_high = 42561,
91 | },
92 | .{ .name = "has_simple_lowercase", .type = bool },
93 | .{
94 | .name = "numeric_value_numeric_reversed",
95 | .type = []const u8,
96 | .max_len = 13,
97 | .max_offset = 503,
98 | .embedded_len = 1,
99 | },
100 | },
101 | };
102 |
103 | fn computeInfo(
104 | allocator: Allocator,
105 | cp: u21,
106 | data: anytype,
107 | backing: anytype,
108 | tracking: anytype,
109 | ) Allocator.Error!void {
110 | var single_item_buffer: [1]u21 = undefined;
111 | types.fieldInit(
112 | "uppercase_mapping_first_char",
113 | cp,
114 | data,
115 | tracking,
116 | data.uppercase_mapping.sliceWith(
117 | backing.uppercase_mapping,
118 | &single_item_buffer,
119 | cp,
120 | )[0],
121 | );
122 |
123 | data.has_simple_lowercase = data.simple_lowercase_mapping.unshift(cp) != null;
124 |
125 | var buffer: [13]u8 = undefined;
126 | for (data.numeric_value_numeric.slice(backing.numeric_value_numeric), 0..) |digit, i| {
127 | buffer[data.numeric_value_numeric.len - i - 1] = digit;
128 | }
129 |
130 | try types.sliceFieldInit(
131 | "numeric_value_numeric_reversed",
132 | allocator,
133 | cp,
134 | data,
135 | backing,
136 | tracking,
137 | buffer[0..data.numeric_value_numeric.len],
138 | );
139 | }
140 |
141 | fn computeOptEmojiOddOrEven(
142 | allocator: Allocator,
143 | cp: u21,
144 | data: anytype,
145 | b: anytype,
146 | tracking: anytype,
147 | ) Allocator.Error!void {
148 | _ = allocator;
149 | _ = b;
150 | types.fieldInit(
151 | "opt_emoji_odd_or_even",
152 | cp,
153 | data,
154 | tracking,
155 | @as(?EmojiOddOrEven, switch (data.emoji_odd_or_even) {
156 | .even_emoji => .even_emoji,
157 | .odd_emoji => .odd_emoji,
158 | .not_emoji => null,
159 | }),
160 | );
161 | }
162 |
163 | const opt_emoji_odd_or_even = config.Extension{
164 | .inputs = &.{"emoji_odd_or_even"},
165 | .compute = &computeOptEmojiOddOrEven,
166 | .fields = &.{
167 | .{
168 | .name = "opt_emoji_odd_or_even",
169 | .type = ?EmojiOddOrEven,
170 | .min_value = 0,
171 | .max_value = 2,
172 | },
173 | },
174 | };
175 |
176 | pub const NextOrPrev = union(enum) {
177 | none: void,
178 | next: u21,
179 | prev: u21,
180 | };
181 |
182 | fn computeNextOrPrev(
183 | allocator: Allocator,
184 | cp: u21,
185 | data: anytype,
186 | b: anytype,
187 | tracking: anytype,
188 | ) Allocator.Error!void {
189 | _ = allocator;
190 | _ = b;
191 | var nop: NextOrPrev = .none;
192 | if (0x1200 <= cp and cp <= 0x1235) {
193 | nop = switch (cp % 3) {
194 | 0 => .{ .next = cp + 1 },
195 | 1 => .{ .prev = cp - 1 },
196 | 2 => .none,
197 | else => unreachable,
198 | };
199 | }
200 |
201 | types.fieldInit(
202 | "next_or_prev",
203 | cp,
204 | data,
205 | tracking,
206 | nop,
207 | );
208 | }
209 |
210 | const next_or_prev = config.Extension{
211 | .inputs = &.{},
212 | .compute = &computeNextOrPrev,
213 | .fields = &.{
214 | .{
215 | .name = "next_or_prev",
216 | .type = NextOrPrev,
217 | .cp_packing = .shift,
218 | .shift_low = -1,
219 | .shift_high = 1,
220 | },
221 | },
222 | };
223 |
224 | fn computeNextOrPrevDirect(
225 | allocator: Allocator,
226 | cp: u21,
227 | data: anytype,
228 | b: anytype,
229 | tracking: anytype,
230 | ) Allocator.Error!void {
231 | _ = allocator;
232 | _ = b;
233 | types.fieldInit(
234 | "next_or_prev_direct",
235 | cp,
236 | data,
237 | tracking,
238 | data.next_or_prev.unshift(cp),
239 | );
240 | }
241 |
242 | const next_or_prev_direct = config.Extension{
243 | .inputs = &.{"next_or_prev"},
244 | .compute = &computeNextOrPrevDirect,
245 | .fields = &.{
246 | .{
247 | .name = "next_or_prev_direct",
248 | .type = NextOrPrev,
249 | },
250 | },
251 | };
252 |
253 | fn computeBidiPairedBracketDirect(
254 | allocator: Allocator,
255 | cp: u21,
256 | data: anytype,
257 | b: anytype,
258 | tracking: anytype,
259 | ) Allocator.Error!void {
260 | _ = allocator;
261 | _ = b;
262 | types.fieldInit(
263 | "bidi_paired_bracket_direct",
264 | cp,
265 | data,
266 | tracking,
267 | data.bidi_paired_bracket.unshift(cp),
268 | );
269 | }
270 |
271 | const bidi_paired_bracket_direct = config.Extension{
272 | .inputs = &.{"bidi_paired_bracket"},
273 | .compute = &computeBidiPairedBracketDirect,
274 | .fields = &.{
275 | .{
276 | .name = "bidi_paired_bracket_direct",
277 | .type = types.BidiPairedBracket,
278 | },
279 | },
280 | };
281 |
282 | fn computeMaybeBit(
283 | allocator: Allocator,
284 | cp: u21,
285 | data: anytype,
286 | b: anytype,
287 | tracking: anytype,
288 | ) Allocator.Error!void {
289 | _ = allocator;
290 | _ = b;
291 | var maybe: ?bool = null;
292 | if (0x1200 <= cp and cp <= 0x1235) {
293 | maybe = cp % 2 == 0;
294 | }
295 |
296 | types.fieldInit(
297 | "maybe_bit",
298 | cp,
299 | data,
300 | tracking,
301 | maybe,
302 | );
303 | }
304 |
305 | const maybe_bit = config.Extension{
306 | .inputs = &.{},
307 | .compute = &computeMaybeBit,
308 | .fields = &.{
309 | .{
310 | .name = "maybe_bit",
311 | .type = ?bool,
312 | .min_value = 0,
313 | .max_value = 1,
314 | },
315 | },
316 | };
317 |
318 | pub const tables = [_]config.Table{
319 | .{
320 | .extensions = &.{
321 | foo,
322 | emoji_odd_or_even,
323 | info,
324 | next_or_prev,
325 | next_or_prev_direct,
326 | bidi_paired_bracket_direct,
327 | },
328 | .fields = &.{
329 | foo.field("foo"),
330 | emoji_odd_or_even.field("emoji_odd_or_even"),
331 | info.field("uppercase_mapping_first_char"),
332 | info.field("has_simple_lowercase"),
333 | info.field("numeric_value_numeric_reversed"),
334 | next_or_prev.field("next_or_prev"),
335 | next_or_prev_direct.field("next_or_prev_direct"),
336 | bidi_paired_bracket_direct.field("bidi_paired_bracket_direct"),
337 | d.field("name").override(.{
338 | .embedded_len = 15,
339 | .max_offset = 986096,
340 | }),
341 | d.field("grapheme_break"),
342 | d.field("special_lowercase_mapping"),
343 | },
344 | },
345 | .{
346 | .stages = .two,
347 | .fields = &.{
348 | d.field("general_category"),
349 | d.field("case_folding_simple"),
350 | },
351 | },
352 | .{
353 | .name = "pack",
354 | .packing = .@"packed",
355 | .extensions = &.{
356 | emoji_odd_or_even,
357 | opt_emoji_odd_or_even,
358 | maybe_bit,
359 | },
360 | .fields = &.{
361 | opt_emoji_odd_or_even.field("opt_emoji_odd_or_even"),
362 | maybe_bit.field("maybe_bit"),
363 | d.field("bidi_paired_bracket"),
364 | },
365 | },
366 | .{
367 | .name = "checks",
368 | .extensions = &.{},
369 | .fields = &.{
370 | d.field("simple_uppercase_mapping"),
371 | d.field("is_alphabetic"),
372 | d.field("is_lowercase"),
373 | d.field("is_uppercase"),
374 | d.field("is_emoji_vs_base"),
375 | d.field("is_emoji_modifier_base"),
376 | },
377 | },
378 | .{
379 | .name = "needed_for_tests_and_build_build_config",
380 | .extensions = &.{
381 | config_x.wcwidth,
382 | config_x.grapheme_break_no_control,
383 | },
384 | .fields = &config._resolveFields(
385 | config_x,
386 | &.{
387 | "wcwidth_standalone",
388 | "wcwidth_zero_in_grapheme",
389 | "grapheme_break_no_control",
390 | "special_casing_condition",
391 | "bidi_class",
392 | },
393 | &.{ "wcwidth", "grapheme_break_no_control" },
394 | ),
395 | },
396 | };
397 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # uucode (Micro/µ Unicode)
2 |
3 | A fast and flexible unicode library, fully configurable at build time.
4 |
5 | ## Basic usage
6 |
7 | ``` zig
8 | const uucode = @import("uucode");
9 |
10 | var cp: u21 = undefined;
11 |
12 | //////////////////////
13 | // `get` properties
14 |
15 | cp = 0x2200; // ∀
16 | uucode.get(.general_category, cp) // .symbol_math
17 |
18 | cp = 0x03C2; // ς
19 | uucode.get(.simple_uppercase_mapping, cp) // U+03A3 == Σ
20 |
21 | cp = 0x21C1; // ⇁
22 | uucode.get(.name, cp) // "RIGHTWARDS HARPOON WITH BARB DOWNWARDS"
23 |
24 | // Many of the []const u21 fields need a single item buffer passed to `with`:
25 | var buffer: [1]u21 = undefined;
26 | cp = 0x00DF; // ß
27 | uucode.get(.uppercase_mapping, cp).with(&buffer, cp) // "SS"
28 |
29 | //////////////////////
30 | // `getAll` to get a group of properties for a code point together.
31 |
32 | cp = 0x03C2; // ς
33 |
34 | // The first argument is the name/index of the table.
35 | const data = uucode.getAll("0", cp);
36 |
37 | data.simple_uppercase_mapping // U+03A3 == Σ
38 | data.general_category // .letter_lowercase
39 |
40 | //////////////////////
41 | // utf8.Iterator
42 |
43 | var it = uucode.utf8.Iterator.init("😀😅😻👺");
44 | it.next(); // 0x1F600
45 | it.i; // 4 (bytes into the utf8 string)
46 | it.peek(); // 0x1F605
47 | it.next(); // 0x1F605
48 | it.next(); // 0x1F63B
49 | it.next(); // 0x1F47A
50 |
51 | //////////////////////
52 | // grapheme.Iterator / grapheme.utf8Iterator
53 |
54 | var it = uucode.grapheme.utf8Iterator("👩🏽🚀🇨🇭👨🏻🍼")
55 |
56 | // (which is equivalent to:)
57 | var it = uucode.grapheme.Iterator(uccode.utf8.Iterator).init(.init("👩🏽🚀🇨🇭👨🏻🍼"));
58 |
59 | // `nextCodePoint` advances one code point at a time, indicating a new grapheme
60 | // with `is_break = true`.
61 | it.nextCodePoint(); // { .code_point = 0x1F469; .is_break = false } // 👩
62 | it.i; // 4 (bytes into the utf8 string)
63 |
64 | it.peekCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽
65 | it.nextCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽
66 | it.nextCodePoint(); // { .code_point = 0x200D; .is_break = false } // Zero width joiner
67 | it.nextCodePoint(); // { .code_point = 0x1F680; .is_break = true } // 🚀
68 |
69 | // `nextGrapheme` advances until the start of the next grapheme cluster
70 | const result = it.nextGrapheme(); // { .start = 15; .end = 23 }
71 | it.i; // "👩🏽🚀🇨🇭".len
72 | str[result.?.start..result.?.end]; // "🇨🇭"
73 |
74 | const result = it.peekGrapheme();
75 | str[result.?.start..result.?.end]; // "👨🏻🍼"
76 |
77 | //////////////////////
78 | // grapheme.isBreak
79 |
80 | var break_state: uucode.grapheme.BreakState = .default;
81 |
82 | var cp1: u21 = 0x1F469; // 👩
83 | var cp2: u21 = 0x1F3FD; // 🏽
84 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false
85 |
86 | cp1 = cp2;
87 | cp2 = 0x200D; // Zero width joiner
88 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false
89 |
90 | cp1 = cp2;
91 | cp2 = 0x1F680; // 🚀
92 | // The combined grapheme cluster is 👩🏽🚀 (woman astronaut)
93 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false
94 |
95 | cp1 = cp2;
96 | cp2 = 0x1F468; // 👨
97 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // true
98 |
99 | //////////////////////
100 | // x.grapheme.wcwidth{,Next,Remaining} / x.grapheme.utf8Wcwidth
101 |
102 | const str = "ò👨🏻❤️👨🏿_";
103 | var it = uucode.grapheme.utf8Iterator(str);
104 |
105 | // Requires the `wcwidth` builtin extension (see below)
106 | uucode.x.grapheme.wcwidth(it); // 1 for 'ò'
107 |
108 | uucode.x.grapheme.wcwidthNext(&it); // 1 for 'ò'
109 | const result = it.peekGrapheme();
110 | str[result.?.start..result.?.end]; // "👨🏻❤️👨🏿"
111 |
112 | uucode.x.grapheme.wcwidthRemaining(&it); // 3 for "👨🏻❤️👨🏿_"
113 |
114 | uucode.x.grapheme.utf8Wcwidth(str); // 4 for the whole string
115 |
116 | //////////////////////
117 | // TypeOf / TypeOfAll / hasField
118 |
119 | uucode.TypeOf(.general_category) // uucode.types.GeneralCategory
120 | uucode.TypeOfAll("0") // @TypeOf(uucode.getAll("0"))
121 | uucode.hasField("is_emoji") // true if `is_emoji` is in one of your tables
122 | ```
123 |
124 | See [src/config.zig](./src/config.zig) for the names of all fields.
125 |
126 | ## Configuration
127 |
128 | Only include the Unicode fields you actually use:
129 |
130 | ``` zig
131 | // In `build.zig`:
132 | if (b.lazyDependency("uucode", .{
133 | .target = target,
134 | .optimize = optimize,
135 | .fields = @as([]const []const u8, &.{
136 | "name",
137 | "general_category",
138 | "case_folding_simple",
139 | "is_alphabetic",
140 | // ...
141 | }),
142 | })) |dep| {
143 | step.root_module.addImport("uucode", dep.module("uucode"));
144 | }
145 | ```
146 |
147 | ### Multiple tables
148 |
149 | Fields can be split into multiple tables using `fields_0` through `fields_9`, to optimize how fields are stored and accessed (with no code changes needed).
150 |
151 |
152 | ``` zig
153 | // In `build.zig`:
154 | if (b.lazyDependency("uucode", .{
155 | .target = target,
156 | .optimize = optimize,
157 | .fields_0 = @as([]const []const u8, &.{
158 | "general_category",
159 | "case_folding_simple",
160 | "is_alphabetic",
161 | }),
162 | .fields_1 = @as([]const []const u8, &.{
163 | // ...
164 | }),
165 | .fields_2 = @as([]const []const u8, &.{
166 | // ...
167 | }),
168 | // ... `fields_3` to `fields_9`
169 | })) |dep| {
170 | step.root_module.addImport("uucode", dep.module("uucode"));
171 | }
172 | ```
173 |
174 | ### Builtin extensions
175 |
176 | `uucode` includes builtin extensions that add derived properties. Use `extensions` or `extensions_0` through `extensions_9` to include them:
177 |
178 | ``` zig
179 | // In `build.zig`:
180 | if (b.lazyDependency("uucode", .{
181 | .target = target,
182 | .optimize = optimize,
183 | .extensions = @as([]const []const u8, &.{
184 | "wcwidth",
185 | }),
186 | .fields = @as([]const []const u8, &.{
187 | // Make sure to also include the extension's fields here:
188 | "wcwidth_standalone",
189 | "wcwidth_zero_in_grapheme",
190 | ...
191 | "general_category",
192 | }),
193 | })) |dep| {
194 | step.root_module.addImport("uucode", dep.module("uucode"));
195 | }
196 |
197 | // In your code:
198 | uucode.get(.wcwidth_standalone, 0x26F5) // ⛵ == 2
199 | ```
200 |
201 | See [src/x/config.x.zig](src/x/config.x.zig) for the full list of builtin extensions.
202 |
203 | ### Advanced configuration
204 |
205 | ``` zig
206 | ///////////////////////////////////////////////////////////
207 | // In `build.zig`:
208 |
209 | b.dependency("uucode", .{
210 | .target = target,
211 | .optimize = optimize,
212 | .build_config_path = b.path("src/build/uucode_config.zig"),
213 |
214 | // Alternatively, use a string literal:
215 | //.@"build_config.zig" = "..."
216 | })
217 |
218 | ///////////////////////////////////////////////////////////
219 | // In `src/build/uucode_config.zig`:
220 |
221 | const std = @import("std");
222 | const config = @import("config.zig");
223 |
224 | // Use `config.x.zig` for builtin extensions:
225 | const config_x = @import("config.x.zig");
226 |
227 | const d = config.default;
228 | const wcwidth = config_x.wcwidth;
229 |
230 | // Or build your own extension:
231 | const emoji_odd_or_even = config.Extension{
232 | .inputs = &.{"is_emoji"},
233 | .compute = &computeEmojiOddOrEven,
234 | .fields = &.{
235 | .{ .name = "emoji_odd_or_even", .type = EmojiOddOrEven },
236 | },
237 | };
238 |
239 | fn computeEmojiOddOrEven(
240 | allocator: std.mem.Allocator,
241 | cp: u21,
242 | data: anytype,
243 | backing: anytype,
244 | tracking: anytype,
245 | ) std.mem.Allocator.Error!void {
246 | // allocator is an ArenaAllocator, so don't worry about freeing
247 | _ = allocator;
248 |
249 | // backing and tracking are only used for slice types (see
250 | // src/build/test_build_config.zig for examples).
251 | _ = backing;
252 | _ = tracking;
253 |
254 | if (!data.is_emoji) {
255 | data.emoji_odd_or_even = .not_emoji;
256 | } else if (cp % 2 == 0) {
257 | data.emoji_odd_or_even = .even_emoji;
258 | } else {
259 | data.emoji_odd_or_even = .odd_emoji;
260 | }
261 | }
262 |
263 | // Types must be marked `pub`
264 | pub const EmojiOddOrEven = enum(u2) {
265 | not_emoji,
266 | even_emoji,
267 | odd_emoji,
268 | };
269 |
270 | // Configure tables with the `tables` declaration.
271 | // The only required field is `fields`, and the rest have reasonable defaults.
272 | pub const tables = [_]config.Table{
273 | .{
274 | // Optional name, to be able to `getAll("foo")` rather than e.g.
275 | // `getAll("0")`
276 | .name = "foo",
277 |
278 | // A two stage table can be slightly faster if the data is small. The
279 | // default `.auto` will pick a reasonable value, but to get the
280 | // absolute best performance run benchmarks with `.two` or `.three`
281 | // on realistic data.
282 | .stages = .three,
283 |
284 | // The default `.auto` value decide whether the final data stage struct
285 | // should be a `packed struct` (.@"packed") or a regular Zig `struct`.
286 | .packing = .unpacked,
287 |
288 | .extensions = &.{
289 | emoji_odd_or_even,
290 | wcwidth,
291 | },
292 |
293 | .fields = &.{
294 | // Don't forget to include the extension's fields here.
295 | emoji_odd_or_even.field("emoji_odd_or_even"),
296 | wcwidth.field("wcwidth_standalone"),
297 | wcwidth.field("wcwidth_zero_in_grapheme"),
298 |
299 | // See `src/config.zig` for everything that can be overriden.
300 | // In this example, we're embedding 15 bytes into the `stage3` data,
301 | // and only names longer than that need to use the `backing` slice.
302 | d.field("name").override(.{
303 | .embedded_len = 15,
304 | .max_offset = 986096, // run once to get the correct number
305 | }),
306 |
307 | d.field("general_category"),
308 | d.field("block"),
309 | // ...
310 | },
311 | },
312 | };
313 |
314 | // Turn on debug logging:
315 | pub const log_level = .debug;
316 |
317 | ///////////////////////////////////////////////////////////
318 | // In your code:
319 |
320 | const uucode = @import("uucode");
321 |
322 | uucode.get(.wcwidth_standalone, 0x26F5) // ⛵ == 2
323 |
324 | uucode.get(.emoji_odd_or_even, 0x1F34B) // 🍋 == .odd_emoji
325 |
326 | ```
327 |
328 | ## Code architecture
329 |
330 | The architecture works in a few layers:
331 |
332 | * Layer 1 (`src/build/Ucd.zig`): Parses the Unicode Character Database (UCD).
333 | * Layer 2 (`src/build/tables.zig`): Generates table data written to a zig file.
334 | * Layer 3 (`src/root.zig`): Exposes methods to fetch information from the built tables.
335 |
336 | ## History and acknowledgments
337 |
338 | `uucode` began out of work on the [Ghostty terminal](https://ghostty.org/) on [an issue to upgrade dependencies](https://github.com/ghostty-org/ghostty/issues/5694), where the experience modifying [zg](https://codeberg.org/atman/zg/) gave the confidence to build a fresh new library.
339 |
340 | `uucode` builds upon the Unicode performance work done in Ghostty, [as outlined in this excellent Devlog](https://mitchellh.com/writing/ghostty-devlog-006). The 3-stage lookup tables, as mentioned in that Devlog, come from [this article](https://here-be-braces.com/fast-lookup-of-unicode-properties/).
341 |
342 | ## License
343 |
344 | `uucode` is available under an MIT License. See [./LICENSE.md](./LICENSE.md) for the license text and an index of licenses for code used in the repo.
345 |
346 | ## Resources
347 |
348 | See [./RESOURCES.md](./RESOURCES.md) for a list of resources used to build `uucode`.
349 |
--------------------------------------------------------------------------------
/src/x/config_x/wcwidth.zig:
--------------------------------------------------------------------------------
1 | //! The `wcwidth` is a calculation of the expected width of a code point in
2 | //! cells of a monospaced font. It is not part of the Unicode standard.
3 | //!
4 | //! IMPORTANT: in general, calculate the width of a grapheme cluster with
5 | //! `uucode.x.grapheme.wcwidth(it)` instead of using this `wcwidth`
6 | //! directly. If it's already known that a code point is standing alone and not
7 | //! part of a multiple-code-point grapheme cluster, it's acceptable to use
8 | //! `wcwidth_standalone` directly.
9 | //!
10 | //! This `wcwidth` calculates two related values:
11 | //!
12 | //! * `wcwidth_standalone`:` The width for a code point as it would display
13 | //! **standing alone** without being combined with other code point in a
14 | //! grapheme cluster. Put another way, this is the width of a grapheme
15 | //! cluster consisting of only this code point. For some code points, it is
16 | //! rare or even technically "invalid" to be alone in a grapheme cluster but
17 | //! despite that, we provide a width for them. See `wcwidth` in
18 | //! `src/x/grapheme.zig` for the code and documentation for determining the
19 | //! width of a grapheme cluster that may contain multiple code points, and
20 | //! not how it uses this `wcwidth_standalone` when there is only one code
21 | //! point.
22 | //!
23 | //! * `wcwidth_zero_in_grapheme`: This indicates whether a code point does not
24 | //! contribute to width within a grapheme cluster, even if the code point
25 | //! might have width when standing alone (`wcwidth_standalone`). Emoji
26 | //! modifiers, nonspacing and enclosing marks, and Hangul/Kirat V/T are all
27 | //! in this category.
28 | //!
29 | //! See resources/wcwidth for other implementations, that help to inform the
30 | //! implementation here.
31 | //!
32 | //! This implementation makes the following choices:
33 | //!
34 | //! * The returned width is never negative. C0 and C1 control characters are
35 | //! treated as zero width, diverging from some implementations that return
36 | //! -1.
37 | //!
38 | //! * When a combining mark (Mn, Mc, Me) stands alone (not preceded by a base
39 | //! character), it forms a "defective combining character sequence" (Core Spec
40 | //! 3.6,
41 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G20665).
42 | //! Per Core Spec 5.13: "Defective combining character sequences should be
43 | //! rendered as if they had a no-break space as a base character"
44 | //! (https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-5/#G1099).
45 | //! Therefore, `wcwidth_standalone` is given a width of 1.
46 | //!
47 | //! Note: Per UAX #44, nonspacing marks (Mn) have "zero advance width" while
48 | //! spacing marks (Mc) have "positive advance width"
49 | //! (https://www.unicode.org/reports/tr44/#General_Category_Values).
50 | //! Enclosing marks (Me) are not explicitly specified, but in terminal
51 | //! rendering contexts they behave similarly to nonspacing marks. See also
52 | //! Core Spec 2.11, "Nonspacing combining characters do not occupy a spacing
53 | //! position by themselves"
54 | //! (https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-2/#G1789).
55 | //! Therefore, `wcwidth_zero_in_grapheme` is true for nonspacing marks (Mn)
56 | //! and enclosing marks (Me).
57 | //!
58 | //! * East Asian Width (UAX #11, https://www.unicode.org/reports/tr11/) is used
59 | //! to determine width, but only as a starting point. UAX #11 warns that
60 | //! East_Asian_Width "is not intended for use by modern terminal emulators
61 | //! without appropriate tailoring" (UAX #11 §2,
62 | //! https://www.unicode.org/reports/tr11/#Scope). This implementation applies
63 | //! tailoring for specific cases such as regional indicators.
64 | //!
65 | //! Ambiguous width (A) characters are treated as width 1. Per UAX #11 §5
66 | //! Recommendations: "If the context cannot be established reliably, they
67 | //! should be treated as narrow characters by default"
68 | //! (https://www.unicode.org/reports/tr11/#Recommendations), and per UAX #11
69 | //! §4.2 Ambiguous Characters: "Modern practice is evolving toward rendering
70 | //! ever more of the ambiguous characters with proportionally spaced, narrow
71 | //! forms that rotate with the direction of writing, independent of their
72 | //! treatment in one or more legacy character sets."
73 | //!
74 | //! * U+20E3 COMBINING ENCLOSING KEYCAP is commonly used in emoji keycap
75 | //! sequences like 1️⃣ (digit + VS16 + U+20E3), but when standing alone might
76 | //! render as an empty keycap symbol visually occupying 2 cells, so sit is
77 | //! given width 2. This is a special case—other enclosing marks like U+20DD
78 | //! COMBINING ENCLOSING CIRCLE are width 1. UTS #51 §1.4.6 ED-20 states
79 | //! "Other components (U+20E3 COMBINING ENCLOSING KEYCAP, ...) should never
80 | //! have an emoji presentation in isolation"
81 | //! (https://www.unicode.org/reports/tr51/#def_basic_emoji_set), so this
82 | //! should display with text presentation standing alone. For
83 | //! `wcwidth_zero_in_grapheme`, it is true, as it should usually follow VS16
84 | //! preceded by a digit or '#', and so the entire keycap sequence will be a
85 | //! width of 2 from the special VS16 handling.
86 | //!
87 | //! * Regional indicator symbols (U+1F1E6..U+1F1FF) are treated as width 2,
88 | //! whether paired in valid emoji flag sequences or standing alone. Per UTS #51
89 | //! §1.5 Conformance: "A singleton emoji Regional Indicator may be displayed
90 | //! as a capital A..Z character with a special display"
91 | //! (https://www.unicode.org/reports/tr51/#C3). Unpaired regional indicators
92 | //! commonly render as the corresponding letter in a width-2 box (e.g., 🇺
93 | //! displays as "U" in a box). See the above bullet point (U+20E3) for the
94 | //! text from UTS #51 §1.4.6 ED-20 that also applies to regional indicators,
95 | //! meaning they should have a text presentation in isolation.
96 | //!
97 | //! * Default_Ignorable_Code_Point characters are treated as width 0. These are
98 | //! characters that "should be ignored in rendering (unless explicitly
99 | //! supported)" (UAX #44,
100 | //! https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point). This
101 | //! includes variation selectors, join controls (ZWJ/ZWNJ), bidi formatting
102 | //! controls, tag characters, and other invisible format controls.
103 | //!
104 | //! Exception: U+00AD SOFT HYPHEN is treated as width 1 for terminal
105 | //! compatibility despite being default-ignorable. Per the Unicode FAQ: "In a
106 | //! terminal emulation environment, particularly in ISO-8859-1 contexts, one
107 | //! could display the SOFT HYPHEN as a hyphen in all circumstances"
108 | //! (https://www.unicode.org/faq/casemap_charprop.html). Terminals lack
109 | //! sophisticated word-breaking algorithms and typically display SOFT HYPHEN as
110 | //! a visible hyphen, requiring width 1. This matches ecosystem wcwidth
111 | //! implementations.
112 | //!
113 | //! VS15 and VS16 have `wcwidth_zero_in_grapheme` set to true. These are not
114 | //! "zero in grapheme" in the sense that they don't affect width--they change
115 | //! the width of the base char! But they don't have their *own* independent
116 | //! width contribution that should be summed. They are special cased in the
117 | //! `x/grapheme.zig` `wcwidth` calculation.
118 | //!
119 | //! * Hangul Jamo medial vowels and Kirat Rai vowels (all
120 | //! Grapheme_Cluster_Break=V) and Hangul trailing consonants
121 | //! (Grapheme_Cluster_Break=T) are width 1 for wcwidth_standalone since they
122 | //! are General_Category=Other_Letter with East_Asian_Width=Neutral. However,
123 | //! `wcwidth_zero_in_grapheme` is true for these, as they should only be
124 | //! present in a grapheme cluster where the other code points contribute to
125 | //! the width.
126 | //!
127 | //! * Grapheme_Cluster_Break=Prepend characters (e.g., Indic Rephas) are treated
128 | //! as width 1 when standing alone, but join with subsequent code points and
129 | //! are `wcwidth_zero_in_grapheme` true. Note that none of the Prepend
130 | //! characters are default-ignorable.
131 | //!
132 | //! * Surrogates (General_Category=Cs, U+D800..U+DFFF) are treated as width 0.
133 | //! They are not Unicode scalar values (Core Spec 3.9,
134 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G25539)
135 | //! and "are designated for surrogate code units in the UTF-16 character
136 | //! encoding form. They are unassigned to any abstract character." (Core Spec
137 | //! 3.2.1 C1,
138 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G22599).
139 | //!
140 | //! * U+2028 LINE SEPARATOR (Zl) and U+2029 PARAGRAPH SEPARATOR (Zp) are
141 | //! treated as width 0. They introduce mandatory line/paragraph breaks (UAX
142 | //! #14, Line_Break=BK, https://www.unicode.org/reports/tr14/#BK) and do not
143 | //! advance horizontally on the same line.
144 | //!
145 | //! * Emoji modifiers (Fitzpatrick skin tone modifiers U+1F3FB..U+1F3FF) have
146 | //! `wcwidth_standalone` = 2, as when standing alone they render as fullwidth
147 | //! colored squares (and are marked East_Asian_Width=W) However,
148 | //! `wcwidth_zero_in_grapheme` is true, as they are typically used to modify a
149 | //! base emoji which contributes the width.
150 | //!
151 |
152 | const std = @import("std");
153 | const config = @import("config.zig");
154 |
155 | fn compute(
156 | allocator: std.mem.Allocator,
157 | cp: u21,
158 | data: anytype,
159 | backing: anytype,
160 | tracking: anytype,
161 | ) std.mem.Allocator.Error!void {
162 | _ = allocator;
163 | _ = backing;
164 | _ = tracking;
165 | const gc = data.general_category;
166 |
167 | var width: u2 = undefined;
168 |
169 | if (gc == .other_control or
170 | gc == .other_surrogate or
171 | gc == .separator_line or
172 | gc == .separator_paragraph)
173 | {
174 | width = 0;
175 | } else if (cp == 0x00AD) { // Soft hyphen
176 | width = 1;
177 | } else if (data.is_default_ignorable) {
178 | width = 0;
179 | } else if (cp == 0x2E3A) { // Two-em dash
180 | width = 2;
181 | } else if (cp == 0x2E3B) { // Three-em dash
182 | width = 3;
183 | } else if (data.east_asian_width == .wide or data.east_asian_width == .fullwidth) {
184 | width = 2;
185 | } else if (data.grapheme_break == .regional_indicator) {
186 | width = 2;
187 | } else {
188 | width = 1;
189 | }
190 |
191 | const Data = @TypeOf(data.*);
192 | if (@hasField(Data, "wcwidth_standalone")) {
193 | if (cp == 0x20E3) { // Combining enclosing keycap
194 | data.wcwidth_standalone = 2;
195 | } else {
196 | data.wcwidth_standalone = width;
197 | }
198 | }
199 | if (@hasField(Data, "wcwidth_zero_in_grapheme")) {
200 | if (width == 0 or // Includes default_ignorable such as ZWJ and VS
201 | data.is_emoji_modifier or
202 | gc == .mark_nonspacing or
203 | gc == .mark_enclosing or // Including keycap
204 | data.grapheme_break == .v or // Hangul Jamo and Kirat Rai vowels
205 | data.grapheme_break == .t or // Hangul trailing consonants
206 | data.grapheme_break == .prepend // e.g. Indic Rephas
207 | ) {
208 | data.wcwidth_zero_in_grapheme = true;
209 | } else {
210 | data.wcwidth_zero_in_grapheme = false;
211 | }
212 | }
213 | }
214 |
215 | pub const wcwidth = config.Extension{
216 | .inputs = &.{
217 | "east_asian_width",
218 | "general_category",
219 | "grapheme_break",
220 | "is_default_ignorable",
221 | "is_emoji_modifier",
222 | },
223 | .compute = &compute,
224 | .fields = &.{
225 | .{ .name = "wcwidth_standalone", .type = u2 },
226 | .{ .name = "wcwidth_zero_in_grapheme", .type = bool },
227 | },
228 | };
229 |
--------------------------------------------------------------------------------
/ucd/Blocks.txt:
--------------------------------------------------------------------------------
1 | # Blocks-16.0.0.txt
2 | # Date: 2024-02-02
3 | # © 2024 Unicode®, Inc.
4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html
6 | #
7 | # Unicode Character Database
8 | # For documentation, see https://www.unicode.org/reports/tr44/
9 | #
10 | # Format:
11 | # Start Code..End Code; Block Name
12 |
13 | # ================================================
14 |
15 | # Note: When comparing block names, casing, whitespace, hyphens,
16 | # and underbars are ignored.
17 | # For example, "Latin Extended-A" and "latin extended a" are equivalent.
18 | # For more information on the comparison of property values,
19 | # see UAX #44: https://www.unicode.org/reports/tr44/
20 | #
21 | # All block ranges start with a value where (cp MOD 16) = 0,
22 | # and end with a value where (cp MOD 16) = 15. In other words,
23 | # the last hexadecimal digit of the start of range is ...0
24 | # and the last hexadecimal digit of the end of range is ...F.
25 | # This constraint on block ranges guarantees that allocations
26 | # are done in terms of whole columns, and that code chart display
27 | # never involves splitting columns in the charts.
28 | #
29 | # All code points not explicitly listed for Block
30 | # have the value No_Block.
31 |
32 | # Property: Block
33 | #
34 | # @missing: 0000..10FFFF; No_Block
35 |
36 | 0000..007F; Basic Latin
37 | 0080..00FF; Latin-1 Supplement
38 | 0100..017F; Latin Extended-A
39 | 0180..024F; Latin Extended-B
40 | 0250..02AF; IPA Extensions
41 | 02B0..02FF; Spacing Modifier Letters
42 | 0300..036F; Combining Diacritical Marks
43 | 0370..03FF; Greek and Coptic
44 | 0400..04FF; Cyrillic
45 | 0500..052F; Cyrillic Supplement
46 | 0530..058F; Armenian
47 | 0590..05FF; Hebrew
48 | 0600..06FF; Arabic
49 | 0700..074F; Syriac
50 | 0750..077F; Arabic Supplement
51 | 0780..07BF; Thaana
52 | 07C0..07FF; NKo
53 | 0800..083F; Samaritan
54 | 0840..085F; Mandaic
55 | 0860..086F; Syriac Supplement
56 | 0870..089F; Arabic Extended-B
57 | 08A0..08FF; Arabic Extended-A
58 | 0900..097F; Devanagari
59 | 0980..09FF; Bengali
60 | 0A00..0A7F; Gurmukhi
61 | 0A80..0AFF; Gujarati
62 | 0B00..0B7F; Oriya
63 | 0B80..0BFF; Tamil
64 | 0C00..0C7F; Telugu
65 | 0C80..0CFF; Kannada
66 | 0D00..0D7F; Malayalam
67 | 0D80..0DFF; Sinhala
68 | 0E00..0E7F; Thai
69 | 0E80..0EFF; Lao
70 | 0F00..0FFF; Tibetan
71 | 1000..109F; Myanmar
72 | 10A0..10FF; Georgian
73 | 1100..11FF; Hangul Jamo
74 | 1200..137F; Ethiopic
75 | 1380..139F; Ethiopic Supplement
76 | 13A0..13FF; Cherokee
77 | 1400..167F; Unified Canadian Aboriginal Syllabics
78 | 1680..169F; Ogham
79 | 16A0..16FF; Runic
80 | 1700..171F; Tagalog
81 | 1720..173F; Hanunoo
82 | 1740..175F; Buhid
83 | 1760..177F; Tagbanwa
84 | 1780..17FF; Khmer
85 | 1800..18AF; Mongolian
86 | 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
87 | 1900..194F; Limbu
88 | 1950..197F; Tai Le
89 | 1980..19DF; New Tai Lue
90 | 19E0..19FF; Khmer Symbols
91 | 1A00..1A1F; Buginese
92 | 1A20..1AAF; Tai Tham
93 | 1AB0..1AFF; Combining Diacritical Marks Extended
94 | 1B00..1B7F; Balinese
95 | 1B80..1BBF; Sundanese
96 | 1BC0..1BFF; Batak
97 | 1C00..1C4F; Lepcha
98 | 1C50..1C7F; Ol Chiki
99 | 1C80..1C8F; Cyrillic Extended-C
100 | 1C90..1CBF; Georgian Extended
101 | 1CC0..1CCF; Sundanese Supplement
102 | 1CD0..1CFF; Vedic Extensions
103 | 1D00..1D7F; Phonetic Extensions
104 | 1D80..1DBF; Phonetic Extensions Supplement
105 | 1DC0..1DFF; Combining Diacritical Marks Supplement
106 | 1E00..1EFF; Latin Extended Additional
107 | 1F00..1FFF; Greek Extended
108 | 2000..206F; General Punctuation
109 | 2070..209F; Superscripts and Subscripts
110 | 20A0..20CF; Currency Symbols
111 | 20D0..20FF; Combining Diacritical Marks for Symbols
112 | 2100..214F; Letterlike Symbols
113 | 2150..218F; Number Forms
114 | 2190..21FF; Arrows
115 | 2200..22FF; Mathematical Operators
116 | 2300..23FF; Miscellaneous Technical
117 | 2400..243F; Control Pictures
118 | 2440..245F; Optical Character Recognition
119 | 2460..24FF; Enclosed Alphanumerics
120 | 2500..257F; Box Drawing
121 | 2580..259F; Block Elements
122 | 25A0..25FF; Geometric Shapes
123 | 2600..26FF; Miscellaneous Symbols
124 | 2700..27BF; Dingbats
125 | 27C0..27EF; Miscellaneous Mathematical Symbols-A
126 | 27F0..27FF; Supplemental Arrows-A
127 | 2800..28FF; Braille Patterns
128 | 2900..297F; Supplemental Arrows-B
129 | 2980..29FF; Miscellaneous Mathematical Symbols-B
130 | 2A00..2AFF; Supplemental Mathematical Operators
131 | 2B00..2BFF; Miscellaneous Symbols and Arrows
132 | 2C00..2C5F; Glagolitic
133 | 2C60..2C7F; Latin Extended-C
134 | 2C80..2CFF; Coptic
135 | 2D00..2D2F; Georgian Supplement
136 | 2D30..2D7F; Tifinagh
137 | 2D80..2DDF; Ethiopic Extended
138 | 2DE0..2DFF; Cyrillic Extended-A
139 | 2E00..2E7F; Supplemental Punctuation
140 | 2E80..2EFF; CJK Radicals Supplement
141 | 2F00..2FDF; Kangxi Radicals
142 | 2FF0..2FFF; Ideographic Description Characters
143 | 3000..303F; CJK Symbols and Punctuation
144 | 3040..309F; Hiragana
145 | 30A0..30FF; Katakana
146 | 3100..312F; Bopomofo
147 | 3130..318F; Hangul Compatibility Jamo
148 | 3190..319F; Kanbun
149 | 31A0..31BF; Bopomofo Extended
150 | 31C0..31EF; CJK Strokes
151 | 31F0..31FF; Katakana Phonetic Extensions
152 | 3200..32FF; Enclosed CJK Letters and Months
153 | 3300..33FF; CJK Compatibility
154 | 3400..4DBF; CJK Unified Ideographs Extension A
155 | 4DC0..4DFF; Yijing Hexagram Symbols
156 | 4E00..9FFF; CJK Unified Ideographs
157 | A000..A48F; Yi Syllables
158 | A490..A4CF; Yi Radicals
159 | A4D0..A4FF; Lisu
160 | A500..A63F; Vai
161 | A640..A69F; Cyrillic Extended-B
162 | A6A0..A6FF; Bamum
163 | A700..A71F; Modifier Tone Letters
164 | A720..A7FF; Latin Extended-D
165 | A800..A82F; Syloti Nagri
166 | A830..A83F; Common Indic Number Forms
167 | A840..A87F; Phags-pa
168 | A880..A8DF; Saurashtra
169 | A8E0..A8FF; Devanagari Extended
170 | A900..A92F; Kayah Li
171 | A930..A95F; Rejang
172 | A960..A97F; Hangul Jamo Extended-A
173 | A980..A9DF; Javanese
174 | A9E0..A9FF; Myanmar Extended-B
175 | AA00..AA5F; Cham
176 | AA60..AA7F; Myanmar Extended-A
177 | AA80..AADF; Tai Viet
178 | AAE0..AAFF; Meetei Mayek Extensions
179 | AB00..AB2F; Ethiopic Extended-A
180 | AB30..AB6F; Latin Extended-E
181 | AB70..ABBF; Cherokee Supplement
182 | ABC0..ABFF; Meetei Mayek
183 | AC00..D7AF; Hangul Syllables
184 | D7B0..D7FF; Hangul Jamo Extended-B
185 | D800..DB7F; High Surrogates
186 | DB80..DBFF; High Private Use Surrogates
187 | DC00..DFFF; Low Surrogates
188 | E000..F8FF; Private Use Area
189 | F900..FAFF; CJK Compatibility Ideographs
190 | FB00..FB4F; Alphabetic Presentation Forms
191 | FB50..FDFF; Arabic Presentation Forms-A
192 | FE00..FE0F; Variation Selectors
193 | FE10..FE1F; Vertical Forms
194 | FE20..FE2F; Combining Half Marks
195 | FE30..FE4F; CJK Compatibility Forms
196 | FE50..FE6F; Small Form Variants
197 | FE70..FEFF; Arabic Presentation Forms-B
198 | FF00..FFEF; Halfwidth and Fullwidth Forms
199 | FFF0..FFFF; Specials
200 | 10000..1007F; Linear B Syllabary
201 | 10080..100FF; Linear B Ideograms
202 | 10100..1013F; Aegean Numbers
203 | 10140..1018F; Ancient Greek Numbers
204 | 10190..101CF; Ancient Symbols
205 | 101D0..101FF; Phaistos Disc
206 | 10280..1029F; Lycian
207 | 102A0..102DF; Carian
208 | 102E0..102FF; Coptic Epact Numbers
209 | 10300..1032F; Old Italic
210 | 10330..1034F; Gothic
211 | 10350..1037F; Old Permic
212 | 10380..1039F; Ugaritic
213 | 103A0..103DF; Old Persian
214 | 10400..1044F; Deseret
215 | 10450..1047F; Shavian
216 | 10480..104AF; Osmanya
217 | 104B0..104FF; Osage
218 | 10500..1052F; Elbasan
219 | 10530..1056F; Caucasian Albanian
220 | 10570..105BF; Vithkuqi
221 | 105C0..105FF; Todhri
222 | 10600..1077F; Linear A
223 | 10780..107BF; Latin Extended-F
224 | 10800..1083F; Cypriot Syllabary
225 | 10840..1085F; Imperial Aramaic
226 | 10860..1087F; Palmyrene
227 | 10880..108AF; Nabataean
228 | 108E0..108FF; Hatran
229 | 10900..1091F; Phoenician
230 | 10920..1093F; Lydian
231 | 10980..1099F; Meroitic Hieroglyphs
232 | 109A0..109FF; Meroitic Cursive
233 | 10A00..10A5F; Kharoshthi
234 | 10A60..10A7F; Old South Arabian
235 | 10A80..10A9F; Old North Arabian
236 | 10AC0..10AFF; Manichaean
237 | 10B00..10B3F; Avestan
238 | 10B40..10B5F; Inscriptional Parthian
239 | 10B60..10B7F; Inscriptional Pahlavi
240 | 10B80..10BAF; Psalter Pahlavi
241 | 10C00..10C4F; Old Turkic
242 | 10C80..10CFF; Old Hungarian
243 | 10D00..10D3F; Hanifi Rohingya
244 | 10D40..10D8F; Garay
245 | 10E60..10E7F; Rumi Numeral Symbols
246 | 10E80..10EBF; Yezidi
247 | 10EC0..10EFF; Arabic Extended-C
248 | 10F00..10F2F; Old Sogdian
249 | 10F30..10F6F; Sogdian
250 | 10F70..10FAF; Old Uyghur
251 | 10FB0..10FDF; Chorasmian
252 | 10FE0..10FFF; Elymaic
253 | 11000..1107F; Brahmi
254 | 11080..110CF; Kaithi
255 | 110D0..110FF; Sora Sompeng
256 | 11100..1114F; Chakma
257 | 11150..1117F; Mahajani
258 | 11180..111DF; Sharada
259 | 111E0..111FF; Sinhala Archaic Numbers
260 | 11200..1124F; Khojki
261 | 11280..112AF; Multani
262 | 112B0..112FF; Khudawadi
263 | 11300..1137F; Grantha
264 | 11380..113FF; Tulu-Tigalari
265 | 11400..1147F; Newa
266 | 11480..114DF; Tirhuta
267 | 11580..115FF; Siddham
268 | 11600..1165F; Modi
269 | 11660..1167F; Mongolian Supplement
270 | 11680..116CF; Takri
271 | 116D0..116FF; Myanmar Extended-C
272 | 11700..1174F; Ahom
273 | 11800..1184F; Dogra
274 | 118A0..118FF; Warang Citi
275 | 11900..1195F; Dives Akuru
276 | 119A0..119FF; Nandinagari
277 | 11A00..11A4F; Zanabazar Square
278 | 11A50..11AAF; Soyombo
279 | 11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A
280 | 11AC0..11AFF; Pau Cin Hau
281 | 11B00..11B5F; Devanagari Extended-A
282 | 11BC0..11BFF; Sunuwar
283 | 11C00..11C6F; Bhaiksuki
284 | 11C70..11CBF; Marchen
285 | 11D00..11D5F; Masaram Gondi
286 | 11D60..11DAF; Gunjala Gondi
287 | 11EE0..11EFF; Makasar
288 | 11F00..11F5F; Kawi
289 | 11FB0..11FBF; Lisu Supplement
290 | 11FC0..11FFF; Tamil Supplement
291 | 12000..123FF; Cuneiform
292 | 12400..1247F; Cuneiform Numbers and Punctuation
293 | 12480..1254F; Early Dynastic Cuneiform
294 | 12F90..12FFF; Cypro-Minoan
295 | 13000..1342F; Egyptian Hieroglyphs
296 | 13430..1345F; Egyptian Hieroglyph Format Controls
297 | 13460..143FF; Egyptian Hieroglyphs Extended-A
298 | 14400..1467F; Anatolian Hieroglyphs
299 | 16100..1613F; Gurung Khema
300 | 16800..16A3F; Bamum Supplement
301 | 16A40..16A6F; Mro
302 | 16A70..16ACF; Tangsa
303 | 16AD0..16AFF; Bassa Vah
304 | 16B00..16B8F; Pahawh Hmong
305 | 16D40..16D7F; Kirat Rai
306 | 16E40..16E9F; Medefaidrin
307 | 16F00..16F9F; Miao
308 | 16FE0..16FFF; Ideographic Symbols and Punctuation
309 | 17000..187FF; Tangut
310 | 18800..18AFF; Tangut Components
311 | 18B00..18CFF; Khitan Small Script
312 | 18D00..18D7F; Tangut Supplement
313 | 1AFF0..1AFFF; Kana Extended-B
314 | 1B000..1B0FF; Kana Supplement
315 | 1B100..1B12F; Kana Extended-A
316 | 1B130..1B16F; Small Kana Extension
317 | 1B170..1B2FF; Nushu
318 | 1BC00..1BC9F; Duployan
319 | 1BCA0..1BCAF; Shorthand Format Controls
320 | 1CC00..1CEBF; Symbols for Legacy Computing Supplement
321 | 1CF00..1CFCF; Znamenny Musical Notation
322 | 1D000..1D0FF; Byzantine Musical Symbols
323 | 1D100..1D1FF; Musical Symbols
324 | 1D200..1D24F; Ancient Greek Musical Notation
325 | 1D2C0..1D2DF; Kaktovik Numerals
326 | 1D2E0..1D2FF; Mayan Numerals
327 | 1D300..1D35F; Tai Xuan Jing Symbols
328 | 1D360..1D37F; Counting Rod Numerals
329 | 1D400..1D7FF; Mathematical Alphanumeric Symbols
330 | 1D800..1DAAF; Sutton SignWriting
331 | 1DF00..1DFFF; Latin Extended-G
332 | 1E000..1E02F; Glagolitic Supplement
333 | 1E030..1E08F; Cyrillic Extended-D
334 | 1E100..1E14F; Nyiakeng Puachue Hmong
335 | 1E290..1E2BF; Toto
336 | 1E2C0..1E2FF; Wancho
337 | 1E4D0..1E4FF; Nag Mundari
338 | 1E5D0..1E5FF; Ol Onal
339 | 1E7E0..1E7FF; Ethiopic Extended-B
340 | 1E800..1E8DF; Mende Kikakui
341 | 1E900..1E95F; Adlam
342 | 1EC70..1ECBF; Indic Siyaq Numbers
343 | 1ED00..1ED4F; Ottoman Siyaq Numbers
344 | 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
345 | 1F000..1F02F; Mahjong Tiles
346 | 1F030..1F09F; Domino Tiles
347 | 1F0A0..1F0FF; Playing Cards
348 | 1F100..1F1FF; Enclosed Alphanumeric Supplement
349 | 1F200..1F2FF; Enclosed Ideographic Supplement
350 | 1F300..1F5FF; Miscellaneous Symbols and Pictographs
351 | 1F600..1F64F; Emoticons
352 | 1F650..1F67F; Ornamental Dingbats
353 | 1F680..1F6FF; Transport and Map Symbols
354 | 1F700..1F77F; Alchemical Symbols
355 | 1F780..1F7FF; Geometric Shapes Extended
356 | 1F800..1F8FF; Supplemental Arrows-C
357 | 1F900..1F9FF; Supplemental Symbols and Pictographs
358 | 1FA00..1FA6F; Chess Symbols
359 | 1FA70..1FAFF; Symbols and Pictographs Extended-A
360 | 1FB00..1FBFF; Symbols for Legacy Computing
361 | 20000..2A6DF; CJK Unified Ideographs Extension B
362 | 2A700..2B73F; CJK Unified Ideographs Extension C
363 | 2B740..2B81F; CJK Unified Ideographs Extension D
364 | 2B820..2CEAF; CJK Unified Ideographs Extension E
365 | 2CEB0..2EBEF; CJK Unified Ideographs Extension F
366 | 2EBF0..2EE5F; CJK Unified Ideographs Extension I
367 | 2F800..2FA1F; CJK Compatibility Ideographs Supplement
368 | 30000..3134F; CJK Unified Ideographs Extension G
369 | 31350..323AF; CJK Unified Ideographs Extension H
370 | E0000..E007F; Tags
371 | E0100..E01EF; Variation Selectors Supplement
372 | F0000..FFFFF; Supplementary Private Use Area-A
373 | 100000..10FFFF; Supplementary Private Use Area-B
374 |
375 | # EOF
376 |
--------------------------------------------------------------------------------
/resources/wcwidth/wcwidth.c:
--------------------------------------------------------------------------------
1 | /*
2 | * TODO: add ability to track arbitrary url files in `cpv`
3 | * From: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
4 | *
5 | * This is an implementation of wcwidth() and wcswidth() (defined in
6 | * IEEE Std 1002.1-2001) for Unicode.
7 | *
8 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
9 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
10 | *
11 | * In fixed-width output devices, Latin characters all occupy a single
12 | * "cell" position of equal width, whereas ideographic CJK characters
13 | * occupy two such cells. Interoperability between terminal-line
14 | * applications and (teletype-style) character terminals using the
15 | * UTF-8 encoding requires agreement on which character should advance
16 | * the cursor by how many cell positions. No established formal
17 | * standards exist at present on which Unicode character shall occupy
18 | * how many cell positions on character terminals. These routines are
19 | * a first attempt of defining such behavior based on simple rules
20 | * applied to data provided by the Unicode Consortium.
21 | *
22 | * For some graphical characters, the Unicode standard explicitly
23 | * defines a character-cell width via the definition of the East Asian
24 | * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
25 | * In all these cases, there is no ambiguity about which width a
26 | * terminal shall use. For characters in the East Asian Ambiguous (A)
27 | * class, the width choice depends purely on a preference of backward
28 | * compatibility with either historic CJK or Western practice.
29 | * Choosing single-width for these characters is easy to justify as
30 | * the appropriate long-term solution, as the CJK practice of
31 | * displaying these characters as double-width comes from historic
32 | * implementation simplicity (8-bit encoded characters were displayed
33 | * single-width and 16-bit ones double-width, even for Greek,
34 | * Cyrillic, etc.) and not any typographic considerations.
35 | *
36 | * Much less clear is the choice of width for the Not East Asian
37 | * (Neutral) class. Existing practice does not dictate a width for any
38 | * of these characters. It would nevertheless make sense
39 | * typographically to allocate two character cells to characters such
40 | * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
41 | * represented adequately with a single-width glyph. The following
42 | * routines at present merely assign a single-cell width to all
43 | * neutral characters, in the interest of simplicity. This is not
44 | * entirely satisfactory and should be reconsidered before
45 | * establishing a formal standard in this area. At the moment, the
46 | * decision which Not East Asian (Neutral) characters should be
47 | * represented by double-width glyphs cannot yet be answered by
48 | * applying a simple rule from the Unicode database content. Setting
49 | * up a proper standard for the behavior of UTF-8 character terminals
50 | * will require a careful analysis not only of each Unicode character,
51 | * but also of each presentation form, something the author of these
52 | * routines has avoided to do so far.
53 | *
54 | * http://www.unicode.org/unicode/reports/tr11/
55 | *
56 | * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
57 | *
58 | * Permission to use, copy, modify, and distribute this software
59 | * for any purpose and without fee is hereby granted. The author
60 | * disclaims all warranties with regard to this software.
61 | *
62 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
63 | */
64 |
65 | #include
66 |
67 | struct interval {
68 | int first;
69 | int last;
70 | };
71 |
72 | /* auxiliary function for binary search in interval table */
73 | static int bisearch(wchar_t ucs, const struct interval *table, int max) {
74 | int min = 0;
75 | int mid;
76 |
77 | if (ucs < table[0].first || ucs > table[max].last)
78 | return 0;
79 | while (max >= min) {
80 | mid = (min + max) / 2;
81 | if (ucs > table[mid].last)
82 | min = mid + 1;
83 | else if (ucs < table[mid].first)
84 | max = mid - 1;
85 | else
86 | return 1;
87 | }
88 |
89 | return 0;
90 | }
91 |
92 | /* The following two functions define the column width of an ISO 10646
93 | * character as follows:
94 | *
95 | * - The null character (U+0000) has a column width of 0.
96 | *
97 | * - Other C0/C1 control characters and DEL will lead to a return
98 | * value of -1.
99 | *
100 | * - Non-spacing and enclosing combining characters (general
101 | * category code Mn or Me in the Unicode database) have a
102 | * column width of 0.
103 | *
104 | * - SOFT HYPHEN (U+00AD) has a column width of 1.
105 | *
106 | * - Other format characters (general category code Cf in the Unicode
107 | * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
108 | *
109 | * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
110 | * have a column width of 0.
111 | *
112 | * - Spacing characters in the East Asian Wide (W) or East Asian
113 | * Full-width (F) category as defined in Unicode Technical
114 | * Report #11 have a column width of 2.
115 | *
116 | * - All remaining characters (including all printable
117 | * ISO 8859-1 and WGL4 characters, Unicode control characters,
118 | * etc.) have a column width of 1.
119 | *
120 | * This implementation assumes that wchar_t characters are encoded
121 | * in ISO 10646.
122 | */
123 |
124 | int mk_wcwidth(wchar_t ucs) {
125 | /* sorted list of non-overlapping intervals of non-spacing characters */
126 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
127 | static const struct interval combining[] = {
128 | {0x0300, 0x036F}, {0x0483, 0x0486}, {0x0488, 0x0489},
129 | {0x0591, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
130 | {0x05C4, 0x05C5}, {0x05C7, 0x05C7}, {0x0600, 0x0603},
131 | {0x0610, 0x0615}, {0x064B, 0x065E}, {0x0670, 0x0670},
132 | {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
133 | {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
134 | {0x07A6, 0x07B0}, {0x07EB, 0x07F3}, {0x0901, 0x0902},
135 | {0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D},
136 | {0x0951, 0x0954}, {0x0962, 0x0963}, {0x0981, 0x0981},
137 | {0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
138 | {0x09E2, 0x09E3}, {0x0A01, 0x0A02}, {0x0A3C, 0x0A3C},
139 | {0x0A41, 0x0A42}, {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D},
140 | {0x0A70, 0x0A71}, {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC},
141 | {0x0AC1, 0x0AC5}, {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD},
142 | {0x0AE2, 0x0AE3}, {0x0B01, 0x0B01}, {0x0B3C, 0x0B3C},
143 | {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43}, {0x0B4D, 0x0B4D},
144 | {0x0B56, 0x0B56}, {0x0B82, 0x0B82}, {0x0BC0, 0x0BC0},
145 | {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C48},
146 | {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56}, {0x0CBC, 0x0CBC},
147 | {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
148 | {0x0CE2, 0x0CE3}, {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D},
149 | {0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6},
150 | {0x0E31, 0x0E31}, {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E},
151 | {0x0EB1, 0x0EB1}, {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC},
152 | {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19}, {0x0F35, 0x0F35},
153 | {0x0F37, 0x0F37}, {0x0F39, 0x0F39}, {0x0F71, 0x0F7E},
154 | {0x0F80, 0x0F84}, {0x0F86, 0x0F87}, {0x0F90, 0x0F97},
155 | {0x0F99, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030},
156 | {0x1032, 0x1032}, {0x1036, 0x1037}, {0x1039, 0x1039},
157 | {0x1058, 0x1059}, {0x1160, 0x11FF}, {0x135F, 0x135F},
158 | {0x1712, 0x1714}, {0x1732, 0x1734}, {0x1752, 0x1753},
159 | {0x1772, 0x1773}, {0x17B4, 0x17B5}, {0x17B7, 0x17BD},
160 | {0x17C6, 0x17C6}, {0x17C9, 0x17D3}, {0x17DD, 0x17DD},
161 | {0x180B, 0x180D}, {0x18A9, 0x18A9}, {0x1920, 0x1922},
162 | {0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B},
163 | {0x1A17, 0x1A18}, {0x1B00, 0x1B03}, {0x1B34, 0x1B34},
164 | {0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42},
165 | {0x1B6B, 0x1B73}, {0x1DC0, 0x1DCA}, {0x1DFE, 0x1DFF},
166 | {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x2063},
167 | {0x206A, 0x206F}, {0x20D0, 0x20EF}, {0x302A, 0x302F},
168 | {0x3099, 0x309A}, {0xA806, 0xA806}, {0xA80B, 0xA80B},
169 | {0xA825, 0xA826}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F},
170 | {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF}, {0xFFF9, 0xFFFB},
171 | {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F},
172 | {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, {0x1D167, 0x1D169},
173 | {0x1D173, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD},
174 | {0x1D242, 0x1D244}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F},
175 | {0xE0100, 0xE01EF}};
176 |
177 | /* test for 8-bit control characters */
178 | if (ucs == 0)
179 | return 0;
180 | if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
181 | return -1;
182 |
183 | /* binary search in table of non-spacing characters */
184 | if (bisearch(ucs, combining, sizeof(combining) / sizeof(struct interval) - 1))
185 | return 0;
186 |
187 | /* if we arrive here, ucs is not a combining or C0/C1 control character */
188 |
189 | return 1 +
190 | (ucs >= 0x1100 &&
191 | (ucs <= 0x115f || /* Hangul Jamo init. consonants */
192 | ucs == 0x2329 || ucs == 0x232a ||
193 | (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) || /* CJK ... Yi */
194 | (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
195 | (ucs >= 0xf900 &&
196 | ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
197 | (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */
198 | (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
199 | (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */
200 | (ucs >= 0xffe0 && ucs <= 0xffe6) ||
201 | (ucs >= 0x20000 && ucs <= 0x2fffd) ||
202 | (ucs >= 0x30000 && ucs <= 0x3fffd)));
203 | }
204 |
205 | int mk_wcswidth(const wchar_t *pwcs, size_t n) {
206 | int w, width = 0;
207 |
208 | for (; *pwcs && n-- > 0; pwcs++)
209 | if ((w = mk_wcwidth(*pwcs)) < 0)
210 | return -1;
211 | else
212 | width += w;
213 |
214 | return width;
215 | }
216 |
217 | /*
218 | * The following functions are the same as mk_wcwidth() and
219 | * mk_wcswidth(), except that spacing characters in the East Asian
220 | * Ambiguous (A) category as defined in Unicode Technical Report #11
221 | * have a column width of 2. This variant might be useful for users of
222 | * CJK legacy encodings who want to migrate to UCS without changing
223 | * the traditional terminal character-width behaviour. It is not
224 | * otherwise recommended for general use.
225 | */
226 | int mk_wcwidth_cjk(wchar_t ucs) {
227 | /* sorted list of non-overlapping intervals of East Asian Ambiguous
228 | * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
229 | static const struct interval ambiguous[] = {
230 | {0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8},
231 | {0x00AA, 0x00AA}, {0x00AE, 0x00AE}, {0x00B0, 0x00B4},
232 | {0x00B6, 0x00BA}, {0x00BC, 0x00BF}, {0x00C6, 0x00C6},
233 | {0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1},
234 | {0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED},
235 | {0x00F0, 0x00F0}, {0x00F2, 0x00F3}, {0x00F7, 0x00FA},
236 | {0x00FC, 0x00FC}, {0x00FE, 0x00FE}, {0x0101, 0x0101},
237 | {0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B},
238 | {0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133},
239 | {0x0138, 0x0138}, {0x013F, 0x0142}, {0x0144, 0x0144},
240 | {0x0148, 0x014B}, {0x014D, 0x014D}, {0x0152, 0x0153},
241 | {0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE},
242 | {0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4},
243 | {0x01D6, 0x01D6}, {0x01D8, 0x01D8}, {0x01DA, 0x01DA},
244 | {0x01DC, 0x01DC}, {0x0251, 0x0251}, {0x0261, 0x0261},
245 | {0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB},
246 | {0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB},
247 | {0x02DD, 0x02DD}, {0x02DF, 0x02DF}, {0x0391, 0x03A1},
248 | {0x03A3, 0x03A9}, {0x03B1, 0x03C1}, {0x03C3, 0x03C9},
249 | {0x0401, 0x0401}, {0x0410, 0x044F}, {0x0451, 0x0451},
250 | {0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019},
251 | {0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027},
252 | {0x2030, 0x2030}, {0x2032, 0x2033}, {0x2035, 0x2035},
253 | {0x203B, 0x203B}, {0x203E, 0x203E}, {0x2074, 0x2074},
254 | {0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC},
255 | {0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109},
256 | {0x2113, 0x2113}, {0x2116, 0x2116}, {0x2121, 0x2122},
257 | {0x2126, 0x2126}, {0x212B, 0x212B}, {0x2153, 0x2154},
258 | {0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179},
259 | {0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2},
260 | {0x21D4, 0x21D4}, {0x21E7, 0x21E7}, {0x2200, 0x2200},
261 | {0x2202, 0x2203}, {0x2207, 0x2208}, {0x220B, 0x220B},
262 | {0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215},
263 | {0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223},
264 | {0x2225, 0x2225}, {0x2227, 0x222C}, {0x222E, 0x222E},
265 | {0x2234, 0x2237}, {0x223C, 0x223D}, {0x2248, 0x2248},
266 | {0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261},
267 | {0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F},
268 | {0x2282, 0x2283}, {0x2286, 0x2287}, {0x2295, 0x2295},
269 | {0x2299, 0x2299}, {0x22A5, 0x22A5}, {0x22BF, 0x22BF},
270 | {0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B},
271 | {0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595},
272 | {0x25A0, 0x25A1}, {0x25A3, 0x25A9}, {0x25B2, 0x25B3},
273 | {0x25B6, 0x25B7}, {0x25BC, 0x25BD}, {0x25C0, 0x25C1},
274 | {0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1},
275 | {0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606},
276 | {0x2609, 0x2609}, {0x260E, 0x260F}, {0x2614, 0x2615},
277 | {0x261C, 0x261C}, {0x261E, 0x261E}, {0x2640, 0x2640},
278 | {0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665},
279 | {0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F},
280 | {0x273D, 0x273D}, {0x2776, 0x277F}, {0xE000, 0xF8FF},
281 | {0xFFFD, 0xFFFD}, {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD}};
282 |
283 | /* binary search in table of non-spacing characters */
284 | if (bisearch(ucs, ambiguous, sizeof(ambiguous) / sizeof(struct interval) - 1))
285 | return 2;
286 |
287 | return mk_wcwidth(ucs);
288 | }
289 |
290 | int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n) {
291 | int w, width = 0;
292 |
293 | for (; *pwcs && n-- > 0; pwcs++)
294 | if ((w = mk_wcwidth_cjk(*pwcs)) < 0)
295 | return -1;
296 | else
297 | width += w;
298 |
299 | return width;
300 | }
301 |
--------------------------------------------------------------------------------
/ucd/SpecialCasing.txt:
--------------------------------------------------------------------------------
1 | # SpecialCasing-16.0.0.txt
2 | # Date: 2024-05-10, 22:49:00 GMT
3 | # © 2024 Unicode®, Inc.
4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html
6 | #
7 | # Unicode Character Database
8 | # For documentation, see https://www.unicode.org/reports/tr44/
9 | #
10 | # Special Casing
11 | #
12 | # This file is a supplement to the UnicodeData.txt file. The data in this file, combined with
13 | # the simple case mappings in UnicodeData.txt, defines the full case mappings
14 | # Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
15 | # For compatibility, the UnicodeData.txt file only contains simple case mappings
16 | # for characters where they are one-to-one (and independent of context and language).
17 | #
18 | # For historical reasons, this file also provides additional information about the casing
19 | # of Unicode characters for selected situations when casing is dependent on context or locale.
20 | #
21 | # Note that the preferred mechanism for defining tailored casing operations is
22 | # the Unicode Common Locale Data Repository (CLDR). For more information, see the
23 | # discussion of case mappings and case algorithms in the Unicode Standard.
24 | #
25 | # All code points not listed in this file that do not have simple case mappings
26 | # in UnicodeData.txt map to themselves.
27 | # ================================================================================
28 | # Format
29 | # ================================================================================
30 | # The entries in this file are in the following machine-readable format:
31 | #
32 | # ; ; ; ; (;)? #
33 | #
34 | # , , , and provide the respective full case mappings
35 | # of , expressed as character values in hex. If there is more than one character,
36 | # they are separated by spaces. Other than as used to separate elements, spaces are
37 | # to be ignored.
38 | #
39 | # The is optional. Where present, it consists of one or more language IDs
40 | # or casing contexts, separated by spaces. In these conditions:
41 | # - A condition list overrides the normal behavior if all of the listed conditions are true.
42 | # - The casing context is always the context of the characters in the original string,
43 | # NOT in the resulting string.
44 | # - Case distinctions in the condition list are not significant.
45 | # - Conditions preceded by "Not_" represent the negation of the condition.
46 | # The condition list is not represented in the UCD as a formal property.
47 | #
48 | # A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
49 | #
50 | # A casing context for a character is defined by Section 3.13 Default Case Algorithms
51 | # of The Unicode Standard.
52 | #
53 | # Parsers of this file must be prepared to deal with future additions to this format:
54 | # * Additional contexts
55 | # * Additional fields
56 | # ================================================================================
57 |
58 | # ================================================================================
59 | # Unconditional mappings
60 | # ================================================================================
61 |
62 | # The German es-zed is special--the normal mapping is to SS.
63 | # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase())
64 |
65 | 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
66 |
67 | # Preserve canonical equivalence for I with dot. Turkic is handled below.
68 |
69 | 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
70 |
71 | # Ligatures
72 |
73 | FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF
74 | FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI
75 | FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL
76 | FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI
77 | FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL
78 | FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T
79 | FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST
80 |
81 | 0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN
82 | FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW
83 | FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH
84 | FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI
85 | FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW
86 | FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
87 |
88 | # No corresponding uppercase precomposed character
89 |
90 | 0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
91 | 0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
92 | 03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
93 | 01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON
94 | 1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW
95 | 1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS
96 | 1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE
97 | 1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
98 | 1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
99 | 1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
100 | 1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
101 | 1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
102 | 1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
103 | 1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
104 | 1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
105 | 1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
106 | 1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
107 | 1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
108 | 1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
109 | 1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
110 | 1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
111 | 1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI
112 | 1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
113 | 1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
114 | 1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
115 |
116 | # IMPORTANT-when iota-subscript (0345) is uppercased or titlecased,
117 | # the result will be incorrect unless the iota-subscript is moved to the end
118 | # of any sequence of combining marks. Otherwise, the accents will go on the capital iota.
119 | # This process can be achieved by first transforming the text to NFC before casing.
120 | # E.g. is uppercased to
121 |
122 | # The following cases are already in the UnicodeData.txt file, so are only commented here.
123 |
124 | # 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI
125 |
126 | # All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
127 | # have special uppercases.
128 | # Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
129 |
130 | 1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
131 | 1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
132 | 1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
133 | 1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
134 | 1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
135 | 1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
136 | 1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
137 | 1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
138 | 1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
139 | 1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
140 | 1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
141 | 1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
142 | 1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
143 | 1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
144 | 1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
145 | 1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
146 | 1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
147 | 1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
148 | 1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
149 | 1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
150 | 1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
151 | 1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
152 | 1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
153 | 1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
154 | 1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
155 | 1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
156 | 1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
157 | 1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
158 | 1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
159 | 1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
160 | 1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
161 | 1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
162 | 1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
163 | 1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
164 | 1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
165 | 1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
166 | 1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
167 | 1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
168 | 1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
169 | 1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
170 | 1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
171 | 1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
172 | 1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
173 | 1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
174 | 1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
175 | 1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
176 | 1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
177 | 1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
178 | 1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
179 | 1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
180 | 1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
181 | 1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
182 | 1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
183 | 1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
184 |
185 | # Some characters with YPOGEGRAMMENI also have no corresponding titlecases
186 |
187 | 1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
188 | 1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
189 | 1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
190 | 1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
191 | 1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
192 | 1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
193 |
194 | 1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
195 | 1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
196 | 1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
197 |
198 | # ================================================================================
199 | # Conditional Mappings
200 | # The remainder of this file provides conditional casing data used to produce
201 | # full case mappings.
202 | # ================================================================================
203 | # Language-Insensitive Mappings
204 | # These are characters whose full case mappings do not depend on language, but do
205 | # depend on context (which characters come before or after). For more information
206 | # see the header of this file and the Unicode Standard.
207 | # ================================================================================
208 |
209 | # Special case for final form of sigma
210 |
211 | 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
212 |
213 | # Note: the following cases for non-final are already in the UnicodeData.txt file.
214 |
215 | # 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
216 | # 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
217 | # 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
218 |
219 | # Note: the following cases are not included, since they would case-fold in lowercasing
220 |
221 | # 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA
222 | # 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
223 |
224 | # ================================================================================
225 | # Language-Sensitive Mappings
226 | # These are characters whose full case mappings depend on language and perhaps also
227 | # context (which characters come before or after). For more information
228 | # see the header of this file and the Unicode Standard.
229 | # ================================================================================
230 |
231 | # Lithuanian
232 |
233 | # Lithuanian retains the dot in a lowercase i when followed by accents.
234 |
235 | # Remove DOT ABOVE after "i" with upper or titlecase
236 |
237 | 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
238 |
239 | # Introduce an explicit dot above when lowercasing capital I's and J's
240 | # whenever there are more accents above.
241 | # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
242 |
243 | 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
244 | 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
245 | 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
246 | 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
247 | 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
248 | 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
249 |
250 | # ================================================================================
251 |
252 | # Turkish and Azeri
253 |
254 | # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
255 | # The following rules handle those cases.
256 |
257 | 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
258 | 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
259 |
260 | # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
261 | # This matches the behavior of the canonically equivalent I-dot_above
262 |
263 | 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
264 | 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
265 |
266 | # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
267 |
268 | 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
269 | 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
270 |
271 | # When uppercasing, i turns into a dotted capital I
272 |
273 | 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
274 | 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
275 |
276 | # Note: the following case is already in the UnicodeData.txt file.
277 |
278 | # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
279 |
280 | # EOF
281 |
282 |
--------------------------------------------------------------------------------
/src/config.zig:
--------------------------------------------------------------------------------
1 | const std = @import("std");
2 | const types = @import("types.zig");
3 | pub const quirks = @import("quirks.zig");
4 |
5 | pub const max_code_point = 0x10FFFF;
6 | pub const zero_width_non_joiner = 0x200C;
7 | pub const zero_width_joiner = 0x200D;
8 |
9 | pub const default = Table{
10 | .fields = &.{
11 | // UnicodeData
12 | .{
13 | .name = "name",
14 | .type = []const u8,
15 | .max_len = 88,
16 | .max_offset = 1030461,
17 | .embedded_len = 2,
18 | },
19 | .{ .name = "general_category", .type = types.GeneralCategory },
20 | .{ .name = "canonical_combining_class", .type = u8 },
21 | .{ .name = "bidi_class", .type = types.BidiClass },
22 | .{ .name = "decomposition_type", .type = types.DecompositionType },
23 | .{
24 | .name = "decomposition_mapping",
25 | .type = []const u21,
26 | .cp_packing = .shift,
27 | .shift_low = -181519,
28 | .shift_high = 99324,
29 | .max_len = 18,
30 | .max_offset = 4602,
31 | .embedded_len = 0,
32 | },
33 | .{ .name = "numeric_type", .type = types.NumericType },
34 | .{
35 | .name = "numeric_value_decimal",
36 | .type = ?u4,
37 | .min_value = 0,
38 | .max_value = 9,
39 | },
40 | .{
41 | .name = "numeric_value_digit",
42 | .type = ?u4,
43 | .min_value = 0,
44 | .max_value = 9,
45 | },
46 | .{
47 | .name = "numeric_value_numeric",
48 | .type = []const u8,
49 | .max_len = 13,
50 | .max_offset = 503,
51 | .embedded_len = 1,
52 | },
53 | .{ .name = "is_bidi_mirrored", .type = bool },
54 | .{
55 | .name = "unicode_1_name",
56 | .type = []const u8,
57 | .max_len = 55,
58 | .max_offset = 49956,
59 | .embedded_len = 0,
60 | },
61 | .{
62 | .name = "simple_uppercase_mapping",
63 | .type = ?u21,
64 | .cp_packing = .shift,
65 | .shift_low = -38864,
66 | .shift_high = 42561,
67 | },
68 | .{
69 | .name = "simple_lowercase_mapping",
70 | .type = ?u21,
71 | .cp_packing = .shift,
72 | .shift_low = -42561,
73 | .shift_high = 38864,
74 | },
75 | .{
76 | .name = "simple_titlecase_mapping",
77 | .type = ?u21,
78 | .cp_packing = .shift,
79 | .shift_low = -38864,
80 | .shift_high = 42561,
81 | },
82 |
83 | // CaseFolding
84 | .{
85 | .name = "case_folding_simple",
86 | .type = u21,
87 | .cp_packing = .shift,
88 | .shift_low = -42561,
89 | .shift_high = 35267,
90 | },
91 | .{
92 | .name = "case_folding_full",
93 | .type = []const u21,
94 | .cp_packing = .shift,
95 | .shift_low = -42561,
96 | .shift_high = 35267,
97 | .max_len = 3,
98 | .max_offset = 160,
99 | .embedded_len = 0,
100 | },
101 | .{
102 | .name = "case_folding_turkish_only",
103 | .type = []const u21,
104 | .cp_packing = .direct,
105 | .shift_low = -199,
106 | .shift_high = 232,
107 | .max_len = 1,
108 | .max_offset = 2,
109 | .embedded_len = 0,
110 | },
111 | .{
112 | .name = "case_folding_common_only",
113 | .type = []const u21,
114 | .cp_packing = .direct,
115 | .shift_low = -42561,
116 | .shift_high = 35267,
117 | .max_len = 1,
118 | .max_offset = 1423,
119 | .embedded_len = 0,
120 | },
121 | .{
122 | .name = "case_folding_simple_only",
123 | .type = []const u21,
124 | .cp_packing = .direct,
125 | .shift_low = -7615,
126 | .shift_high = 1,
127 | .max_len = 1,
128 | .max_offset = 31,
129 | .embedded_len = 0,
130 | },
131 | .{
132 | .name = "case_folding_full_only",
133 | .type = []const u21,
134 | .max_len = 3,
135 | .max_offset = 160,
136 | .embedded_len = 0,
137 | },
138 |
139 | // SpecialCasing
140 | .{ .name = "has_special_casing", .type = bool },
141 | .{
142 | .name = "special_lowercase_mapping",
143 | .type = []const u21,
144 | .cp_packing = .shift,
145 | .shift_low = -199,
146 | .shift_high = 232,
147 | .max_len = 3,
148 | .max_offset = 13,
149 | .embedded_len = 0,
150 | },
151 | .{
152 | .name = "special_titlecase_mapping",
153 | .type = []const u21,
154 | .cp_packing = .shift,
155 | .shift_low = 0,
156 | .shift_high = 199,
157 | .max_len = 3,
158 | .max_offset = 104,
159 | .embedded_len = 0,
160 | },
161 | .{
162 | .name = "special_uppercase_mapping",
163 | .type = []const u21,
164 | .cp_packing = .shift,
165 | .shift_low = 0,
166 | .shift_high = 199,
167 | .max_len = 3,
168 | .max_offset = 158,
169 | .embedded_len = 0,
170 | },
171 | .{
172 | .name = "special_casing_condition",
173 | .type = []const types.SpecialCasingCondition,
174 | .max_len = 2,
175 | .max_offset = 9,
176 | .embedded_len = 0,
177 | },
178 |
179 | // Case mappings
180 | .{
181 | .name = "lowercase_mapping",
182 | .type = []const u21,
183 | .cp_packing = .shift,
184 | .shift_low = -42561,
185 | .shift_high = 38864,
186 | .max_len = 1,
187 | .max_offset = 0,
188 | .embedded_len = 0,
189 | },
190 | .{
191 | .name = "titlecase_mapping",
192 | .type = []const u21,
193 | .cp_packing = .shift,
194 | .shift_low = -38864,
195 | .shift_high = 42561,
196 | .max_len = 3,
197 | .max_offset = 104,
198 | .embedded_len = 0,
199 | },
200 | .{
201 | .name = "uppercase_mapping",
202 | .type = []const u21,
203 | .cp_packing = .shift,
204 | .shift_low = -38864,
205 | .shift_high = 42561,
206 | .max_len = 3,
207 | .max_offset = 158,
208 | .embedded_len = 0,
209 | },
210 |
211 | // DerivedCoreProperties
212 | .{ .name = "is_math", .type = bool },
213 | .{ .name = "is_alphabetic", .type = bool },
214 | .{ .name = "is_lowercase", .type = bool },
215 | .{ .name = "is_uppercase", .type = bool },
216 | .{ .name = "is_cased", .type = bool },
217 | .{ .name = "is_case_ignorable", .type = bool },
218 | .{ .name = "changes_when_lowercased", .type = bool },
219 | .{ .name = "changes_when_uppercased", .type = bool },
220 | .{ .name = "changes_when_titlecased", .type = bool },
221 | .{ .name = "changes_when_casefolded", .type = bool },
222 | .{ .name = "changes_when_casemapped", .type = bool },
223 | .{ .name = "is_id_start", .type = bool },
224 | .{ .name = "is_id_continue", .type = bool },
225 | .{ .name = "is_xid_start", .type = bool },
226 | .{ .name = "is_xid_continue", .type = bool },
227 | .{ .name = "is_default_ignorable", .type = bool },
228 | .{ .name = "is_grapheme_extend", .type = bool },
229 | .{ .name = "is_grapheme_base", .type = bool },
230 | .{ .name = "is_grapheme_link", .type = bool },
231 | .{ .name = "indic_conjunct_break", .type = types.IndicConjunctBreak },
232 |
233 | // EastAsianWidth
234 | .{ .name = "east_asian_width", .type = types.EastAsianWidth },
235 |
236 | // OriginalGraphemeBreak
237 | // This is the field from GraphemeBreakProperty.txt, without combining
238 | // `indic_conjunct_break`, `is_emoji_modifier`,
239 | // `is_emoji_modifier_base`, and `is_extended_pictographic`
240 | .{ .name = "original_grapheme_break", .type = types.OriginalGraphemeBreak },
241 |
242 | // EmojiData
243 | .{ .name = "is_emoji", .type = bool },
244 | .{ .name = "is_emoji_presentation", .type = bool },
245 | .{ .name = "is_emoji_modifier", .type = bool },
246 | .{ .name = "is_emoji_modifier_base", .type = bool },
247 | .{ .name = "is_emoji_component", .type = bool },
248 | .{ .name = "is_extended_pictographic", .type = bool },
249 |
250 | // EmojiVariationSequences
251 | // These are all going to be equivalent, but
252 | // `emoji-variation-sequences.txt` and UTS #51 split out the emoji and
253 | // text variation sequences separately. However, ever since these were
254 | // introduced in Unicode 6.1 (see
255 | // https://unicode.org/Public/6.1.0/ucd/StandardizedVariants.txt --
256 | // dated 2011-11-10), until present, there has never been an emoji
257 | // variation sequence that isn't also a valid text variation sequence,
258 | // and vice versa, so the recommendation is to just use
259 | // `is_emoji_vs_base`. Also the "Total sequences" comment at the end of
260 | // emoji-variation-sequences.txt counts the number of sequences as one
261 | // per base code point, rather than counting the "emoji style" and
262 | // "text style" lines separately.
263 | .{ .name = "is_emoji_vs_base", .type = bool },
264 | .{ .name = "is_emoji_vs_text", .type = bool },
265 | .{ .name = "is_emoji_vs_emoji", .type = bool },
266 |
267 | // GraphemeBreak (derived)
268 | // This is derived from `original_grapheme_break`
269 | // (GraphemeBreakProperty.txt), `indic_conjunct_break`,
270 | // `is_emoji_modifier`, `is_emoji_modifier_base`, and
271 | // `is_extended_pictographic`
272 | .{ .name = "grapheme_break", .type = types.GraphemeBreak },
273 |
274 | // BidiPairedBracket
275 | .{
276 | .name = "bidi_paired_bracket",
277 | .type = types.BidiPairedBracket,
278 | .cp_packing = .shift,
279 | .shift_low = -3,
280 | .shift_high = 3,
281 | },
282 |
283 | // Block
284 | .{ .name = "block", .type = types.Block },
285 | },
286 | };
287 |
288 | pub const is_updating_ucd = false;
289 |
290 | pub const Field = struct {
291 | name: [:0]const u8,
292 | type: type,
293 |
294 | // For Shift + Slice fields
295 | cp_packing: CpPacking = .direct,
296 | shift_low: isize = 0,
297 | shift_high: isize = 0,
298 |
299 | // For Slice fields
300 | max_len: usize = 0,
301 | max_offset: usize = 0,
302 | embedded_len: usize = 0,
303 |
304 | // For PackedOptional fields
305 | min_value: isize = 0,
306 | max_value: isize = 0,
307 |
308 | pub const CpPacking = enum {
309 | direct,
310 | shift,
311 | };
312 |
313 | pub const Runtime = struct {
314 | name: []const u8,
315 | type: []const u8,
316 | cp_packing: CpPacking,
317 | shift_low: isize,
318 | shift_high: isize,
319 | max_len: usize,
320 | max_offset: usize,
321 | embedded_len: usize,
322 | min_value: isize,
323 | max_value: isize,
324 |
325 | pub fn eql(a: Runtime, b: Runtime) bool {
326 | return a.cp_packing == b.cp_packing and
327 | a.shift_low == b.shift_low and
328 | a.shift_high == b.shift_high and
329 | a.max_len == b.max_len and
330 | a.max_offset == b.max_offset and
331 | a.embedded_len == b.embedded_len and
332 | a.min_value == b.min_value and
333 | a.max_value == b.max_value and
334 | std.mem.eql(u8, a.type, b.type) and
335 | std.mem.eql(u8, a.name, b.name);
336 | }
337 |
338 | pub fn override(self: Runtime, overrides: anytype) Runtime {
339 | var result: Runtime = .{
340 | .name = self.name,
341 | .type = self.type,
342 | .cp_packing = self.cp_packing,
343 | .shift_low = self.shift_low,
344 | .shift_high = self.shift_high,
345 | .max_len = self.max_len,
346 | .max_offset = self.max_offset,
347 | .embedded_len = self.embedded_len,
348 | .min_value = self.min_value,
349 | .max_value = self.max_value,
350 | };
351 |
352 | inline for (@typeInfo(@TypeOf(overrides)).@"struct".fields) |f| {
353 | @field(result, f.name) = @field(overrides, f.name);
354 | }
355 |
356 | return result;
357 | }
358 |
359 | pub fn compareActual(self: Runtime, actual: Runtime) bool {
360 | var is_okay = true;
361 |
362 | if (self.shift_low != actual.shift_low) {
363 | std.log.err("Config for field '{s}' does not match actual. Set .shift_low = {d}, // change from {d}", .{ self.name, actual.shift_low, self.shift_low });
364 | is_okay = false;
365 | }
366 |
367 | if (self.shift_high != actual.shift_high) {
368 | std.log.err("Config for field '{s}' does not match actual. Set .shift_high = {d}, // change from {d}", .{ self.name, actual.shift_high, self.shift_high });
369 | is_okay = false;
370 | }
371 |
372 | if (self.max_len != actual.max_len) {
373 | std.log.err("Config for field '{s}' does not match actual. Set .max_len = {d}, // change from {d}", .{ self.name, actual.max_len, self.max_len });
374 | is_okay = false;
375 | }
376 |
377 | if (self.max_offset != actual.max_offset) {
378 | std.log.err("Config for field '{s}' does not match actual. Set .max_offset = {d}, // change from {d}", .{ self.name, actual.max_offset, self.max_offset });
379 | is_okay = false;
380 | }
381 |
382 | if (self.min_value != actual.min_value) {
383 | std.log.err("Config for field '{s}' does not match actual. Set .min_value = {d}, // change from {d}", .{ self.name, actual.min_value, self.min_value });
384 | is_okay = false;
385 | }
386 |
387 | if (self.max_value != actual.max_value) {
388 | std.log.err("Config for field '{s}' does not match actual. Set .max_value = {d}, // change from {d}", .{ self.name, actual.max_value, self.max_value });
389 | is_okay = false;
390 | }
391 |
392 | return is_okay;
393 | }
394 |
395 | pub fn write(self: Runtime, writer: *std.Io.Writer) !void {
396 | try writer.print(
397 | \\.{{
398 | \\ .name = "{s}",
399 | \\
400 | , .{self.name});
401 |
402 | var type_parts = std.mem.splitScalar(u8, self.type, '.');
403 | const base_type = type_parts.next().?;
404 | const rest_type = type_parts.rest();
405 |
406 | if (std.mem.endsWith(u8, base_type, "types") or
407 | std.mem.endsWith(u8, base_type, "types_x") or
408 | rest_type.len == 0)
409 | {
410 | try writer.print(
411 | \\ .type = {s},
412 | \\
413 | , .{self.type});
414 | } else {
415 | const prefix = if (base_type[0] == '?') "?" else "";
416 | try writer.print(
417 | \\ .type = {s}build_config.{s},
418 | \\
419 | , .{ prefix, rest_type });
420 | }
421 |
422 | if (self.cp_packing != .direct or
423 | self.shift_low != 0 or
424 | self.shift_high != 0)
425 | {
426 | try writer.print(
427 | \\ .cp_packing = .{s},
428 | \\ .shift_low = {},
429 | \\ .shift_high = {},
430 | \\
431 | , .{ @tagName(self.cp_packing), self.shift_low, self.shift_high });
432 | }
433 | if (self.max_len != 0) {
434 | try writer.print(
435 | \\ .max_len = {},
436 | \\ .max_offset = {},
437 | \\ .embedded_len = {},
438 | \\
439 | , .{ self.max_len, self.max_offset, self.embedded_len });
440 | }
441 | if (self.min_value != 0 or self.max_value != 0) {
442 | try writer.print(
443 | \\ .min_value = {},
444 | \\ .max_value = {},
445 | \\
446 | , .{ self.min_value, self.max_value });
447 | }
448 |
449 | try writer.writeAll(
450 | \\},
451 | \\
452 | );
453 | }
454 | };
455 |
456 | pub const Kind = enum {
457 | basic,
458 | slice,
459 | shift,
460 | optional,
461 | @"union",
462 | };
463 |
464 | pub fn kind(self: Field) Kind {
465 | switch (@typeInfo(self.type)) {
466 | .pointer => return .slice,
467 | .optional => |optional| {
468 | if (!isPackable(optional.child)) {
469 | return .basic;
470 | }
471 |
472 | switch (self.cp_packing) {
473 | .direct => return .optional,
474 | .shift => return .shift,
475 | }
476 | },
477 | .@"union" => return .@"union",
478 | else => {
479 | switch (self.cp_packing) {
480 | .direct => return .basic,
481 | .shift => return .shift,
482 | }
483 | },
484 | }
485 | }
486 |
487 | pub fn canBePacked(self: Field) bool {
488 | if (self.kind() == .slice) {
489 | return false;
490 | }
491 |
492 | switch (@typeInfo(self.type)) {
493 | .optional => |optional| {
494 | return isPackable(optional.child);
495 | },
496 | .@"union" => |info| {
497 | return for (info.fields) |f| {
498 | if (f.type != void and !isPackable(f.type)) {
499 | break false;
500 | }
501 | } else true;
502 | },
503 | else => return true,
504 | }
505 | }
506 |
507 | pub fn runtime(self: Field) Runtime {
508 | return .{
509 | .name = self.name,
510 | .type = @typeName(self.type),
511 | .cp_packing = self.cp_packing,
512 | .shift_low = self.shift_low,
513 | .shift_high = self.shift_high,
514 | .max_len = self.max_len,
515 | .max_offset = self.max_offset,
516 | .embedded_len = self.embedded_len,
517 | .min_value = self.min_value,
518 | .max_value = self.max_value,
519 | };
520 | }
521 |
522 | pub fn eql(a: Field, b: Field) bool {
523 | // Use runtime `eql` just to be lazy
524 | return a.runtime().eql(b.runtime());
525 | }
526 |
527 | pub fn override(self: Field, overrides: anytype) Field {
528 | var result = self;
529 |
530 | inline for (@typeInfo(@TypeOf(overrides)).@"struct".fields) |f| {
531 | if (!is_updating_ucd and (std.mem.eql(u8, f.name, "name") or
532 | std.mem.eql(u8, f.name, "type") or
533 | std.mem.eql(u8, f.name, "shift_low") or
534 | std.mem.eql(u8, f.name, "shift_high") or
535 | std.mem.eql(u8, f.name, "max_len") or
536 | std.mem.eql(u8, f.name, "min_value") or
537 | std.mem.eql(u8, f.name, "max_value")))
538 | {
539 | @compileError("Cannot override field '" ++ f.name ++ "'");
540 | }
541 |
542 | @field(result, f.name) = @field(overrides, f.name);
543 | }
544 |
545 | return result;
546 | }
547 | };
548 |
549 | pub fn isPackable(comptime T: type) bool {
550 | switch (@typeInfo(T)) {
551 | .int => |int| {
552 | return int.bits <= @bitSizeOf(isize);
553 | },
554 | .@"enum" => |e| {
555 | return @typeInfo(e.tag_type).int.bits <= @bitSizeOf(isize);
556 | },
557 | .bool => return true,
558 | else => return false,
559 | }
560 | }
561 |
562 | pub const Table = struct {
563 | name: ?[]const u8 = null,
564 | stages: Stages = .auto,
565 | packing: Packing = .auto,
566 | extensions: []const Extension = &.{},
567 | fields: []const Field,
568 |
569 | pub const Stages = enum {
570 | auto,
571 | two,
572 | three,
573 | };
574 |
575 | pub const Packing = enum {
576 | auto, // as in decide automatically, not as in Type.ContainerLayout.auto
577 | @"packed",
578 | unpacked,
579 |
580 | pub fn write(self: Packing, writer: *std.Io.Writer) !void {
581 | switch (self) {
582 | .auto => unreachable,
583 | .unpacked => try writer.writeAll(".unpacked"),
584 | .@"packed" => try writer.writeAll(".@\"packed\""),
585 | }
586 | }
587 | };
588 |
589 | pub fn hasField(comptime self: *const Table, name: []const u8) bool {
590 | @setEvalBranchQuota(10_000);
591 |
592 | return inline for (self.fields) |f| {
593 | if (std.mem.eql(u8, f.name, name)) {
594 | break true;
595 | }
596 | } else false;
597 | }
598 |
599 | pub fn field(comptime self: *const Table, name: []const u8) Field {
600 | @setEvalBranchQuota(20_000);
601 |
602 | return for (self.fields) |f| {
603 | if (std.mem.eql(u8, f.name, name)) {
604 | break f;
605 | }
606 | } else @compileError("Field '" ++ name ++ "' not found in Table");
607 | }
608 |
609 | // TODO: benchmark this more
610 | const two_stage_size_threshold = 4;
611 |
612 | pub fn resolve(comptime self: *const Table) Table {
613 | if (self.stages != .auto and self.packing != .auto) {
614 | return self;
615 | }
616 |
617 | const can_be_packed = switch (self.packing) {
618 | .auto, .@"packed" => blk: {
619 | for (self.fields) |f| {
620 | if (!f.canBePacked()) {
621 | break :blk false;
622 | }
623 | }
624 |
625 | break :blk true;
626 | },
627 | .unpacked => false,
628 | };
629 |
630 | const DataUnpacked = types.Data(.{
631 | .packing = .unpacked,
632 | .fields = self.fields,
633 | });
634 | const DataPacked = if (can_be_packed)
635 | types.Data(.{
636 | .packing = .@"packed",
637 | .fields = self.fields,
638 | })
639 | else
640 | DataUnpacked;
641 |
642 | const unpacked_size = @sizeOf(DataUnpacked);
643 | const packed_size = @sizeOf(DataPacked);
644 | const min_size = @min(unpacked_size, packed_size);
645 |
646 | const stages: Stages = switch (self.stages) {
647 | .auto => blk: {
648 | if (min_size <= two_stage_size_threshold) {
649 | break :blk .two;
650 | } else {
651 | break :blk .three;
652 | }
653 | },
654 | .two => .two,
655 | .three => .three,
656 | };
657 |
658 | const packing: Packing = switch (self.packing) {
659 | .auto => blk: {
660 | if (!can_be_packed) {
661 | break :blk .unpacked;
662 | }
663 |
664 | if (unpacked_size == min_size or unpacked_size <= two_stage_size_threshold) {
665 | break :blk .unpacked;
666 | }
667 |
668 | if (stages == .two) {
669 | if (packed_size <= two_stage_size_threshold) {
670 | break :blk .@"packed";
671 | } else if (3 * packed_size <= 2 * unpacked_size) {
672 | break :blk .@"packed";
673 | } else {
674 | break :blk .unpacked;
675 | }
676 | } else {
677 | if (packed_size <= unpacked_size / 2) {
678 | break :blk .@"packed";
679 | } else {
680 | break :blk .unpacked;
681 | }
682 | }
683 | },
684 | .@"packed" => .@"packed",
685 | .unpacked => .unpacked,
686 | };
687 |
688 | return .{
689 | .stages = stages,
690 | .packing = packing,
691 | .name = self.name,
692 | .extensions = self.extensions,
693 | .fields = self.fields,
694 | };
695 | }
696 | };
697 |
698 | pub const Extension = struct {
699 | inputs: []const [:0]const u8,
700 | fields: []const Field,
701 |
702 | compute: *const fn (
703 | allocator: std.mem.Allocator,
704 | cp: u21,
705 | data: anytype,
706 | backing: anytype,
707 | tracking: anytype,
708 | ) std.mem.Allocator.Error!void,
709 |
710 | pub fn hasField(comptime self: *const Extension, name: []const u8) bool {
711 | return inline for (self.fields) |f| {
712 | if (std.mem.eql(u8, f.name, name)) {
713 | break true;
714 | }
715 | } else false;
716 | }
717 |
718 | pub fn field(comptime self: *const Extension, name: []const u8) Field {
719 | return for (self.fields) |f| {
720 | if (std.mem.eql(u8, f.name, name)) {
721 | break f;
722 | }
723 | } else @compileError("Field '" ++ name ++ "' not found in Extension");
724 | }
725 | };
726 |
727 | // This is used by generated build_config.zig, and not intended for direct use
728 | // when using advanced configuration.
729 | pub fn _resolveFields(
730 | comptime config_x: type,
731 | comptime field_names: []const []const u8,
732 | comptime extension_names: []const []const u8,
733 | ) [field_names.len]Field {
734 | @setEvalBranchQuota(100_000);
735 | var result: [field_names.len]Field = undefined;
736 | for (field_names, 0..) |field_name, i| {
737 | result[i] = extensions_loop: inline for (@typeInfo(config_x).@"struct".decls) |decl| {
738 | for (extension_names) |ext_name| {
739 | if (std.mem.eql(u8, decl.name, ext_name)) {
740 | const extension = @field(config_x, decl.name);
741 | if (extension.hasField(field_name)) {
742 | break :extensions_loop extension.field(field_name);
743 | }
744 | }
745 | }
746 | } else default.field(field_name);
747 | }
748 | return result;
749 | }
750 |
--------------------------------------------------------------------------------