├── .gitignore ├── mise.toml ├── resources ├── terminal-support-23107.pdf └── wcwidth │ ├── LICENSE_uniseg.txt │ ├── LICENSE_go_runewidth.txt │ ├── LICENSE_ziglyph.txt │ ├── LICENSE_unicode_width.txt │ ├── LICENSE_zg.txt │ ├── LICENSE_wcwidth.txt │ ├── go_runewidth.go │ ├── utf8proc.jl │ ├── uniseg.go │ ├── ziglyph.zig │ ├── LICENSE_utf8proc.md │ ├── zg.zig │ ├── wcwidth.py │ └── wcwidth.c ├── src ├── x │ ├── types.x.zig │ ├── config.x.zig │ ├── types_x │ │ └── grapheme.zig │ ├── config_x │ │ ├── grapheme_break.zig │ │ └── wcwidth.zig │ └── root.zig ├── quirks.zig ├── ascii.zig ├── utf8.zig ├── root.zig ├── code_point.zig ├── get.zig ├── build │ └── test_build_config.zig └── config.zig ├── RESOURCES.md ├── licenses ├── LICENSE_Bjoern_Hoehrmann └── LICENSE_unicode ├── AGENTS.md ├── bin └── fetch-ucd.sh ├── LICENSE.md ├── ucd ├── .gitignore ├── BidiBrackets.txt ├── Blocks.txt └── SpecialCasing.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .zig-cache/ 2 | zig-out/ 3 | -------------------------------------------------------------------------------- /mise.toml: -------------------------------------------------------------------------------- 1 | [tools] 2 | zig = "0.15.2" 3 | hyperfine = "1.19.0" 4 | -------------------------------------------------------------------------------- /resources/terminal-support-23107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobsandlund/uucode/HEAD/resources/terminal-support-23107.pdf -------------------------------------------------------------------------------- /src/x/types.x.zig: -------------------------------------------------------------------------------- 1 | pub const grapheme = @import("types_x/grapheme.zig"); 2 | 3 | pub const GraphemeBreakNoControl = grapheme.GraphemeBreakNoControl; 4 | -------------------------------------------------------------------------------- /src/x/config.x.zig: -------------------------------------------------------------------------------- 1 | pub const grapheme_break_no_control = @import("config_x/grapheme_break.zig").grapheme_break_no_control; 2 | pub const wcwidth = @import("config_x/wcwidth.zig").wcwidth; 3 | -------------------------------------------------------------------------------- /RESOURCES.md: -------------------------------------------------------------------------------- 1 | # Resources 2 | 3 | This is an index of the resources used in `uucode`, mostly residing in [./resources](./resources). 4 | 5 | ## Unicode Character Database 6 | 7 | See [./ucd](./ucd) and 8 | 9 | ## wcwidth 10 | 11 | * `terminal-support-23107.pdf` - 12 | -------------------------------------------------------------------------------- /src/x/types_x/grapheme.zig: -------------------------------------------------------------------------------- 1 | pub const GraphemeBreakNoControl = enum(u5) { 2 | other, 3 | prepend, 4 | regional_indicator, 5 | spacing_mark, 6 | l, 7 | v, 8 | t, 9 | lv, 10 | lvt, 11 | zwj, 12 | zwnj, 13 | extended_pictographic, 14 | emoji_modifier_base, 15 | emoji_modifier, 16 | // extend, == 17 | // zwnj + 18 | // indic_conjunct_break_extend + 19 | // indic_conjunct_break_linker 20 | indic_conjunct_break_extend, 21 | indic_conjunct_break_linker, 22 | indic_conjunct_break_consonant, 23 | }; 24 | -------------------------------------------------------------------------------- /src/x/config_x/grapheme_break.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const config = @import("config.zig"); 3 | const types_x = @import("types.x.zig"); 4 | 5 | fn compute( 6 | allocator: std.mem.Allocator, 7 | cp: u21, 8 | data: anytype, 9 | backing: anytype, 10 | tracking: anytype, 11 | ) std.mem.Allocator.Error!void { 12 | _ = allocator; 13 | _ = cp; 14 | _ = backing; 15 | _ = tracking; 16 | 17 | data.grapheme_break_no_control = switch (data.grapheme_break) { 18 | .control, .cr, .lf => .other, 19 | inline else => |tag| comptime std.meta.stringToEnum( 20 | types_x.GraphemeBreakNoControl, 21 | @tagName(tag), 22 | ) orelse unreachable, 23 | }; 24 | } 25 | 26 | pub const grapheme_break_no_control = config.Extension{ 27 | .inputs = &.{ 28 | "grapheme_break", 29 | }, 30 | .compute = &compute, 31 | .fields = &.{ 32 | .{ .name = "grapheme_break_no_control", .type = types_x.GraphemeBreakNoControl }, 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /licenses/LICENSE_Bjoern_Hoehrmann: -------------------------------------------------------------------------------- 1 | From https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 2 | 3 | Copyright (c) 2008-2009 Bjoern Hoehrmann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # uucode (Micro/µ Unicode) 2 | 3 | ## Project Overview 4 | 5 | This library intends to provide a minimal set of unicode functionality to enable Ghostty and similar projects. 6 | 7 | The architecture works in a few layers: 8 | 9 | * Layer 1 (@src/build/Ucd.zig): Parses the Unicode Character Database (UCD). 10 | * Layer 2 (@src/build/tables.zig): Generates table data written to a zig file. 11 | * Layer 3 (@src/root.zig): Exposes methods to fetch information from the built tables. 12 | 13 | ## Build & Commands 14 | 15 | * Build and test with: `zig build test` 16 | * Run a single test: `zig build test -Dtest-filter="test name"` 17 | * Format code with: `zig fmt` 18 | 19 | Always `zig build test` to check that changes still pass. 20 | 21 | ## Code Style 22 | 23 | Follow Zig standard conventions, also keeping imports at the top. 24 | 25 | Prefer self-documenting code to comments, but add detailed comments for anything that needs explanation. 26 | 27 | Never leave trailing whitespace in lines of source code. 28 | 29 | ## Testing 30 | 31 | Add `test ""` blocks directly below code that it is testing, with more blocks at the bottom of module for testing the entire module. 32 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_uniseg.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/LICENSE.txt#L1-L21 2 | MIT License 3 | 4 | Copyright (c) 2019 Oliver Kuederle 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | # cpv: end 24 | -------------------------------------------------------------------------------- /bin/fetch-ucd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version="16.0.0" 4 | 5 | base_url="https://www.unicode.org/Public/zipped/${version}" 6 | emoji_url="https://www.unicode.org/Public/emoji/latest" 7 | 8 | mv ucd/.gitignore ucd-gitignore 9 | rm -rf ucd 10 | mkdir -p ucd/Unihan 11 | mv ucd-gitignore ucd/.gitignore 12 | 13 | cd ucd 14 | curl -o ucd.zip "${base_url}/UCD.zip" 15 | unzip ucd.zip 16 | rm ucd.zip 17 | 18 | cd emoji 19 | curl -o emoji-sequences.txt "${emoji_url}/emoji-sequences.txt" 20 | curl -o emoji-test.txt "${emoji_url}/emoji-test.txt" 21 | curl -o emoji-zwj-sequences.txt "${emoji_url}/emoji-zwj-sequences.txt" 22 | cd .. 23 | 24 | cd Unihan 25 | curl -o unihan.zip "${base_url}/Unihan.zip" 26 | unzip unihan.zip 27 | rm unihan.zip 28 | cd .. 29 | 30 | cd .. 31 | 32 | echo 33 | echo "########################################################################" 34 | echo 35 | echo "Done fetching UCD files" 36 | echo 37 | echo "Explicitly add any new files to start parsing to the list of .gitignore" 38 | echo "exceptions. Add a '#' to comment them out, appending '(used)' at the end." 39 | echo 40 | echo "Next, flip the 'is_updating_ucd' flag in 'src/config.zig' to true, and" 41 | echo "'zig build test' once, updating the 'default' config if it needs" 42 | echo "changing, before flipping 'is_updating_ucd' back to false." 43 | echo 44 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_go_runewidth.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/LICENSE#L1-L21 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2016 Yasuhiro Matsumoto 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | # cpv: end 24 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_ziglyph.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://codeberg.org/dude_the_builder/ziglyph/src/commit/29760d237219cc4d486f5cd654262d7b0d62d511/LICENSE#L1-L21 2 | MIT License 3 | 4 | Copyright (c) 2021 Jose Colon Rodriguez 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | # cpv: end 24 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_unicode_width.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/unicode-rs/unicode-width/blob/9d98411769fe13c7c18cab0b3fbbab29ba8350ea/LICENSE-MIT#L1-L25 2 | Copyright (c) 2015 The Rust Project Developers 3 | 4 | Permission is hereby granted, free of charge, to any 5 | person obtaining a copy of this software and associated 6 | documentation files (the "Software"), to deal in the 7 | Software without restriction, including without 8 | limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software 11 | is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice 15 | shall be included in all copies or substantial portions 16 | of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 | DEALINGS IN THE SOFTWARE. 27 | # cpv: end 28 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_zg.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/LICENSE#L1-L22 2 | MIT License 3 | 4 | Copyright (c) 2021 Jose Colon Rodriguez 5 | Copyright (c) 2025 Sam Atman and contributors 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | # cpv: end 25 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # uucode license 2 | 3 | MIT License 4 | 5 | Copyright (c) 2025 Jacob Sandlund 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of 8 | this software and associated documentation files (the "Software"), to deal in 9 | the Software without restriction, including without limitation the rights to 10 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 11 | of the Software, and to permit persons to whom the Software is furnished to do 12 | so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | 25 | ## Other licenses 26 | 27 | See [./licenses](./licenses) for licenses of code being used in the repo: 28 | 29 | * [LICENSE_Bjoern_Hoehrmann](./licenses/LICENSE_Bjoern_Hoehrmann) 30 | * [LICENSE_unicode](./licenses/LICENSE_unicode) 31 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_wcwidth.txt: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/LICENSE#L1-L27 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2014 Jeff Quast 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | Markus Kuhn -- 2007-05-26 (Unicode 5.0) 25 | 26 | Permission to use, copy, modify, and distribute this software 27 | for any purpose and without fee is hereby granted. The author 28 | disclaims all warranties with regard to this software. 29 | # cpv: end 30 | -------------------------------------------------------------------------------- /resources/wcwidth/go_runewidth.go: -------------------------------------------------------------------------------- 1 | // cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/runewidth.go#L115-L156 2 | // RuneWidth returns the number of cells in r. 3 | // See http://www.unicode.org/reports/tr11/ 4 | func (c *Condition) RuneWidth(r rune) int { 5 | if r < 0 || r > 0x10FFFF { 6 | return 0 7 | } 8 | if len(c.combinedLut) > 0 { 9 | return int(c.combinedLut[r>>1]>>(uint(r&1)*4)) & 3 10 | } 11 | // optimized version, verified by TestRuneWidthChecksums() 12 | if !c.EastAsianWidth { 13 | switch { 14 | case r < 0x20: 15 | return 0 16 | case (r >= 0x7F && r <= 0x9F) || r == 0xAD: // nonprint 17 | return 0 18 | case r < 0x300: 19 | return 1 20 | case inTable(r, narrow): 21 | return 1 22 | case inTables(r, nonprint, combining): 23 | return 0 24 | case inTable(r, doublewidth): 25 | return 2 26 | default: 27 | return 1 28 | } 29 | } else { 30 | switch { 31 | case inTables(r, nonprint, combining): 32 | return 0 33 | case inTable(r, narrow): 34 | return 1 35 | case inTables(r, ambiguous, doublewidth): 36 | return 2 37 | case !c.StrictEmojiNeutral && inTables(r, ambiguous, emoji, narrow): 38 | return 2 39 | default: 40 | return 1 41 | } 42 | } 43 | } 44 | // cpv: end 45 | 46 | // cpv: track https://github.com/mattn/go-runewidth/blob/7770d045cdc691f0fcb87b0364a83f0de2d1a421/runewidth.go#L179-L193 47 | // StringWidth return width as you can see 48 | func (c *Condition) StringWidth(s string) (width int) { 49 | g := graphemes.FromString(s) 50 | for g.Next() { 51 | var chWidth int 52 | for _, r := range g.Value() { 53 | chWidth = c.RuneWidth(r) 54 | if chWidth > 0 { 55 | break // Our best guess at this point is to use the width of the first non-zero-width rune. 56 | } 57 | } 58 | width += chWidth 59 | } 60 | return 61 | } 62 | // cpv: end 63 | -------------------------------------------------------------------------------- /src/quirks.zig: -------------------------------------------------------------------------------- 1 | //! cpv: track https://github.com/ghostty-org/ghostty/blob/cb45410dccc381b0dab54110b841dd216eb86d66/src/quirks.zig#L1-L10 2 | //! Inspired by WebKit's quirks.cpp[1], this file centralizes all our 3 | //! sad environment-specific hacks that we have to do to make things work. 4 | //! This is a last resort; if we can find a general solution to a problem, 5 | //! we of course prefer that, but sometimes other software, fonts, etc. are 6 | //! just broken or weird and we have to work around it. 7 | //! 8 | //! [1]: https://github.com/WebKit/WebKit/blob/main/Source/WebCore/page/Quirks.cpp 9 | 10 | const std = @import("std"); 11 | const builtin = @import("builtin"); 12 | // cpv: end 13 | 14 | /// cpv: track https://github.com/ghostty-org/ghostty/blob/cb45410dccc381b0dab54110b841dd216eb86d66/src/quirks.zig#L32-L57 15 | /// We use our own assert function instead of `std.debug.assert`. 16 | /// 17 | /// The only difference between this and the one in 18 | /// the stdlib is that this version is marked inline. 19 | /// 20 | /// The reason for this is that, despite the promises of the doc comment 21 | /// on the stdlib function, the function call to `std.debug.assert` isn't 22 | /// always optimized away in `ReleaseFast` mode, at least in Zig 0.15.2. 23 | /// 24 | /// In the majority of places, the overhead from calling an empty function 25 | /// is negligible, but we have some asserts inside tight loops and hotpaths 26 | /// that cause significant overhead (as much as 15-20%) when they don't get 27 | /// optimized out. 28 | pub const inlineAssert = switch (builtin.mode) { 29 | // In debug builds we just use std.debug.assert because this 30 | // fixes up stack traces. `inline` causes broken stack traces. This 31 | // is probably a Zig compiler bug but until it is fixed we have to 32 | // do this for development sanity. 33 | .Debug => std.debug.assert, 34 | 35 | .ReleaseSmall, .ReleaseSafe, .ReleaseFast => (struct { 36 | inline fn assert(ok: bool) void { 37 | if (!ok) unreachable; 38 | } 39 | }).assert, 40 | }; 41 | // cpv: end 42 | -------------------------------------------------------------------------------- /resources/wcwidth/utf8proc.jl: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/JuliaStrings/utf8proc/blob/90daf9f396cfec91668758eb9cc54bd5248a6b89/data/data_generator.jl#L202-L249 2 | let ea_widths = read_east_asian_widths("EastAsianWidth.txt") 3 | # Following work by @jiahao, we compute character widths using a combination of 4 | # * character category 5 | # * UAX 11: East Asian Width 6 | # * a few exceptions as needed 7 | # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 8 | global function derive_char_width(code, category) 9 | # Use a default width of 1 for all character categories that are 10 | # letter/symbol/number-like, as well as for unassigned/private-use chars. 11 | # This provides a useful nonzero fallback for new codepoints when a new 12 | # Unicode version has been released. 13 | width = 1 14 | 15 | # Various zero-width categories 16 | # 17 | # "Sk" not included in zero width - see issue #167 18 | if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs") 19 | width = 0 20 | end 21 | 22 | # Widths from UAX #11: East Asian Width 23 | eaw = get(ea_widths, code, nothing) 24 | if !isnothing(eaw) 25 | width = eaw < 0 ? 1 : eaw 26 | end 27 | 28 | # A few exceptional cases, found by manual comparison to other wcwidth 29 | # functions and similar checks. 30 | if category == "Mn" 31 | width = 0 32 | end 33 | 34 | if code == 0x00ad 35 | # Soft hyphen is typically printed as a hyphen (-) in terminals. 36 | width = 1 37 | elseif code == 0x2028 || code == 0x2029 38 | #By definition, should have zero width (on the same line) 39 | #0x002028 '\u2028' category: Zl name: LINE SEPARATOR/ 40 | #0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/ 41 | width = 0 42 | end 43 | 44 | return width 45 | end 46 | global function is_ambiguous_width(code) 47 | return get(ea_widths, code, 0) < 0 48 | end 49 | end 50 | # cpv: end 51 | -------------------------------------------------------------------------------- /licenses/LICENSE_unicode: -------------------------------------------------------------------------------- 1 | https://www.unicode.org/license.txt 2 | 3 | UNICODE LICENSE V3 4 | 5 | COPYRIGHT AND PERMISSION NOTICE 6 | 7 | Copyright © 1991-2025 Unicode, Inc. 8 | 9 | NOTICE TO USER: Carefully read the following legal agreement. BY 10 | DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR 11 | SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 12 | TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT 13 | DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a 16 | copy of data files and any associated documentation (the "Data Files") or 17 | software and any associated documentation (the "Software") to deal in the 18 | Data Files or Software without restriction, including without limitation 19 | the rights to use, copy, modify, merge, publish, distribute, and/or sell 20 | copies of the Data Files or Software, and to permit persons to whom the 21 | Data Files or Software are furnished to do so, provided that either (a) 22 | this copyright and permission notice appear with all copies of the Data 23 | Files or Software, or (b) this copyright and permission notice appear in 24 | associated Documentation. 25 | 26 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 27 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF 29 | THIRD PARTY RIGHTS. 30 | 31 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE 32 | BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, 33 | OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 34 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 35 | ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA 36 | FILES OR SOFTWARE. 37 | 38 | Except as contained in this notice, the name of a copyright holder shall 39 | not be used in advertising or otherwise to promote the sale, use or other 40 | dealings in these Data Files or Software without prior written 41 | authorization of the copyright holder. 42 | -------------------------------------------------------------------------------- /src/ascii.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | /// Returns whether the code point is alphanumeric: A-Z, a-z, or 0-9. 4 | pub fn isAlphanumeric(c: u21) bool { 5 | return switch (c) { 6 | '0'...'9', 'A'...'Z', 'a'...'z' => true, 7 | else => false, 8 | }; 9 | } 10 | 11 | /// Returns whether the code point is alphabetic: A-Z or a-z. 12 | pub fn isAlphabetic(c: u21) bool { 13 | return switch (c) { 14 | 'A'...'Z', 'a'...'z' => true, 15 | else => false, 16 | }; 17 | } 18 | 19 | /// Returns whether the code point is a control character. 20 | /// 21 | /// See also: `control_code` 22 | pub fn isControl(c: u21) bool { 23 | return c <= std.ascii.control_code.us or c == std.ascii.control_code.del; 24 | } 25 | 26 | /// Returns whether the code point is a digit. 27 | pub fn isDigit(c: u21) bool { 28 | return switch (c) { 29 | '0'...'9' => true, 30 | else => false, 31 | }; 32 | } 33 | 34 | /// Returns whether the code point is a lowercase letter. 35 | pub fn isLower(c: u21) bool { 36 | return switch (c) { 37 | 'a'...'z' => true, 38 | else => false, 39 | }; 40 | } 41 | 42 | /// Returns whether the code point is printable and has some graphical representation, 43 | /// including the space code point. 44 | pub fn isPrint(c: u21) bool { 45 | return isAscii(c) and !isControl(c); 46 | } 47 | 48 | /// Returns whether this code point is included in `whitespace`. 49 | pub fn isWhitespace(c: u21) bool { 50 | return switch (c) { 51 | ' ', '\t'...'\r' => true, 52 | else => false, 53 | }; 54 | } 55 | 56 | /// Returns whether the code point is an uppercase letter. 57 | pub fn isUpper(c: u21) bool { 58 | return switch (c) { 59 | 'A'...'Z' => true, 60 | else => false, 61 | }; 62 | } 63 | 64 | /// Returns whether the code point is a hexadecimal digit: A-F, a-f, or 0-9. 65 | pub fn isHex(c: u21) bool { 66 | return switch (c) { 67 | '0'...'9', 'A'...'F', 'a'...'f' => true, 68 | else => false, 69 | }; 70 | } 71 | 72 | /// Returns whether the code point is a 7-bit ASCII character. 73 | pub fn isAscii(c: u21) bool { 74 | return c < 128; 75 | } 76 | 77 | /// Uppercases the code point and returns it as-is if already uppercase or not a letter. 78 | pub fn toUpper(c: u21) u21 { 79 | const mask = @as(u21, @intFromBool(isLower(c))) << 5; 80 | return c ^ mask; 81 | } 82 | 83 | /// Lowercases the code point and returns it as-is if already lowercase or not a letter. 84 | pub fn toLower(c: u21) u21 { 85 | const mask = @as(u21, @intFromBool(isUpper(c))) << 5; 86 | return c | mask; 87 | } 88 | -------------------------------------------------------------------------------- /ucd/.gitignore: -------------------------------------------------------------------------------- 1 | # We use explicit ignore instead of a blanket ignore rule 2 | # plus un-ignoring needed files because `fd` doesn't seem 3 | # to list un-ignored files correctly 4 | 5 | ArabicShaping.txt 6 | # BidiBrackets.txt (used) 7 | BidiCharacterTest.txt 8 | BidiMirroring.txt 9 | BidiTest.txt 10 | # Blocks.txt (used) 11 | CJKRadicals.txt 12 | # CaseFolding.txt (used) 13 | CompositionExclusions.txt 14 | DerivedAge.txt 15 | # DerivedCoreProperties.txt (used) 16 | DerivedNormalizationProps.txt 17 | DoNotEmit.txt 18 | EastAsianWidth.txt 19 | EmojiSources.txt 20 | EquivalentUnifiedIdeograph.txt 21 | HangulSyllableType.txt 22 | Index.txt 23 | IndicPositionalCategory.txt 24 | IndicSyllabicCategory.txt 25 | Jamo.txt 26 | LineBreak.txt 27 | NameAliases.txt 28 | NamedSequences.txt 29 | NamedSequencesProv.txt 30 | NamesList.html 31 | NamesList.txt 32 | NormalizationCorrections.txt 33 | NormalizationTest.txt 34 | NushuSources.txt 35 | PropList.txt 36 | PropertyAliases.txt 37 | PropertyValueAliases.txt 38 | ReadMe.txt 39 | ScriptExtensions.txt 40 | Scripts.txt 41 | # SpecialCasing.txt (used) 42 | StandardizedVariants.txt 43 | TangutSources.txt 44 | USourceData.txt 45 | USourceGlyphs.pdf 46 | USourceRSChart.pdf 47 | # UnicodeData.txt (used) 48 | Unihan/Unihan_DictionaryIndices.txt 49 | Unihan/Unihan_DictionaryLikeData.txt 50 | Unihan/Unihan_IRGSources.txt 51 | Unihan/Unihan_NumericValues.txt 52 | Unihan/Unihan_OtherMappings.txt 53 | Unihan/Unihan_RadicalStrokeCounts.txt 54 | Unihan/Unihan_Readings.txt 55 | Unihan/Unihan_Variants.txt 56 | Unikemet.txt 57 | VerticalOrientation.txt 58 | # auxiliary/GraphemeBreakProperty.txt (used) 59 | auxiliary/GraphemeBreakTest.html 60 | # auxiliary/GraphemeBreakTest.txt 61 | auxiliary/LineBreakTest.html 62 | auxiliary/LineBreakTest.txt 63 | auxiliary/SentenceBreakProperty.txt 64 | auxiliary/SentenceBreakTest.html 65 | auxiliary/SentenceBreakTest.txt 66 | auxiliary/WordBreakProperty.txt 67 | auxiliary/WordBreakTest.html 68 | auxiliary/WordBreakTest.txt 69 | emoji/ReadMe.txt 70 | # emoji/emoji-data.txt (used) 71 | emoji/emoji-sequences.txt 72 | emoji/emoji-test.txt 73 | # emoji/emoji-variation-sequences.txt (used) 74 | # emoji/emoji-zwj-sequences.txt (for reference) 75 | # extracted/DerivedBidiClass.txt (used) 76 | extracted/DerivedBinaryProperties.txt 77 | extracted/DerivedCombiningClass.txt 78 | extracted/DerivedDecompositionType.txt 79 | # extracted/DerivedEastAsianWidth.txt (used) 80 | # extracted/DerivedGeneralCategory.txt (unused, but useful for reference) 81 | extracted/DerivedJoiningGroup.txt 82 | extracted/DerivedJoiningType.txt 83 | extracted/DerivedLineBreak.txt 84 | extracted/DerivedName.txt 85 | extracted/DerivedNumericType.txt 86 | extracted/DerivedNumericValues.txt 87 | -------------------------------------------------------------------------------- /resources/wcwidth/uniseg.go: -------------------------------------------------------------------------------- 1 | // cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/width.go#L3-L61 2 | // EastAsianAmbiguousWidth specifies the monospace width for East Asian 3 | // characters classified as Ambiguous. The default is 1 but some rare fonts 4 | // render them with a width of 2. 5 | var EastAsianAmbiguousWidth = 1 6 | 7 | // runeWidth returns the monospace width for the given rune. The provided 8 | // grapheme property is a value mapped by the [graphemeCodePoints] table. 9 | // 10 | // Every rune has a width of 1, except for runes with the following properties 11 | // (evaluated in this order): 12 | // 13 | // - Control, CR, LF, Extend, ZWJ: Width of 0 14 | // - \u2e3a, TWO-EM DASH: Width of 3 15 | // - \u2e3b, THREE-EM DASH: Width of 4 16 | // - East-Asian width Fullwidth and Wide: Width of 2 (Ambiguous and Neutral 17 | // have a width of 1) 18 | // - Regional Indicator: Width of 2 19 | // - Extended Pictographic: Width of 2, unless Emoji Presentation is "No". 20 | func runeWidth(r rune, graphemeProperty int) int { 21 | switch graphemeProperty { 22 | case prControl, prCR, prLF, prExtend, prZWJ: 23 | return 0 24 | case prRegionalIndicator: 25 | return 2 26 | case prExtendedPictographic: 27 | if property(emojiPresentation, r) == prEmojiPresentation { 28 | return 2 29 | } 30 | return 1 31 | } 32 | 33 | switch r { 34 | case 0x2e3a: 35 | return 3 36 | case 0x2e3b: 37 | return 4 38 | } 39 | 40 | switch propertyEastAsianWidth(r) { 41 | case prW, prF: 42 | return 2 43 | case prA: 44 | return EastAsianAmbiguousWidth 45 | } 46 | 47 | return 1 48 | } 49 | 50 | // StringWidth returns the monospace width for the given string, that is, the 51 | // number of same-size cells to be occupied by the string. 52 | func StringWidth(s string) (width int) { 53 | state := -1 54 | for len(s) > 0 { 55 | var w int 56 | _, s, w, state = FirstGraphemeClusterInString(s, state) 57 | width += w 58 | } 59 | return 60 | } 61 | // cpv: end 62 | 63 | // cpv: track https://github.com/rivo/uniseg/blob/087b3e4194c1feb0856b68d0e7c425c0994829cf/grapheme.go#L287-L345 64 | // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and 65 | // outputs are strings. 66 | func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) { 67 | // An empty string returns nothing. 68 | if len(str) == 0 { 69 | return 70 | } 71 | 72 | // Extract the first rune. 73 | r, length := utf8.DecodeRuneInString(str) 74 | if len(str) <= length { // If we're already past the end, there is nothing else to parse. 75 | var prop int 76 | if state < 0 { 77 | prop = propertyGraphemes(r) 78 | } else { 79 | prop = state >> shiftGraphemePropState 80 | } 81 | return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) 82 | } 83 | 84 | // If we don't know the state, determine it now. 85 | var firstProp int 86 | if state < 0 { 87 | state, firstProp, _ = transitionGraphemeState(state, r) 88 | } else { 89 | firstProp = state >> shiftGraphemePropState 90 | } 91 | width += runeWidth(r, firstProp) 92 | 93 | // Transition until we find a boundary. 94 | for { 95 | var ( 96 | prop int 97 | boundary bool 98 | ) 99 | 100 | r, l := utf8.DecodeRuneInString(str[length:]) 101 | state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) 102 | 103 | if boundary { 104 | return str[:length], str[length:], width, state | (prop << shiftGraphemePropState) 105 | } 106 | 107 | if firstProp == prExtendedPictographic { 108 | if r == vs15 { 109 | width = 1 110 | } else if r == vs16 { 111 | width = 2 112 | } 113 | } else if firstProp != prRegionalIndicator && firstProp != prL { 114 | width += runeWidth(r, prop) 115 | } 116 | 117 | length += l 118 | if len(str) <= length { 119 | return str, "", width, grAny | (prop << shiftGraphemePropState) 120 | } 121 | } 122 | } 123 | // cpv: end 124 | -------------------------------------------------------------------------------- /resources/wcwidth/ziglyph.zig: -------------------------------------------------------------------------------- 1 | //! cpv: track https://codeberg.org/dude_the_builder/ziglyph/src/commit/29760d237219cc4d486f5cd654262d7b0d62d511/src/display_width.zig#L15-L124 2 | fn isAsciiStr(str: []const u8) bool { 3 | return for (str) |b| { 4 | if (b > 127) break false; 5 | } else true; 6 | } 7 | 8 | /// AmbiguousWidth determines the width of ambiguous characters according to the context. In an 9 | /// East Asian context, the width of ambiguous code points should be 2 (full), and 1 (half) 10 | /// in non-East Asian contexts. The most common use case is `half`. 11 | pub const AmbiguousWidth = enum(u2) { 12 | half = 1, 13 | full = 2, 14 | }; 15 | 16 | /// codePointWidth returns how many cells (or columns) wide `cp` should be when rendered in a 17 | /// fixed-width font. 18 | pub fn codePointWidth(cp: u21, am_width: AmbiguousWidth) i3 { 19 | if (cp == 0x000 or cp == 0x0005 or cp == 0x0007 or (cp >= 0x000A and cp <= 0x000F)) { 20 | // Control. 21 | return 0; 22 | } else if (cp == 0x0008 or cp == 0x007F) { 23 | // backspace and DEL 24 | return -1; 25 | } else if (cp == 0x00AD) { 26 | // soft-hyphen 27 | return 1; 28 | } else if (cp == 0x2E3A) { 29 | // two-em dash 30 | return 2; 31 | } else if (cp == 0x2E3B) { 32 | // three-em dash 33 | return 3; 34 | } else if (cats.isEnclosingMark(cp) or cats.isNonspacingMark(cp)) { 35 | // Combining Marks. 36 | return 0; 37 | } else if (cats.isFormat(cp) and (!(cp >= 0x0600 and cp <= 0x0605) and cp != 0x061C and 38 | cp != 0x06DD and cp != 0x08E2)) 39 | { 40 | // Format except Arabic. 41 | return 0; 42 | } else if ((cp >= 0x1160 and cp <= 0x11FF) or (cp >= 0x2060 and cp <= 0x206F) or 43 | (cp >= 0xFFF0 and cp <= 0xFFF8) or (cp >= 0xE0000 and cp <= 0xE0FFF)) 44 | { 45 | // Hangul syllable and ignorable. 46 | return 0; 47 | } else if ((cp >= 0x3400 and cp <= 0x4DBF) or (cp >= 0x4E00 and cp <= 0x9FFF) or 48 | (cp >= 0xF900 and cp <= 0xFAFF) or (cp >= 0x20000 and cp <= 0x2FFFD) or 49 | (cp >= 0x30000 and cp <= 0x3FFFD)) 50 | { 51 | return 2; 52 | } else if (eaw.isWide(cp) or eaw.isFullwidth(cp)) { 53 | return 2; 54 | } else if (gbp.isRegionalIndicator(cp)) { 55 | return 2; 56 | } else if (eaw.isAmbiguous(cp)) { 57 | return @intFromEnum(am_width); 58 | } else { 59 | return 1; 60 | } 61 | } 62 | 63 | /// strWidth returns how many cells (or columns) wide `str` should be when rendered in a 64 | /// fixed-width font. 65 | pub fn strWidth(str: []const u8, am_width: AmbiguousWidth) !usize { 66 | var total: isize = 0; 67 | 68 | // ASCII bytes are all width == 1. 69 | if (isAsciiStr(str)) { 70 | for (str) |b| { 71 | // Backspace and DEL 72 | if (b == 8 or b == 127) { 73 | total -= 1; 74 | continue; 75 | } 76 | 77 | // Control 78 | if (b < 32) continue; 79 | 80 | // All other ASCII. 81 | total += 1; 82 | } 83 | 84 | return if (total > 0) @intCast(total) else 0; 85 | } 86 | 87 | var giter = GraphemeIterator.init(str); 88 | 89 | while (giter.next()) |gc| { 90 | var cp_iter = (try unicode.Utf8View.init(str[gc.offset .. gc.offset + gc.len])).iterator(); 91 | 92 | while (cp_iter.nextCodepoint()) |cp| { 93 | var w = codePointWidth(cp, am_width); 94 | 95 | if (w != 0) { 96 | // Only adding width of first non-zero-width code point. 97 | if (emoji.isExtendedPictographic(cp)) { 98 | if (cp_iter.nextCodepoint()) |ncp| { 99 | // emoji text sequence. 100 | if (ncp == 0xFE0E) w = 1; 101 | if (ncp == 0xFE0F) w = 2; 102 | } 103 | } 104 | total += w; 105 | break; 106 | } 107 | } 108 | } 109 | 110 | return if (total > 0) @intCast(total) else 0; 111 | } 112 | // cpv: end 113 | -------------------------------------------------------------------------------- /src/utf8.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 4 | // and licenses/LICENSE_Bjoern_Hoehrmann 5 | 6 | const UTF8_ACCEPT = 0; 7 | const UTF8_REJECT = 12; 8 | 9 | // The first part of the table maps bytes to character classes to reduce the 10 | // size of the transition table and create bitmasks. 11 | // zig fmt: off 12 | const utf8d = [_]u8{ 13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 14 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 15 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 16 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 17 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 18 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 19 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 20 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 21 | }; 22 | 23 | // The second part is a transition table that maps a combination of a state of 24 | // the automaton and a character class to a state. 25 | const state_utf8d = [_]u8{ 26 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 27 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 28 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 29 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 30 | 12,36,12,12,12,12,12,12,12,12,12,12, 31 | }; 32 | // zig fmt: on 33 | 34 | fn decodeByte(state: *usize, cp: *u21, byte: u8) void { 35 | const class: std.math.IntFittingRange(0, 11) = @intCast(utf8d[byte]); 36 | const mask: u21 = 0xff; 37 | 38 | cp.* = if (state.* != UTF8_ACCEPT) 39 | (byte & 0x3f) | (cp.* << 6) 40 | else 41 | (mask >> class) & byte; 42 | 43 | state.* = state_utf8d[state.* + class]; 44 | } 45 | 46 | fn isDoneDecoding(state: usize) bool { 47 | return state == UTF8_ACCEPT or state == UTF8_REJECT; 48 | } 49 | 50 | pub const Iterator = struct { 51 | // This "i" is part of the documented API of this iterator, pointing to the 52 | // current location of the iterator in `bytes`. 53 | i: usize = 0, 54 | bytes: []const u8, 55 | 56 | const Self = @This(); 57 | 58 | pub fn init(bytes: []const u8) Self { 59 | return .{ 60 | .bytes = bytes, 61 | }; 62 | } 63 | 64 | pub fn next(self: *Self) ?u21 { 65 | if (self.i >= self.bytes.len) return null; 66 | 67 | var cp: u21 = 0; 68 | var state: usize = UTF8_ACCEPT; 69 | 70 | while (true) { 71 | decodeByte(&state, &cp, self.bytes[self.i]); 72 | self.i += 1; 73 | if (isDoneDecoding(state) or self.i >= self.bytes.len) break; 74 | } 75 | 76 | if (state == UTF8_ACCEPT) return cp; 77 | return 0xFFFD; // Replacement character 78 | } 79 | 80 | pub fn peek(self: Self) ?u21 { 81 | var it = self; 82 | return it.next(); 83 | } 84 | }; 85 | 86 | test "Iterator for ascii" { 87 | var it = Iterator.init("abc"); 88 | try std.testing.expectEqual('a', it.next()); 89 | try std.testing.expectEqual(1, it.i); 90 | try std.testing.expectEqual('b', it.peek()); 91 | try std.testing.expectEqual('b', it.next()); 92 | try std.testing.expectEqual('c', it.next()); 93 | try std.testing.expectEqual(null, it.peek()); 94 | try std.testing.expectEqual(null, it.next()); 95 | try std.testing.expectEqual(null, it.next()); 96 | } 97 | 98 | test "Iterator for emoji" { 99 | var it = Iterator.init("😀😅😻👺"); 100 | try std.testing.expectEqual(0x1F600, it.next()); 101 | try std.testing.expectEqual(4, it.i); 102 | try std.testing.expectEqual(0x1F605, it.peek()); 103 | try std.testing.expectEqual(4, it.i); 104 | try std.testing.expectEqual(0x1F605, it.next()); 105 | try std.testing.expectEqual(8, it.i); 106 | try std.testing.expectEqual(0x1F63B, it.next()); 107 | try std.testing.expectEqual(12, it.i); 108 | try std.testing.expectEqual(0x1F47A, it.next()); 109 | try std.testing.expectEqual(16, it.i); 110 | try std.testing.expectEqual(null, it.next()); 111 | try std.testing.expectEqual(16, it.i); 112 | } 113 | 114 | test "Iterator overlong utf8" { 115 | var it = Iterator.init("\xf0\x80\x80\xaf"); 116 | try std.testing.expectEqual(0xFFFD, it.next()); 117 | try std.testing.expectEqual(0xFFFD, it.next()); 118 | try std.testing.expectEqual(0xFFFD, it.next()); 119 | try std.testing.expectEqual(null, it.next()); 120 | try std.testing.expectEqual(null, it.next()); 121 | } 122 | -------------------------------------------------------------------------------- /resources/wcwidth/LICENSE_utf8proc.md: -------------------------------------------------------------------------------- 1 | 2 | ## utf8proc license ## 3 | 4 | **utf8proc** is a software package originally developed 5 | by Jan Behrens and the rest of the Public Software Group, who 6 | deserve nearly all of the credit for this library, that is now maintained by the Julia-language developers. Like the original utf8proc, 7 | whose copyright and license statements are reproduced below, all new 8 | work on the utf8proc library is licensed under the [MIT "expat" 9 | license](http://opensource.org/licenses/MIT): 10 | 11 | *Copyright © 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.* 12 | 13 | Permission is hereby granted, free of charge, to any person obtaining a 14 | copy of this software and associated documentation files (the "Software"), 15 | to deal in the Software without restriction, including without limitation 16 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 17 | and/or sell copies of the Software, and to permit persons to whom the 18 | Software is furnished to do so, subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be included in 21 | all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 28 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 29 | DEALINGS IN THE SOFTWARE. 30 | 31 | ## Original utf8proc license ## 32 | 33 | *Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany* 34 | 35 | Permission is hereby granted, free of charge, to any person obtaining a 36 | copy of this software and associated documentation files (the "Software"), 37 | to deal in the Software without restriction, including without limitation 38 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 39 | and/or sell copies of the Software, and to permit persons to whom the 40 | Software is furnished to do so, subject to the following conditions: 41 | 42 | The above copyright notice and this permission notice shall be included in 43 | all copies or substantial portions of the Software. 44 | 45 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 46 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 47 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 48 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 49 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 50 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 51 | DEALINGS IN THE SOFTWARE. 52 | 53 | ## Unicode data license ## 54 | 55 | This software contains data (`utf8proc_data.c`) derived from processing 56 | the Unicode data files. The following license applies to that data: 57 | 58 | **COPYRIGHT AND PERMISSION NOTICE** 59 | 60 | *Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed 61 | under the Terms of Use in http://www.unicode.org/copyright.html.* 62 | 63 | Permission is hereby granted, free of charge, to any person obtaining a 64 | copy of the Unicode data files and any associated documentation (the "Data 65 | Files") or Unicode software and any associated documentation (the 66 | "Software") to deal in the Data Files or Software without restriction, 67 | including without limitation the rights to use, copy, modify, merge, 68 | publish, distribute, and/or sell copies of the Data Files or Software, and 69 | to permit persons to whom the Data Files or Software are furnished to do 70 | so, provided that (a) the above copyright notice(s) and this permission 71 | notice appear with all copies of the Data Files or Software, (b) both the 72 | above copyright notice(s) and this permission notice appear in associated 73 | documentation, and (c) there is clear notice in each modified Data File or 74 | in the Software as well as in the documentation associated with the Data 75 | File(s) or Software that the data or software has been modified. 76 | 77 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 78 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 79 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF 80 | THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS 81 | INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR 82 | CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 83 | USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 84 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 85 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 86 | 87 | Except as contained in this notice, the name of a copyright holder shall 88 | not be used in advertising or otherwise to promote the sale, use or other 89 | dealings in these Data Files or Software without prior written 90 | authorization of the copyright holder. 91 | 92 | Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be 93 | registered in some jurisdictions. All other trademarks and registered 94 | trademarks mentioned herein are the property of their respective owners. 95 | 96 | -------------------------------------------------------------------------------- /src/root.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const getpkg = @import("get.zig"); 3 | pub const config = @import("config.zig"); 4 | pub const types = @import("types.zig"); 5 | pub const ascii = @import("ascii.zig"); 6 | pub const grapheme = @import("grapheme.zig"); 7 | pub const code_point = @import("code_point.zig"); 8 | pub const utf8 = @import("utf8.zig"); 9 | pub const x = @import("x/root.zig"); 10 | const testing = std.testing; 11 | 12 | pub const FieldEnum = getpkg.FieldEnum; 13 | pub const TypeOf = getpkg.TypeOf; 14 | pub const TypeOfAll = getpkg.TypeOfAll; 15 | pub const get = getpkg.get; 16 | pub const getAll = getpkg.getAll; 17 | pub const hasField = getpkg.hasField; 18 | 19 | test { 20 | std.testing.refAllDeclsRecursive(@This()); 21 | } 22 | 23 | test "name" { 24 | try testing.expect(std.mem.eql(u8, get(.name, 65), "LATIN CAPITAL LETTER A")); 25 | } 26 | 27 | test "is_alphabetic" { 28 | try testing.expect(get(.is_alphabetic, 65)); // 'A' 29 | try testing.expect(get(.is_alphabetic, 97)); // 'a' 30 | try testing.expect(!get(.is_alphabetic, 0)); 31 | } 32 | 33 | test "case_folding_simple" { 34 | try testing.expectEqual(97, get(.case_folding_simple, 65)); // 'a' 35 | try testing.expectEqual(97, get(.case_folding_simple, 97)); // 'a' 36 | } 37 | 38 | test "simple_uppercase_mapping" { 39 | try testing.expectEqual(65, get(.simple_uppercase_mapping, 97)); // 'a' 40 | try testing.expectEqual(null, get(.simple_uppercase_mapping, 65)); // 'A' 41 | } 42 | 43 | test "generalCategory" { 44 | try testing.expect(get(.general_category, 65) == .letter_uppercase); // 'A' 45 | } 46 | 47 | test "getAll" { 48 | const d1 = getAll("1", 65); 49 | try testing.expect(d1.general_category == .letter_uppercase); 50 | try testing.expect(d1.case_folding_simple.unshift(65) == 97); 51 | 52 | const d_checks = getAll("checks", 65); 53 | // auto should become packed for these checks 54 | try testing.expectEqual(.@"packed", @typeInfo(TypeOfAll("checks")).@"struct".layout); 55 | try testing.expect(d_checks.simple_uppercase_mapping.unshift(65) == null); 56 | try testing.expect(d_checks.is_alphabetic); 57 | try testing.expect(d_checks.is_uppercase); 58 | try testing.expect(!d_checks.is_lowercase); 59 | } 60 | 61 | test "get extension foo" { 62 | try testing.expectEqual(0, get(.foo, 65)); 63 | try testing.expectEqual(3, get(.foo, 0)); 64 | } 65 | 66 | test "get extension emoji_odd_or_even" { 67 | try testing.expectEqual(.odd_emoji, get(.emoji_odd_or_even, 0x1F34B)); // 🍋 68 | } 69 | 70 | test "get packed optional enum works" { 71 | try testing.expectEqual(.odd_emoji, get(.opt_emoji_odd_or_even, 0x1F34B)); // 🍋 72 | try testing.expectEqual(null, get(.opt_emoji_odd_or_even, 0x01D8)); // ǘ 73 | } 74 | 75 | test "get packed optional bool works" { 76 | try testing.expectEqual(true, get(.maybe_bit, 0x1200)); 77 | try testing.expectEqual(false, get(.maybe_bit, 0x1235)); 78 | try testing.expectEqual(null, get(.maybe_bit, 0x1236)); 79 | } 80 | 81 | test "get union unpacked, shift" { 82 | try testing.expectEqual(0x1234, get(.next_or_prev, 0x1233).next); 83 | try testing.expectEqual(0x1200, get(.next_or_prev, 0x1201).prev); 84 | try testing.expectEqual(.none, get(.next_or_prev, 0x1235)); 85 | } 86 | 87 | test "get union unpacked, direct" { 88 | try testing.expectEqual(0x1234, get(.next_or_prev_direct, 0x1233).next); 89 | try testing.expectEqual(0x1200, get(.next_or_prev_direct, 0x1201).prev); 90 | try testing.expectEqual(.none, get(.next_or_prev_direct, 0x1235)); 91 | } 92 | 93 | test "get union packed, shift" { 94 | try testing.expectEqual(5, @bitSizeOf(@FieldType(TypeOfAll("pack"), "bidi_paired_bracket"))); 95 | try testing.expectEqual(0x0029, get(.bidi_paired_bracket, 0x0028).open); 96 | try testing.expectEqual(0x2997, get(.bidi_paired_bracket, 0x2998).close); 97 | try testing.expectEqual(.none, get(.bidi_paired_bracket, 0x4000)); 98 | } 99 | 100 | test "get union packed, direct" { 101 | try testing.expectEqual(0x0029, get(.bidi_paired_bracket_direct, 0x0028).open); 102 | try testing.expectEqual(0x2997, get(.bidi_paired_bracket_direct, 0x2998).close); 103 | try testing.expectEqual(.none, get(.bidi_paired_bracket_direct, 0x4000)); 104 | } 105 | 106 | test "get bidi_class" { 107 | try testing.expectEqual(.arabic_number, get(.bidi_class, 0x0600)); 108 | } 109 | 110 | test "special_casing_condition" { 111 | const conditions1 = get(.special_casing_condition, 65); // 'A' 112 | try testing.expectEqual(0, conditions1.len); 113 | 114 | // Greek Capital Sigma (U+03A3) which has Final_Sigma condition 115 | const conditions = get(.special_casing_condition, 0x03A3); 116 | try testing.expectEqual(1, conditions.len); 117 | try testing.expectEqual(types.SpecialCasingCondition.final_sigma, conditions[0]); 118 | } 119 | 120 | test "special_lowercase_mapping" { 121 | var buffer: [1]u21 = undefined; 122 | 123 | // Greek Capital Sigma (U+03A3) which has Final_Sigma condition 124 | const mapping = get(.special_lowercase_mapping, 0x03A3).with(&buffer, 0x03A3); 125 | try testing.expectEqual(1, mapping.len); 126 | try testing.expectEqual(0x03C2, mapping[0]); // Should map to Greek Small Letter Final Sigma 127 | } 128 | 129 | test "info extension" { 130 | // ǰ -> J 131 | try testing.expectEqual(0x004A, get(.uppercase_mapping_first_char, 0x01F0)); 132 | 133 | try testing.expect(get(.has_simple_lowercase, 0x1FD9)); // Ῑ 134 | try testing.expect(!get(.has_simple_lowercase, 0x1FE0)); // ῠ 135 | 136 | // MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH 137 | try testing.expect(std.mem.eql(u8, "061/1", get(.numeric_value_numeric_reversed, 0x0D58))); 138 | } 139 | 140 | test "is_emoji_vs_base" { 141 | try testing.expect(get(.is_emoji_vs_base, 0x231B)); // ⌛ 142 | try testing.expect(get(.is_emoji_vs_base, 0x1F327)); // 🌧 143 | try testing.expect(!get(.is_emoji_vs_base, 0x1F46C)); // 👬 144 | } 145 | -------------------------------------------------------------------------------- /src/code_point.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub const Iterator = struct { 4 | // This "i" is part of the documented API of this iterator, pointing to the 5 | // current location of the iterator in `code_points`. 6 | i: usize = 0, 7 | code_points: []const u21, 8 | 9 | const Self = @This(); 10 | 11 | pub fn init(code_points: []const u21) Self { 12 | return .{ 13 | .code_points = code_points, 14 | }; 15 | } 16 | 17 | pub fn next(self: *Self) ?u21 { 18 | if (self.i >= self.code_points.len) return null; 19 | defer self.i += 1; 20 | return self.code_points[self.i]; 21 | } 22 | 23 | pub fn peek(self: Self) ?u21 { 24 | if (self.i >= self.code_points.len) return null; 25 | return self.code_points[self.i]; 26 | } 27 | }; 28 | 29 | test "Iterator for emoji code points" { 30 | const code_points = &[_]u21{ 31 | 0x1F600, // 😀 32 | 0x1F605, // 😅 33 | 0x1F63B, // 😻 34 | 0x1F47A, // 👺 35 | }; 36 | 37 | var it = Iterator.init(code_points); 38 | try std.testing.expectEqual(0x1F600, it.next()); 39 | try std.testing.expectEqual(1, it.i); 40 | try std.testing.expectEqual(0x1F605, it.peek()); 41 | try std.testing.expectEqual(1, it.i); 42 | try std.testing.expectEqual(0x1F605, it.next()); 43 | try std.testing.expectEqual(2, it.i); 44 | try std.testing.expectEqual(0x1F63B, it.next()); 45 | try std.testing.expectEqual(3, it.i); 46 | try std.testing.expectEqual(0x1F47A, it.next()); 47 | try std.testing.expectEqual(4, it.i); 48 | try std.testing.expectEqual(null, it.next()); 49 | try std.testing.expectEqual(4, it.i); 50 | } 51 | 52 | /// Returns a custom iterator for a given Context type. 53 | /// 54 | /// The Context must have the following methods: 55 | /// 56 | /// * len(self: *Context) usize 57 | /// * get(self: *Context, i: usize) ?u21 // or u21 58 | /// 59 | /// If `get` returns null, the code continues incrementing `i` until it returns 60 | /// a non-null result or `len` is reached, with `len` being called every 61 | /// iteration to allow for `Context` to end early. If instead `get` has a 62 | /// return type of non-optional `u21`, we don't loop. 63 | pub fn CustomIterator(comptime Context: type) type { 64 | return struct { 65 | // This "i" is part of the documented API of this iterator, pointing to the 66 | // current location of the iterator in `code_points`. 67 | i: usize = 0, 68 | ctx: Context, 69 | 70 | const Self = @This(); 71 | 72 | pub fn init(ctx: Context) Self { 73 | return .{ 74 | .ctx = ctx, 75 | }; 76 | } 77 | 78 | pub fn next(self: *Self) ?u21 { 79 | const getFn = @typeInfo(@TypeOf(@TypeOf(self.ctx).get)).@"fn"; 80 | if (comptime getFn.return_type.? == ?u21) { 81 | while (self.i < self.ctx.len()) : (self.i += 1) { 82 | const value = self.ctx.get(self.i); 83 | if (value) |cp| { 84 | @branchHint(.likely); 85 | self.i += 1; 86 | return cp; 87 | } 88 | } 89 | } else { 90 | if (self.i < self.ctx.len()) { 91 | defer self.i += 1; 92 | return self.ctx.get(self.i); 93 | } 94 | } 95 | 96 | return null; 97 | } 98 | 99 | pub fn peek(self: Self) ?u21 { 100 | var it = self; 101 | return it.next(); 102 | } 103 | }; 104 | } 105 | 106 | test "CustomIterator for emoji code points" { 107 | const Wrapper = struct { 108 | cp: u21, 109 | }; 110 | 111 | const code_points = &[_]Wrapper{ 112 | .{ .cp = 0x1F600 }, // 😀 113 | .{ .cp = 0x1F605 }, // 😅 114 | .{ .cp = 0x1F63B }, // 😻 115 | .{ .cp = 0x1F47A }, // 👺 116 | }; 117 | 118 | var it = CustomIterator(struct { 119 | points: []const Wrapper, 120 | 121 | pub fn len(self: @This()) usize { 122 | return self.points.len; 123 | } 124 | 125 | pub fn get(self: @This(), i: usize) u21 { 126 | return self.points[i].cp; 127 | } 128 | }).init(.{ .points = code_points }); 129 | try std.testing.expectEqual(0x1F600, it.next()); 130 | try std.testing.expectEqual(1, it.i); 131 | try std.testing.expectEqual(0x1F605, it.peek()); 132 | try std.testing.expectEqual(1, it.i); 133 | try std.testing.expectEqual(0x1F605, it.next()); 134 | try std.testing.expectEqual(2, it.i); 135 | try std.testing.expectEqual(0x1F63B, it.next()); 136 | try std.testing.expectEqual(3, it.i); 137 | try std.testing.expectEqual(0x1F47A, it.next()); 138 | try std.testing.expectEqual(4, it.i); 139 | try std.testing.expectEqual(null, it.next()); 140 | try std.testing.expectEqual(4, it.i); 141 | } 142 | 143 | test "CustomIterator for emoji code points with gaps and optional get" { 144 | const Wrapper = struct { 145 | cp: ?u21, 146 | }; 147 | 148 | const code_points = &[_]Wrapper{ 149 | .{ .cp = 0x1F600 }, // 😀 150 | .{ .cp = null }, 151 | .{ .cp = 0x1F605 }, // 😅 152 | .{ .cp = 0x1F63B }, // 😻 153 | .{ .cp = 0x1F47A }, // 👺 154 | .{ .cp = null }, 155 | .{ .cp = null }, 156 | }; 157 | 158 | var it = CustomIterator(struct { 159 | points: []const Wrapper, 160 | 161 | pub fn len(self: @This()) usize { 162 | return self.points.len; 163 | } 164 | 165 | pub fn get(self: @This(), i: usize) ?u21 { 166 | return self.points[i].cp; 167 | } 168 | }).init(.{ .points = code_points }); 169 | try std.testing.expectEqual(0x1F600, it.next()); 170 | try std.testing.expectEqual(1, it.i); 171 | try std.testing.expectEqual(0x1F605, it.peek()); 172 | try std.testing.expectEqual(1, it.i); 173 | try std.testing.expectEqual(0x1F605, it.next()); 174 | try std.testing.expectEqual(3, it.i); 175 | try std.testing.expectEqual(0x1F63B, it.next()); 176 | try std.testing.expectEqual(4, it.i); 177 | try std.testing.expectEqual(0x1F47A, it.next()); 178 | try std.testing.expectEqual(5, it.i); 179 | try std.testing.expectEqual(null, it.next()); 180 | try std.testing.expectEqual(7, it.i); 181 | } 182 | -------------------------------------------------------------------------------- /src/get.zig: -------------------------------------------------------------------------------- 1 | //! This file defines the low(er)-level `get` method, returning `Data`. 2 | //! (It also must be separate from `root.zig` so that `types.zig` can use it to 3 | //! allow for a better API on `Slice` fields.) 4 | const std = @import("std"); 5 | const tables = @import("tables").tables; 6 | const types = @import("types.zig"); 7 | 8 | fn TableData(comptime Table: anytype) type { 9 | const DataSlice = if (@hasField(Table, "stage3")) 10 | @FieldType(Table, "stage3") 11 | else 12 | @FieldType(Table, "stage2"); 13 | return @typeInfo(DataSlice).pointer.child; 14 | } 15 | 16 | fn tableInfoFor(comptime field: []const u8) std.builtin.Type.StructField { 17 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| { 18 | if (@hasField(TableData(tableInfo.type), field)) { 19 | return tableInfo; 20 | } 21 | } 22 | 23 | @compileError("Table not found for field: " ++ field); 24 | } 25 | 26 | pub fn hasField(comptime field: []const u8) bool { 27 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| { 28 | if (@hasField(TableData(tableInfo.type), field)) { 29 | return true; 30 | } 31 | } 32 | 33 | return false; 34 | } 35 | 36 | fn getTableInfo(comptime table_name: []const u8) std.builtin.Type.StructField { 37 | inline for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| { 38 | if (std.mem.eql(u8, tableInfo.name, table_name)) { 39 | return tableInfo; 40 | } 41 | } 42 | 43 | @compileError("Table '" ++ table_name ++ "' not found in tables"); 44 | } 45 | 46 | fn BackingFor(comptime field: []const u8) type { 47 | const tableInfo = tableInfoFor(field); 48 | const Backing = @FieldType(@FieldType(@TypeOf(tables), tableInfo.name), "backing"); 49 | return @FieldType(@typeInfo(Backing).pointer.child, field); 50 | } 51 | 52 | pub fn backingFor(comptime field: []const u8) BackingFor(field) { 53 | const tableInfo = tableInfoFor(field); 54 | return @field(@field(tables, tableInfo.name).backing, field); 55 | } 56 | 57 | fn TableFor(comptime field: []const u8) type { 58 | const tableInfo = tableInfoFor(field); 59 | return @FieldType(@TypeOf(tables), tableInfo.name); 60 | } 61 | 62 | fn tableFor(comptime field: []const u8) TableFor(field) { 63 | return @field(tables, tableInfoFor(field).name); 64 | } 65 | 66 | fn GetTable(comptime table_name: []const u8) type { 67 | const tableInfo = getTableInfo(table_name); 68 | return @FieldType(@TypeOf(tables), tableInfo.name); 69 | } 70 | 71 | fn getTable(comptime table_name: []const u8) GetTable(table_name) { 72 | return @field(tables, getTableInfo(table_name).name); 73 | } 74 | 75 | fn data(comptime table: anytype, cp: u21) TableData(@TypeOf(table)) { 76 | const stage1_idx = cp >> 8; 77 | const stage2_idx = cp & 0xFF; 78 | if (@hasField(@TypeOf(table), "stage3")) { 79 | return table.stage3[table.stage2[table.stage1[stage1_idx] + stage2_idx]]; 80 | } else { 81 | return table.stage2[table.stage1[stage1_idx] + stage2_idx]; 82 | } 83 | } 84 | 85 | pub fn getAll(comptime table_name: []const u8, cp: u21) TypeOfAll(table_name) { 86 | const table = comptime getTable(table_name); 87 | return data(table, cp); 88 | } 89 | 90 | pub fn TypeOfAll(comptime table_name: []const u8) type { 91 | return TableData(getTableInfo(table_name).type); 92 | } 93 | 94 | pub const FieldEnum = blk: { 95 | var fields_len: usize = 0; 96 | for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| { 97 | fields_len += @typeInfo(TableData(tableInfo.type)).@"struct".fields.len; 98 | } 99 | 100 | var fields: [fields_len]std.builtin.Type.EnumField = undefined; 101 | var i: usize = 0; 102 | 103 | for (@typeInfo(@TypeOf(tables)).@"struct".fields) |tableInfo| { 104 | for (@typeInfo(TableData(tableInfo.type)).@"struct".fields) |f| { 105 | fields[i] = .{ 106 | .name = f.name, 107 | .value = i, 108 | }; 109 | i += 1; 110 | } 111 | } 112 | 113 | break :blk @Type(.{ 114 | .@"enum" = .{ 115 | .tag_type = std.math.IntFittingRange(0, fields_len - 1), 116 | .fields = &fields, 117 | .decls = &[_]std.builtin.Type.Declaration{}, 118 | .is_exhaustive = true, 119 | }, 120 | }); 121 | }; 122 | 123 | fn DataField(comptime field: []const u8) type { 124 | return @FieldType(TableData(tableInfoFor(field).type), field); 125 | } 126 | 127 | fn FieldValue(comptime field: []const u8) type { 128 | const D = DataField(field); 129 | if (@typeInfo(D) == .@"struct") { 130 | if (@hasDecl(D, "unshift") and @TypeOf(D.unshift) != void) { 131 | return @typeInfo(@TypeOf(D.unshift)).@"fn".return_type.?; 132 | } else if (@hasDecl(D, "unpack")) { 133 | return @typeInfo(@TypeOf(D.unpack)).@"fn".return_type.?; 134 | } else if (@hasDecl(D, "value") and @TypeOf(D.value) != void) { 135 | return @typeInfo(@TypeOf(D.value)).@"fn".return_type.?; 136 | } else { 137 | return D; 138 | } 139 | } else { 140 | return D; 141 | } 142 | } 143 | 144 | // Note: I tried using a union with members that are the known types, and using 145 | // @FieldType(KnownFieldsForLspUnion, field) but the LSP was still unable to 146 | // figure out the type. It seems like the only way to get the LSP to know the 147 | // type would be having dedicated `get` functions for each field, but I don't 148 | // want to go that route. 149 | pub fn get(comptime field: FieldEnum, cp: u21) TypeOf(field) { 150 | const name = @tagName(field); 151 | const D = DataField(name); 152 | const table = comptime tableFor(name); 153 | 154 | if (@typeInfo(D) == .@"struct" and (@hasDecl(D, "unpack") or @hasDecl(D, "unshift") or (@hasDecl(D, "value") and @TypeOf(D.value) != void))) { 155 | const d = @field(data(table, cp), name); 156 | if (@hasDecl(D, "unshift") and @TypeOf(D.unshift) != void) { 157 | return d.unshift(cp); 158 | } else if (@hasDecl(D, "unpack")) { 159 | return d.unpack(); 160 | } else { 161 | return d.value(); 162 | } 163 | } else { 164 | return @field(data(table, cp), name); 165 | } 166 | } 167 | 168 | pub fn TypeOf(comptime field: FieldEnum) type { 169 | return FieldValue(@tagName(field)); 170 | } 171 | -------------------------------------------------------------------------------- /resources/wcwidth/zg.zig: -------------------------------------------------------------------------------- 1 | // cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/codegen/dwp.zig#L31-L223 2 | var flat_map = std.AutoHashMap(u21, i4).init(allocator); 3 | defer flat_map.deinit(); 4 | 5 | var line_buf: [4096]u8 = undefined; 6 | 7 | // Process DerivedEastAsianWidth.txt 8 | var deaw_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedEastAsianWidth.txt", .{}); 9 | defer deaw_file.close(); 10 | var deaw_buf = std.io.bufferedReader(deaw_file.reader()); 11 | const deaw_reader = deaw_buf.reader(); 12 | 13 | while (try deaw_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { 14 | if (line.len == 0) continue; 15 | 16 | // @missing ranges 17 | if (std.mem.startsWith(u8, line, "# @missing: ")) { 18 | const semi = std.mem.indexOfScalar(u8, line, ';').?; 19 | const field = line[12..semi]; 20 | const dots = std.mem.indexOf(u8, field, "..").?; 21 | const from = try std.fmt.parseInt(u21, field[0..dots], 16); 22 | const to = try std.fmt.parseInt(u21, field[dots + 2 ..], 16); 23 | if (from == 0 and to == 0x10ffff) continue; 24 | for (from..to + 1) |cp| try flat_map.put(@intCast(cp), 2); 25 | continue; 26 | } 27 | 28 | if (line[0] == '#') continue; 29 | 30 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; 31 | 32 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); 33 | var current_code: [2]u21 = undefined; 34 | 35 | var i: usize = 0; 36 | while (field_iter.next()) |field| : (i += 1) { 37 | switch (i) { 38 | 0 => { 39 | // Code point(s) 40 | if (std.mem.indexOf(u8, field, "..")) |dots| { 41 | current_code = .{ 42 | try std.fmt.parseInt(u21, field[0..dots], 16), 43 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), 44 | }; 45 | } else { 46 | const code = try std.fmt.parseInt(u21, field, 16); 47 | current_code = .{ code, code }; 48 | } 49 | }, 50 | 1 => { 51 | // Width 52 | if (std.mem.eql(u8, field, "W") or 53 | std.mem.eql(u8, field, "F") or 54 | (options.cjk and std.mem.eql(u8, field, "A"))) 55 | { 56 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 2); 57 | } 58 | }, 59 | else => {}, 60 | } 61 | } 62 | } 63 | 64 | // Process DerivedGeneralCategory.txt 65 | var dgc_file = try std.fs.cwd().openFile("data/unicode/extracted/DerivedGeneralCategory.txt", .{}); 66 | defer dgc_file.close(); 67 | var dgc_buf = std.io.bufferedReader(dgc_file.reader()); 68 | const dgc_reader = dgc_buf.reader(); 69 | 70 | while (try dgc_reader.readUntilDelimiterOrEof(&line_buf, '\n')) |line| { 71 | if (line.len == 0 or line[0] == '#') continue; 72 | const no_comment = if (std.mem.indexOfScalar(u8, line, '#')) |octo| line[0..octo] else line; 73 | 74 | var field_iter = std.mem.tokenizeAny(u8, no_comment, "; "); 75 | var current_code: [2]u21 = undefined; 76 | 77 | var i: usize = 0; 78 | while (field_iter.next()) |field| : (i += 1) { 79 | switch (i) { 80 | 0 => { 81 | // Code point(s) 82 | if (std.mem.indexOf(u8, field, "..")) |dots| { 83 | current_code = .{ 84 | try std.fmt.parseInt(u21, field[0..dots], 16), 85 | try std.fmt.parseInt(u21, field[dots + 2 ..], 16), 86 | }; 87 | } else { 88 | const code = try std.fmt.parseInt(u21, field, 16); 89 | current_code = .{ code, code }; 90 | } 91 | }, 92 | 1 => { 93 | // General category 94 | if (std.mem.eql(u8, field, "Mn")) { 95 | // Nonspacing_Mark 96 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0); 97 | } else if (std.mem.eql(u8, field, "Me")) { 98 | // Enclosing_Mark 99 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0); 100 | } else if (std.mem.eql(u8, field, "Mc")) { 101 | // Spacing_Mark 102 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0); 103 | } else if (std.mem.eql(u8, field, "Cf")) { 104 | if (std.mem.indexOf(u8, line, "ARABIC") == null) { 105 | // Format except Arabic 106 | for (current_code[0]..current_code[1] + 1) |cp| try flat_map.put(@intCast(cp), 0); 107 | } 108 | } 109 | }, 110 | else => {}, 111 | } 112 | } 113 | } 114 | 115 | var blocks_map = BlockMap.init(allocator); 116 | defer blocks_map.deinit(); 117 | 118 | var stage1 = std.ArrayList(u16).init(allocator); 119 | defer stage1.deinit(); 120 | 121 | var stage2 = std.ArrayList(i4).init(allocator); 122 | defer stage2.deinit(); 123 | 124 | var block: Block = [_]i4{0} ** block_size; 125 | var block_len: u16 = 0; 126 | 127 | for (0..0x110000) |i| { 128 | const cp: u21 = @intCast(i); 129 | var width = flat_map.get(cp) orelse 1; 130 | 131 | // Specific overrides 132 | switch (cp) { 133 | // Three-em dash 134 | 0x2e3b => width = 3, 135 | 136 | // C0/C1 control codes 137 | 0...0x20 => width = if (options.c0_width) |c0| c0 else 0, 138 | 0x80...0x9f => width = if (options.c1_width) |c1| c1 else 0, 139 | 140 | // Line separator 141 | 0x2028, 142 | 143 | // Paragraph separator 144 | 0x2029, 145 | 146 | // Hangul syllable and ignorable. 147 | 0x1160...0x11ff, 148 | 0xd7b0...0xd7ff, 149 | 0x2060...0x206f, 150 | 0xfff0...0xfff8, 151 | 0xe0000...0xE0fff, 152 | => width = 0, 153 | 154 | // Two-em dash 155 | 0x2e3a, 156 | 157 | // Regional indicators 158 | 0x1f1e6...0x1f200, 159 | 160 | // CJK Blocks 161 | 0x3400...0x4dbf, // CJK Unified Ideographs Extension A 162 | 0x4e00...0x9fff, // CJK Unified Ideographs 163 | 0xf900...0xfaff, // CJK Compatibility Ideographs 164 | 0x20000...0x2fffd, // Plane 2 165 | 0x30000...0x3fffd, // Plane 3 166 | => width = 2, 167 | 168 | else => {}, 169 | } 170 | 171 | // ASCII 172 | if (0x20 <= cp and cp < 0x7f) width = 1; 173 | 174 | // Soft hyphen 175 | if (cp == 0xad) width = 1; 176 | 177 | // Backspace and delete 178 | if (cp == 0x8 or cp == 0x7f) width = if (options.c0_width) |c0| c0 else -1; 179 | 180 | // Process block 181 | block[block_len] = width; 182 | block_len += 1; 183 | 184 | if (block_len < block_size and cp != 0x10ffff) continue; 185 | 186 | const gop = try blocks_map.getOrPut(block); 187 | if (!gop.found_existing) { 188 | gop.value_ptr.* = @intCast(stage2.items.len); 189 | try stage2.appendSlice(&block); 190 | } 191 | 192 | try stage1.append(gop.value_ptr.*); 193 | block_len = 0; 194 | } 195 | // cpv: end 196 | 197 | /// cpv: track https://codeberg.org/atman/zg/src/commit/9427a9e53aaa29ee071f4dcb35b809a699d75aa9/src/DisplayWidth.zig#L105-L145 198 | /// strWidth returns the total display width of `str` as the number of cells 199 | /// required in a fixed-pitch font (i.e. a terminal screen). 200 | pub fn strWidth(dw: DisplayWidth, str: []const u8) usize { 201 | var total: isize = 0; 202 | 203 | // ASCII fast path 204 | if (ascii.isAsciiOnly(str)) { 205 | for (str) |b| total += dw.codePointWidth(b); 206 | return @intCast(@max(0, total)); 207 | } 208 | 209 | var giter = dw.graphemes.iterator(str); 210 | 211 | while (giter.next()) |gc| { 212 | var cp_iter = CodePointIterator{ .bytes = gc.bytes(str) }; 213 | var gc_total: isize = 0; 214 | 215 | while (cp_iter.next()) |cp| { 216 | var w = dw.codePointWidth(cp.code); 217 | 218 | if (w != 0) { 219 | // Handle text emoji sequence. 220 | if (cp_iter.next()) |ncp| { 221 | // emoji text sequence. 222 | if (ncp.code == 0xFE0E) w = 1; 223 | if (ncp.code == 0xFE0F) w = 2; 224 | } 225 | 226 | // Only adding width of first non-zero-width code point. 227 | if (gc_total == 0) { 228 | gc_total = w; 229 | break; 230 | } 231 | } 232 | } 233 | 234 | total += gc_total; 235 | } 236 | 237 | return @intCast(@max(0, total)); 238 | } 239 | // cpv: end 240 | -------------------------------------------------------------------------------- /src/x/root.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | pub const types = @import("types.x.zig"); 3 | pub const grapheme = @import("grapheme.zig"); 4 | const testing = std.testing; 5 | 6 | test { 7 | std.testing.refAllDeclsRecursive(@This()); 8 | } 9 | 10 | // wcwidth tests 11 | 12 | test "wcwidth_standalone control characters are width 0" { 13 | const get = @import("get.zig").get; 14 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x0000)); // NULL (C0) 15 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x001F)); // UNIT SEPARATOR (C0) 16 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x007F)); // DELETE (C0) 17 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x0080)); // C1 control 18 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x009F)); // C1 control 19 | } 20 | 21 | test "wcwidth_standalone surrogates are width 0" { 22 | const get = @import("get.zig").get; 23 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xD800)); // High surrogate start 24 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDBFF)); // High surrogate end 25 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDC00)); // Low surrogate start 26 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xDFFF)); // Low surrogate end 27 | } 28 | 29 | test "wcwidth_standalone line and paragraph separators are width 0" { 30 | const get = @import("get.zig").get; 31 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x2028)); // LINE SEPARATOR (Zl) 32 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x2029)); // PARAGRAPH SEPARATOR (Zp) 33 | } 34 | 35 | test "wcwidth_standalone default ignorable characters are width 0" { 36 | const get = @import("get.zig").get; 37 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200B)); // ZERO WIDTH SPACE 38 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200C)); // ZERO WIDTH NON-JOINER (ZWNJ) 39 | try testing.expectEqual(0, get(.wcwidth_standalone, 0x200D)); // ZERO WIDTH JOINER (ZWJ) 40 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFE00)); // VARIATION SELECTOR-1 41 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFE0F)); // VARIATION SELECTOR-16 42 | try testing.expectEqual(0, get(.wcwidth_standalone, 0xFEFF)); // ZERO WIDTH NO-BREAK SPACE 43 | } 44 | 45 | test "wcwidth_standalone soft hyphen exception is width 1" { 46 | const get = @import("get.zig").get; 47 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00AD)); // SOFT HYPHEN 48 | } 49 | 50 | test "wcwidth_standalone combining marks are width 1" { 51 | const get = @import("get.zig").get; 52 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0300)); // COMBINING GRAVE ACCENT (Mn) 53 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0903)); // DEVANAGARI SIGN VISARGA (Mc) 54 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x20DD)); // COMBINING ENCLOSING CIRCLE (Me) 55 | } 56 | 57 | test "wcwidth_zero_in_grapheme combining marks" { 58 | const get = @import("get.zig").get; 59 | // mark_nonspacing (Mn) are true 60 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0300)); // COMBINING GRAVE ACCENT (Mn) 61 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0341)); // COMBINING GREEK PERISPOMENI (Mn) 62 | // mark_enclosing (Me) are true 63 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20DD)); // COMBINING ENCLOSING CIRCLE (Me) 64 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20DE)); // COMBINING ENCLOSING SQUARE (Me) 65 | // mark_spacing_combining (Mc) follow EAW - Neutral=1, so false 66 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x0903)); // DEVANAGARI SIGN VISARGA (Mc, N) 67 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x093E)); // DEVANAGARI VOWEL SIGN AA (Mc, N) 68 | // mark_spacing_combining with EAW=Wide are width 2, so false 69 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x302E)); // HANGUL SINGLE DOT TONE MARK (Mc, W) 70 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x302F)); // HANGUL DOUBLE DOT TONE MARK (Mc, W) 71 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x16FF0)); // VIETNAMESE ALTERNATE READING MARK CA (Mc, W) 72 | try testing.expect(!get(.wcwidth_zero_in_grapheme, 0x16FF1)); // VIETNAMESE ALTERNATE READING MARK NHAY (Mc, W) 73 | } 74 | 75 | test "wcwidth_standalone combining enclosing keycap exception is width 2" { 76 | const get = @import("get.zig").get; 77 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x20E3)); // COMBINING ENCLOSING KEYCAP 78 | } 79 | 80 | test "wcwidth_zero_in_grapheme combining enclosing keycap exception is true" { 81 | const get = @import("get.zig").get; 82 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x20E3)); // COMBINING ENCLOSING KEYCAP 83 | } 84 | 85 | test "wcwidth_standalone regional indicators are width 2" { 86 | const get = @import("get.zig").get; 87 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1E6)); // Regional Indicator A 88 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1FA)); // Regional Indicator U 89 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1F8)); // Regional Indicator S 90 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F1FF)); // Regional Indicator Z 91 | } 92 | 93 | test "wcwidth_standalone em dashes have special widths" { 94 | const get = @import("get.zig").get; 95 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x2E3A)); // TWO-EM DASH 96 | try testing.expectEqual(3, get(.wcwidth_standalone, 0x2E3B)); // THREE-EM DASH 97 | } 98 | 99 | test "wcwidth_standalone ambiguous width characters are width 1" { 100 | const get = @import("get.zig").get; 101 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00A1)); // INVERTED EXCLAMATION MARK (A) 102 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x00B1)); // PLUS-MINUS SIGN (A) 103 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x2664)); // WHITE SPADE SUIT (A) 104 | } 105 | 106 | test "wcwidth_standalone east asian wide and fullwidth are width 2" { 107 | const get = @import("get.zig").get; 108 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x3000)); // IDEOGRAPHIC SPACE (F) 109 | try testing.expectEqual(2, get(.wcwidth_standalone, 0xFF01)); // FULLWIDTH EXCLAMATION MARK (F) 110 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x4E00)); // CJK UNIFIED IDEOGRAPH (W) 111 | try testing.expectEqual(2, get(.wcwidth_standalone, 0xAC00)); // HANGUL SYLLABLE (W) 112 | } 113 | 114 | test "wcwidth_standalone hangul jamo V and T are width 1" { 115 | const get = @import("get.zig").get; 116 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x1161)); // HANGUL JUNGSEONG A (V) 117 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x11A8)); // HANGUL JONGSEONG KIYEOK (T) 118 | try testing.expectEqual(1, get(.wcwidth_standalone, 0xD7B0)); // HANGUL JUNGSEONG O-YEO (V) 119 | try testing.expectEqual(1, get(.wcwidth_standalone, 0xD7CB)); // HANGUL JONGSEONG NIEUN-RIEUL (T) 120 | } 121 | 122 | test "wcwidth_zero_in_grapheme hangul jamo V and T are true" { 123 | const get = @import("get.zig").get; 124 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1161)); // HANGUL JUNGSEONG A (V) 125 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x11A8)); // HANGUL JONGSEONG KIYEOK (T) 126 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0xD7B0)); // HANGUL JUNGSEONG O-YEO (V) 127 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0xD7CB)); // HANGUL JONGSEONG NIEUN-RIEUL (T) 128 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x16D63)); // KIRAT RAI VOWEL SIGN AA (V) 129 | } 130 | 131 | test "wcwidth_standalone format characters non-DI are width 1" { 132 | const get = @import("get.zig").get; 133 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0600)); // ARABIC NUMBER SIGN (Cf, not DI) 134 | } 135 | 136 | test "wcwidth_zero_in_grapheme format characters non-DI is true" { 137 | const get = @import("get.zig").get; 138 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0600)); // ARABIC NUMBER SIGN (Cf, not DI) 139 | } 140 | 141 | test "wcwidth_standalone prepend characters are width 1" { 142 | const get = @import("get.zig").get; 143 | // Lo Prepend (0D4E) 144 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x0D4E)); 145 | } 146 | 147 | test "wcwidth_zero_in_grapheme prepend characters are true" { 148 | const get = @import("get.zig").get; 149 | // Lo Prepend (0D4E) 150 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x0D4E)); 151 | } 152 | 153 | test "wcwidth_standalone emoji with default text presentation is 1" { 154 | const get = @import("get.zig").get; 155 | // weight lifter 156 | try testing.expectEqual(1, get(.wcwidth_standalone, 0x1F3CB)); 157 | } 158 | 159 | test "wcwidth_standalone emoji_modifier is 2" { 160 | const get = @import("get.zig").get; 161 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F3FB)); // 🏻 EMOJI MODIFIER FITZPATRICK TYPE-1-2 162 | try testing.expectEqual(2, get(.wcwidth_standalone, 0x1F3FF)); // 🏿 EMOJI MODIFIER FITZPATRICK TYPE-6 163 | } 164 | 165 | test "wcwidth_zero_in_grapheme emoji_modifier is true" { 166 | const get = @import("get.zig").get; 167 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1F3FB)); // 🏻 EMOJI MODIFIER FITZPATRICK TYPE-1-2 168 | try testing.expect(get(.wcwidth_zero_in_grapheme, 0x1F3FF)); // 🏿 EMOJI MODIFIER FITZPATRICK TYPE-6 169 | } 170 | -------------------------------------------------------------------------------- /ucd/BidiBrackets.txt: -------------------------------------------------------------------------------- 1 | # BidiBrackets-16.0.0.txt 2 | # Date: 2024-02-02 3 | # © 2024 Unicode®, Inc. 4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. 5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html 6 | # 7 | # Unicode Character Database 8 | # For documentation, see https://www.unicode.org/reports/tr44/ 9 | # 10 | # Bidi_Paired_Bracket and Bidi_Paired_Bracket_Type Properties 11 | # 12 | # This file is a normative contributory data file in the Unicode 13 | # Character Database. 14 | # 15 | # Bidi_Paired_Bracket is a normative property 16 | # which establishes a mapping between characters that are treated as 17 | # bracket pairs by the Unicode Bidirectional Algorithm. 18 | # 19 | # Bidi_Paired_Bracket_Type is a normative property 20 | # which classifies characters into opening and closing paired brackets 21 | # for the purposes of the Unicode Bidirectional Algorithm. 22 | # 23 | # This file lists the set of code points with Bidi_Paired_Bracket_Type 24 | # property values Open and Close. The set is derived from the character 25 | # properties General_Category (gc), Bidi_Class (bc), Bidi_Mirrored (Bidi_M), 26 | # and Bidi_Mirroring_Glyph (bmg), as follows: two characters, A and B, 27 | # form a bracket pair if A has gc=Ps and B has gc=Pe, both have bc=ON and 28 | # Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket (bpb) maps A to B and 29 | # vice versa, and their Bidi_Paired_Bracket_Type (bpt) property values are 30 | # Open (o) and Close (c), respectively. 31 | # 32 | # The brackets with ticks U+298D LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 33 | # through U+2990 RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER are paired the 34 | # same way their glyphs form mirror pairs, according to their bmg property 35 | # values. They are not paired on the basis of a diagonal or antidiagonal 36 | # matching of the corner ticks inferred from code point order. 37 | # 38 | # For legacy reasons, the characters U+FD3E ORNATE LEFT PARENTHESIS and 39 | # U+FD3F ORNATE RIGHT PARENTHESIS do not mirror in bidirectional display 40 | # and therefore do not form a bracket pair. 41 | # 42 | # The Unicode property value stability policy guarantees that characters 43 | # which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y. As a result, an 44 | # implementation can optimize the lookup of the Bidi_Paired_Bracket_Type 45 | # property values Open and Close by restricting the processing to characters 46 | # with bc=ON. 47 | # 48 | # The format of the file is three fields separated by a semicolon. 49 | # Field 0: Unicode code point value, represented as a hexadecimal value 50 | # Field 1: Bidi_Paired_Bracket property value, a code point value or 51 | # Field 2: Bidi_Paired_Bracket_Type property value, one of the following: 52 | # o Open 53 | # c Close 54 | # n None 55 | # The names of the characters in field 0 are given in comments at the end 56 | # of each line. 57 | # 58 | # For information on bidirectional paired brackets, see UAX #9: Unicode 59 | # Bidirectional Algorithm, at https://www.unicode.org/reports/tr9/ 60 | # 61 | # This file was originally created by Andrew Glass and Laurentiu Iancu 62 | # for Unicode 6.3. 63 | 64 | 0028; 0029; o # LEFT PARENTHESIS 65 | 0029; 0028; c # RIGHT PARENTHESIS 66 | 005B; 005D; o # LEFT SQUARE BRACKET 67 | 005D; 005B; c # RIGHT SQUARE BRACKET 68 | 007B; 007D; o # LEFT CURLY BRACKET 69 | 007D; 007B; c # RIGHT CURLY BRACKET 70 | 0F3A; 0F3B; o # TIBETAN MARK GUG RTAGS GYON 71 | 0F3B; 0F3A; c # TIBETAN MARK GUG RTAGS GYAS 72 | 0F3C; 0F3D; o # TIBETAN MARK ANG KHANG GYON 73 | 0F3D; 0F3C; c # TIBETAN MARK ANG KHANG GYAS 74 | 169B; 169C; o # OGHAM FEATHER MARK 75 | 169C; 169B; c # OGHAM REVERSED FEATHER MARK 76 | 2045; 2046; o # LEFT SQUARE BRACKET WITH QUILL 77 | 2046; 2045; c # RIGHT SQUARE BRACKET WITH QUILL 78 | 207D; 207E; o # SUPERSCRIPT LEFT PARENTHESIS 79 | 207E; 207D; c # SUPERSCRIPT RIGHT PARENTHESIS 80 | 208D; 208E; o # SUBSCRIPT LEFT PARENTHESIS 81 | 208E; 208D; c # SUBSCRIPT RIGHT PARENTHESIS 82 | 2308; 2309; o # LEFT CEILING 83 | 2309; 2308; c # RIGHT CEILING 84 | 230A; 230B; o # LEFT FLOOR 85 | 230B; 230A; c # RIGHT FLOOR 86 | 2329; 232A; o # LEFT-POINTING ANGLE BRACKET 87 | 232A; 2329; c # RIGHT-POINTING ANGLE BRACKET 88 | 2768; 2769; o # MEDIUM LEFT PARENTHESIS ORNAMENT 89 | 2769; 2768; c # MEDIUM RIGHT PARENTHESIS ORNAMENT 90 | 276A; 276B; o # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT 91 | 276B; 276A; c # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT 92 | 276C; 276D; o # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT 93 | 276D; 276C; c # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT 94 | 276E; 276F; o # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT 95 | 276F; 276E; c # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT 96 | 2770; 2771; o # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT 97 | 2771; 2770; c # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT 98 | 2772; 2773; o # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT 99 | 2773; 2772; c # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT 100 | 2774; 2775; o # MEDIUM LEFT CURLY BRACKET ORNAMENT 101 | 2775; 2774; c # MEDIUM RIGHT CURLY BRACKET ORNAMENT 102 | 27C5; 27C6; o # LEFT S-SHAPED BAG DELIMITER 103 | 27C6; 27C5; c # RIGHT S-SHAPED BAG DELIMITER 104 | 27E6; 27E7; o # MATHEMATICAL LEFT WHITE SQUARE BRACKET 105 | 27E7; 27E6; c # MATHEMATICAL RIGHT WHITE SQUARE BRACKET 106 | 27E8; 27E9; o # MATHEMATICAL LEFT ANGLE BRACKET 107 | 27E9; 27E8; c # MATHEMATICAL RIGHT ANGLE BRACKET 108 | 27EA; 27EB; o # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET 109 | 27EB; 27EA; c # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET 110 | 27EC; 27ED; o # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET 111 | 27ED; 27EC; c # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET 112 | 27EE; 27EF; o # MATHEMATICAL LEFT FLATTENED PARENTHESIS 113 | 27EF; 27EE; c # MATHEMATICAL RIGHT FLATTENED PARENTHESIS 114 | 2983; 2984; o # LEFT WHITE CURLY BRACKET 115 | 2984; 2983; c # RIGHT WHITE CURLY BRACKET 116 | 2985; 2986; o # LEFT WHITE PARENTHESIS 117 | 2986; 2985; c # RIGHT WHITE PARENTHESIS 118 | 2987; 2988; o # Z NOTATION LEFT IMAGE BRACKET 119 | 2988; 2987; c # Z NOTATION RIGHT IMAGE BRACKET 120 | 2989; 298A; o # Z NOTATION LEFT BINDING BRACKET 121 | 298A; 2989; c # Z NOTATION RIGHT BINDING BRACKET 122 | 298B; 298C; o # LEFT SQUARE BRACKET WITH UNDERBAR 123 | 298C; 298B; c # RIGHT SQUARE BRACKET WITH UNDERBAR 124 | 298D; 2990; o # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER 125 | 298E; 298F; c # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 126 | 298F; 298E; o # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER 127 | 2990; 298D; c # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER 128 | 2991; 2992; o # LEFT ANGLE BRACKET WITH DOT 129 | 2992; 2991; c # RIGHT ANGLE BRACKET WITH DOT 130 | 2993; 2994; o # LEFT ARC LESS-THAN BRACKET 131 | 2994; 2993; c # RIGHT ARC GREATER-THAN BRACKET 132 | 2995; 2996; o # DOUBLE LEFT ARC GREATER-THAN BRACKET 133 | 2996; 2995; c # DOUBLE RIGHT ARC LESS-THAN BRACKET 134 | 2997; 2998; o # LEFT BLACK TORTOISE SHELL BRACKET 135 | 2998; 2997; c # RIGHT BLACK TORTOISE SHELL BRACKET 136 | 29D8; 29D9; o # LEFT WIGGLY FENCE 137 | 29D9; 29D8; c # RIGHT WIGGLY FENCE 138 | 29DA; 29DB; o # LEFT DOUBLE WIGGLY FENCE 139 | 29DB; 29DA; c # RIGHT DOUBLE WIGGLY FENCE 140 | 29FC; 29FD; o # LEFT-POINTING CURVED ANGLE BRACKET 141 | 29FD; 29FC; c # RIGHT-POINTING CURVED ANGLE BRACKET 142 | 2E22; 2E23; o # TOP LEFT HALF BRACKET 143 | 2E23; 2E22; c # TOP RIGHT HALF BRACKET 144 | 2E24; 2E25; o # BOTTOM LEFT HALF BRACKET 145 | 2E25; 2E24; c # BOTTOM RIGHT HALF BRACKET 146 | 2E26; 2E27; o # LEFT SIDEWAYS U BRACKET 147 | 2E27; 2E26; c # RIGHT SIDEWAYS U BRACKET 148 | 2E28; 2E29; o # LEFT DOUBLE PARENTHESIS 149 | 2E29; 2E28; c # RIGHT DOUBLE PARENTHESIS 150 | 2E55; 2E56; o # LEFT SQUARE BRACKET WITH STROKE 151 | 2E56; 2E55; c # RIGHT SQUARE BRACKET WITH STROKE 152 | 2E57; 2E58; o # LEFT SQUARE BRACKET WITH DOUBLE STROKE 153 | 2E58; 2E57; c # RIGHT SQUARE BRACKET WITH DOUBLE STROKE 154 | 2E59; 2E5A; o # TOP HALF LEFT PARENTHESIS 155 | 2E5A; 2E59; c # TOP HALF RIGHT PARENTHESIS 156 | 2E5B; 2E5C; o # BOTTOM HALF LEFT PARENTHESIS 157 | 2E5C; 2E5B; c # BOTTOM HALF RIGHT PARENTHESIS 158 | 3008; 3009; o # LEFT ANGLE BRACKET 159 | 3009; 3008; c # RIGHT ANGLE BRACKET 160 | 300A; 300B; o # LEFT DOUBLE ANGLE BRACKET 161 | 300B; 300A; c # RIGHT DOUBLE ANGLE BRACKET 162 | 300C; 300D; o # LEFT CORNER BRACKET 163 | 300D; 300C; c # RIGHT CORNER BRACKET 164 | 300E; 300F; o # LEFT WHITE CORNER BRACKET 165 | 300F; 300E; c # RIGHT WHITE CORNER BRACKET 166 | 3010; 3011; o # LEFT BLACK LENTICULAR BRACKET 167 | 3011; 3010; c # RIGHT BLACK LENTICULAR BRACKET 168 | 3014; 3015; o # LEFT TORTOISE SHELL BRACKET 169 | 3015; 3014; c # RIGHT TORTOISE SHELL BRACKET 170 | 3016; 3017; o # LEFT WHITE LENTICULAR BRACKET 171 | 3017; 3016; c # RIGHT WHITE LENTICULAR BRACKET 172 | 3018; 3019; o # LEFT WHITE TORTOISE SHELL BRACKET 173 | 3019; 3018; c # RIGHT WHITE TORTOISE SHELL BRACKET 174 | 301A; 301B; o # LEFT WHITE SQUARE BRACKET 175 | 301B; 301A; c # RIGHT WHITE SQUARE BRACKET 176 | FE59; FE5A; o # SMALL LEFT PARENTHESIS 177 | FE5A; FE59; c # SMALL RIGHT PARENTHESIS 178 | FE5B; FE5C; o # SMALL LEFT CURLY BRACKET 179 | FE5C; FE5B; c # SMALL RIGHT CURLY BRACKET 180 | FE5D; FE5E; o # SMALL LEFT TORTOISE SHELL BRACKET 181 | FE5E; FE5D; c # SMALL RIGHT TORTOISE SHELL BRACKET 182 | FF08; FF09; o # FULLWIDTH LEFT PARENTHESIS 183 | FF09; FF08; c # FULLWIDTH RIGHT PARENTHESIS 184 | FF3B; FF3D; o # FULLWIDTH LEFT SQUARE BRACKET 185 | FF3D; FF3B; c # FULLWIDTH RIGHT SQUARE BRACKET 186 | FF5B; FF5D; o # FULLWIDTH LEFT CURLY BRACKET 187 | FF5D; FF5B; c # FULLWIDTH RIGHT CURLY BRACKET 188 | FF5F; FF60; o # FULLWIDTH LEFT WHITE PARENTHESIS 189 | FF60; FF5F; c # FULLWIDTH RIGHT WHITE PARENTHESIS 190 | FF62; FF63; o # HALFWIDTH LEFT CORNER BRACKET 191 | FF63; FF62; c # HALFWIDTH RIGHT CORNER BRACKET 192 | 193 | # EOF 194 | -------------------------------------------------------------------------------- /resources/wcwidth/wcwidth.py: -------------------------------------------------------------------------------- 1 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/wcwidth/wcwidth.py#L103-L203 2 | @lru_cache(maxsize=1000) 3 | def wcwidth(wc, unicode_version='auto'): 4 | r""" 5 | Given one Unicode character, return its printable length on a terminal. 6 | 7 | :param str wc: A single Unicode character. 8 | :param str unicode_version: A Unicode version number, such as 9 | ``'6.0.0'``. A list of version levels suported by wcwidth 10 | is returned by :func:`list_versions`. 11 | 12 | Any version string may be specified without error -- the nearest 13 | matching version is selected. When ``latest`` (default), the 14 | highest Unicode version level is used. 15 | :return: The width, in cells, necessary to display the character of 16 | Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has 17 | no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is 18 | not printable, or has an indeterminate effect on the terminal, such as 19 | a control character. Otherwise, the number of column positions the 20 | character occupies on a graphic terminal (1 or 2) is returned. 21 | :rtype: int 22 | 23 | See :ref:`Specification` for details of cell measurement. 24 | """ 25 | ucs = ord(wc) if wc else 0 26 | 27 | # small optimization: early return of 1 for printable ASCII, this provides 28 | # approximately 40% performance improvement for mostly-ascii documents, with 29 | # less than 1% impact to others. 30 | if 32 <= ucs < 0x7f: 31 | return 1 32 | 33 | # C0/C1 control characters are -1 for compatibility with POSIX-like calls 34 | if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: 35 | return -1 36 | 37 | _unicode_version = _wcmatch_version(unicode_version) 38 | 39 | # Zero width 40 | if _bisearch(ucs, ZERO_WIDTH[_unicode_version]): 41 | return 0 42 | 43 | # 1 or 2 width 44 | return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version]) 45 | 46 | 47 | def wcswidth(pwcs, n=None, unicode_version='auto'): 48 | """ 49 | Given a unicode string, return its printable length on a terminal. 50 | 51 | :param str pwcs: Measure width of given unicode string. 52 | :param int n: When ``n`` is None (default), return the length of the entire 53 | string, otherwise only the first ``n`` characters are measured. This 54 | argument exists only for compatibility with the C POSIX function 55 | signature. It is suggested instead to use python's string slicing 56 | capability, ``wcswidth(pwcs[:n])`` 57 | :param str unicode_version: An explicit definition of the unicode version 58 | level to use for determination, may be ``auto`` (default), which uses 59 | the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest 60 | available unicode version, otherwise. 61 | :rtype: int 62 | :returns: The width, in cells, needed to display the first ``n`` characters 63 | of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control 64 | characters! 65 | 66 | See :ref:`Specification` for details of cell measurement. 67 | """ 68 | # this 'n' argument is a holdover for POSIX function 69 | _unicode_version = None 70 | end = len(pwcs) if n is None else n 71 | width = 0 72 | idx = 0 73 | last_measured_char = None 74 | while idx < end: 75 | char = pwcs[idx] 76 | if char == '\u200D': 77 | # Zero Width Joiner, do not measure this or next character 78 | idx += 2 79 | continue 80 | if char == '\uFE0F' and last_measured_char: 81 | # on variation selector 16 (VS16) following another character, 82 | # conditionally add '1' to the measured width if that character is 83 | # known to be converted from narrow to wide by the VS16 character. 84 | if _unicode_version is None: 85 | _unicode_version = _wcversion_value(_wcmatch_version(unicode_version)) 86 | if _unicode_version >= (9, 0, 0): 87 | width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"]) 88 | last_measured_char = None 89 | idx += 1 90 | continue 91 | # measure character at current index 92 | wcw = wcwidth(char, unicode_version) 93 | if wcw < 0: 94 | # early return -1 on C0 and C1 control characters 95 | return wcw 96 | if wcw > 0: 97 | # track last character measured to contain a cell, so that 98 | # subsequent VS-16 modifiers may be understood 99 | last_measured_char = char 100 | width += wcw 101 | idx += 1 102 | return width 103 | # cpv: end 104 | 105 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/bin/update-tables.py#L122-L160 106 | @dataclass(frozen=True) 107 | class TableEntry: 108 | """An entry of a unicode table.""" 109 | code_range: tuple[int, int] | None 110 | properties: tuple[str, ...] 111 | comment: str 112 | 113 | def filter_by_category_width(self, wide: int) -> bool: 114 | """ 115 | Return whether entry matches displayed width. 116 | 117 | Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt 118 | """ 119 | if self.code_range is None: 120 | return False 121 | elif self.properties[0] == 'Sk': 122 | if 'EMOJI MODIFIER' in self.comment: 123 | # These codepoints are fullwidth when used without emoji, 0-width with. 124 | # Generate code that expects the best case, that is always combined 125 | return wide == 0 126 | elif 'FULLWIDTH' in self.comment: 127 | # Some codepoints in 'Sk' categories are fullwidth(!) 128 | # at this time just 3, FULLWIDTH: CIRCUMFLEX ACCENT, GRAVE ACCENT, and MACRON 129 | return wide == 2 130 | else: 131 | # the rest are narrow 132 | return wide == 1 133 | # Me Enclosing Mark 134 | # Mn Nonspacing Mark 135 | # Cf Format 136 | # Zl Line Separator 137 | # Zp Paragraph Separator 138 | if self.properties[0] in ('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp'): 139 | return wide == 0 140 | # F Fullwidth 141 | # W Wide 142 | if self.properties[0] in ('W', 'F'): 143 | return wide == 2 144 | return wide == 1 145 | # cpv: end 146 | 147 | # cpv: track https://github.com/jquast/wcwidth/blob/915166f9453098a56e87a7fb69e697696cefe206/bin/update-tables.py#L336-L391 148 | def fetch_table_wide_data() -> UnicodeTableRenderCtx: 149 | """Fetch east-asian tables.""" 150 | table: dict[UnicodeVersion, TableDef] = {} 151 | for version in fetch_unicode_versions(): 152 | # parse typical 'wide' characters by categories 'W' and 'F', 153 | table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version), 154 | wide=2) 155 | 156 | # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth, 157 | # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory! 158 | table[version].values = table[version].values.difference(parse_category( 159 | fname=UnicodeDataFile.DerivedGeneralCategory(version), 160 | wide=0).values) 161 | 162 | # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants 163 | table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH) 164 | 165 | # finally, join with atypical 'wide' characters defined by category 'Sk', 166 | table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version), 167 | wide=2).values) 168 | return UnicodeTableRenderCtx('WIDE_EASTASIAN', table) 169 | 170 | 171 | def fetch_table_zero_data() -> UnicodeTableRenderCtx: 172 | """ 173 | Fetch zero width tables. 174 | 175 | See also: https://unicode.org/L2/L2002/02368-default-ignorable.html 176 | """ 177 | table: dict[UnicodeVersion, TableDef] = {} 178 | for version in fetch_unicode_versions(): 179 | # Determine values of zero-width character lookup table by the following category codes 180 | table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version), 181 | wide=0) 182 | 183 | # Include NULL 184 | table[version].values.add(0) 185 | 186 | # Add Hangul Jamo Vowels and Hangul Trailing Consonants 187 | table[version].values.update(HANGUL_JAMO_ZEROWIDTH) 188 | 189 | # Remove u+00AD categoryCode=Cf name="SOFT HYPHEN", 190 | # > https://www.unicode.org/faq/casemap_charprop.html 191 | # 192 | # > Q: Unicode now treats the SOFT HYPHEN as format control (Cf) 193 | # > character when formerly it was a punctuation character (Pd). 194 | # > Doesn't this break ISO 8859-1 compatibility? 195 | # 196 | # > [..] In a terminal emulation environment, particularly in 197 | # > ISO-8859-1 contexts, one could display the SOFT HYPHEN as a hyphen 198 | # > in all circumstances. 199 | # 200 | # This value was wrongly measured as a width of '0' in this wcwidth 201 | # versions 0.2.9 - 0.2.13. Fixed in 0.2.14 202 | table[version].values.discard(0x00AD) # SOFT HYPHEN 203 | return UnicodeTableRenderCtx('ZERO_WIDTH', table) 204 | # cpv: end 205 | -------------------------------------------------------------------------------- /src/build/test_build_config.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const config = @import("config.zig"); 3 | const config_x = @import("config.x.zig"); 4 | const types = @import("types.zig"); 5 | const d = config.default; 6 | 7 | const Allocator = std.mem.Allocator; 8 | pub const log_level = .debug; 9 | 10 | fn computeFoo( 11 | allocator: Allocator, 12 | cp: u21, 13 | data: anytype, 14 | b: anytype, 15 | t: anytype, 16 | ) Allocator.Error!void { 17 | _ = allocator; 18 | _ = cp; 19 | _ = b; 20 | _ = t; 21 | data.foo = switch (data.original_grapheme_break) { 22 | .other => 0, 23 | .control => 3, 24 | else => 10, 25 | }; 26 | } 27 | 28 | const foo = config.Extension{ 29 | .inputs = &.{"original_grapheme_break"}, 30 | .compute = &computeFoo, 31 | .fields = &.{ 32 | .{ .name = "foo", .type = u8 }, 33 | }, 34 | }; 35 | 36 | // Or build your own extension: 37 | const emoji_odd_or_even = config.Extension{ 38 | .inputs = &.{"is_emoji"}, 39 | .compute = &computeEmojiOddOrEven, 40 | .fields = &.{ 41 | .{ .name = "emoji_odd_or_even", .type = EmojiOddOrEven }, 42 | }, 43 | }; 44 | 45 | fn computeEmojiOddOrEven( 46 | allocator: Allocator, 47 | cp: u21, 48 | data: anytype, 49 | backing: anytype, 50 | tracking: anytype, 51 | ) Allocator.Error!void { 52 | // allocator is an ArenaAllocator, so don't worry about freeing 53 | _ = allocator; 54 | 55 | // backing and tracking are only used for slice types (see 56 | // src/build/test_build_config.zig for examples). 57 | _ = backing; 58 | _ = tracking; 59 | 60 | if (!data.is_emoji) { 61 | data.emoji_odd_or_even = .not_emoji; 62 | } else if (cp % 2 == 0) { 63 | data.emoji_odd_or_even = .even_emoji; 64 | } else { 65 | data.emoji_odd_or_even = .odd_emoji; 66 | } 67 | } 68 | 69 | // Types must be marked `pub` 70 | pub const EmojiOddOrEven = enum(u2) { 71 | not_emoji, 72 | even_emoji, 73 | odd_emoji, 74 | }; 75 | 76 | const info = config.Extension{ 77 | .inputs = &.{ 78 | "uppercase_mapping", 79 | "numeric_value_numeric", 80 | "numeric_value_decimal", 81 | "simple_lowercase_mapping", 82 | }, 83 | .compute = &computeInfo, 84 | .fields = &.{ 85 | .{ 86 | .name = "uppercase_mapping_first_char", 87 | .type = u21, 88 | .cp_packing = .shift, 89 | .shift_low = -64190, 90 | .shift_high = 42561, 91 | }, 92 | .{ .name = "has_simple_lowercase", .type = bool }, 93 | .{ 94 | .name = "numeric_value_numeric_reversed", 95 | .type = []const u8, 96 | .max_len = 13, 97 | .max_offset = 503, 98 | .embedded_len = 1, 99 | }, 100 | }, 101 | }; 102 | 103 | fn computeInfo( 104 | allocator: Allocator, 105 | cp: u21, 106 | data: anytype, 107 | backing: anytype, 108 | tracking: anytype, 109 | ) Allocator.Error!void { 110 | var single_item_buffer: [1]u21 = undefined; 111 | types.fieldInit( 112 | "uppercase_mapping_first_char", 113 | cp, 114 | data, 115 | tracking, 116 | data.uppercase_mapping.sliceWith( 117 | backing.uppercase_mapping, 118 | &single_item_buffer, 119 | cp, 120 | )[0], 121 | ); 122 | 123 | data.has_simple_lowercase = data.simple_lowercase_mapping.unshift(cp) != null; 124 | 125 | var buffer: [13]u8 = undefined; 126 | for (data.numeric_value_numeric.slice(backing.numeric_value_numeric), 0..) |digit, i| { 127 | buffer[data.numeric_value_numeric.len - i - 1] = digit; 128 | } 129 | 130 | try types.sliceFieldInit( 131 | "numeric_value_numeric_reversed", 132 | allocator, 133 | cp, 134 | data, 135 | backing, 136 | tracking, 137 | buffer[0..data.numeric_value_numeric.len], 138 | ); 139 | } 140 | 141 | fn computeOptEmojiOddOrEven( 142 | allocator: Allocator, 143 | cp: u21, 144 | data: anytype, 145 | b: anytype, 146 | tracking: anytype, 147 | ) Allocator.Error!void { 148 | _ = allocator; 149 | _ = b; 150 | types.fieldInit( 151 | "opt_emoji_odd_or_even", 152 | cp, 153 | data, 154 | tracking, 155 | @as(?EmojiOddOrEven, switch (data.emoji_odd_or_even) { 156 | .even_emoji => .even_emoji, 157 | .odd_emoji => .odd_emoji, 158 | .not_emoji => null, 159 | }), 160 | ); 161 | } 162 | 163 | const opt_emoji_odd_or_even = config.Extension{ 164 | .inputs = &.{"emoji_odd_or_even"}, 165 | .compute = &computeOptEmojiOddOrEven, 166 | .fields = &.{ 167 | .{ 168 | .name = "opt_emoji_odd_or_even", 169 | .type = ?EmojiOddOrEven, 170 | .min_value = 0, 171 | .max_value = 2, 172 | }, 173 | }, 174 | }; 175 | 176 | pub const NextOrPrev = union(enum) { 177 | none: void, 178 | next: u21, 179 | prev: u21, 180 | }; 181 | 182 | fn computeNextOrPrev( 183 | allocator: Allocator, 184 | cp: u21, 185 | data: anytype, 186 | b: anytype, 187 | tracking: anytype, 188 | ) Allocator.Error!void { 189 | _ = allocator; 190 | _ = b; 191 | var nop: NextOrPrev = .none; 192 | if (0x1200 <= cp and cp <= 0x1235) { 193 | nop = switch (cp % 3) { 194 | 0 => .{ .next = cp + 1 }, 195 | 1 => .{ .prev = cp - 1 }, 196 | 2 => .none, 197 | else => unreachable, 198 | }; 199 | } 200 | 201 | types.fieldInit( 202 | "next_or_prev", 203 | cp, 204 | data, 205 | tracking, 206 | nop, 207 | ); 208 | } 209 | 210 | const next_or_prev = config.Extension{ 211 | .inputs = &.{}, 212 | .compute = &computeNextOrPrev, 213 | .fields = &.{ 214 | .{ 215 | .name = "next_or_prev", 216 | .type = NextOrPrev, 217 | .cp_packing = .shift, 218 | .shift_low = -1, 219 | .shift_high = 1, 220 | }, 221 | }, 222 | }; 223 | 224 | fn computeNextOrPrevDirect( 225 | allocator: Allocator, 226 | cp: u21, 227 | data: anytype, 228 | b: anytype, 229 | tracking: anytype, 230 | ) Allocator.Error!void { 231 | _ = allocator; 232 | _ = b; 233 | types.fieldInit( 234 | "next_or_prev_direct", 235 | cp, 236 | data, 237 | tracking, 238 | data.next_or_prev.unshift(cp), 239 | ); 240 | } 241 | 242 | const next_or_prev_direct = config.Extension{ 243 | .inputs = &.{"next_or_prev"}, 244 | .compute = &computeNextOrPrevDirect, 245 | .fields = &.{ 246 | .{ 247 | .name = "next_or_prev_direct", 248 | .type = NextOrPrev, 249 | }, 250 | }, 251 | }; 252 | 253 | fn computeBidiPairedBracketDirect( 254 | allocator: Allocator, 255 | cp: u21, 256 | data: anytype, 257 | b: anytype, 258 | tracking: anytype, 259 | ) Allocator.Error!void { 260 | _ = allocator; 261 | _ = b; 262 | types.fieldInit( 263 | "bidi_paired_bracket_direct", 264 | cp, 265 | data, 266 | tracking, 267 | data.bidi_paired_bracket.unshift(cp), 268 | ); 269 | } 270 | 271 | const bidi_paired_bracket_direct = config.Extension{ 272 | .inputs = &.{"bidi_paired_bracket"}, 273 | .compute = &computeBidiPairedBracketDirect, 274 | .fields = &.{ 275 | .{ 276 | .name = "bidi_paired_bracket_direct", 277 | .type = types.BidiPairedBracket, 278 | }, 279 | }, 280 | }; 281 | 282 | fn computeMaybeBit( 283 | allocator: Allocator, 284 | cp: u21, 285 | data: anytype, 286 | b: anytype, 287 | tracking: anytype, 288 | ) Allocator.Error!void { 289 | _ = allocator; 290 | _ = b; 291 | var maybe: ?bool = null; 292 | if (0x1200 <= cp and cp <= 0x1235) { 293 | maybe = cp % 2 == 0; 294 | } 295 | 296 | types.fieldInit( 297 | "maybe_bit", 298 | cp, 299 | data, 300 | tracking, 301 | maybe, 302 | ); 303 | } 304 | 305 | const maybe_bit = config.Extension{ 306 | .inputs = &.{}, 307 | .compute = &computeMaybeBit, 308 | .fields = &.{ 309 | .{ 310 | .name = "maybe_bit", 311 | .type = ?bool, 312 | .min_value = 0, 313 | .max_value = 1, 314 | }, 315 | }, 316 | }; 317 | 318 | pub const tables = [_]config.Table{ 319 | .{ 320 | .extensions = &.{ 321 | foo, 322 | emoji_odd_or_even, 323 | info, 324 | next_or_prev, 325 | next_or_prev_direct, 326 | bidi_paired_bracket_direct, 327 | }, 328 | .fields = &.{ 329 | foo.field("foo"), 330 | emoji_odd_or_even.field("emoji_odd_or_even"), 331 | info.field("uppercase_mapping_first_char"), 332 | info.field("has_simple_lowercase"), 333 | info.field("numeric_value_numeric_reversed"), 334 | next_or_prev.field("next_or_prev"), 335 | next_or_prev_direct.field("next_or_prev_direct"), 336 | bidi_paired_bracket_direct.field("bidi_paired_bracket_direct"), 337 | d.field("name").override(.{ 338 | .embedded_len = 15, 339 | .max_offset = 986096, 340 | }), 341 | d.field("grapheme_break"), 342 | d.field("special_lowercase_mapping"), 343 | }, 344 | }, 345 | .{ 346 | .stages = .two, 347 | .fields = &.{ 348 | d.field("general_category"), 349 | d.field("case_folding_simple"), 350 | }, 351 | }, 352 | .{ 353 | .name = "pack", 354 | .packing = .@"packed", 355 | .extensions = &.{ 356 | emoji_odd_or_even, 357 | opt_emoji_odd_or_even, 358 | maybe_bit, 359 | }, 360 | .fields = &.{ 361 | opt_emoji_odd_or_even.field("opt_emoji_odd_or_even"), 362 | maybe_bit.field("maybe_bit"), 363 | d.field("bidi_paired_bracket"), 364 | }, 365 | }, 366 | .{ 367 | .name = "checks", 368 | .extensions = &.{}, 369 | .fields = &.{ 370 | d.field("simple_uppercase_mapping"), 371 | d.field("is_alphabetic"), 372 | d.field("is_lowercase"), 373 | d.field("is_uppercase"), 374 | d.field("is_emoji_vs_base"), 375 | d.field("is_emoji_modifier_base"), 376 | }, 377 | }, 378 | .{ 379 | .name = "needed_for_tests_and_build_build_config", 380 | .extensions = &.{ 381 | config_x.wcwidth, 382 | config_x.grapheme_break_no_control, 383 | }, 384 | .fields = &config._resolveFields( 385 | config_x, 386 | &.{ 387 | "wcwidth_standalone", 388 | "wcwidth_zero_in_grapheme", 389 | "grapheme_break_no_control", 390 | "special_casing_condition", 391 | "bidi_class", 392 | }, 393 | &.{ "wcwidth", "grapheme_break_no_control" }, 394 | ), 395 | }, 396 | }; 397 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uucode (Micro/µ Unicode) 2 | 3 | A fast and flexible unicode library, fully configurable at build time. 4 | 5 | ## Basic usage 6 | 7 | ``` zig 8 | const uucode = @import("uucode"); 9 | 10 | var cp: u21 = undefined; 11 | 12 | ////////////////////// 13 | // `get` properties 14 | 15 | cp = 0x2200; // ∀ 16 | uucode.get(.general_category, cp) // .symbol_math 17 | 18 | cp = 0x03C2; // ς 19 | uucode.get(.simple_uppercase_mapping, cp) // U+03A3 == Σ 20 | 21 | cp = 0x21C1; // ⇁ 22 | uucode.get(.name, cp) // "RIGHTWARDS HARPOON WITH BARB DOWNWARDS" 23 | 24 | // Many of the []const u21 fields need a single item buffer passed to `with`: 25 | var buffer: [1]u21 = undefined; 26 | cp = 0x00DF; // ß 27 | uucode.get(.uppercase_mapping, cp).with(&buffer, cp) // "SS" 28 | 29 | ////////////////////// 30 | // `getAll` to get a group of properties for a code point together. 31 | 32 | cp = 0x03C2; // ς 33 | 34 | // The first argument is the name/index of the table. 35 | const data = uucode.getAll("0", cp); 36 | 37 | data.simple_uppercase_mapping // U+03A3 == Σ 38 | data.general_category // .letter_lowercase 39 | 40 | ////////////////////// 41 | // utf8.Iterator 42 | 43 | var it = uucode.utf8.Iterator.init("😀😅😻👺"); 44 | it.next(); // 0x1F600 45 | it.i; // 4 (bytes into the utf8 string) 46 | it.peek(); // 0x1F605 47 | it.next(); // 0x1F605 48 | it.next(); // 0x1F63B 49 | it.next(); // 0x1F47A 50 | 51 | ////////////////////// 52 | // grapheme.Iterator / grapheme.utf8Iterator 53 | 54 | var it = uucode.grapheme.utf8Iterator("👩🏽‍🚀🇨🇭👨🏻‍🍼") 55 | 56 | // (which is equivalent to:) 57 | var it = uucode.grapheme.Iterator(uccode.utf8.Iterator).init(.init("👩🏽‍🚀🇨🇭👨🏻‍🍼")); 58 | 59 | // `nextCodePoint` advances one code point at a time, indicating a new grapheme 60 | // with `is_break = true`. 61 | it.nextCodePoint(); // { .code_point = 0x1F469; .is_break = false } // 👩 62 | it.i; // 4 (bytes into the utf8 string) 63 | 64 | it.peekCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽 65 | it.nextCodePoint(); // { .code_point = 0x1F3FD; .is_break = false } // 🏽 66 | it.nextCodePoint(); // { .code_point = 0x200D; .is_break = false } // Zero width joiner 67 | it.nextCodePoint(); // { .code_point = 0x1F680; .is_break = true } // 🚀 68 | 69 | // `nextGrapheme` advances until the start of the next grapheme cluster 70 | const result = it.nextGrapheme(); // { .start = 15; .end = 23 } 71 | it.i; // "👩🏽‍🚀🇨🇭".len 72 | str[result.?.start..result.?.end]; // "🇨🇭" 73 | 74 | const result = it.peekGrapheme(); 75 | str[result.?.start..result.?.end]; // "👨🏻‍🍼" 76 | 77 | ////////////////////// 78 | // grapheme.isBreak 79 | 80 | var break_state: uucode.grapheme.BreakState = .default; 81 | 82 | var cp1: u21 = 0x1F469; // 👩 83 | var cp2: u21 = 0x1F3FD; // 🏽 84 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false 85 | 86 | cp1 = cp2; 87 | cp2 = 0x200D; // Zero width joiner 88 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false 89 | 90 | cp1 = cp2; 91 | cp2 = 0x1F680; // 🚀 92 | // The combined grapheme cluster is 👩🏽‍🚀 (woman astronaut) 93 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // false 94 | 95 | cp1 = cp2; 96 | cp2 = 0x1F468; // 👨 97 | uucode.grapheme.isBreak(cp1, cp2, &break_state); // true 98 | 99 | ////////////////////// 100 | // x.grapheme.wcwidth{,Next,Remaining} / x.grapheme.utf8Wcwidth 101 | 102 | const str = "ò👨🏻‍❤️‍👨🏿_"; 103 | var it = uucode.grapheme.utf8Iterator(str); 104 | 105 | // Requires the `wcwidth` builtin extension (see below) 106 | uucode.x.grapheme.wcwidth(it); // 1 for 'ò' 107 | 108 | uucode.x.grapheme.wcwidthNext(&it); // 1 for 'ò' 109 | const result = it.peekGrapheme(); 110 | str[result.?.start..result.?.end]; // "👨🏻‍❤️‍👨🏿" 111 | 112 | uucode.x.grapheme.wcwidthRemaining(&it); // 3 for "👨🏻‍❤️‍👨🏿_" 113 | 114 | uucode.x.grapheme.utf8Wcwidth(str); // 4 for the whole string 115 | 116 | ////////////////////// 117 | // TypeOf / TypeOfAll / hasField 118 | 119 | uucode.TypeOf(.general_category) // uucode.types.GeneralCategory 120 | uucode.TypeOfAll("0") // @TypeOf(uucode.getAll("0")) 121 | uucode.hasField("is_emoji") // true if `is_emoji` is in one of your tables 122 | ``` 123 | 124 | See [src/config.zig](./src/config.zig) for the names of all fields. 125 | 126 | ## Configuration 127 | 128 | Only include the Unicode fields you actually use: 129 | 130 | ``` zig 131 | // In `build.zig`: 132 | if (b.lazyDependency("uucode", .{ 133 | .target = target, 134 | .optimize = optimize, 135 | .fields = @as([]const []const u8, &.{ 136 | "name", 137 | "general_category", 138 | "case_folding_simple", 139 | "is_alphabetic", 140 | // ... 141 | }), 142 | })) |dep| { 143 | step.root_module.addImport("uucode", dep.module("uucode")); 144 | } 145 | ``` 146 | 147 | ### Multiple tables 148 | 149 | Fields can be split into multiple tables using `fields_0` through `fields_9`, to optimize how fields are stored and accessed (with no code changes needed). 150 | 151 | 152 | ``` zig 153 | // In `build.zig`: 154 | if (b.lazyDependency("uucode", .{ 155 | .target = target, 156 | .optimize = optimize, 157 | .fields_0 = @as([]const []const u8, &.{ 158 | "general_category", 159 | "case_folding_simple", 160 | "is_alphabetic", 161 | }), 162 | .fields_1 = @as([]const []const u8, &.{ 163 | // ... 164 | }), 165 | .fields_2 = @as([]const []const u8, &.{ 166 | // ... 167 | }), 168 | // ... `fields_3` to `fields_9` 169 | })) |dep| { 170 | step.root_module.addImport("uucode", dep.module("uucode")); 171 | } 172 | ``` 173 | 174 | ### Builtin extensions 175 | 176 | `uucode` includes builtin extensions that add derived properties. Use `extensions` or `extensions_0` through `extensions_9` to include them: 177 | 178 | ``` zig 179 | // In `build.zig`: 180 | if (b.lazyDependency("uucode", .{ 181 | .target = target, 182 | .optimize = optimize, 183 | .extensions = @as([]const []const u8, &.{ 184 | "wcwidth", 185 | }), 186 | .fields = @as([]const []const u8, &.{ 187 | // Make sure to also include the extension's fields here: 188 | "wcwidth_standalone", 189 | "wcwidth_zero_in_grapheme", 190 | ... 191 | "general_category", 192 | }), 193 | })) |dep| { 194 | step.root_module.addImport("uucode", dep.module("uucode")); 195 | } 196 | 197 | // In your code: 198 | uucode.get(.wcwidth_standalone, 0x26F5) // ⛵ == 2 199 | ``` 200 | 201 | See [src/x/config.x.zig](src/x/config.x.zig) for the full list of builtin extensions. 202 | 203 | ### Advanced configuration 204 | 205 | ``` zig 206 | /////////////////////////////////////////////////////////// 207 | // In `build.zig`: 208 | 209 | b.dependency("uucode", .{ 210 | .target = target, 211 | .optimize = optimize, 212 | .build_config_path = b.path("src/build/uucode_config.zig"), 213 | 214 | // Alternatively, use a string literal: 215 | //.@"build_config.zig" = "..." 216 | }) 217 | 218 | /////////////////////////////////////////////////////////// 219 | // In `src/build/uucode_config.zig`: 220 | 221 | const std = @import("std"); 222 | const config = @import("config.zig"); 223 | 224 | // Use `config.x.zig` for builtin extensions: 225 | const config_x = @import("config.x.zig"); 226 | 227 | const d = config.default; 228 | const wcwidth = config_x.wcwidth; 229 | 230 | // Or build your own extension: 231 | const emoji_odd_or_even = config.Extension{ 232 | .inputs = &.{"is_emoji"}, 233 | .compute = &computeEmojiOddOrEven, 234 | .fields = &.{ 235 | .{ .name = "emoji_odd_or_even", .type = EmojiOddOrEven }, 236 | }, 237 | }; 238 | 239 | fn computeEmojiOddOrEven( 240 | allocator: std.mem.Allocator, 241 | cp: u21, 242 | data: anytype, 243 | backing: anytype, 244 | tracking: anytype, 245 | ) std.mem.Allocator.Error!void { 246 | // allocator is an ArenaAllocator, so don't worry about freeing 247 | _ = allocator; 248 | 249 | // backing and tracking are only used for slice types (see 250 | // src/build/test_build_config.zig for examples). 251 | _ = backing; 252 | _ = tracking; 253 | 254 | if (!data.is_emoji) { 255 | data.emoji_odd_or_even = .not_emoji; 256 | } else if (cp % 2 == 0) { 257 | data.emoji_odd_or_even = .even_emoji; 258 | } else { 259 | data.emoji_odd_or_even = .odd_emoji; 260 | } 261 | } 262 | 263 | // Types must be marked `pub` 264 | pub const EmojiOddOrEven = enum(u2) { 265 | not_emoji, 266 | even_emoji, 267 | odd_emoji, 268 | }; 269 | 270 | // Configure tables with the `tables` declaration. 271 | // The only required field is `fields`, and the rest have reasonable defaults. 272 | pub const tables = [_]config.Table{ 273 | .{ 274 | // Optional name, to be able to `getAll("foo")` rather than e.g. 275 | // `getAll("0")` 276 | .name = "foo", 277 | 278 | // A two stage table can be slightly faster if the data is small. The 279 | // default `.auto` will pick a reasonable value, but to get the 280 | // absolute best performance run benchmarks with `.two` or `.three` 281 | // on realistic data. 282 | .stages = .three, 283 | 284 | // The default `.auto` value decide whether the final data stage struct 285 | // should be a `packed struct` (.@"packed") or a regular Zig `struct`. 286 | .packing = .unpacked, 287 | 288 | .extensions = &.{ 289 | emoji_odd_or_even, 290 | wcwidth, 291 | }, 292 | 293 | .fields = &.{ 294 | // Don't forget to include the extension's fields here. 295 | emoji_odd_or_even.field("emoji_odd_or_even"), 296 | wcwidth.field("wcwidth_standalone"), 297 | wcwidth.field("wcwidth_zero_in_grapheme"), 298 | 299 | // See `src/config.zig` for everything that can be overriden. 300 | // In this example, we're embedding 15 bytes into the `stage3` data, 301 | // and only names longer than that need to use the `backing` slice. 302 | d.field("name").override(.{ 303 | .embedded_len = 15, 304 | .max_offset = 986096, // run once to get the correct number 305 | }), 306 | 307 | d.field("general_category"), 308 | d.field("block"), 309 | // ... 310 | }, 311 | }, 312 | }; 313 | 314 | // Turn on debug logging: 315 | pub const log_level = .debug; 316 | 317 | /////////////////////////////////////////////////////////// 318 | // In your code: 319 | 320 | const uucode = @import("uucode"); 321 | 322 | uucode.get(.wcwidth_standalone, 0x26F5) // ⛵ == 2 323 | 324 | uucode.get(.emoji_odd_or_even, 0x1F34B) // 🍋 == .odd_emoji 325 | 326 | ``` 327 | 328 | ## Code architecture 329 | 330 | The architecture works in a few layers: 331 | 332 | * Layer 1 (`src/build/Ucd.zig`): Parses the Unicode Character Database (UCD). 333 | * Layer 2 (`src/build/tables.zig`): Generates table data written to a zig file. 334 | * Layer 3 (`src/root.zig`): Exposes methods to fetch information from the built tables. 335 | 336 | ## History and acknowledgments 337 | 338 | `uucode` began out of work on the [Ghostty terminal](https://ghostty.org/) on [an issue to upgrade dependencies](https://github.com/ghostty-org/ghostty/issues/5694), where the experience modifying [zg](https://codeberg.org/atman/zg/) gave the confidence to build a fresh new library. 339 | 340 | `uucode` builds upon the Unicode performance work done in Ghostty, [as outlined in this excellent Devlog](https://mitchellh.com/writing/ghostty-devlog-006). The 3-stage lookup tables, as mentioned in that Devlog, come from [this article](https://here-be-braces.com/fast-lookup-of-unicode-properties/). 341 | 342 | ## License 343 | 344 | `uucode` is available under an MIT License. See [./LICENSE.md](./LICENSE.md) for the license text and an index of licenses for code used in the repo. 345 | 346 | ## Resources 347 | 348 | See [./RESOURCES.md](./RESOURCES.md) for a list of resources used to build `uucode`. 349 | -------------------------------------------------------------------------------- /src/x/config_x/wcwidth.zig: -------------------------------------------------------------------------------- 1 | //! The `wcwidth` is a calculation of the expected width of a code point in 2 | //! cells of a monospaced font. It is not part of the Unicode standard. 3 | //! 4 | //! IMPORTANT: in general, calculate the width of a grapheme cluster with 5 | //! `uucode.x.grapheme.wcwidth(it)` instead of using this `wcwidth` 6 | //! directly. If it's already known that a code point is standing alone and not 7 | //! part of a multiple-code-point grapheme cluster, it's acceptable to use 8 | //! `wcwidth_standalone` directly. 9 | //! 10 | //! This `wcwidth` calculates two related values: 11 | //! 12 | //! * `wcwidth_standalone`:` The width for a code point as it would display 13 | //! **standing alone** without being combined with other code point in a 14 | //! grapheme cluster. Put another way, this is the width of a grapheme 15 | //! cluster consisting of only this code point. For some code points, it is 16 | //! rare or even technically "invalid" to be alone in a grapheme cluster but 17 | //! despite that, we provide a width for them. See `wcwidth` in 18 | //! `src/x/grapheme.zig` for the code and documentation for determining the 19 | //! width of a grapheme cluster that may contain multiple code points, and 20 | //! not how it uses this `wcwidth_standalone` when there is only one code 21 | //! point. 22 | //! 23 | //! * `wcwidth_zero_in_grapheme`: This indicates whether a code point does not 24 | //! contribute to width within a grapheme cluster, even if the code point 25 | //! might have width when standing alone (`wcwidth_standalone`). Emoji 26 | //! modifiers, nonspacing and enclosing marks, and Hangul/Kirat V/T are all 27 | //! in this category. 28 | //! 29 | //! See resources/wcwidth for other implementations, that help to inform the 30 | //! implementation here. 31 | //! 32 | //! This implementation makes the following choices: 33 | //! 34 | //! * The returned width is never negative. C0 and C1 control characters are 35 | //! treated as zero width, diverging from some implementations that return 36 | //! -1. 37 | //! 38 | //! * When a combining mark (Mn, Mc, Me) stands alone (not preceded by a base 39 | //! character), it forms a "defective combining character sequence" (Core Spec 40 | //! 3.6, 41 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G20665). 42 | //! Per Core Spec 5.13: "Defective combining character sequences should be 43 | //! rendered as if they had a no-break space as a base character" 44 | //! (https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-5/#G1099). 45 | //! Therefore, `wcwidth_standalone` is given a width of 1. 46 | //! 47 | //! Note: Per UAX #44, nonspacing marks (Mn) have "zero advance width" while 48 | //! spacing marks (Mc) have "positive advance width" 49 | //! (https://www.unicode.org/reports/tr44/#General_Category_Values). 50 | //! Enclosing marks (Me) are not explicitly specified, but in terminal 51 | //! rendering contexts they behave similarly to nonspacing marks. See also 52 | //! Core Spec 2.11, "Nonspacing combining characters do not occupy a spacing 53 | //! position by themselves" 54 | //! (https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-2/#G1789). 55 | //! Therefore, `wcwidth_zero_in_grapheme` is true for nonspacing marks (Mn) 56 | //! and enclosing marks (Me). 57 | //! 58 | //! * East Asian Width (UAX #11, https://www.unicode.org/reports/tr11/) is used 59 | //! to determine width, but only as a starting point. UAX #11 warns that 60 | //! East_Asian_Width "is not intended for use by modern terminal emulators 61 | //! without appropriate tailoring" (UAX #11 §2, 62 | //! https://www.unicode.org/reports/tr11/#Scope). This implementation applies 63 | //! tailoring for specific cases such as regional indicators. 64 | //! 65 | //! Ambiguous width (A) characters are treated as width 1. Per UAX #11 §5 66 | //! Recommendations: "If the context cannot be established reliably, they 67 | //! should be treated as narrow characters by default" 68 | //! (https://www.unicode.org/reports/tr11/#Recommendations), and per UAX #11 69 | //! §4.2 Ambiguous Characters: "Modern practice is evolving toward rendering 70 | //! ever more of the ambiguous characters with proportionally spaced, narrow 71 | //! forms that rotate with the direction of writing, independent of their 72 | //! treatment in one or more legacy character sets." 73 | //! 74 | //! * U+20E3 COMBINING ENCLOSING KEYCAP is commonly used in emoji keycap 75 | //! sequences like 1️⃣ (digit + VS16 + U+20E3), but when standing alone might 76 | //! render as an empty keycap symbol visually occupying 2 cells, so sit is 77 | //! given width 2. This is a special case—other enclosing marks like U+20DD 78 | //! COMBINING ENCLOSING CIRCLE are width 1. UTS #51 §1.4.6 ED-20 states 79 | //! "Other components (U+20E3 COMBINING ENCLOSING KEYCAP, ...) should never 80 | //! have an emoji presentation in isolation" 81 | //! (https://www.unicode.org/reports/tr51/#def_basic_emoji_set), so this 82 | //! should display with text presentation standing alone. For 83 | //! `wcwidth_zero_in_grapheme`, it is true, as it should usually follow VS16 84 | //! preceded by a digit or '#', and so the entire keycap sequence will be a 85 | //! width of 2 from the special VS16 handling. 86 | //! 87 | //! * Regional indicator symbols (U+1F1E6..U+1F1FF) are treated as width 2, 88 | //! whether paired in valid emoji flag sequences or standing alone. Per UTS #51 89 | //! §1.5 Conformance: "A singleton emoji Regional Indicator may be displayed 90 | //! as a capital A..Z character with a special display" 91 | //! (https://www.unicode.org/reports/tr51/#C3). Unpaired regional indicators 92 | //! commonly render as the corresponding letter in a width-2 box (e.g., 🇺 93 | //! displays as "U" in a box). See the above bullet point (U+20E3) for the 94 | //! text from UTS #51 §1.4.6 ED-20 that also applies to regional indicators, 95 | //! meaning they should have a text presentation in isolation. 96 | //! 97 | //! * Default_Ignorable_Code_Point characters are treated as width 0. These are 98 | //! characters that "should be ignored in rendering (unless explicitly 99 | //! supported)" (UAX #44, 100 | //! https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point). This 101 | //! includes variation selectors, join controls (ZWJ/ZWNJ), bidi formatting 102 | //! controls, tag characters, and other invisible format controls. 103 | //! 104 | //! Exception: U+00AD SOFT HYPHEN is treated as width 1 for terminal 105 | //! compatibility despite being default-ignorable. Per the Unicode FAQ: "In a 106 | //! terminal emulation environment, particularly in ISO-8859-1 contexts, one 107 | //! could display the SOFT HYPHEN as a hyphen in all circumstances" 108 | //! (https://www.unicode.org/faq/casemap_charprop.html). Terminals lack 109 | //! sophisticated word-breaking algorithms and typically display SOFT HYPHEN as 110 | //! a visible hyphen, requiring width 1. This matches ecosystem wcwidth 111 | //! implementations. 112 | //! 113 | //! VS15 and VS16 have `wcwidth_zero_in_grapheme` set to true. These are not 114 | //! "zero in grapheme" in the sense that they don't affect width--they change 115 | //! the width of the base char! But they don't have their *own* independent 116 | //! width contribution that should be summed. They are special cased in the 117 | //! `x/grapheme.zig` `wcwidth` calculation. 118 | //! 119 | //! * Hangul Jamo medial vowels and Kirat Rai vowels (all 120 | //! Grapheme_Cluster_Break=V) and Hangul trailing consonants 121 | //! (Grapheme_Cluster_Break=T) are width 1 for wcwidth_standalone since they 122 | //! are General_Category=Other_Letter with East_Asian_Width=Neutral. However, 123 | //! `wcwidth_zero_in_grapheme` is true for these, as they should only be 124 | //! present in a grapheme cluster where the other code points contribute to 125 | //! the width. 126 | //! 127 | //! * Grapheme_Cluster_Break=Prepend characters (e.g., Indic Rephas) are treated 128 | //! as width 1 when standing alone, but join with subsequent code points and 129 | //! are `wcwidth_zero_in_grapheme` true. Note that none of the Prepend 130 | //! characters are default-ignorable. 131 | //! 132 | //! * Surrogates (General_Category=Cs, U+D800..U+DFFF) are treated as width 0. 133 | //! They are not Unicode scalar values (Core Spec 3.9, 134 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G25539) 135 | //! and "are designated for surrogate code units in the UTF-16 character 136 | //! encoding form. They are unassigned to any abstract character." (Core Spec 137 | //! 3.2.1 C1, 138 | //! https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G22599). 139 | //! 140 | //! * U+2028 LINE SEPARATOR (Zl) and U+2029 PARAGRAPH SEPARATOR (Zp) are 141 | //! treated as width 0. They introduce mandatory line/paragraph breaks (UAX 142 | //! #14, Line_Break=BK, https://www.unicode.org/reports/tr14/#BK) and do not 143 | //! advance horizontally on the same line. 144 | //! 145 | //! * Emoji modifiers (Fitzpatrick skin tone modifiers U+1F3FB..U+1F3FF) have 146 | //! `wcwidth_standalone` = 2, as when standing alone they render as fullwidth 147 | //! colored squares (and are marked East_Asian_Width=W) However, 148 | //! `wcwidth_zero_in_grapheme` is true, as they are typically used to modify a 149 | //! base emoji which contributes the width. 150 | //! 151 | 152 | const std = @import("std"); 153 | const config = @import("config.zig"); 154 | 155 | fn compute( 156 | allocator: std.mem.Allocator, 157 | cp: u21, 158 | data: anytype, 159 | backing: anytype, 160 | tracking: anytype, 161 | ) std.mem.Allocator.Error!void { 162 | _ = allocator; 163 | _ = backing; 164 | _ = tracking; 165 | const gc = data.general_category; 166 | 167 | var width: u2 = undefined; 168 | 169 | if (gc == .other_control or 170 | gc == .other_surrogate or 171 | gc == .separator_line or 172 | gc == .separator_paragraph) 173 | { 174 | width = 0; 175 | } else if (cp == 0x00AD) { // Soft hyphen 176 | width = 1; 177 | } else if (data.is_default_ignorable) { 178 | width = 0; 179 | } else if (cp == 0x2E3A) { // Two-em dash 180 | width = 2; 181 | } else if (cp == 0x2E3B) { // Three-em dash 182 | width = 3; 183 | } else if (data.east_asian_width == .wide or data.east_asian_width == .fullwidth) { 184 | width = 2; 185 | } else if (data.grapheme_break == .regional_indicator) { 186 | width = 2; 187 | } else { 188 | width = 1; 189 | } 190 | 191 | const Data = @TypeOf(data.*); 192 | if (@hasField(Data, "wcwidth_standalone")) { 193 | if (cp == 0x20E3) { // Combining enclosing keycap 194 | data.wcwidth_standalone = 2; 195 | } else { 196 | data.wcwidth_standalone = width; 197 | } 198 | } 199 | if (@hasField(Data, "wcwidth_zero_in_grapheme")) { 200 | if (width == 0 or // Includes default_ignorable such as ZWJ and VS 201 | data.is_emoji_modifier or 202 | gc == .mark_nonspacing or 203 | gc == .mark_enclosing or // Including keycap 204 | data.grapheme_break == .v or // Hangul Jamo and Kirat Rai vowels 205 | data.grapheme_break == .t or // Hangul trailing consonants 206 | data.grapheme_break == .prepend // e.g. Indic Rephas 207 | ) { 208 | data.wcwidth_zero_in_grapheme = true; 209 | } else { 210 | data.wcwidth_zero_in_grapheme = false; 211 | } 212 | } 213 | } 214 | 215 | pub const wcwidth = config.Extension{ 216 | .inputs = &.{ 217 | "east_asian_width", 218 | "general_category", 219 | "grapheme_break", 220 | "is_default_ignorable", 221 | "is_emoji_modifier", 222 | }, 223 | .compute = &compute, 224 | .fields = &.{ 225 | .{ .name = "wcwidth_standalone", .type = u2 }, 226 | .{ .name = "wcwidth_zero_in_grapheme", .type = bool }, 227 | }, 228 | }; 229 | -------------------------------------------------------------------------------- /ucd/Blocks.txt: -------------------------------------------------------------------------------- 1 | # Blocks-16.0.0.txt 2 | # Date: 2024-02-02 3 | # © 2024 Unicode®, Inc. 4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. 5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html 6 | # 7 | # Unicode Character Database 8 | # For documentation, see https://www.unicode.org/reports/tr44/ 9 | # 10 | # Format: 11 | # Start Code..End Code; Block Name 12 | 13 | # ================================================ 14 | 15 | # Note: When comparing block names, casing, whitespace, hyphens, 16 | # and underbars are ignored. 17 | # For example, "Latin Extended-A" and "latin extended a" are equivalent. 18 | # For more information on the comparison of property values, 19 | # see UAX #44: https://www.unicode.org/reports/tr44/ 20 | # 21 | # All block ranges start with a value where (cp MOD 16) = 0, 22 | # and end with a value where (cp MOD 16) = 15. In other words, 23 | # the last hexadecimal digit of the start of range is ...0 24 | # and the last hexadecimal digit of the end of range is ...F. 25 | # This constraint on block ranges guarantees that allocations 26 | # are done in terms of whole columns, and that code chart display 27 | # never involves splitting columns in the charts. 28 | # 29 | # All code points not explicitly listed for Block 30 | # have the value No_Block. 31 | 32 | # Property: Block 33 | # 34 | # @missing: 0000..10FFFF; No_Block 35 | 36 | 0000..007F; Basic Latin 37 | 0080..00FF; Latin-1 Supplement 38 | 0100..017F; Latin Extended-A 39 | 0180..024F; Latin Extended-B 40 | 0250..02AF; IPA Extensions 41 | 02B0..02FF; Spacing Modifier Letters 42 | 0300..036F; Combining Diacritical Marks 43 | 0370..03FF; Greek and Coptic 44 | 0400..04FF; Cyrillic 45 | 0500..052F; Cyrillic Supplement 46 | 0530..058F; Armenian 47 | 0590..05FF; Hebrew 48 | 0600..06FF; Arabic 49 | 0700..074F; Syriac 50 | 0750..077F; Arabic Supplement 51 | 0780..07BF; Thaana 52 | 07C0..07FF; NKo 53 | 0800..083F; Samaritan 54 | 0840..085F; Mandaic 55 | 0860..086F; Syriac Supplement 56 | 0870..089F; Arabic Extended-B 57 | 08A0..08FF; Arabic Extended-A 58 | 0900..097F; Devanagari 59 | 0980..09FF; Bengali 60 | 0A00..0A7F; Gurmukhi 61 | 0A80..0AFF; Gujarati 62 | 0B00..0B7F; Oriya 63 | 0B80..0BFF; Tamil 64 | 0C00..0C7F; Telugu 65 | 0C80..0CFF; Kannada 66 | 0D00..0D7F; Malayalam 67 | 0D80..0DFF; Sinhala 68 | 0E00..0E7F; Thai 69 | 0E80..0EFF; Lao 70 | 0F00..0FFF; Tibetan 71 | 1000..109F; Myanmar 72 | 10A0..10FF; Georgian 73 | 1100..11FF; Hangul Jamo 74 | 1200..137F; Ethiopic 75 | 1380..139F; Ethiopic Supplement 76 | 13A0..13FF; Cherokee 77 | 1400..167F; Unified Canadian Aboriginal Syllabics 78 | 1680..169F; Ogham 79 | 16A0..16FF; Runic 80 | 1700..171F; Tagalog 81 | 1720..173F; Hanunoo 82 | 1740..175F; Buhid 83 | 1760..177F; Tagbanwa 84 | 1780..17FF; Khmer 85 | 1800..18AF; Mongolian 86 | 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended 87 | 1900..194F; Limbu 88 | 1950..197F; Tai Le 89 | 1980..19DF; New Tai Lue 90 | 19E0..19FF; Khmer Symbols 91 | 1A00..1A1F; Buginese 92 | 1A20..1AAF; Tai Tham 93 | 1AB0..1AFF; Combining Diacritical Marks Extended 94 | 1B00..1B7F; Balinese 95 | 1B80..1BBF; Sundanese 96 | 1BC0..1BFF; Batak 97 | 1C00..1C4F; Lepcha 98 | 1C50..1C7F; Ol Chiki 99 | 1C80..1C8F; Cyrillic Extended-C 100 | 1C90..1CBF; Georgian Extended 101 | 1CC0..1CCF; Sundanese Supplement 102 | 1CD0..1CFF; Vedic Extensions 103 | 1D00..1D7F; Phonetic Extensions 104 | 1D80..1DBF; Phonetic Extensions Supplement 105 | 1DC0..1DFF; Combining Diacritical Marks Supplement 106 | 1E00..1EFF; Latin Extended Additional 107 | 1F00..1FFF; Greek Extended 108 | 2000..206F; General Punctuation 109 | 2070..209F; Superscripts and Subscripts 110 | 20A0..20CF; Currency Symbols 111 | 20D0..20FF; Combining Diacritical Marks for Symbols 112 | 2100..214F; Letterlike Symbols 113 | 2150..218F; Number Forms 114 | 2190..21FF; Arrows 115 | 2200..22FF; Mathematical Operators 116 | 2300..23FF; Miscellaneous Technical 117 | 2400..243F; Control Pictures 118 | 2440..245F; Optical Character Recognition 119 | 2460..24FF; Enclosed Alphanumerics 120 | 2500..257F; Box Drawing 121 | 2580..259F; Block Elements 122 | 25A0..25FF; Geometric Shapes 123 | 2600..26FF; Miscellaneous Symbols 124 | 2700..27BF; Dingbats 125 | 27C0..27EF; Miscellaneous Mathematical Symbols-A 126 | 27F0..27FF; Supplemental Arrows-A 127 | 2800..28FF; Braille Patterns 128 | 2900..297F; Supplemental Arrows-B 129 | 2980..29FF; Miscellaneous Mathematical Symbols-B 130 | 2A00..2AFF; Supplemental Mathematical Operators 131 | 2B00..2BFF; Miscellaneous Symbols and Arrows 132 | 2C00..2C5F; Glagolitic 133 | 2C60..2C7F; Latin Extended-C 134 | 2C80..2CFF; Coptic 135 | 2D00..2D2F; Georgian Supplement 136 | 2D30..2D7F; Tifinagh 137 | 2D80..2DDF; Ethiopic Extended 138 | 2DE0..2DFF; Cyrillic Extended-A 139 | 2E00..2E7F; Supplemental Punctuation 140 | 2E80..2EFF; CJK Radicals Supplement 141 | 2F00..2FDF; Kangxi Radicals 142 | 2FF0..2FFF; Ideographic Description Characters 143 | 3000..303F; CJK Symbols and Punctuation 144 | 3040..309F; Hiragana 145 | 30A0..30FF; Katakana 146 | 3100..312F; Bopomofo 147 | 3130..318F; Hangul Compatibility Jamo 148 | 3190..319F; Kanbun 149 | 31A0..31BF; Bopomofo Extended 150 | 31C0..31EF; CJK Strokes 151 | 31F0..31FF; Katakana Phonetic Extensions 152 | 3200..32FF; Enclosed CJK Letters and Months 153 | 3300..33FF; CJK Compatibility 154 | 3400..4DBF; CJK Unified Ideographs Extension A 155 | 4DC0..4DFF; Yijing Hexagram Symbols 156 | 4E00..9FFF; CJK Unified Ideographs 157 | A000..A48F; Yi Syllables 158 | A490..A4CF; Yi Radicals 159 | A4D0..A4FF; Lisu 160 | A500..A63F; Vai 161 | A640..A69F; Cyrillic Extended-B 162 | A6A0..A6FF; Bamum 163 | A700..A71F; Modifier Tone Letters 164 | A720..A7FF; Latin Extended-D 165 | A800..A82F; Syloti Nagri 166 | A830..A83F; Common Indic Number Forms 167 | A840..A87F; Phags-pa 168 | A880..A8DF; Saurashtra 169 | A8E0..A8FF; Devanagari Extended 170 | A900..A92F; Kayah Li 171 | A930..A95F; Rejang 172 | A960..A97F; Hangul Jamo Extended-A 173 | A980..A9DF; Javanese 174 | A9E0..A9FF; Myanmar Extended-B 175 | AA00..AA5F; Cham 176 | AA60..AA7F; Myanmar Extended-A 177 | AA80..AADF; Tai Viet 178 | AAE0..AAFF; Meetei Mayek Extensions 179 | AB00..AB2F; Ethiopic Extended-A 180 | AB30..AB6F; Latin Extended-E 181 | AB70..ABBF; Cherokee Supplement 182 | ABC0..ABFF; Meetei Mayek 183 | AC00..D7AF; Hangul Syllables 184 | D7B0..D7FF; Hangul Jamo Extended-B 185 | D800..DB7F; High Surrogates 186 | DB80..DBFF; High Private Use Surrogates 187 | DC00..DFFF; Low Surrogates 188 | E000..F8FF; Private Use Area 189 | F900..FAFF; CJK Compatibility Ideographs 190 | FB00..FB4F; Alphabetic Presentation Forms 191 | FB50..FDFF; Arabic Presentation Forms-A 192 | FE00..FE0F; Variation Selectors 193 | FE10..FE1F; Vertical Forms 194 | FE20..FE2F; Combining Half Marks 195 | FE30..FE4F; CJK Compatibility Forms 196 | FE50..FE6F; Small Form Variants 197 | FE70..FEFF; Arabic Presentation Forms-B 198 | FF00..FFEF; Halfwidth and Fullwidth Forms 199 | FFF0..FFFF; Specials 200 | 10000..1007F; Linear B Syllabary 201 | 10080..100FF; Linear B Ideograms 202 | 10100..1013F; Aegean Numbers 203 | 10140..1018F; Ancient Greek Numbers 204 | 10190..101CF; Ancient Symbols 205 | 101D0..101FF; Phaistos Disc 206 | 10280..1029F; Lycian 207 | 102A0..102DF; Carian 208 | 102E0..102FF; Coptic Epact Numbers 209 | 10300..1032F; Old Italic 210 | 10330..1034F; Gothic 211 | 10350..1037F; Old Permic 212 | 10380..1039F; Ugaritic 213 | 103A0..103DF; Old Persian 214 | 10400..1044F; Deseret 215 | 10450..1047F; Shavian 216 | 10480..104AF; Osmanya 217 | 104B0..104FF; Osage 218 | 10500..1052F; Elbasan 219 | 10530..1056F; Caucasian Albanian 220 | 10570..105BF; Vithkuqi 221 | 105C0..105FF; Todhri 222 | 10600..1077F; Linear A 223 | 10780..107BF; Latin Extended-F 224 | 10800..1083F; Cypriot Syllabary 225 | 10840..1085F; Imperial Aramaic 226 | 10860..1087F; Palmyrene 227 | 10880..108AF; Nabataean 228 | 108E0..108FF; Hatran 229 | 10900..1091F; Phoenician 230 | 10920..1093F; Lydian 231 | 10980..1099F; Meroitic Hieroglyphs 232 | 109A0..109FF; Meroitic Cursive 233 | 10A00..10A5F; Kharoshthi 234 | 10A60..10A7F; Old South Arabian 235 | 10A80..10A9F; Old North Arabian 236 | 10AC0..10AFF; Manichaean 237 | 10B00..10B3F; Avestan 238 | 10B40..10B5F; Inscriptional Parthian 239 | 10B60..10B7F; Inscriptional Pahlavi 240 | 10B80..10BAF; Psalter Pahlavi 241 | 10C00..10C4F; Old Turkic 242 | 10C80..10CFF; Old Hungarian 243 | 10D00..10D3F; Hanifi Rohingya 244 | 10D40..10D8F; Garay 245 | 10E60..10E7F; Rumi Numeral Symbols 246 | 10E80..10EBF; Yezidi 247 | 10EC0..10EFF; Arabic Extended-C 248 | 10F00..10F2F; Old Sogdian 249 | 10F30..10F6F; Sogdian 250 | 10F70..10FAF; Old Uyghur 251 | 10FB0..10FDF; Chorasmian 252 | 10FE0..10FFF; Elymaic 253 | 11000..1107F; Brahmi 254 | 11080..110CF; Kaithi 255 | 110D0..110FF; Sora Sompeng 256 | 11100..1114F; Chakma 257 | 11150..1117F; Mahajani 258 | 11180..111DF; Sharada 259 | 111E0..111FF; Sinhala Archaic Numbers 260 | 11200..1124F; Khojki 261 | 11280..112AF; Multani 262 | 112B0..112FF; Khudawadi 263 | 11300..1137F; Grantha 264 | 11380..113FF; Tulu-Tigalari 265 | 11400..1147F; Newa 266 | 11480..114DF; Tirhuta 267 | 11580..115FF; Siddham 268 | 11600..1165F; Modi 269 | 11660..1167F; Mongolian Supplement 270 | 11680..116CF; Takri 271 | 116D0..116FF; Myanmar Extended-C 272 | 11700..1174F; Ahom 273 | 11800..1184F; Dogra 274 | 118A0..118FF; Warang Citi 275 | 11900..1195F; Dives Akuru 276 | 119A0..119FF; Nandinagari 277 | 11A00..11A4F; Zanabazar Square 278 | 11A50..11AAF; Soyombo 279 | 11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A 280 | 11AC0..11AFF; Pau Cin Hau 281 | 11B00..11B5F; Devanagari Extended-A 282 | 11BC0..11BFF; Sunuwar 283 | 11C00..11C6F; Bhaiksuki 284 | 11C70..11CBF; Marchen 285 | 11D00..11D5F; Masaram Gondi 286 | 11D60..11DAF; Gunjala Gondi 287 | 11EE0..11EFF; Makasar 288 | 11F00..11F5F; Kawi 289 | 11FB0..11FBF; Lisu Supplement 290 | 11FC0..11FFF; Tamil Supplement 291 | 12000..123FF; Cuneiform 292 | 12400..1247F; Cuneiform Numbers and Punctuation 293 | 12480..1254F; Early Dynastic Cuneiform 294 | 12F90..12FFF; Cypro-Minoan 295 | 13000..1342F; Egyptian Hieroglyphs 296 | 13430..1345F; Egyptian Hieroglyph Format Controls 297 | 13460..143FF; Egyptian Hieroglyphs Extended-A 298 | 14400..1467F; Anatolian Hieroglyphs 299 | 16100..1613F; Gurung Khema 300 | 16800..16A3F; Bamum Supplement 301 | 16A40..16A6F; Mro 302 | 16A70..16ACF; Tangsa 303 | 16AD0..16AFF; Bassa Vah 304 | 16B00..16B8F; Pahawh Hmong 305 | 16D40..16D7F; Kirat Rai 306 | 16E40..16E9F; Medefaidrin 307 | 16F00..16F9F; Miao 308 | 16FE0..16FFF; Ideographic Symbols and Punctuation 309 | 17000..187FF; Tangut 310 | 18800..18AFF; Tangut Components 311 | 18B00..18CFF; Khitan Small Script 312 | 18D00..18D7F; Tangut Supplement 313 | 1AFF0..1AFFF; Kana Extended-B 314 | 1B000..1B0FF; Kana Supplement 315 | 1B100..1B12F; Kana Extended-A 316 | 1B130..1B16F; Small Kana Extension 317 | 1B170..1B2FF; Nushu 318 | 1BC00..1BC9F; Duployan 319 | 1BCA0..1BCAF; Shorthand Format Controls 320 | 1CC00..1CEBF; Symbols for Legacy Computing Supplement 321 | 1CF00..1CFCF; Znamenny Musical Notation 322 | 1D000..1D0FF; Byzantine Musical Symbols 323 | 1D100..1D1FF; Musical Symbols 324 | 1D200..1D24F; Ancient Greek Musical Notation 325 | 1D2C0..1D2DF; Kaktovik Numerals 326 | 1D2E0..1D2FF; Mayan Numerals 327 | 1D300..1D35F; Tai Xuan Jing Symbols 328 | 1D360..1D37F; Counting Rod Numerals 329 | 1D400..1D7FF; Mathematical Alphanumeric Symbols 330 | 1D800..1DAAF; Sutton SignWriting 331 | 1DF00..1DFFF; Latin Extended-G 332 | 1E000..1E02F; Glagolitic Supplement 333 | 1E030..1E08F; Cyrillic Extended-D 334 | 1E100..1E14F; Nyiakeng Puachue Hmong 335 | 1E290..1E2BF; Toto 336 | 1E2C0..1E2FF; Wancho 337 | 1E4D0..1E4FF; Nag Mundari 338 | 1E5D0..1E5FF; Ol Onal 339 | 1E7E0..1E7FF; Ethiopic Extended-B 340 | 1E800..1E8DF; Mende Kikakui 341 | 1E900..1E95F; Adlam 342 | 1EC70..1ECBF; Indic Siyaq Numbers 343 | 1ED00..1ED4F; Ottoman Siyaq Numbers 344 | 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols 345 | 1F000..1F02F; Mahjong Tiles 346 | 1F030..1F09F; Domino Tiles 347 | 1F0A0..1F0FF; Playing Cards 348 | 1F100..1F1FF; Enclosed Alphanumeric Supplement 349 | 1F200..1F2FF; Enclosed Ideographic Supplement 350 | 1F300..1F5FF; Miscellaneous Symbols and Pictographs 351 | 1F600..1F64F; Emoticons 352 | 1F650..1F67F; Ornamental Dingbats 353 | 1F680..1F6FF; Transport and Map Symbols 354 | 1F700..1F77F; Alchemical Symbols 355 | 1F780..1F7FF; Geometric Shapes Extended 356 | 1F800..1F8FF; Supplemental Arrows-C 357 | 1F900..1F9FF; Supplemental Symbols and Pictographs 358 | 1FA00..1FA6F; Chess Symbols 359 | 1FA70..1FAFF; Symbols and Pictographs Extended-A 360 | 1FB00..1FBFF; Symbols for Legacy Computing 361 | 20000..2A6DF; CJK Unified Ideographs Extension B 362 | 2A700..2B73F; CJK Unified Ideographs Extension C 363 | 2B740..2B81F; CJK Unified Ideographs Extension D 364 | 2B820..2CEAF; CJK Unified Ideographs Extension E 365 | 2CEB0..2EBEF; CJK Unified Ideographs Extension F 366 | 2EBF0..2EE5F; CJK Unified Ideographs Extension I 367 | 2F800..2FA1F; CJK Compatibility Ideographs Supplement 368 | 30000..3134F; CJK Unified Ideographs Extension G 369 | 31350..323AF; CJK Unified Ideographs Extension H 370 | E0000..E007F; Tags 371 | E0100..E01EF; Variation Selectors Supplement 372 | F0000..FFFFF; Supplementary Private Use Area-A 373 | 100000..10FFFF; Supplementary Private Use Area-B 374 | 375 | # EOF 376 | -------------------------------------------------------------------------------- /resources/wcwidth/wcwidth.c: -------------------------------------------------------------------------------- 1 | /* 2 | * TODO: add ability to track arbitrary url files in `cpv` 3 | * From: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 4 | * 5 | * This is an implementation of wcwidth() and wcswidth() (defined in 6 | * IEEE Std 1002.1-2001) for Unicode. 7 | * 8 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html 9 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html 10 | * 11 | * In fixed-width output devices, Latin characters all occupy a single 12 | * "cell" position of equal width, whereas ideographic CJK characters 13 | * occupy two such cells. Interoperability between terminal-line 14 | * applications and (teletype-style) character terminals using the 15 | * UTF-8 encoding requires agreement on which character should advance 16 | * the cursor by how many cell positions. No established formal 17 | * standards exist at present on which Unicode character shall occupy 18 | * how many cell positions on character terminals. These routines are 19 | * a first attempt of defining such behavior based on simple rules 20 | * applied to data provided by the Unicode Consortium. 21 | * 22 | * For some graphical characters, the Unicode standard explicitly 23 | * defines a character-cell width via the definition of the East Asian 24 | * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. 25 | * In all these cases, there is no ambiguity about which width a 26 | * terminal shall use. For characters in the East Asian Ambiguous (A) 27 | * class, the width choice depends purely on a preference of backward 28 | * compatibility with either historic CJK or Western practice. 29 | * Choosing single-width for these characters is easy to justify as 30 | * the appropriate long-term solution, as the CJK practice of 31 | * displaying these characters as double-width comes from historic 32 | * implementation simplicity (8-bit encoded characters were displayed 33 | * single-width and 16-bit ones double-width, even for Greek, 34 | * Cyrillic, etc.) and not any typographic considerations. 35 | * 36 | * Much less clear is the choice of width for the Not East Asian 37 | * (Neutral) class. Existing practice does not dictate a width for any 38 | * of these characters. It would nevertheless make sense 39 | * typographically to allocate two character cells to characters such 40 | * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be 41 | * represented adequately with a single-width glyph. The following 42 | * routines at present merely assign a single-cell width to all 43 | * neutral characters, in the interest of simplicity. This is not 44 | * entirely satisfactory and should be reconsidered before 45 | * establishing a formal standard in this area. At the moment, the 46 | * decision which Not East Asian (Neutral) characters should be 47 | * represented by double-width glyphs cannot yet be answered by 48 | * applying a simple rule from the Unicode database content. Setting 49 | * up a proper standard for the behavior of UTF-8 character terminals 50 | * will require a careful analysis not only of each Unicode character, 51 | * but also of each presentation form, something the author of these 52 | * routines has avoided to do so far. 53 | * 54 | * http://www.unicode.org/unicode/reports/tr11/ 55 | * 56 | * Markus Kuhn -- 2007-05-26 (Unicode 5.0) 57 | * 58 | * Permission to use, copy, modify, and distribute this software 59 | * for any purpose and without fee is hereby granted. The author 60 | * disclaims all warranties with regard to this software. 61 | * 62 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 63 | */ 64 | 65 | #include 66 | 67 | struct interval { 68 | int first; 69 | int last; 70 | }; 71 | 72 | /* auxiliary function for binary search in interval table */ 73 | static int bisearch(wchar_t ucs, const struct interval *table, int max) { 74 | int min = 0; 75 | int mid; 76 | 77 | if (ucs < table[0].first || ucs > table[max].last) 78 | return 0; 79 | while (max >= min) { 80 | mid = (min + max) / 2; 81 | if (ucs > table[mid].last) 82 | min = mid + 1; 83 | else if (ucs < table[mid].first) 84 | max = mid - 1; 85 | else 86 | return 1; 87 | } 88 | 89 | return 0; 90 | } 91 | 92 | /* The following two functions define the column width of an ISO 10646 93 | * character as follows: 94 | * 95 | * - The null character (U+0000) has a column width of 0. 96 | * 97 | * - Other C0/C1 control characters and DEL will lead to a return 98 | * value of -1. 99 | * 100 | * - Non-spacing and enclosing combining characters (general 101 | * category code Mn or Me in the Unicode database) have a 102 | * column width of 0. 103 | * 104 | * - SOFT HYPHEN (U+00AD) has a column width of 1. 105 | * 106 | * - Other format characters (general category code Cf in the Unicode 107 | * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. 108 | * 109 | * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) 110 | * have a column width of 0. 111 | * 112 | * - Spacing characters in the East Asian Wide (W) or East Asian 113 | * Full-width (F) category as defined in Unicode Technical 114 | * Report #11 have a column width of 2. 115 | * 116 | * - All remaining characters (including all printable 117 | * ISO 8859-1 and WGL4 characters, Unicode control characters, 118 | * etc.) have a column width of 1. 119 | * 120 | * This implementation assumes that wchar_t characters are encoded 121 | * in ISO 10646. 122 | */ 123 | 124 | int mk_wcwidth(wchar_t ucs) { 125 | /* sorted list of non-overlapping intervals of non-spacing characters */ 126 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 127 | static const struct interval combining[] = { 128 | {0x0300, 0x036F}, {0x0483, 0x0486}, {0x0488, 0x0489}, 129 | {0x0591, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2}, 130 | {0x05C4, 0x05C5}, {0x05C7, 0x05C7}, {0x0600, 0x0603}, 131 | {0x0610, 0x0615}, {0x064B, 0x065E}, {0x0670, 0x0670}, 132 | {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, 133 | {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A}, 134 | {0x07A6, 0x07B0}, {0x07EB, 0x07F3}, {0x0901, 0x0902}, 135 | {0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D}, 136 | {0x0951, 0x0954}, {0x0962, 0x0963}, {0x0981, 0x0981}, 137 | {0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, 138 | {0x09E2, 0x09E3}, {0x0A01, 0x0A02}, {0x0A3C, 0x0A3C}, 139 | {0x0A41, 0x0A42}, {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, 140 | {0x0A70, 0x0A71}, {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, 141 | {0x0AC1, 0x0AC5}, {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, 142 | {0x0AE2, 0x0AE3}, {0x0B01, 0x0B01}, {0x0B3C, 0x0B3C}, 143 | {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43}, {0x0B4D, 0x0B4D}, 144 | {0x0B56, 0x0B56}, {0x0B82, 0x0B82}, {0x0BC0, 0x0BC0}, 145 | {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C48}, 146 | {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56}, {0x0CBC, 0x0CBC}, 147 | {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD}, 148 | {0x0CE2, 0x0CE3}, {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, 149 | {0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, 150 | {0x0E31, 0x0E31}, {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, 151 | {0x0EB1, 0x0EB1}, {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, 152 | {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, 153 | {0x0F37, 0x0F37}, {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, 154 | {0x0F80, 0x0F84}, {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, 155 | {0x0F99, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, 156 | {0x1032, 0x1032}, {0x1036, 0x1037}, {0x1039, 0x1039}, 157 | {0x1058, 0x1059}, {0x1160, 0x11FF}, {0x135F, 0x135F}, 158 | {0x1712, 0x1714}, {0x1732, 0x1734}, {0x1752, 0x1753}, 159 | {0x1772, 0x1773}, {0x17B4, 0x17B5}, {0x17B7, 0x17BD}, 160 | {0x17C6, 0x17C6}, {0x17C9, 0x17D3}, {0x17DD, 0x17DD}, 161 | {0x180B, 0x180D}, {0x18A9, 0x18A9}, {0x1920, 0x1922}, 162 | {0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B}, 163 | {0x1A17, 0x1A18}, {0x1B00, 0x1B03}, {0x1B34, 0x1B34}, 164 | {0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42}, 165 | {0x1B6B, 0x1B73}, {0x1DC0, 0x1DCA}, {0x1DFE, 0x1DFF}, 166 | {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x2063}, 167 | {0x206A, 0x206F}, {0x20D0, 0x20EF}, {0x302A, 0x302F}, 168 | {0x3099, 0x309A}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, 169 | {0xA825, 0xA826}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F}, 170 | {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF}, {0xFFF9, 0xFFFB}, 171 | {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, 172 | {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, {0x1D167, 0x1D169}, 173 | {0x1D173, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, 174 | {0x1D242, 0x1D244}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F}, 175 | {0xE0100, 0xE01EF}}; 176 | 177 | /* test for 8-bit control characters */ 178 | if (ucs == 0) 179 | return 0; 180 | if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) 181 | return -1; 182 | 183 | /* binary search in table of non-spacing characters */ 184 | if (bisearch(ucs, combining, sizeof(combining) / sizeof(struct interval) - 1)) 185 | return 0; 186 | 187 | /* if we arrive here, ucs is not a combining or C0/C1 control character */ 188 | 189 | return 1 + 190 | (ucs >= 0x1100 && 191 | (ucs <= 0x115f || /* Hangul Jamo init. consonants */ 192 | ucs == 0x2329 || ucs == 0x232a || 193 | (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) || /* CJK ... Yi */ 194 | (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ 195 | (ucs >= 0xf900 && 196 | ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ 197 | (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ 198 | (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ 199 | (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ 200 | (ucs >= 0xffe0 && ucs <= 0xffe6) || 201 | (ucs >= 0x20000 && ucs <= 0x2fffd) || 202 | (ucs >= 0x30000 && ucs <= 0x3fffd))); 203 | } 204 | 205 | int mk_wcswidth(const wchar_t *pwcs, size_t n) { 206 | int w, width = 0; 207 | 208 | for (; *pwcs && n-- > 0; pwcs++) 209 | if ((w = mk_wcwidth(*pwcs)) < 0) 210 | return -1; 211 | else 212 | width += w; 213 | 214 | return width; 215 | } 216 | 217 | /* 218 | * The following functions are the same as mk_wcwidth() and 219 | * mk_wcswidth(), except that spacing characters in the East Asian 220 | * Ambiguous (A) category as defined in Unicode Technical Report #11 221 | * have a column width of 2. This variant might be useful for users of 222 | * CJK legacy encodings who want to migrate to UCS without changing 223 | * the traditional terminal character-width behaviour. It is not 224 | * otherwise recommended for general use. 225 | */ 226 | int mk_wcwidth_cjk(wchar_t ucs) { 227 | /* sorted list of non-overlapping intervals of East Asian Ambiguous 228 | * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */ 229 | static const struct interval ambiguous[] = { 230 | {0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8}, 231 | {0x00AA, 0x00AA}, {0x00AE, 0x00AE}, {0x00B0, 0x00B4}, 232 | {0x00B6, 0x00BA}, {0x00BC, 0x00BF}, {0x00C6, 0x00C6}, 233 | {0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1}, 234 | {0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED}, 235 | {0x00F0, 0x00F0}, {0x00F2, 0x00F3}, {0x00F7, 0x00FA}, 236 | {0x00FC, 0x00FC}, {0x00FE, 0x00FE}, {0x0101, 0x0101}, 237 | {0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B}, 238 | {0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133}, 239 | {0x0138, 0x0138}, {0x013F, 0x0142}, {0x0144, 0x0144}, 240 | {0x0148, 0x014B}, {0x014D, 0x014D}, {0x0152, 0x0153}, 241 | {0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE}, 242 | {0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4}, 243 | {0x01D6, 0x01D6}, {0x01D8, 0x01D8}, {0x01DA, 0x01DA}, 244 | {0x01DC, 0x01DC}, {0x0251, 0x0251}, {0x0261, 0x0261}, 245 | {0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB}, 246 | {0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB}, 247 | {0x02DD, 0x02DD}, {0x02DF, 0x02DF}, {0x0391, 0x03A1}, 248 | {0x03A3, 0x03A9}, {0x03B1, 0x03C1}, {0x03C3, 0x03C9}, 249 | {0x0401, 0x0401}, {0x0410, 0x044F}, {0x0451, 0x0451}, 250 | {0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019}, 251 | {0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027}, 252 | {0x2030, 0x2030}, {0x2032, 0x2033}, {0x2035, 0x2035}, 253 | {0x203B, 0x203B}, {0x203E, 0x203E}, {0x2074, 0x2074}, 254 | {0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC}, 255 | {0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109}, 256 | {0x2113, 0x2113}, {0x2116, 0x2116}, {0x2121, 0x2122}, 257 | {0x2126, 0x2126}, {0x212B, 0x212B}, {0x2153, 0x2154}, 258 | {0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179}, 259 | {0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2}, 260 | {0x21D4, 0x21D4}, {0x21E7, 0x21E7}, {0x2200, 0x2200}, 261 | {0x2202, 0x2203}, {0x2207, 0x2208}, {0x220B, 0x220B}, 262 | {0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215}, 263 | {0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223}, 264 | {0x2225, 0x2225}, {0x2227, 0x222C}, {0x222E, 0x222E}, 265 | {0x2234, 0x2237}, {0x223C, 0x223D}, {0x2248, 0x2248}, 266 | {0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261}, 267 | {0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F}, 268 | {0x2282, 0x2283}, {0x2286, 0x2287}, {0x2295, 0x2295}, 269 | {0x2299, 0x2299}, {0x22A5, 0x22A5}, {0x22BF, 0x22BF}, 270 | {0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B}, 271 | {0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595}, 272 | {0x25A0, 0x25A1}, {0x25A3, 0x25A9}, {0x25B2, 0x25B3}, 273 | {0x25B6, 0x25B7}, {0x25BC, 0x25BD}, {0x25C0, 0x25C1}, 274 | {0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1}, 275 | {0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606}, 276 | {0x2609, 0x2609}, {0x260E, 0x260F}, {0x2614, 0x2615}, 277 | {0x261C, 0x261C}, {0x261E, 0x261E}, {0x2640, 0x2640}, 278 | {0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665}, 279 | {0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F}, 280 | {0x273D, 0x273D}, {0x2776, 0x277F}, {0xE000, 0xF8FF}, 281 | {0xFFFD, 0xFFFD}, {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD}}; 282 | 283 | /* binary search in table of non-spacing characters */ 284 | if (bisearch(ucs, ambiguous, sizeof(ambiguous) / sizeof(struct interval) - 1)) 285 | return 2; 286 | 287 | return mk_wcwidth(ucs); 288 | } 289 | 290 | int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n) { 291 | int w, width = 0; 292 | 293 | for (; *pwcs && n-- > 0; pwcs++) 294 | if ((w = mk_wcwidth_cjk(*pwcs)) < 0) 295 | return -1; 296 | else 297 | width += w; 298 | 299 | return width; 300 | } 301 | -------------------------------------------------------------------------------- /ucd/SpecialCasing.txt: -------------------------------------------------------------------------------- 1 | # SpecialCasing-16.0.0.txt 2 | # Date: 2024-05-10, 22:49:00 GMT 3 | # © 2024 Unicode®, Inc. 4 | # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. 5 | # For terms of use and license, see https://www.unicode.org/terms_of_use.html 6 | # 7 | # Unicode Character Database 8 | # For documentation, see https://www.unicode.org/reports/tr44/ 9 | # 10 | # Special Casing 11 | # 12 | # This file is a supplement to the UnicodeData.txt file. The data in this file, combined with 13 | # the simple case mappings in UnicodeData.txt, defines the full case mappings 14 | # Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc). 15 | # For compatibility, the UnicodeData.txt file only contains simple case mappings 16 | # for characters where they are one-to-one (and independent of context and language). 17 | # 18 | # For historical reasons, this file also provides additional information about the casing 19 | # of Unicode characters for selected situations when casing is dependent on context or locale. 20 | # 21 | # Note that the preferred mechanism for defining tailored casing operations is 22 | # the Unicode Common Locale Data Repository (CLDR). For more information, see the 23 | # discussion of case mappings and case algorithms in the Unicode Standard. 24 | # 25 | # All code points not listed in this file that do not have simple case mappings 26 | # in UnicodeData.txt map to themselves. 27 | # ================================================================================ 28 | # Format 29 | # ================================================================================ 30 | # The entries in this file are in the following machine-readable format: 31 | # 32 | # ; ; ; <upper>; (<condition_list>;)? # <comment> 33 | # 34 | # <code>, <lower>, <title>, and <upper> provide the respective full case mappings 35 | # of <code>, expressed as character values in hex. If there is more than one character, 36 | # they are separated by spaces. Other than as used to separate elements, spaces are 37 | # to be ignored. 38 | # 39 | # The <condition_list> is optional. Where present, it consists of one or more language IDs 40 | # or casing contexts, separated by spaces. In these conditions: 41 | # - A condition list overrides the normal behavior if all of the listed conditions are true. 42 | # - The casing context is always the context of the characters in the original string, 43 | # NOT in the resulting string. 44 | # - Case distinctions in the condition list are not significant. 45 | # - Conditions preceded by "Not_" represent the negation of the condition. 46 | # The condition list is not represented in the UCD as a formal property. 47 | # 48 | # A language ID is defined by BCP 47, with '-' and '_' treated equivalently. 49 | # 50 | # A casing context for a character is defined by Section 3.13 Default Case Algorithms 51 | # of The Unicode Standard. 52 | # 53 | # Parsers of this file must be prepared to deal with future additions to this format: 54 | # * Additional contexts 55 | # * Additional fields 56 | # ================================================================================ 57 | 58 | # ================================================================================ 59 | # Unconditional mappings 60 | # ================================================================================ 61 | 62 | # The German es-zed is special--the normal mapping is to SS. 63 | # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>)) 64 | 65 | 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S 66 | 67 | # Preserve canonical equivalence for I with dot. Turkic is handled below. 68 | 69 | 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 70 | 71 | # Ligatures 72 | 73 | FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF 74 | FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI 75 | FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL 76 | FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI 77 | FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL 78 | FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T 79 | FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST 80 | 81 | 0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN 82 | FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW 83 | FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH 84 | FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI 85 | FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW 86 | FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH 87 | 88 | # No corresponding uppercase precomposed character 89 | 90 | 0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 91 | 0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 92 | 03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 93 | 01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON 94 | 1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW 95 | 1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS 96 | 1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE 97 | 1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE 98 | 1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING 99 | 1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI 100 | 1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA 101 | 1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA 102 | 1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI 103 | 1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI 104 | 1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI 105 | 1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA 106 | 1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 107 | 1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI 108 | 1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI 109 | 1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA 110 | 1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 111 | 1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI 112 | 1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI 113 | 1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI 114 | 1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI 115 | 116 | # IMPORTANT-when iota-subscript (0345) is uppercased or titlecased, 117 | # the result will be incorrect unless the iota-subscript is moved to the end 118 | # of any sequence of combining marks. Otherwise, the accents will go on the capital iota. 119 | # This process can be achieved by first transforming the text to NFC before casing. 120 | # E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> 121 | 122 | # The following cases are already in the UnicodeData.txt file, so are only commented here. 123 | 124 | # 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI 125 | 126 | # All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript) 127 | # have special uppercases. 128 | # Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase! 129 | 130 | 1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI 131 | 1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI 132 | 1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI 133 | 1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI 134 | 1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI 135 | 1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI 136 | 1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 137 | 1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 138 | 1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI 139 | 1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI 140 | 1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI 141 | 1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI 142 | 1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI 143 | 1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI 144 | 1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 145 | 1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 146 | 1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI 147 | 1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI 148 | 1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI 149 | 1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI 150 | 1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI 151 | 1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI 152 | 1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 153 | 1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 154 | 1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI 155 | 1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI 156 | 1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI 157 | 1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI 158 | 1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI 159 | 1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI 160 | 1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 161 | 1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 162 | 1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI 163 | 1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI 164 | 1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI 165 | 1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI 166 | 1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI 167 | 1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI 168 | 1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI 169 | 1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI 170 | 1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI 171 | 1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI 172 | 1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI 173 | 1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI 174 | 1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI 175 | 1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI 176 | 1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI 177 | 1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI 178 | 1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI 179 | 1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI 180 | 1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI 181 | 1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI 182 | 1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI 183 | 1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 184 | 185 | # Some characters with YPOGEGRAMMENI also have no corresponding titlecases 186 | 187 | 1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI 188 | 1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI 189 | 1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI 190 | 1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI 191 | 1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI 192 | 1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI 193 | 194 | 1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 195 | 1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 196 | 1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 197 | 198 | # ================================================================================ 199 | # Conditional Mappings 200 | # The remainder of this file provides conditional casing data used to produce 201 | # full case mappings. 202 | # ================================================================================ 203 | # Language-Insensitive Mappings 204 | # These are characters whose full case mappings do not depend on language, but do 205 | # depend on context (which characters come before or after). For more information 206 | # see the header of this file and the Unicode Standard. 207 | # ================================================================================ 208 | 209 | # Special case for final form of sigma 210 | 211 | 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 212 | 213 | # Note: the following cases for non-final are already in the UnicodeData.txt file. 214 | 215 | # 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA 216 | # 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA 217 | # 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA 218 | 219 | # Note: the following cases are not included, since they would case-fold in lowercasing 220 | 221 | # 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA 222 | # 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA 223 | 224 | # ================================================================================ 225 | # Language-Sensitive Mappings 226 | # These are characters whose full case mappings depend on language and perhaps also 227 | # context (which characters come before or after). For more information 228 | # see the header of this file and the Unicode Standard. 229 | # ================================================================================ 230 | 231 | # Lithuanian 232 | 233 | # Lithuanian retains the dot in a lowercase i when followed by accents. 234 | 235 | # Remove DOT ABOVE after "i" with upper or titlecase 236 | 237 | 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 238 | 239 | # Introduce an explicit dot above when lowercasing capital I's and J's 240 | # whenever there are more accents above. 241 | # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 242 | 243 | 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 244 | 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 245 | 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 246 | 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 247 | 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 248 | 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 249 | 250 | # ================================================================================ 251 | 252 | # Turkish and Azeri 253 | 254 | # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 255 | # The following rules handle those cases. 256 | 257 | 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE 258 | 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE 259 | 260 | # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 261 | # This matches the behavior of the canonically equivalent I-dot_above 262 | 263 | 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 264 | 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 265 | 266 | # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 267 | 268 | 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 269 | 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 270 | 271 | # When uppercasing, i turns into a dotted capital I 272 | 273 | 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 274 | 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 275 | 276 | # Note: the following case is already in the UnicodeData.txt file. 277 | 278 | # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I 279 | 280 | # EOF 281 | 282 | -------------------------------------------------------------------------------- /src/config.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const types = @import("types.zig"); 3 | pub const quirks = @import("quirks.zig"); 4 | 5 | pub const max_code_point = 0x10FFFF; 6 | pub const zero_width_non_joiner = 0x200C; 7 | pub const zero_width_joiner = 0x200D; 8 | 9 | pub const default = Table{ 10 | .fields = &.{ 11 | // UnicodeData 12 | .{ 13 | .name = "name", 14 | .type = []const u8, 15 | .max_len = 88, 16 | .max_offset = 1030461, 17 | .embedded_len = 2, 18 | }, 19 | .{ .name = "general_category", .type = types.GeneralCategory }, 20 | .{ .name = "canonical_combining_class", .type = u8 }, 21 | .{ .name = "bidi_class", .type = types.BidiClass }, 22 | .{ .name = "decomposition_type", .type = types.DecompositionType }, 23 | .{ 24 | .name = "decomposition_mapping", 25 | .type = []const u21, 26 | .cp_packing = .shift, 27 | .shift_low = -181519, 28 | .shift_high = 99324, 29 | .max_len = 18, 30 | .max_offset = 4602, 31 | .embedded_len = 0, 32 | }, 33 | .{ .name = "numeric_type", .type = types.NumericType }, 34 | .{ 35 | .name = "numeric_value_decimal", 36 | .type = ?u4, 37 | .min_value = 0, 38 | .max_value = 9, 39 | }, 40 | .{ 41 | .name = "numeric_value_digit", 42 | .type = ?u4, 43 | .min_value = 0, 44 | .max_value = 9, 45 | }, 46 | .{ 47 | .name = "numeric_value_numeric", 48 | .type = []const u8, 49 | .max_len = 13, 50 | .max_offset = 503, 51 | .embedded_len = 1, 52 | }, 53 | .{ .name = "is_bidi_mirrored", .type = bool }, 54 | .{ 55 | .name = "unicode_1_name", 56 | .type = []const u8, 57 | .max_len = 55, 58 | .max_offset = 49956, 59 | .embedded_len = 0, 60 | }, 61 | .{ 62 | .name = "simple_uppercase_mapping", 63 | .type = ?u21, 64 | .cp_packing = .shift, 65 | .shift_low = -38864, 66 | .shift_high = 42561, 67 | }, 68 | .{ 69 | .name = "simple_lowercase_mapping", 70 | .type = ?u21, 71 | .cp_packing = .shift, 72 | .shift_low = -42561, 73 | .shift_high = 38864, 74 | }, 75 | .{ 76 | .name = "simple_titlecase_mapping", 77 | .type = ?u21, 78 | .cp_packing = .shift, 79 | .shift_low = -38864, 80 | .shift_high = 42561, 81 | }, 82 | 83 | // CaseFolding 84 | .{ 85 | .name = "case_folding_simple", 86 | .type = u21, 87 | .cp_packing = .shift, 88 | .shift_low = -42561, 89 | .shift_high = 35267, 90 | }, 91 | .{ 92 | .name = "case_folding_full", 93 | .type = []const u21, 94 | .cp_packing = .shift, 95 | .shift_low = -42561, 96 | .shift_high = 35267, 97 | .max_len = 3, 98 | .max_offset = 160, 99 | .embedded_len = 0, 100 | }, 101 | .{ 102 | .name = "case_folding_turkish_only", 103 | .type = []const u21, 104 | .cp_packing = .direct, 105 | .shift_low = -199, 106 | .shift_high = 232, 107 | .max_len = 1, 108 | .max_offset = 2, 109 | .embedded_len = 0, 110 | }, 111 | .{ 112 | .name = "case_folding_common_only", 113 | .type = []const u21, 114 | .cp_packing = .direct, 115 | .shift_low = -42561, 116 | .shift_high = 35267, 117 | .max_len = 1, 118 | .max_offset = 1423, 119 | .embedded_len = 0, 120 | }, 121 | .{ 122 | .name = "case_folding_simple_only", 123 | .type = []const u21, 124 | .cp_packing = .direct, 125 | .shift_low = -7615, 126 | .shift_high = 1, 127 | .max_len = 1, 128 | .max_offset = 31, 129 | .embedded_len = 0, 130 | }, 131 | .{ 132 | .name = "case_folding_full_only", 133 | .type = []const u21, 134 | .max_len = 3, 135 | .max_offset = 160, 136 | .embedded_len = 0, 137 | }, 138 | 139 | // SpecialCasing 140 | .{ .name = "has_special_casing", .type = bool }, 141 | .{ 142 | .name = "special_lowercase_mapping", 143 | .type = []const u21, 144 | .cp_packing = .shift, 145 | .shift_low = -199, 146 | .shift_high = 232, 147 | .max_len = 3, 148 | .max_offset = 13, 149 | .embedded_len = 0, 150 | }, 151 | .{ 152 | .name = "special_titlecase_mapping", 153 | .type = []const u21, 154 | .cp_packing = .shift, 155 | .shift_low = 0, 156 | .shift_high = 199, 157 | .max_len = 3, 158 | .max_offset = 104, 159 | .embedded_len = 0, 160 | }, 161 | .{ 162 | .name = "special_uppercase_mapping", 163 | .type = []const u21, 164 | .cp_packing = .shift, 165 | .shift_low = 0, 166 | .shift_high = 199, 167 | .max_len = 3, 168 | .max_offset = 158, 169 | .embedded_len = 0, 170 | }, 171 | .{ 172 | .name = "special_casing_condition", 173 | .type = []const types.SpecialCasingCondition, 174 | .max_len = 2, 175 | .max_offset = 9, 176 | .embedded_len = 0, 177 | }, 178 | 179 | // Case mappings 180 | .{ 181 | .name = "lowercase_mapping", 182 | .type = []const u21, 183 | .cp_packing = .shift, 184 | .shift_low = -42561, 185 | .shift_high = 38864, 186 | .max_len = 1, 187 | .max_offset = 0, 188 | .embedded_len = 0, 189 | }, 190 | .{ 191 | .name = "titlecase_mapping", 192 | .type = []const u21, 193 | .cp_packing = .shift, 194 | .shift_low = -38864, 195 | .shift_high = 42561, 196 | .max_len = 3, 197 | .max_offset = 104, 198 | .embedded_len = 0, 199 | }, 200 | .{ 201 | .name = "uppercase_mapping", 202 | .type = []const u21, 203 | .cp_packing = .shift, 204 | .shift_low = -38864, 205 | .shift_high = 42561, 206 | .max_len = 3, 207 | .max_offset = 158, 208 | .embedded_len = 0, 209 | }, 210 | 211 | // DerivedCoreProperties 212 | .{ .name = "is_math", .type = bool }, 213 | .{ .name = "is_alphabetic", .type = bool }, 214 | .{ .name = "is_lowercase", .type = bool }, 215 | .{ .name = "is_uppercase", .type = bool }, 216 | .{ .name = "is_cased", .type = bool }, 217 | .{ .name = "is_case_ignorable", .type = bool }, 218 | .{ .name = "changes_when_lowercased", .type = bool }, 219 | .{ .name = "changes_when_uppercased", .type = bool }, 220 | .{ .name = "changes_when_titlecased", .type = bool }, 221 | .{ .name = "changes_when_casefolded", .type = bool }, 222 | .{ .name = "changes_when_casemapped", .type = bool }, 223 | .{ .name = "is_id_start", .type = bool }, 224 | .{ .name = "is_id_continue", .type = bool }, 225 | .{ .name = "is_xid_start", .type = bool }, 226 | .{ .name = "is_xid_continue", .type = bool }, 227 | .{ .name = "is_default_ignorable", .type = bool }, 228 | .{ .name = "is_grapheme_extend", .type = bool }, 229 | .{ .name = "is_grapheme_base", .type = bool }, 230 | .{ .name = "is_grapheme_link", .type = bool }, 231 | .{ .name = "indic_conjunct_break", .type = types.IndicConjunctBreak }, 232 | 233 | // EastAsianWidth 234 | .{ .name = "east_asian_width", .type = types.EastAsianWidth }, 235 | 236 | // OriginalGraphemeBreak 237 | // This is the field from GraphemeBreakProperty.txt, without combining 238 | // `indic_conjunct_break`, `is_emoji_modifier`, 239 | // `is_emoji_modifier_base`, and `is_extended_pictographic` 240 | .{ .name = "original_grapheme_break", .type = types.OriginalGraphemeBreak }, 241 | 242 | // EmojiData 243 | .{ .name = "is_emoji", .type = bool }, 244 | .{ .name = "is_emoji_presentation", .type = bool }, 245 | .{ .name = "is_emoji_modifier", .type = bool }, 246 | .{ .name = "is_emoji_modifier_base", .type = bool }, 247 | .{ .name = "is_emoji_component", .type = bool }, 248 | .{ .name = "is_extended_pictographic", .type = bool }, 249 | 250 | // EmojiVariationSequences 251 | // These are all going to be equivalent, but 252 | // `emoji-variation-sequences.txt` and UTS #51 split out the emoji and 253 | // text variation sequences separately. However, ever since these were 254 | // introduced in Unicode 6.1 (see 255 | // https://unicode.org/Public/6.1.0/ucd/StandardizedVariants.txt -- 256 | // dated 2011-11-10), until present, there has never been an emoji 257 | // variation sequence that isn't also a valid text variation sequence, 258 | // and vice versa, so the recommendation is to just use 259 | // `is_emoji_vs_base`. Also the "Total sequences" comment at the end of 260 | // emoji-variation-sequences.txt counts the number of sequences as one 261 | // per base code point, rather than counting the "emoji style" and 262 | // "text style" lines separately. 263 | .{ .name = "is_emoji_vs_base", .type = bool }, 264 | .{ .name = "is_emoji_vs_text", .type = bool }, 265 | .{ .name = "is_emoji_vs_emoji", .type = bool }, 266 | 267 | // GraphemeBreak (derived) 268 | // This is derived from `original_grapheme_break` 269 | // (GraphemeBreakProperty.txt), `indic_conjunct_break`, 270 | // `is_emoji_modifier`, `is_emoji_modifier_base`, and 271 | // `is_extended_pictographic` 272 | .{ .name = "grapheme_break", .type = types.GraphemeBreak }, 273 | 274 | // BidiPairedBracket 275 | .{ 276 | .name = "bidi_paired_bracket", 277 | .type = types.BidiPairedBracket, 278 | .cp_packing = .shift, 279 | .shift_low = -3, 280 | .shift_high = 3, 281 | }, 282 | 283 | // Block 284 | .{ .name = "block", .type = types.Block }, 285 | }, 286 | }; 287 | 288 | pub const is_updating_ucd = false; 289 | 290 | pub const Field = struct { 291 | name: [:0]const u8, 292 | type: type, 293 | 294 | // For Shift + Slice fields 295 | cp_packing: CpPacking = .direct, 296 | shift_low: isize = 0, 297 | shift_high: isize = 0, 298 | 299 | // For Slice fields 300 | max_len: usize = 0, 301 | max_offset: usize = 0, 302 | embedded_len: usize = 0, 303 | 304 | // For PackedOptional fields 305 | min_value: isize = 0, 306 | max_value: isize = 0, 307 | 308 | pub const CpPacking = enum { 309 | direct, 310 | shift, 311 | }; 312 | 313 | pub const Runtime = struct { 314 | name: []const u8, 315 | type: []const u8, 316 | cp_packing: CpPacking, 317 | shift_low: isize, 318 | shift_high: isize, 319 | max_len: usize, 320 | max_offset: usize, 321 | embedded_len: usize, 322 | min_value: isize, 323 | max_value: isize, 324 | 325 | pub fn eql(a: Runtime, b: Runtime) bool { 326 | return a.cp_packing == b.cp_packing and 327 | a.shift_low == b.shift_low and 328 | a.shift_high == b.shift_high and 329 | a.max_len == b.max_len and 330 | a.max_offset == b.max_offset and 331 | a.embedded_len == b.embedded_len and 332 | a.min_value == b.min_value and 333 | a.max_value == b.max_value and 334 | std.mem.eql(u8, a.type, b.type) and 335 | std.mem.eql(u8, a.name, b.name); 336 | } 337 | 338 | pub fn override(self: Runtime, overrides: anytype) Runtime { 339 | var result: Runtime = .{ 340 | .name = self.name, 341 | .type = self.type, 342 | .cp_packing = self.cp_packing, 343 | .shift_low = self.shift_low, 344 | .shift_high = self.shift_high, 345 | .max_len = self.max_len, 346 | .max_offset = self.max_offset, 347 | .embedded_len = self.embedded_len, 348 | .min_value = self.min_value, 349 | .max_value = self.max_value, 350 | }; 351 | 352 | inline for (@typeInfo(@TypeOf(overrides)).@"struct".fields) |f| { 353 | @field(result, f.name) = @field(overrides, f.name); 354 | } 355 | 356 | return result; 357 | } 358 | 359 | pub fn compareActual(self: Runtime, actual: Runtime) bool { 360 | var is_okay = true; 361 | 362 | if (self.shift_low != actual.shift_low) { 363 | std.log.err("Config for field '{s}' does not match actual. Set .shift_low = {d}, // change from {d}", .{ self.name, actual.shift_low, self.shift_low }); 364 | is_okay = false; 365 | } 366 | 367 | if (self.shift_high != actual.shift_high) { 368 | std.log.err("Config for field '{s}' does not match actual. Set .shift_high = {d}, // change from {d}", .{ self.name, actual.shift_high, self.shift_high }); 369 | is_okay = false; 370 | } 371 | 372 | if (self.max_len != actual.max_len) { 373 | std.log.err("Config for field '{s}' does not match actual. Set .max_len = {d}, // change from {d}", .{ self.name, actual.max_len, self.max_len }); 374 | is_okay = false; 375 | } 376 | 377 | if (self.max_offset != actual.max_offset) { 378 | std.log.err("Config for field '{s}' does not match actual. Set .max_offset = {d}, // change from {d}", .{ self.name, actual.max_offset, self.max_offset }); 379 | is_okay = false; 380 | } 381 | 382 | if (self.min_value != actual.min_value) { 383 | std.log.err("Config for field '{s}' does not match actual. Set .min_value = {d}, // change from {d}", .{ self.name, actual.min_value, self.min_value }); 384 | is_okay = false; 385 | } 386 | 387 | if (self.max_value != actual.max_value) { 388 | std.log.err("Config for field '{s}' does not match actual. Set .max_value = {d}, // change from {d}", .{ self.name, actual.max_value, self.max_value }); 389 | is_okay = false; 390 | } 391 | 392 | return is_okay; 393 | } 394 | 395 | pub fn write(self: Runtime, writer: *std.Io.Writer) !void { 396 | try writer.print( 397 | \\.{{ 398 | \\ .name = "{s}", 399 | \\ 400 | , .{self.name}); 401 | 402 | var type_parts = std.mem.splitScalar(u8, self.type, '.'); 403 | const base_type = type_parts.next().?; 404 | const rest_type = type_parts.rest(); 405 | 406 | if (std.mem.endsWith(u8, base_type, "types") or 407 | std.mem.endsWith(u8, base_type, "types_x") or 408 | rest_type.len == 0) 409 | { 410 | try writer.print( 411 | \\ .type = {s}, 412 | \\ 413 | , .{self.type}); 414 | } else { 415 | const prefix = if (base_type[0] == '?') "?" else ""; 416 | try writer.print( 417 | \\ .type = {s}build_config.{s}, 418 | \\ 419 | , .{ prefix, rest_type }); 420 | } 421 | 422 | if (self.cp_packing != .direct or 423 | self.shift_low != 0 or 424 | self.shift_high != 0) 425 | { 426 | try writer.print( 427 | \\ .cp_packing = .{s}, 428 | \\ .shift_low = {}, 429 | \\ .shift_high = {}, 430 | \\ 431 | , .{ @tagName(self.cp_packing), self.shift_low, self.shift_high }); 432 | } 433 | if (self.max_len != 0) { 434 | try writer.print( 435 | \\ .max_len = {}, 436 | \\ .max_offset = {}, 437 | \\ .embedded_len = {}, 438 | \\ 439 | , .{ self.max_len, self.max_offset, self.embedded_len }); 440 | } 441 | if (self.min_value != 0 or self.max_value != 0) { 442 | try writer.print( 443 | \\ .min_value = {}, 444 | \\ .max_value = {}, 445 | \\ 446 | , .{ self.min_value, self.max_value }); 447 | } 448 | 449 | try writer.writeAll( 450 | \\}, 451 | \\ 452 | ); 453 | } 454 | }; 455 | 456 | pub const Kind = enum { 457 | basic, 458 | slice, 459 | shift, 460 | optional, 461 | @"union", 462 | }; 463 | 464 | pub fn kind(self: Field) Kind { 465 | switch (@typeInfo(self.type)) { 466 | .pointer => return .slice, 467 | .optional => |optional| { 468 | if (!isPackable(optional.child)) { 469 | return .basic; 470 | } 471 | 472 | switch (self.cp_packing) { 473 | .direct => return .optional, 474 | .shift => return .shift, 475 | } 476 | }, 477 | .@"union" => return .@"union", 478 | else => { 479 | switch (self.cp_packing) { 480 | .direct => return .basic, 481 | .shift => return .shift, 482 | } 483 | }, 484 | } 485 | } 486 | 487 | pub fn canBePacked(self: Field) bool { 488 | if (self.kind() == .slice) { 489 | return false; 490 | } 491 | 492 | switch (@typeInfo(self.type)) { 493 | .optional => |optional| { 494 | return isPackable(optional.child); 495 | }, 496 | .@"union" => |info| { 497 | return for (info.fields) |f| { 498 | if (f.type != void and !isPackable(f.type)) { 499 | break false; 500 | } 501 | } else true; 502 | }, 503 | else => return true, 504 | } 505 | } 506 | 507 | pub fn runtime(self: Field) Runtime { 508 | return .{ 509 | .name = self.name, 510 | .type = @typeName(self.type), 511 | .cp_packing = self.cp_packing, 512 | .shift_low = self.shift_low, 513 | .shift_high = self.shift_high, 514 | .max_len = self.max_len, 515 | .max_offset = self.max_offset, 516 | .embedded_len = self.embedded_len, 517 | .min_value = self.min_value, 518 | .max_value = self.max_value, 519 | }; 520 | } 521 | 522 | pub fn eql(a: Field, b: Field) bool { 523 | // Use runtime `eql` just to be lazy 524 | return a.runtime().eql(b.runtime()); 525 | } 526 | 527 | pub fn override(self: Field, overrides: anytype) Field { 528 | var result = self; 529 | 530 | inline for (@typeInfo(@TypeOf(overrides)).@"struct".fields) |f| { 531 | if (!is_updating_ucd and (std.mem.eql(u8, f.name, "name") or 532 | std.mem.eql(u8, f.name, "type") or 533 | std.mem.eql(u8, f.name, "shift_low") or 534 | std.mem.eql(u8, f.name, "shift_high") or 535 | std.mem.eql(u8, f.name, "max_len") or 536 | std.mem.eql(u8, f.name, "min_value") or 537 | std.mem.eql(u8, f.name, "max_value"))) 538 | { 539 | @compileError("Cannot override field '" ++ f.name ++ "'"); 540 | } 541 | 542 | @field(result, f.name) = @field(overrides, f.name); 543 | } 544 | 545 | return result; 546 | } 547 | }; 548 | 549 | pub fn isPackable(comptime T: type) bool { 550 | switch (@typeInfo(T)) { 551 | .int => |int| { 552 | return int.bits <= @bitSizeOf(isize); 553 | }, 554 | .@"enum" => |e| { 555 | return @typeInfo(e.tag_type).int.bits <= @bitSizeOf(isize); 556 | }, 557 | .bool => return true, 558 | else => return false, 559 | } 560 | } 561 | 562 | pub const Table = struct { 563 | name: ?[]const u8 = null, 564 | stages: Stages = .auto, 565 | packing: Packing = .auto, 566 | extensions: []const Extension = &.{}, 567 | fields: []const Field, 568 | 569 | pub const Stages = enum { 570 | auto, 571 | two, 572 | three, 573 | }; 574 | 575 | pub const Packing = enum { 576 | auto, // as in decide automatically, not as in Type.ContainerLayout.auto 577 | @"packed", 578 | unpacked, 579 | 580 | pub fn write(self: Packing, writer: *std.Io.Writer) !void { 581 | switch (self) { 582 | .auto => unreachable, 583 | .unpacked => try writer.writeAll(".unpacked"), 584 | .@"packed" => try writer.writeAll(".@\"packed\""), 585 | } 586 | } 587 | }; 588 | 589 | pub fn hasField(comptime self: *const Table, name: []const u8) bool { 590 | @setEvalBranchQuota(10_000); 591 | 592 | return inline for (self.fields) |f| { 593 | if (std.mem.eql(u8, f.name, name)) { 594 | break true; 595 | } 596 | } else false; 597 | } 598 | 599 | pub fn field(comptime self: *const Table, name: []const u8) Field { 600 | @setEvalBranchQuota(20_000); 601 | 602 | return for (self.fields) |f| { 603 | if (std.mem.eql(u8, f.name, name)) { 604 | break f; 605 | } 606 | } else @compileError("Field '" ++ name ++ "' not found in Table"); 607 | } 608 | 609 | // TODO: benchmark this more 610 | const two_stage_size_threshold = 4; 611 | 612 | pub fn resolve(comptime self: *const Table) Table { 613 | if (self.stages != .auto and self.packing != .auto) { 614 | return self; 615 | } 616 | 617 | const can_be_packed = switch (self.packing) { 618 | .auto, .@"packed" => blk: { 619 | for (self.fields) |f| { 620 | if (!f.canBePacked()) { 621 | break :blk false; 622 | } 623 | } 624 | 625 | break :blk true; 626 | }, 627 | .unpacked => false, 628 | }; 629 | 630 | const DataUnpacked = types.Data(.{ 631 | .packing = .unpacked, 632 | .fields = self.fields, 633 | }); 634 | const DataPacked = if (can_be_packed) 635 | types.Data(.{ 636 | .packing = .@"packed", 637 | .fields = self.fields, 638 | }) 639 | else 640 | DataUnpacked; 641 | 642 | const unpacked_size = @sizeOf(DataUnpacked); 643 | const packed_size = @sizeOf(DataPacked); 644 | const min_size = @min(unpacked_size, packed_size); 645 | 646 | const stages: Stages = switch (self.stages) { 647 | .auto => blk: { 648 | if (min_size <= two_stage_size_threshold) { 649 | break :blk .two; 650 | } else { 651 | break :blk .three; 652 | } 653 | }, 654 | .two => .two, 655 | .three => .three, 656 | }; 657 | 658 | const packing: Packing = switch (self.packing) { 659 | .auto => blk: { 660 | if (!can_be_packed) { 661 | break :blk .unpacked; 662 | } 663 | 664 | if (unpacked_size == min_size or unpacked_size <= two_stage_size_threshold) { 665 | break :blk .unpacked; 666 | } 667 | 668 | if (stages == .two) { 669 | if (packed_size <= two_stage_size_threshold) { 670 | break :blk .@"packed"; 671 | } else if (3 * packed_size <= 2 * unpacked_size) { 672 | break :blk .@"packed"; 673 | } else { 674 | break :blk .unpacked; 675 | } 676 | } else { 677 | if (packed_size <= unpacked_size / 2) { 678 | break :blk .@"packed"; 679 | } else { 680 | break :blk .unpacked; 681 | } 682 | } 683 | }, 684 | .@"packed" => .@"packed", 685 | .unpacked => .unpacked, 686 | }; 687 | 688 | return .{ 689 | .stages = stages, 690 | .packing = packing, 691 | .name = self.name, 692 | .extensions = self.extensions, 693 | .fields = self.fields, 694 | }; 695 | } 696 | }; 697 | 698 | pub const Extension = struct { 699 | inputs: []const [:0]const u8, 700 | fields: []const Field, 701 | 702 | compute: *const fn ( 703 | allocator: std.mem.Allocator, 704 | cp: u21, 705 | data: anytype, 706 | backing: anytype, 707 | tracking: anytype, 708 | ) std.mem.Allocator.Error!void, 709 | 710 | pub fn hasField(comptime self: *const Extension, name: []const u8) bool { 711 | return inline for (self.fields) |f| { 712 | if (std.mem.eql(u8, f.name, name)) { 713 | break true; 714 | } 715 | } else false; 716 | } 717 | 718 | pub fn field(comptime self: *const Extension, name: []const u8) Field { 719 | return for (self.fields) |f| { 720 | if (std.mem.eql(u8, f.name, name)) { 721 | break f; 722 | } 723 | } else @compileError("Field '" ++ name ++ "' not found in Extension"); 724 | } 725 | }; 726 | 727 | // This is used by generated build_config.zig, and not intended for direct use 728 | // when using advanced configuration. 729 | pub fn _resolveFields( 730 | comptime config_x: type, 731 | comptime field_names: []const []const u8, 732 | comptime extension_names: []const []const u8, 733 | ) [field_names.len]Field { 734 | @setEvalBranchQuota(100_000); 735 | var result: [field_names.len]Field = undefined; 736 | for (field_names, 0..) |field_name, i| { 737 | result[i] = extensions_loop: inline for (@typeInfo(config_x).@"struct".decls) |decl| { 738 | for (extension_names) |ext_name| { 739 | if (std.mem.eql(u8, decl.name, ext_name)) { 740 | const extension = @field(config_x, decl.name); 741 | if (extension.hasField(field_name)) { 742 | break :extensions_loop extension.field(field_name); 743 | } 744 | } 745 | } 746 | } else default.field(field_name); 747 | } 748 | return result; 749 | } 750 | --------------------------------------------------------------------------------