├── .gitignore ├── .gitmodules ├── LICENSE.md ├── Makefile ├── README.md ├── config.mk ├── dat ├── UnicodeData.txt ├── charwidths.c ├── logo.png └── logo.txt ├── man ├── CHANGELOG-v1.0.0.md ├── TODO.md └── chmap.scd ├── src ├── arg.h ├── display.c ├── main.c ├── range.c ├── unicode.c ├── unicode.h ├── utf8.c ├── utf8.h ├── util.c └── util.h ├── tests ├── .gitignore ├── decode.sh ├── lib.sh ├── pilot_decode.c ├── pilot_range.c └── range.sh └── tool ├── gencharsh.lua └── install.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore vim's swap files 2 | *.sw[po] 3 | 4 | # ignore generated files 5 | charinfo.c 6 | chmap.1 7 | 8 | # ignore compiled files 9 | *.[ao] 10 | *.xz 11 | chmap 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sub/argoat"] 2 | path = sub/argoat 3 | url = https://github.com/cylgom/argoat 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | =========== 3 | 4 | - Copyright © 2019-2021 Kiëd Llaentenn and contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights to 9 | (mis)use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is furnished 11 | to do so, subject to the following conditions: 12 | 13 | - The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # chmap: retreive information about Unicode characters 3 | # https://github.com/lptstr/chmap 4 | # 5 | # (c) Kiëd Llaentenn and contributors 6 | # See the LICENSE.md file for more information 7 | # 8 | 9 | include config.mk 10 | 11 | BIN = chmap 12 | VERSION = 1.1.0 13 | PKGNAME = $(BIN)-$(shell uname -s)-$(shell uname -m)-$(VERSION) 14 | 15 | SRC = src/utf8.c src/unicode.c src/util.c src/main.c 16 | OBJ = $(SRC:.c=.o) 17 | 18 | WARNING = -Wall -Wpedantic -Wextra -Wold-style-definition -Wformat=2 \ 19 | -Wmissing-prototypes -Winit-self -Wfloat-equal -Wstrict-prototypes \ 20 | -Wredundant-decls -Wendif-labels -Wstrict-aliasing=2 -Woverflow \ 21 | -Werror=implicit-function-declaration -Werror=return-type 22 | 23 | INC = -I ~/local/include/ -I. -Isrc/ 24 | DEF = -D_DEFAULT_SOURCE -D_XOPEN_SOURCE=1000 -DVERSION=\"$(VERSION)\" 25 | 26 | CFLAGS = -std=c99 $(WARNING) $(DEF) $(INC) 27 | LDFLAGS = -fuse-ld=$(LD) -L ~/local/lib -static 28 | 29 | all: man/$(BIN).1 debug 30 | 31 | .c.o: 32 | @printf " %-8s%s\n" "CC" $@ 33 | $(CMD)$(CC) -c $< -o $(<:.c=.o) $(CFLAGS) $(CFLAGS_OPT) 34 | 35 | debug: CFLAGS_OPT := $(DEBUG_CFLAGS) 36 | debug: LDFLAGS_OPT := $(DEBUG_LDFLAGS) 37 | debug: $(BIN) 38 | 39 | release: CFLAGS_OPT := $(RELEASE_CFLAGS) 40 | release: LDFLAGS_OPT := $(RELEASE_LDFLAGS) 41 | release: $(BIN) man/$(BIN).1 42 | 43 | src/main.o: src/range.c src/display.c 44 | 45 | $(BIN): dat/charinfo.c $(OBJ) 46 | @printf " %-8s%s\n" "CCLD" $@ 47 | $(CMD)$(CC) -o $@ $(OBJ) $(CFLAGS) $(CFLAGS_OPT) $(LDFLAGS) $(LDFLAGS_OPT) 48 | 49 | dat/charinfo.c: tool/gencharsh.lua dat/UnicodeData.txt 50 | @printf " %-8s%s\n" "GEN" $@ 51 | $(CMD)tool/gencharsh.lua < dat/UnicodeData.txt > $@ 52 | 53 | man/$(BIN).1: man/$(BIN).scd 54 | @printf " %-8s%s\n" "SCDOC" $@ 55 | $(CMD)scdoc < $^ > $@ 56 | 57 | clean: 58 | rm -rf $(BIN) $(OBJ) man/$(BIN).1 59 | rm -rf *.xz $(PKGNAME)* 60 | rm -rf dat/charinfo.c 61 | 62 | dist: release man/$(BIN).1 63 | $(CMD)mkdir $(PKGNAME) 64 | $(CMD)cp $(BIN) $(PKGNAME) 65 | $(CMD)cp man/$(BIN).1 $(PKGNAME) 66 | $(CMD)cp tool/install.sh $(PKGNAME) 67 | $(CMD)tar -cf - $(PKGNAME) | xz -qcT0 > $(PKGNAME).tar.xz 68 | $(CMD)rm -rf $(PKGNAME) 69 | 70 | 71 | install: $(BIN) man/$(BIN).1 72 | install -Dm755 $(BIN) $(DESTDIR)/$(PREFIX)/bin/$(BIN) 73 | install -Dm644 man/$(BIN).1 $(DESTDIR)/$(PREFIX)/share/man/man1/$(BIN).1 74 | 75 | uninstall: 76 | rm -f $(DESTDIR)/$(PREFIX)/bin/$(BIN) 77 | rm -f $(DESTDIR)/$(PREFIX)/share/man/man1/$(BIN).1 78 | 79 | check: 80 | $(CMD)for i in tests/pilot_*.c; do \ 81 | cc -Isrc/ $$i -o $${i%%.c} -O0 -g || exit 1; \ 82 | done 83 | $(CMD)find tests -name '*.sh' -perm -700 -exec '{}' \; 84 | 85 | 86 | .PHONY: all debug release clean dist install uninstall check 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `chmap` (formerly `lcharmap`) 2 | 3 | > A CLI utility to get information for Unicode characters. 4 | 5 | ## What? 6 | 7 | `chmap` is a little utility to get information on Unicode characters, 8 | such as its description, hexadecimal/octal representation, Unicode 9 | category, UTF-8 encoding, and more. 10 | 11 | ``` 12 | $ chmap -r 9-10,13935,255-258 13 | codepoint glyph encoded case description 14 | 9 9 other character tabulation 15 | 10 A other line feed (lf) 16 | 13935 㙯 E3 99 AF other modifier letter chinese tone yin ping 17 | 255 ÿ C3 BF lower latin small letter y with diaeresis 18 | 256 Ā C4 80 upper latin capital letter a with macron 19 | 257 ā C4 81 lower latin small letter a with macron 20 | 258 Ă C4 82 upper latin capital letter a with breve 21 | ``` 22 | 23 | `chmap` was inspired by the `charmap.exe` tool present in Windows XP and 24 | later. 25 | 26 | ## Where? 27 | 28 | Tarballs are available from GitHub releases for Linux-x86_64 and 29 | Linux-armv6l. Extract, `cd`, and run: 30 | 31 | ``` 32 | $ sudo ./install.sh /usr/local 33 | ``` 34 | 35 | ### Building from Source 36 | 37 | #### Build dependencies 38 | - a C99 compiler, GNU Make 39 | - [scdoc](https://git.sr.ht/~sircmpwn/scdoc) (manpage) 40 | 41 | Download the latest source tarball from GitHub releases 42 | (`chmap-v$VERSION.tar.xz`), extract, and build: 43 | 44 | ``` 45 | # make clean install 46 | ``` 47 | 48 | ## How? 49 | 50 | Run `chmap` with the `-r` flag (for *r*ange) to get info for a range of 51 | Unicode runes: 52 | 53 | ``` 54 | $ chmap -r 0-5 55 | codepoint glyph encoded case description 56 | 0 other null 57 | 1 1 other start of heading 58 | 2 2 other start of text 59 | 3 3 other end of text 60 | 4 4 other end of transmission 61 | 5 5 other enquiry 62 | ``` 63 | 64 | Example ranges: `0-1`, `355-369`, `34`, `189-192,12,45-49`. 65 | 66 | You may also list info for a list of given characters with the `-c` flag: 67 | 68 | ``` 69 | $ chmap -c Lovecraft 70 | codepoint glyph encoded case description 71 | 76 L 4C upper latin capital letter l 72 | 111 o 6F lower latin small letter o 73 | 118 v 76 lower latin small letter v 74 | 101 e 65 lower latin small letter e 75 | 99 c 63 lower latin small letter c 76 | 114 r 72 lower latin small letter r 77 | 97 a 61 lower latin small letter a 78 | 102 f 66 lower latin small letter f 79 | 116 t 74 lower latin small letter t 80 | ``` 81 | 82 | You can search for characters that match a regex, too, with the `-s` flag: 83 | 84 | ``` 85 | $ chmap -s '^latin.*capital letter z' 86 | codepoint glyph encoded case description 87 | 90 Z 5A upper latin capital letter z 88 | 377 Ź C5 B9 upper latin capital letter z with acute 89 | 379 Ż C5 BB upper latin capital letter z with dot above 90 | 381 Ž C5 BD upper latin capital letter z with caron 91 | 437 Ƶ C6 B5 upper latin capital letter z with stroke 92 | 548 Ȥ C8 A4 upper latin capital letter z with hook 93 | 7824 Ẑ E1 BA 90 upper latin capital letter z with circumflex 94 | 7826 Ẓ E1 BA 92 upper latin capital letter z with dot below 95 | 7828 Ẕ E1 BA 94 upper latin capital letter z with line below 96 | 11371 Ⱬ E2 B1 AB upper latin capital letter z with descender 97 | 11391 Ɀ E2 B1 BF upper latin capital letter z with swash tail 98 | 42950 Ᶎ EA 9F 86 upper latin capital letter z with palatal hook 99 | ``` 100 | 101 | Adding the `-l` flag causes `chmap` to print more information: 102 | 103 | ``` 104 | $ chmap -l -r 13000 105 | codepoint 13000 0x32C8 0o31310 106 | encoding UTF8(E3 8B 88) 107 | glyph ㋈ (2 columns) 108 | description ideographic telegraph symbol for september 109 | case other 110 | category Symbol (other) 111 | ``` 112 | 113 | See `man chmap` for more. 114 | 115 | ## Why? 116 | 117 | I miss some Windows utilities. 118 | 119 | I found [`chars`](https://github.com/antifuchs/chars), but it lacked 120 | several features which I'd like: 121 | 122 | - A table-like output mode (which `chmap` has by default). This makes 123 | viewing many entries more comfortable in a short terminal. 124 | - A `category` field to specify the Unicode category that the rune falls 125 | under. 126 | 127 | ## Inspiration 128 | 129 | - Window's `charmap.exe` 130 | - Eric Raymond's `ascii` utility 131 | - `chars` (https://github.com/antifuchs/chars) 132 | - `uniname` from uniutils 133 | 134 | ## License 135 | 136 | This lame little utility is licensed under the MIT License. See 137 | the `LICENSE.md` file for more information. 138 | -------------------------------------------------------------------------------- /config.mk: -------------------------------------------------------------------------------- 1 | # 2 | # lcharmap: retrieve information about Unicode characters 3 | # https://github.com/lptstr/lcharmap 4 | # 5 | # (c) Kiëd Llaentenn and contributors 6 | # See the LICENSE.md file for more information 7 | # 8 | 9 | CMD = @ 10 | 11 | DESTDIR = 12 | PREFIX = /usr/local 13 | 14 | AR = ar 15 | CC = cc 16 | LD = bfd 17 | 18 | RELEASE_CFLAGS = -O3 -mtune=native -march=native 19 | RELEASE_LDFLAGS = -flto -s 20 | DEBUG_CFLAGS = -Og -g 21 | DEBUG_LDFLAGS = 22 | -------------------------------------------------------------------------------- /dat/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kiedtl/chmap/af3fd2fb798040e14a69f78ae317c2ec6668bed5/dat/logo.png -------------------------------------------------------------------------------- /dat/logo.txt: -------------------------------------------------------------------------------- 1 | ƚçĦârɱáƤ 2 | -------------------------------------------------------------------------------- /man/CHANGELOG-v1.0.0.md: -------------------------------------------------------------------------------- 1 | # Changelog v1.0.0 2 | 3 | - `lcharmap` rewritten in C 4 | - `--range` syntax converted to the more common `START-END` from `START,END`. 5 | - `--range` now accepts numbers in binary, octal, hexadecimal. 6 | - `--search` engine now uses `musl`'s buggy regex implementation, instead of 7 | the Rust crate. 8 | -------------------------------------------------------------------------------- /man/TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | Sorted in order of the chance that they'll be implemented in the next 10 4 | years. 5 | 6 | - Upgrade to Unicode 14 7 | - "--pager" flag 8 | - "-e" command to search emojis by CLDR (see https://github.com/arp242/uni) 9 | - "--tone" flag to change skin tone 10 | - "--gender" flag to change gender 11 | - "-g" to print/search from Unicode groups 12 | - Move "-s" command to "--search"/"--filter" flag, so that it can be 13 | combined with other commands 14 | - Search modifiers: -or 15 | - new field: UTF-16 encoding 16 | - -c should sort and deduplicate input 17 | - an -f flag to filter output by property (e.g. `-f case=upper`) 18 | - simplify display code 19 | - X11/ncurses client, which an interface similar to charmap.exe 20 | - use cross-platform regex library instead of `regex.h` (PCRE?) 21 | 22 | --- 23 | 24 | **other:** `grep -nRE 'FIXME|XXX|TODO' src` 25 | -------------------------------------------------------------------------------- /man/chmap.scd: -------------------------------------------------------------------------------- 1 | chmap(1) "Something Something Manual" "Something Something Manual" 2 | 3 | # NAME 4 | 5 | chmap - Get information for unicode characters 6 | 7 | # SYNOPSIS 8 | 9 | *chmap* [-C always|never|auto] [-l] [-r RANGE] [-c CHARS] [-s REGEX] 10 | 11 | # DESCRIPTION 12 | 13 | chmap (formerly lcharmap) retrieves information regarding Unicode 14 | characters, including a short description, its UTF8 encoding, the Unicode 15 | category it belongs to, its casing, and its decimal/hexadecimal/octal 16 | representation. 17 | 18 | # OPTIONS 19 | 20 | *-h* 21 | Print a short help message and exit. 22 | 23 | *-V* 24 | Print chmap's version and exit. 25 | 26 | *-l* 27 | Print information in the long format. 28 | 29 | *-C* _WHEN_ 30 | Control color usage (_WHEN_ can be *always*, *auto*, *never*). 31 | 32 | # COMMANDS 33 | 34 | *-r* _RANGE_ 35 | Print a range of Unicode codepoints. (e.g. `0x033-0x035', 36 | or `1,3,6-9') 37 | 38 | Range values can be in binary, octal, hexadecimal, in addition 39 | to decimal. In that case, they must be prefixed with a `0b', `0o', 40 | or `0x`, respectively. 41 | 42 | *-c* _CHARS_ 43 | Print a range of Unicode codepoints that match the provided 44 | character(s). (e.g., `Kiëd') 45 | 46 | The input must be a valid UTF8-encoded string. 47 | 48 | *-s* _REGEX_ 49 | Search character descriptions for _REGEX_. 50 | 51 | # EXAMPLES 52 | 53 | chmap -r 33,36,45-47 54 | Print information for characters 33, 36, 45, 46, and 47. 55 | 56 | chmap -r 0o033,0xAB,0b1101 57 | Print information for characters 27, 171, and 13. 58 | 59 | chmap -c Asbjørn 60 | Print information for all characters in the name `Asbjørn'. 61 | 62 | chmap -s "^latin .\* a$" 63 | Print all characters where the description matches the regular 64 | expression "^latin .\* a$". 65 | 66 | # KNOWN ISSUES 67 | 68 | - chmap is currently buggy and untested on Windows and macOS. 69 | - The display code is utterly incomprehensible. 70 | 71 | # AUTHORS 72 | 73 | Kiëd Llaentenn 74 | 75 | # REPORTING BUGS 76 | 77 | Send bugs reports, hate mail, and other chmap-related bikeshedding to the 78 | author's email above, or */msg kiedtl* on freenode. 79 | 80 | # SEE ALSO 81 | 82 | The full documentation for *chmap* is not maintained as a Texinfo manual. 83 | If the *info* and *chmap* programs are properly installed on your system, 84 | the command 85 | 86 | *info chmap* 87 | 88 | should not give you access to the complete manual. 89 | -------------------------------------------------------------------------------- /src/arg.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copy me if you can. 3 | * by 20h 4 | */ 5 | 6 | #ifndef ARG_H__ 7 | #define ARG_H__ 8 | 9 | char *argv0; 10 | 11 | /* use main(int argc, char *argv[]) */ 12 | #define ARGBEGIN for (argv0 = *argv, argv++, argc--;\ 13 | argv[0] && argv[0][0] == '-'\ 14 | && argv[0][1];\ 15 | argc--, argv++) {\ 16 | char argc_;\ 17 | char **argv_;\ 18 | int brk_;\ 19 | if (argv[0][1] == '-' && argv[0][2] == '\0') {\ 20 | argv++;\ 21 | argc--;\ 22 | break;\ 23 | }\ 24 | for (brk_ = 0, argv[0]++, argv_ = argv;\ 25 | argv[0][0] && !brk_;\ 26 | argv[0]++) {\ 27 | if (argv_ != argv)\ 28 | break;\ 29 | argc_ = argv[0][0];\ 30 | switch (argc_) 31 | 32 | /* Handles obsolete -NUM syntax */ 33 | #define ARGNUM case '0':\ 34 | case '1':\ 35 | case '2':\ 36 | case '3':\ 37 | case '4':\ 38 | case '5':\ 39 | case '6':\ 40 | case '7':\ 41 | case '8':\ 42 | case '9' 43 | 44 | #define ARGEND }\ 45 | } 46 | 47 | #define ARGC() argc_ 48 | 49 | #define EARGF(x) ((argv[0][1] == '\0' && argv[1] == NULL)?\ 50 | ((x), abort(), (char *)0) :\ 51 | (brk_ = 1, (argv[0][1] != '\0')?\ 52 | (&argv[0][1]) :\ 53 | (argc--, argv++, argv[0]))) 54 | 55 | #define LNGARG() &argv[0][0] 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /src/display.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "util.h" 7 | #include "unicode.h" 8 | #include "utf8.h" 9 | 10 | static size_t _state; 11 | 12 | static char * 13 | fmt_bytes(char *bytes) 14 | { 15 | static char buf[255]; 16 | memset(buf, 0x0, sizeof(buf)); 17 | 18 | size_t i = 0; 19 | for (; bytes[i]; ++i) 20 | strcat(buf, format("%hhX ", bytes[i])); 21 | 22 | buf[(i * 3) - 1] = '\0'; 23 | return (char *)&buf; 24 | } 25 | 26 | static void 27 | printentry_short(uint32_t entry, char *description, _Bool fancy) 28 | { 29 | char glyph[7]; 30 | bzero(glyph, sizeof(glyph)); 31 | utf8_encode(glyph, entry); 32 | 33 | char *padding = &" "[charwidths[entry]]; 34 | size_t category = charinfos[entry].category; 35 | _Bool iscontrol = category == UC_Cc; 36 | 37 | char *casestr = "other"; 38 | if (unicodeisupper(entry)) 39 | casestr = "upper"; 40 | else if (unicodeislower(entry)) 41 | casestr = "lower"; 42 | 43 | if (fancy && (_state & 1) == 0) 44 | printf("\x1b[100m"); 45 | 46 | printf("%8s %s%s %-11s %s %s", 47 | format("U+%04X", entry), iscontrol ? "" : glyph, 48 | padding, fmt_bytes(glyph), 49 | casestr, description ? description : "-"); 50 | 51 | if (fancy && (_state & 1) == 0) 52 | printf("\x1b[K\x1b[m"); 53 | 54 | printf("\n"); 55 | } 56 | 57 | static void 58 | fmt_entry(_Bool fancy, char *key, char *value) 59 | { 60 | if (fancy) 61 | printf("\033[1m%-14s\033[m %s\n", key, value); 62 | else 63 | printf("%-14s %s\n", key, value); 64 | } 65 | 66 | static void 67 | printentry_long(uint32_t entry, char *description, _Bool fancy) 68 | { 69 | char charbuf[7], charbuf2[7]; 70 | bzero(charbuf, sizeof(charbuf)); 71 | bzero(charbuf2, sizeof(charbuf2)); 72 | 73 | utf8_encode(charbuf, entry); 74 | 75 | size_t colwidth = charwidths[entry]; 76 | struct CharInfo ci = charinfos[entry]; 77 | _Bool iscontrol = ci.category == UC_Cc; 78 | 79 | fmt_entry(fancy, "codepoint", format("U+%04X %-5d 0o%o", entry, entry, entry)); 80 | fmt_entry(fancy, "UTF8 encoding", fmt_bytes(charbuf)); 81 | fmt_entry(fancy, "glyph", format("%s (%zd %s)", iscontrol ? "" : charbuf, 82 | colwidth, colwidth == 1 ? "column" : "columns")); 83 | fmt_entry(fancy, "description", description ? description : "(none)"); 84 | 85 | if (unicodeisupper(entry)) { 86 | int32_t lower = charinfos[entry].lower; 87 | assert(lower != -1); 88 | utf8_encode(charbuf, lower); 89 | fmt_entry(fancy, "case", 90 | format("uppercase, lowercase: 0x%X %s", lower, charbuf2)); 91 | } else if (unicodeislower(entry)) { 92 | int32_t upper = charinfos[entry].upper; 93 | assert(upper != -1); 94 | utf8_encode(charbuf, upper); 95 | fmt_entry(fancy, "case", 96 | format("lowercase, uppercase: 0x%X %s", upper, charbuf2)); 97 | } else { 98 | fmt_entry(fancy, "case", "other"); 99 | } 100 | 101 | fmt_entry(fancy, "category", (char *)category_strs[ci.category]); 102 | 103 | printf("\n"); 104 | } 105 | 106 | static void 107 | printheader(_Bool flong, _Bool fancy) 108 | { 109 | _state = 0; 110 | 111 | if (!flong) { 112 | if (fancy) printf("\x1b[1m"); 113 | printf("codepoint glyph encoded case description\n"); 114 | if (fancy) printf("\x1b[0m"); 115 | } 116 | } 117 | 118 | static void 119 | printentry(uint32_t entry, char *description, _Bool fancy, _Bool flong) 120 | { 121 | ++_state; 122 | 123 | if (!flong) 124 | printentry_short(entry, description, fancy); 125 | else 126 | printentry_long(entry, description, fancy); 127 | } 128 | -------------------------------------------------------------------------------- /src/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "arg.h" 12 | #include "display.c" 13 | #include "util.h" 14 | #include "range.c" 15 | #include "unicode.h" 16 | 17 | sqlite3 *db; 18 | _Bool istty = false; 19 | _Bool flong = false; 20 | 21 | static void 22 | range(char *param) 23 | { 24 | /* 25 | * TODO: support number characters from other languages 26 | * e.g. Chinese 27 | */ 28 | 29 | uint32_t entries[262144]; 30 | ssize_t entries_len = -1; 31 | 32 | if ((entries_len = expand_range(param, entries)) < 0) 33 | errx(1, "'%s': invalid range.", param); 34 | 35 | printheader(flong, istty); 36 | 37 | for (size_t i = 0; i < (size_t)entries_len; ++i) { 38 | if (entries[i] > UNICODE_MAX) { 39 | warnx("%u is above maximum Unicode value", entries[i]); 40 | continue; 41 | } 42 | char *desc = charinfos[entries[i]].desc; 43 | printentry(entries[i], desc, istty, flong); 44 | } 45 | } 46 | 47 | static void 48 | chars(char *param) 49 | { 50 | char *inp = param; 51 | size_t len = strlen(inp); 52 | 53 | printheader(flong, istty); 54 | 55 | while (*inp) { 56 | size_t offset = inp - param; 57 | uint32_t charbuf = 0; 58 | ssize_t runelen = utf8_decode(&charbuf, inp, len - offset); 59 | 60 | if (runelen < 0) { 61 | warnx("invalid UTF8 rune at offset %zu", offset); 62 | ++inp; 63 | continue; 64 | } 65 | 66 | printentry(charbuf, charinfos[charbuf].desc, istty, flong); 67 | inp += runelen; 68 | } 69 | } 70 | 71 | static void 72 | search(char *query) 73 | { 74 | regex_t re; 75 | 76 | /* TODO: get char of error and error message */ 77 | if (regcomp(&re, query, REG_ICASE)) 78 | errx(1, "'%s': invalid regex.", query); 79 | 80 | printheader(flong, istty); 81 | 82 | for (size_t i = 0; i < UNICODE_MAX; ++i) { 83 | char *desc = charinfos[i].desc; 84 | if (desc == NULL) 85 | continue; 86 | if (regexec(&re, desc, 0, NULL, 0) != REG_NOMATCH) 87 | printentry(i, desc, istty, flong); 88 | } 89 | 90 | regfree(&re); 91 | } 92 | 93 | static void 94 | usage(_Bool _short) 95 | { 96 | printf("Usage: chmap [-C always|never|auto] [-l] [-r RANGE] [-c CHARS] [-s REGEX]\n"); 97 | 98 | if (_short) 99 | exit(0); 100 | 101 | printf("\n"); 102 | printf("Print information for Unicode characters.\n"); 103 | printf("\n"); 104 | printf("OPTIONS:\n"); 105 | printf(" -l, --long Show character entries in the long format.\n"); 106 | printf(" -h, --help print this help message and exit.\n"); 107 | printf(" -V, --version print version and exit.\n"); 108 | printf("\n"); 109 | printf("FLAGS:\n"); 110 | printf(" -r, --range RANGE print a range of Unicode characters.\n"); 111 | printf(" -c, --chars CHARS print a range of Unicode codepoints that match\n"); 112 | printf(" provided character(s).\n"); 113 | printf(" -s, --search REGEX search character descriptions for REGEX.\n"); 114 | printf("\n"); 115 | printf("Full documentation is available locally at chmap(1).\n"); 116 | 117 | exit(0); 118 | } 119 | 120 | static _Bool 121 | usecolor(void) 122 | { 123 | if (!isatty(STDOUT_FILENO)) 124 | return false; 125 | 126 | char *env_NOCOLOR = getenv("NO_COLOR"); 127 | char *env_TERM = getenv("TERM"); 128 | 129 | if (env_NOCOLOR) 130 | return false; 131 | 132 | if (!env_TERM || !strcmp(env_TERM, "dumb")) 133 | return false; 134 | 135 | return true; 136 | } 137 | 138 | int 139 | main(int argc, char **argv) 140 | { 141 | istty = usecolor(); 142 | 143 | ARGBEGIN { 144 | break; case 'l': 145 | flong = !flong; 146 | break; case 'r': 147 | range(EARGF(usage(true))); 148 | break; case 'c': 149 | chars(EARGF(usage(true))); 150 | break; case 's': 151 | search(EARGF(usage(true))); 152 | break; case 'C': 153 | optarg = EARGF(usage(true)); 154 | if (!strncmp(optarg, "au", 2)) 155 | istty = usecolor(); 156 | else if (!strncmp(optarg, "al", 2)) 157 | istty = true; 158 | else if (!strncmp(optarg, "ne", 2)) 159 | istty = false; 160 | else 161 | usage(true); 162 | break; case 'V': case 'v': 163 | printf("chmap v%s\n", VERSION); 164 | return 0; 165 | break; case 'h': 166 | usage(false); 167 | break; default: 168 | usage(true); 169 | } ARGEND 170 | } 171 | -------------------------------------------------------------------------------- /src/range.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static size_t _buf_len = 0; 10 | 11 | static _Bool 12 | parse_int(int *x, char *s, char **e, _Bool add, uint32_t *buf) 13 | { 14 | size_t base; 15 | if (!strncmp(s, "0x", 2) || !strncmp(s, "U+", 2)) { 16 | base = 16; 17 | s += 2; 18 | } else if (!strncmp(s, "0o", 2)) { 19 | base = 8; 20 | s += 2; 21 | } else if (!strncmp(s, "0b", 2)) { 22 | base = 2; 23 | s += 2; 24 | } else { 25 | base = 10; 26 | } 27 | 28 | *x = strtol(s, e, base); 29 | _Bool ok = *e != s; 30 | 31 | /* HACK: the add parameter controls whether a parsed integer is 32 | * added to the entries if it succeeds in parsing it. 33 | * 34 | * The reason it is needed is because parsed_int is used in two 35 | * places: 1) in expand_range (where we *want* successfully parse 36 | * integers to be added to the entries) and 2) in parse_range (where 37 | * we *don't want* successfully parsed integers to be added to the 38 | * entries. 39 | */ 40 | 41 | if (ok && add) buf[_buf_len] = *x, ++_buf_len; 42 | return ok; 43 | } 44 | 45 | 46 | static _Bool 47 | parse_range(char *s, char **e, uint32_t *buf) 48 | { 49 | int x = 0, y = 0; 50 | char *ee; 51 | char *start = s; 52 | 53 | /* try to parse left-hand side of range */ 54 | if (!parse_int(&x, s, &ee, false, buf)) 55 | return false; 56 | s = ee; 57 | 58 | /* check if this is really a range, or just 59 | * a single integer */ 60 | if (*s != '-') { 61 | e = &start; 62 | return false; 63 | } else { 64 | ++s; 65 | } 66 | 67 | /* try to parse right-hand side of range */ 68 | if (!parse_int(&y, s, e, false, buf)) 69 | return false; 70 | 71 | /* check if left-hand size is greater than 72 | * right-hand side of range */ 73 | if (y < x) return false; 74 | 75 | /* copy onto accumulator */ 76 | for (size_t i = x; i <= (size_t)y; ++i) 77 | buf[_buf_len] = i, ++_buf_len; 78 | return true; 79 | } 80 | 81 | static ssize_t 82 | expand_range(char *s, uint32_t *buf) 83 | { 84 | _buf_len = 0; 85 | int x = 0; 86 | char **e = &s; 87 | 88 | for (;;) { 89 | while (isspace(*s)) ++s; 90 | 91 | /* 92 | * try to parse input as a range, and fall back 93 | * to parsing input as a single integer if that 94 | * failed. 95 | * if both failed, it's probably a syntax error. 96 | */ 97 | if (!parse_range(s, e, buf)) { 98 | if (!parse_int(&x, s, e, true, buf)) { 99 | break; 100 | } 101 | } 102 | s = *e; 103 | 104 | while (isspace(*s)) ++s; 105 | if (strlen(s) == 0) return _buf_len; 106 | 107 | /* check if there's something more to parse */ 108 | if ((*s) == ',') { 109 | ++s; 110 | continue; 111 | } 112 | 113 | break; 114 | } 115 | 116 | /* if we broke out of the main loop then a syntax 117 | * error must have occurred */ 118 | return -1; 119 | } 120 | -------------------------------------------------------------------------------- /src/unicode.c: -------------------------------------------------------------------------------- 1 | #include "unicode.h" 2 | 3 | const char *category_strs[30] = { 4 | [UC_Cn] = "Other (not assigned)", 5 | [UC_Lu] = "Letter (uppercase)", 6 | [UC_Ll] = "Letter (lowercase)", 7 | [UC_Lt] = "Letter (titlecase)", 8 | [UC_Lm] = "Letter (modifier)", 9 | [UC_Lo] = "Letter (other)", 10 | [UC_Mn] = "Mark (nonspacing)", 11 | [UC_Mc] = "Mark (space combining)", 12 | [UC_Me] = "Mark (enclosing)", 13 | [UC_Nd] = "Number (decimal digit)", 14 | [UC_Nl] = "Number (letter)", 15 | [UC_No] = "Number (other)", 16 | [UC_Pc] = "Punctuation (connector)", 17 | [UC_Pd] = "Punctuation (dash)", 18 | [UC_Ps] = "Punctuation (open)", 19 | [UC_Pe] = "Punctuation (close)", 20 | [UC_Pi] = "Punctuation (initial quote)", 21 | [UC_Pf] = "Punctuation (final quote)", 22 | [UC_Po] = "Punctuation (other)", 23 | [UC_Sm] = "Symbol (math)", 24 | [UC_Sc] = "Symbol (currency)", 25 | [UC_Sk] = "Symbol (modifier)", 26 | [UC_So] = "Symbol (other)", 27 | [UC_Zs] = "Separator (space)", 28 | [UC_Zl] = "Separator (line)", 29 | [UC_Zp] = "Separator (paragraph)", 30 | [UC_Cc] = "Other (control)", 31 | [UC_Cf] = "Other (format)", 32 | [UC_Cs] = "Other (surrogate)", 33 | [UC_Co] = "Other (private use)", 34 | }; 35 | 36 | #include "dat/charinfo.c" 37 | #include "dat/charwidths.c" 38 | 39 | _Bool 40 | unicodeisupper(uint32_t c) 41 | { 42 | struct CharInfo ci = charinfos[c]; 43 | return ci.lower != ci.upper && ci.upper == -1 && ci.category != UC_Lt; 44 | } 45 | 46 | _Bool 47 | unicodeislower(uint32_t c) 48 | { 49 | struct CharInfo ci = charinfos[c]; 50 | return ci.lower != ci.upper && ci.lower == -1; 51 | } 52 | -------------------------------------------------------------------------------- /src/unicode.h: -------------------------------------------------------------------------------- 1 | #ifndef UNICODE_H 2 | #define UNICODE_H 3 | 4 | #include 5 | 6 | #define UNICODE_MAX 0x10FFFF 7 | 8 | enum { 9 | UC_Lu, 10 | UC_Ll, 11 | UC_Lt, 12 | UC_Mn, 13 | UC_Mc, 14 | UC_Me, 15 | UC_Nd, 16 | UC_Nl, 17 | UC_No, 18 | UC_Zs, 19 | UC_Zl, 20 | UC_Zp, 21 | UC_Cc, 22 | UC_Cf, 23 | UC_Cs, 24 | UC_Co, 25 | UC_Cn, 26 | 27 | UC_Lm, 28 | UC_Lo, 29 | UC_Pc, 30 | UC_Pd, 31 | UC_Ps, 32 | UC_Pe, 33 | UC_Pi, 34 | UC_Pf, 35 | UC_Po, 36 | UC_Sm, 37 | UC_Sc, 38 | UC_Sk, 39 | UC_So 40 | }; 41 | 42 | enum { 43 | UBIDI_AL, 44 | UBIDI_AN, 45 | UBIDI_B, 46 | UBIDI_BN, 47 | UBIDI_CS, 48 | UBIDI_EN, 49 | UBIDI_ES, 50 | UBIDI_ET, 51 | UBIDI_FSI, 52 | UBIDI_L, 53 | UBIDI_LRE, 54 | UBIDI_LRI, 55 | UBIDI_LRO, 56 | UBIDI_NSM, 57 | UBIDI_ON, 58 | UBIDI_PDF, 59 | UBIDI_PDI, 60 | UBIDI_R, 61 | UBIDI_RLE, 62 | UBIDI_RLI, 63 | UBIDI_RLO, 64 | UBIDI_S, 65 | UBIDI_WS, 66 | }; 67 | 68 | extern const char *category_strs[30]; 69 | 70 | struct CharInfo { 71 | uint8_t category; 72 | char bidirect; 73 | int32_t decimal; 74 | char *desc; 75 | int32_t upper; 76 | int32_t lower; 77 | }; 78 | 79 | extern const struct CharInfo charinfos[UNICODE_MAX]; 80 | extern const uint8_t charwidths[UNICODE_MAX]; 81 | 82 | _Bool unicodeisupper(uint32_t c); 83 | _Bool unicodeislower(uint32_t c); 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /src/utf8.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2010-2013 nsf 3 | * 2021 kiedtl 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy 6 | * of this software and associated documentation files (the "Software"), to deal 7 | * in the Software without restriction, including without limitation the rights 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | * copies of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | * THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "utf8.h" 29 | #include "unicode.h" 30 | 31 | /* 32 | * Invalid starter bytes are marked with 0: 33 | * 34 | * The first two... cells (C0 and C1) could be used only for a 2-byte encoding 35 | * of a 7-bit ASCII character which should be encoded in 1 byte... such 36 | * "overlong" sequences are disallowed. The red cells in the F_ row (F5 to FD) 37 | * indicate leading bytes of 4-byte or longer sequences that cannot be valid 38 | * because they would encode code points larger than the U+10FFFF limit of 39 | * Unicode (a limit derived from the maximum code point encodable in UTF-16). 40 | * FE and FF do not match any allowed character pattern and are therefore not 41 | * valid start bytes. -- Wikipedia article on UTF8 42 | * 43 | * Additionally, values between 0x80 and 0xbf inclusive are marked with 0, as 44 | * they are continuation bytes and may not appear at the beginning of an 45 | * encoded rune sequence. 46 | */ 47 | static const uint8_t utf8_length[256] = { 48 | /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ 49 | /* 0 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 | /* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 | /* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 | /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 | /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54 | /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55 | /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 56 | /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 57 | /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58 | /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59 | /* A */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60 | /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61 | /* C */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 62 | /* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 63 | /* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 64 | /* F */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 65 | }; 66 | 67 | static const uint8_t utf8_mask[6] = { 68 | 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 69 | }; 70 | 71 | ssize_t utf8_char_to_unicode(uint32_t *out, const char *c); 72 | size_t utf8_unicode_to_char(char *out, uint32_t c); 73 | 74 | uint8_t 75 | utf8_bytesz(char c) 76 | { 77 | return utf8_length[(uint8_t)c]; 78 | } 79 | 80 | ssize_t 81 | utf8_decode(uint32_t *out, char *c, size_t sz) 82 | { 83 | if (c[0] == 0 || sz == 0) 84 | return -1; 85 | 86 | uint8_t len = utf8_bytesz(*c); 87 | 88 | if (len == 0 || len > sz) 89 | return -1; 90 | 91 | uint32_t result = c[0] & utf8_mask[len-1]; 92 | 93 | for (size_t i = 1; i < len; ++i) { 94 | if ((c[i] & 0xc0) != 0x80) 95 | return -1; /* not a continuation byte */ 96 | result <<= 6; 97 | result |= c[i] & 0x3f; 98 | } 99 | 100 | if (result > UNICODE_MAX) 101 | return -1; /* value beyond unicode's 21-bit max */ 102 | if (result >= 0xD800 && result <= 0xDFFF) 103 | return -1; /* surrogate chars */ 104 | if (result >= 0xFDD0 && result <= 0xFDEF) 105 | return -1; /* non-character range */ 106 | if ((result & 0xFFFE) == 0xFFFE) 107 | return -1; /* non-character at plane end */ 108 | 109 | *out = result; 110 | return (size_t)len; 111 | } 112 | 113 | ssize_t 114 | utf8_encode(char *out, uint32_t c) 115 | { 116 | size_t len = 0, first, i; 117 | 118 | if (c < 0x80) { 119 | first = 0; 120 | len = 1; 121 | } else if (c < 0x800) { 122 | /* XXX: we allow encoding surrogate chars, even 123 | * though that's invalid UTF8 */ 124 | first = 0xc0; 125 | len = 2; 126 | } else if (c < 0x10000) { 127 | first = 0xe0; 128 | len = 3; 129 | } else if (c < 0x110000) { 130 | first = 0xf0; 131 | len = 4; 132 | } else { 133 | return -1; 134 | } 135 | 136 | for (i = len - 1; i > 0; --i) { 137 | out[i] = (c & 0x3f) | 0x80; 138 | c >>= 6; 139 | } 140 | out[0] = c | first; 141 | 142 | return len; 143 | } 144 | -------------------------------------------------------------------------------- /src/utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef UTF8_H 2 | #define UTF8_H 3 | 4 | #include 5 | 6 | uint8_t utf8_bytesz(char c); 7 | ssize_t utf8_decode(uint32_t *out, char *c, size_t sz); 8 | ssize_t utf8_encode(char *out, uint32_t c); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /src/util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "util.h" 8 | #include "unicode.h" 9 | 10 | char * __attribute__((format(printf, 1, 2))) 11 | format(const char *fmt, ...) 12 | { 13 | static char buf[8192]; 14 | memset(buf, 0x0, sizeof(buf)); 15 | va_list ap; 16 | va_start(ap, fmt); 17 | int len = vsnprintf(buf, sizeof(buf), fmt, ap); 18 | va_end(ap); 19 | assert((size_t) len < sizeof(buf)); 20 | return (char *) &buf; 21 | } 22 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #define UNUSED(V) ((void)(V)) 5 | 6 | char * __attribute__((format(printf, 1, 2))) format(const char *fmt, ...); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | pilot_range 2 | pilot_decode 3 | -------------------------------------------------------------------------------- /tests/decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Require GNU printf(3) and GNU bc 4 | 5 | . tests/lib.sh 6 | 7 | hex() { 8 | echo "obase=16;ibase=A;${1##0x}" | bc 9 | } 10 | 11 | decode() { 12 | tests/pilot_decode $(/usr/bin/printf "$1") >/dev/null 13 | } 14 | 15 | begin "Decode capital a" 16 | cmdout decode '\x41' 'U+0041' 17 | 18 | begin "Decode small o with diaeresis" 19 | cmdout decode '\xc3\xb6' 'U+00F6' 20 | 21 | begin "Decode cyrillic capital letter zhe" 22 | cmdout decode '\xd0\x96' 'U+0416' 23 | 24 | begin "Decode euro sign" 25 | cmdout decode '\xe2\x82\xac' 'U+20AC' 26 | 27 | begin "Decode musical symbol g cleff" 28 | cmdout decode '\xf0\x9d\x84\x9e' 'U+1D11E' 29 | 30 | begin "Fail on decoding surrogate chars" 31 | cmdend decode '\xed\xa0\x82' 1 32 | 33 | begin "Fail on first byte == 0xFF" 34 | cmdend decode '\xff\xc3\xb1' 1 35 | 36 | # The following tests were directly stolen from libutf8proc's 37 | # test suite. 38 | 39 | test_fail_missing_cont_byte() { 40 | for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)" && return 1; done 41 | return 0 42 | } 43 | begin "Fail on missing continuation bytes" 44 | test_fail_missing_cont_byte; chkend 45 | 46 | test_fail_missing_cont_byte_before_noncont_0x41() { 47 | for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)\x41" && return 1; done 48 | return 0 49 | } 50 | begin "Fail on lead followed by non-continuation byte 0x41" 51 | test_fail_missing_cont_byte_before_noncont_0x41; chkend 52 | 53 | test_fail_missing_cont_byte_before_noncont_0xc0() { 54 | for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)\x41" && return 1; done 55 | return 0 56 | } 57 | begin "Fail on lead followed by non-continuation byte 0xc0" 58 | test_fail_missing_cont_byte_before_noncont_0xc0; chkend 59 | 60 | test_fail_lead_cont_byte() { 61 | for byte in $(seq 0x80 0xc3); do decode "\x$(hex $byte)" && return 1; done 62 | return 0 63 | } 64 | begin "Fail on leading continuation byte" 65 | test_fail_lead_cont_byte; chkend 66 | 67 | test_fail_overlong_2() { 68 | for byte in $(seq 0x81 0xbf); do 69 | decode "\xc0\x$(hex $byte)" && return 1 70 | decode "\xc1\x$(hex $byte)" && return 1 71 | done 72 | return 0 73 | } 74 | begin "Fail on overlong 2byte sequence" 75 | test_fail_overlong_2; chkend 76 | 77 | test_fail_overlong_3() { 78 | for byte in $(seq 0x80 0x9f); do 79 | decode "\xe0\x$(hex $byte)\x80" && return 1 80 | done 81 | return 0 82 | } 83 | begin "Fail on overlong 3byte sequence" 84 | test_fail_overlong_3; chkend 85 | 86 | test_fail_overlong_4() { 87 | for byte in $(seq 0x80 0x8f); do 88 | decode "\xf0\x$(hex $byte)\x80\x80" && return 1 89 | done 90 | return 0 91 | } 92 | begin "Fail on overlong 4byte sequence" 93 | test_fail_overlong_4; chkend 94 | 95 | test_fail_abovemax_4() { 96 | for byte in $(seq 0x90 0xbf); do decode "\xf4\x$(hex $byte)\x80\x80" && return 1; done 97 | for byte in $(seq 0xf5 0xf7); do decode "\x$(hex $byte)\x80\x80\x80" && return 1; done 98 | return 0 99 | } 100 | begin "Fail on 4byte sequence encoding > UNICODE_MAX" 101 | test_fail_abovemax_4; chkend 102 | 103 | test_fail_invalid_5() { 104 | for byte in $(seq 0xf8 0xfd); do 105 | decode "\xf7\x80\x80\x80\x$(hex $byte)" && return 1 106 | done 107 | return 0 108 | } 109 | begin "Fail on invalid 5byte sequence" 110 | test_fail_invalid_5; chkend 111 | 112 | test_fail_invalid_6() { 113 | for byte in $(seq 0xfc 0xfd); do 114 | decode "\xf7\x80\x80\x80\x80\x$(hex $byte)" && return 1 115 | done 116 | return 0 117 | } 118 | begin "Fail on invalid 6byte sequence" 119 | test_fail_invalid_6; chkend 120 | -------------------------------------------------------------------------------- /tests/lib.sh: -------------------------------------------------------------------------------- 1 | printf '== %s\n' "$0" 2 | trap "printf '\n'" EXIT 3 | 4 | begin() { 5 | trmcols=$(stty size | cut -d' ' -f2) 6 | padding=$(($trmcols - 5)) 7 | printf "%-${padding}s" "$1" 8 | } 9 | 10 | failure() { 11 | printf 'FAIL\n' 12 | } 13 | 14 | success() { 15 | printf ' OK\n' 16 | } 17 | 18 | cmdout() { 19 | arg=$(/usr/bin/printf '%b' "$2") 20 | if [ "$(tests/pilot_$1 $arg)" != "$3" ]; then 21 | failure 22 | else 23 | success 24 | fi 25 | } 26 | 27 | cmdend() { 28 | arg=$(/usr/bin/printf '%b' "$2") 29 | tests/pilot_$1 $arg 2>/dev/null >&2 30 | 31 | if [ $? -ne "$3" ]; then 32 | failure 33 | else 34 | success 35 | fi 36 | } 37 | 38 | chkend() { 39 | if [ $? -eq 0 ]; then success; else failure; fi 40 | } 41 | -------------------------------------------------------------------------------- /tests/pilot_decode.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "utf8.c" 4 | 5 | int 6 | main(int argc, char **argv) 7 | { 8 | uint32_t charbuf = 0; 9 | ssize_t runelen = 0; 10 | 11 | while (*argv[1]) { 12 | charbuf = 0; 13 | if ((runelen = utf8_decode(&charbuf, argv[1], strlen(argv[1]))) < 0) 14 | return 1; 15 | printf("U+%04X\n", charbuf); 16 | argv[1] += runelen; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/pilot_range.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "range.c" 3 | 4 | int 5 | main(int argc, char **argv) 6 | { 7 | uint32_t entries[4096] = {0}; 8 | ssize_t entries_len = 0; 9 | if ((entries_len = expand_range(argv[1], entries)) < 0) 10 | return 1; 11 | 12 | printf("%d", entries[0]); 13 | for (size_t i = 1; i < entries_len; ++i) { 14 | printf(" %d", entries[i]); 15 | } 16 | 17 | printf("\n"); 18 | } 19 | -------------------------------------------------------------------------------- /tests/range.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | . tests/lib.sh 4 | 5 | begin "Lone stranger" 6 | cmdout range "128" "128" 7 | 8 | begin "Simple start-to-end" 9 | cmdout range "0-5" "0 1 2 3 4 5" 10 | 11 | begin "Start-to-end with stranger" 12 | cmdout range "0-5,8" "0 1 2 3 4 5 8" 13 | 14 | begin "Start-to-end with hexadecimal stranger" 15 | cmdout range "0-5,0xFF" "0 1 2 3 4 5 255" 16 | 17 | begin "Strangers with negative values" 18 | cmdout range "12,-9,-5--2" "12 -9 -5 -4 -3 -2" 19 | 20 | begin "Fail when end < start" 21 | cmdend range "5-2" 1 22 | 23 | begin "Fail on invalid number" 24 | cmdend range "0-abcd" 1 25 | 26 | begin "Fail on invalid hexadecimal number" 27 | cmdend range "0-12,24,0xFZ" 1 28 | 29 | begin "Fail when missing end of start-to-end" 30 | cmdend range "12,4-9,0-" 1 31 | 32 | begin "Fail when missing start of start-to-end" 33 | cmdend range "12,--9,0-8" 1 34 | -------------------------------------------------------------------------------- /tool/gencharsh.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | -- 3 | -- (c) Kiëd Llaentenn 4 | -- See the COPYING file for copyright information. 5 | 6 | function mytonumber(a, ...) 7 | if a and a ~= "" then 8 | return tonumber(a, ...) 9 | else 10 | return -1 11 | end 12 | end 13 | 14 | print("\ 15 | #include \ 16 | #include \"unicode.h\"\ 17 | \ 18 | /* This file has automatically generated. */\ 19 | \ 20 | const struct CharInfo charinfos[UNICODE_MAX] = {\ 21 | ") 22 | 23 | local data = io.stdin:read('*all') 24 | for line in data:gmatch("([^\n]+)\n?") do 25 | local ch, desc, category, bidi, decimal, olddesc, upper, lower = line:match("(.-);(.-);(.-);.-;(.-);.-;(.-);.-;.-;.-;(.-);.-;(.-);(.-);") 26 | 27 | ch = tonumber(ch, 16) 28 | category = "UC_" .. category 29 | bidi = "UBIDI_" .. bidi 30 | decimal = mytonumber(decimal) 31 | if desc == "" then desc = olddesc end 32 | upper = mytonumber(upper, 16) 33 | lower = mytonumber(lower, 16) 34 | 35 | desc = desc:lower() 36 | 37 | print(string.format("\t[%5d] = { %s, %s, %d, \"%s\", %d, %d },", 38 | ch, category, bidi, decimal, desc, upper, lower)) 39 | end 40 | 41 | print("};") 42 | -------------------------------------------------------------------------------- /tool/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | die() { 4 | printf '%s\n' "$1"; exit 1 5 | } 6 | 7 | [ -z "$1" ] && die "usage: $0 [destination]" 8 | [ -f "chmap" ] || die "can't find chmap" 9 | [ -f "chmap.1" ] || die "can't find chmap's db" 10 | 11 | install -Dm755 chmap "$1/bin/chmap" 12 | install -Dm644 chmap.1 "$1/share/man/man1/chmap.1" 13 | --------------------------------------------------------------------------------