├── .gitignore
├── .gitmodules
├── LICENSE.md
├── Makefile
├── README.md
├── config.mk
├── dat
    ├── UnicodeData.txt
    ├── charwidths.c
    ├── logo.png
    └── logo.txt
├── man
    ├── CHANGELOG-v1.0.0.md
    ├── TODO.md
    └── chmap.scd
├── src
    ├── arg.h
    ├── display.c
    ├── main.c
    ├── range.c
    ├── unicode.c
    ├── unicode.h
    ├── utf8.c
    ├── utf8.h
    ├── util.c
    └── util.h
├── tests
    ├── .gitignore
    ├── decode.sh
    ├── lib.sh
    ├── pilot_decode.c
    ├── pilot_range.c
    └── range.sh
└── tool
    ├── gencharsh.lua
    └── install.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore vim's swap files
 2 | *.sw[po]
 3 | 
 4 | # ignore generated files
 5 | charinfo.c
 6 | chmap.1
 7 | 
 8 | # ignore compiled files
 9 | *.[ao]
10 | *.xz
11 | chmap
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "sub/argoat"]
2 | 	path = sub/argoat
3 | 	url = https://github.com/cylgom/argoat
4 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | ===========
 3 | 
 4 | - Copyright © 2019-2021 Kiëd Llaentenn and contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights to
 9 | (mis)use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is furnished
11 | to do so, subject to the following conditions:
12 | 
13 | - The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # chmap: retreive information about Unicode characters
 3 | # https://github.com/lptstr/chmap
 4 | #
 5 | # (c) Kiëd Llaentenn and contributors
 6 | # See the LICENSE.md file for more information
 7 | #
 8 | 
 9 | include config.mk
10 | 
11 | BIN     = chmap
12 | VERSION = 1.1.0
13 | PKGNAME = $(BIN)-$(shell uname -s)-$(shell uname -m)-$(VERSION)
14 | 
15 | SRC     = src/utf8.c src/unicode.c src/util.c src/main.c
16 | OBJ     = $(SRC:.c=.o)
17 | 
18 | WARNING = -Wall -Wpedantic -Wextra -Wold-style-definition -Wformat=2 \
19 | 	  -Wmissing-prototypes -Winit-self -Wfloat-equal -Wstrict-prototypes \
20 | 	  -Wredundant-decls -Wendif-labels -Wstrict-aliasing=2 -Woverflow \
21 | 	  -Werror=implicit-function-declaration -Werror=return-type
22 | 
23 | INC     = -I ~/local/include/ -I. -Isrc/
24 | DEF     = -D_DEFAULT_SOURCE -D_XOPEN_SOURCE=1000 -DVERSION=\"$(VERSION)\"
25 | 
26 | CFLAGS  = -std=c99 $(WARNING) $(DEF) $(INC)
27 | LDFLAGS = -fuse-ld=$(LD) -L ~/local/lib -static
28 | 
29 | all: man/$(BIN).1 debug
30 | 
31 | .c.o:
32 | 	@printf "    %-8s%s\n" "CC" $@
33 | 	$(CMD)$(CC) -c $< -o $(<:.c=.o) $(CFLAGS) $(CFLAGS_OPT)
34 | 
35 | debug: CFLAGS_OPT  := $(DEBUG_CFLAGS)
36 | debug: LDFLAGS_OPT := $(DEBUG_LDFLAGS)
37 | debug: $(BIN)
38 | 
39 | release: CFLAGS_OPT  := $(RELEASE_CFLAGS)
40 | release: LDFLAGS_OPT := $(RELEASE_LDFLAGS)
41 | release: $(BIN) man/$(BIN).1
42 | 
43 | src/main.o: src/range.c src/display.c
44 | 
45 | $(BIN): dat/charinfo.c $(OBJ)
46 | 	@printf "    %-8s%s\n" "CCLD" $@
47 | 	$(CMD)$(CC) -o $@ $(OBJ) $(CFLAGS) $(CFLAGS_OPT) $(LDFLAGS) $(LDFLAGS_OPT)
48 | 
49 | dat/charinfo.c: tool/gencharsh.lua dat/UnicodeData.txt
50 | 	@printf "    %-8s%s\n" "GEN" $@
51 | 	$(CMD)tool/gencharsh.lua < dat/UnicodeData.txt > $@
52 | 
53 | man/$(BIN).1: man/$(BIN).scd
54 | 	@printf "    %-8s%s\n" "SCDOC" $@
55 | 	$(CMD)scdoc < $^ > $@
56 | 
57 | clean:
58 | 	rm -rf $(BIN) $(OBJ) man/$(BIN).1
59 | 	rm -rf *.xz $(PKGNAME)*
60 | 	rm -rf dat/charinfo.c
61 | 
62 | dist: release man/$(BIN).1
63 | 	$(CMD)mkdir $(PKGNAME)
64 | 	$(CMD)cp $(BIN)          $(PKGNAME)
65 | 	$(CMD)cp man/$(BIN).1    $(PKGNAME)
66 | 	$(CMD)cp tool/install.sh $(PKGNAME)
67 | 	$(CMD)tar -cf - $(PKGNAME) | xz -qcT0 > $(PKGNAME).tar.xz
68 | 	$(CMD)rm -rf $(PKGNAME)
69 | 
70 | 
71 | install: $(BIN) man/$(BIN).1
72 | 	install -Dm755 $(BIN) $(DESTDIR)/$(PREFIX)/bin/$(BIN)
73 | 	install -Dm644 man/$(BIN).1 $(DESTDIR)/$(PREFIX)/share/man/man1/$(BIN).1
74 | 
75 | uninstall:
76 | 	rm -f $(DESTDIR)/$(PREFIX)/bin/$(BIN)
77 | 	rm -f $(DESTDIR)/$(PREFIX)/share/man/man1/$(BIN).1
78 | 
79 | check:
80 | 	$(CMD)for i in tests/pilot_*.c; do \
81 | 		cc -Isrc/ $$i -o $${i%%.c} -O0 -g || exit 1; \
82 | 	done
83 | 	$(CMD)find tests -name '*.sh' -perm -700 -exec '{}' \;
84 | 
85 | 
86 | .PHONY: all debug release clean dist install uninstall check
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `chmap` (formerly `lcharmap`)
  2 | 
  3 | > A CLI utility to get information for Unicode characters.
  4 | 
  5 | ## What?
  6 | 
  7 | `chmap` is a little utility to get information on Unicode characters,
  8 | such as its description, hexadecimal/octal representation, Unicode
  9 | category, UTF-8 encoding, and more.
 10 | 
 11 | ```
 12 | $ chmap -r 9-10,13935,255-258
 13 | codepoint  glyph  encoded     case   description
 14 |         9         9           other  character tabulation
 15 |        10         A           other  line feed (lf)
 16 |     13935  㙯     E3 99 AF    other  modifier letter chinese tone yin ping
 17 |       255  ÿ      C3 BF       lower  latin small letter y with diaeresis
 18 |       256  Ā      C4 80       upper  latin capital letter a with macron
 19 |       257  ā      C4 81       lower  latin small letter a with macron
 20 |       258  Ă      C4 82       upper  latin capital letter a with breve
 21 | ```
 22 | 
 23 | `chmap` was inspired by the `charmap.exe` tool present in Windows XP and
 24 | later.
 25 | 
 26 | ## Where?
 27 | 
 28 | Tarballs are available from GitHub releases for Linux-x86_64 and
 29 | Linux-armv6l. Extract, `cd`, and run:
 30 | 
 31 | ```
 32 | $ sudo ./install.sh /usr/local
 33 | ```
 34 | 
 35 | ### Building from Source
 36 | 
 37 | #### Build dependencies
 38 | - a C99 compiler, GNU Make
 39 | - [scdoc](https://git.sr.ht/~sircmpwn/scdoc) (manpage)
 40 | 
 41 | Download the latest source tarball from GitHub releases
 42 | (`chmap-v$VERSION.tar.xz`), extract, and build:
 43 | 
 44 | ```
 45 | # make clean install
 46 | ```
 47 | 
 48 | ## How?
 49 | 
 50 | Run `chmap` with the `-r` flag (for *r*ange) to get info for a range of
 51 | Unicode runes:
 52 | 
 53 | ```
 54 | $ chmap -r 0-5
 55 | codepoint  glyph  encoded      case   description
 56 |         0                      other  null
 57 |         1         1            other  start of heading
 58 |         2         2            other  start of text
 59 |         3         3            other  end of text
 60 |         4         4            other  end of transmission
 61 |         5         5            other  enquiry
 62 | ```
 63 | 
 64 | Example ranges: `0-1`, `355-369`, `34`, `189-192,12,45-49`.
 65 | 
 66 | You may also list info for a list of given characters with the `-c` flag:
 67 | 
 68 | ```
 69 | $ chmap -c Lovecraft
 70 | codepoint  glyph  encoded      case   description
 71 |        76  L      4C           upper  latin capital letter l
 72 |       111  o      6F           lower  latin small letter o
 73 |       118  v      76           lower  latin small letter v
 74 |       101  e      65           lower  latin small letter e
 75 |        99  c      63           lower  latin small letter c
 76 |       114  r      72           lower  latin small letter r
 77 |        97  a      61           lower  latin small letter a
 78 |       102  f      66           lower  latin small letter f
 79 |       116  t      74           lower  latin small letter t
 80 | ```
 81 | 
 82 | You can search for characters that match a regex, too, with the `-s` flag:
 83 | 
 84 | ```
 85 | $ chmap -s '^latin.*capital letter z'                                                       
 86 | codepoint  glyph  encoded      case   description
 87 |        90  Z      5A           upper  latin capital letter z
 88 |       377  Ź      C5 B9        upper  latin capital letter z with acute
 89 |       379  Ż      C5 BB        upper  latin capital letter z with dot above
 90 |       381  Ž      C5 BD        upper  latin capital letter z with caron
 91 |       437  Ƶ      C6 B5        upper  latin capital letter z with stroke
 92 |       548  Ȥ      C8 A4        upper  latin capital letter z with hook
 93 |      7824  Ẑ      E1 BA 90     upper  latin capital letter z with circumflex
 94 |      7826  Ẓ      E1 BA 92     upper  latin capital letter z with dot below
 95 |      7828  Ẕ      E1 BA 94     upper  latin capital letter z with line below
 96 |     11371  Ⱬ      E2 B1 AB     upper  latin capital letter z with descender
 97 |     11391  Ɀ      E2 B1 BF     upper  latin capital letter z with swash tail
 98 |     42950  Ᶎ      EA 9F 86     upper  latin capital letter z with palatal hook
 99 | ```
100 | 
101 | Adding the `-l` flag causes `chmap` to print more information:
102 | 
103 | ```
104 | $ chmap -l -r 13000                                                                         
105 | codepoint    13000 0x32C8  0o31310
106 | encoding     UTF8(E3 8B 88)
107 | glyph        ㋈ (2 columns)
108 | description  ideographic telegraph symbol for september
109 | case         other
110 | category     Symbol (other)
111 | ```
112 | 
113 | See `man chmap` for more.
114 | 
115 | ## Why?
116 | 
117 | I miss some Windows utilities.
118 | 
119 | I found [`chars`](https://github.com/antifuchs/chars), but it lacked
120 | several features which I'd like:
121 | 
122 | - A table-like output mode (which `chmap` has by default). This makes
123 |   viewing many entries more comfortable in a short terminal.
124 | - A `category` field to specify the Unicode category that the rune falls
125 |   under.
126 | 
127 | ## Inspiration
128 | 
129 | - Window's `charmap.exe`
130 | - Eric Raymond's `ascii` utility
131 | - `chars` (https://github.com/antifuchs/chars)
132 | - `uniname` from uniutils
133 | 
134 | ## License
135 | 
136 | This lame little utility is licensed under the MIT License. See
137 | the `LICENSE.md` file for more information.
138 | 


--------------------------------------------------------------------------------
/config.mk:
--------------------------------------------------------------------------------
 1 | #
 2 | # lcharmap: retrieve information about Unicode characters
 3 | # https://github.com/lptstr/lcharmap
 4 | #
 5 | # (c) Kiëd Llaentenn and contributors
 6 | # See the LICENSE.md file for more information
 7 | #
 8 | 
 9 | CMD = @
10 | 
11 | DESTDIR =
12 | PREFIX  = /usr/local
13 | 
14 | AR = ar
15 | CC = cc
16 | LD = bfd
17 | 
18 | RELEASE_CFLAGS  = -O3 -mtune=native -march=native
19 | RELEASE_LDFLAGS = -flto -s
20 | DEBUG_CFLAGS    = -Og -g
21 | DEBUG_LDFLAGS   =
22 | 


--------------------------------------------------------------------------------
/dat/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kiedtl/chmap/af3fd2fb798040e14a69f78ae317c2ec6668bed5/dat/logo.png


--------------------------------------------------------------------------------
/dat/logo.txt:
--------------------------------------------------------------------------------
1 |    ƚçĦârɱáƤ
2 | 


--------------------------------------------------------------------------------
/man/CHANGELOG-v1.0.0.md:
--------------------------------------------------------------------------------
1 | # Changelog v1.0.0
2 | 
3 | - `lcharmap` rewritten in C
4 | - `--range` syntax converted to the more common `START-END` from `START,END`.
5 | - `--range` now accepts numbers in binary, octal, hexadecimal.
6 | - `--search` engine now uses `musl`'s buggy regex implementation, instead of
7 |   the Rust crate.
8 | 


--------------------------------------------------------------------------------
/man/TODO.md:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | Sorted in order of the chance that they'll be implemented in the next 10
 4 | years.
 5 | 
 6 | - Upgrade to Unicode 14
 7 | - "--pager" flag
 8 | - "-e" command to search emojis by CLDR (see https://github.com/arp242/uni)
 9 |   - "--tone" flag to change skin tone
10 |   - "--gender" flag to change gender
11 | - "-g" to print/search from Unicode groups
12 | - Move "-s" command to "--search"/"--filter" flag, so that it can be
13 |   combined with other commands
14 |   - Search modifiers: -or
15 | - new field: UTF-16 encoding
16 | - -c should sort and deduplicate input
17 | - an -f flag to filter output by property (e.g. `-f case=upper`)
18 | - simplify display code
19 | - X11/ncurses client, which an interface similar to charmap.exe
20 | - use cross-platform regex library instead of `regex.h` (PCRE?)
21 | 
22 | ---
23 | 
24 | **other:** `grep -nRE 'FIXME|XXX|TODO' src`
25 | 


--------------------------------------------------------------------------------
/man/chmap.scd:
--------------------------------------------------------------------------------
 1 | chmap(1) "Something Something Manual" "Something Something Manual"
 2 | 
 3 | # NAME
 4 | 
 5 | chmap - Get information for unicode characters
 6 | 
 7 | # SYNOPSIS
 8 | 
 9 | *chmap* [-C always|never|auto] [-l] [-r RANGE] [-c CHARS] [-s REGEX]
10 | 
11 | # DESCRIPTION
12 | 
13 | chmap (formerly lcharmap) retrieves information regarding Unicode
14 | characters, including a short description, its UTF8 encoding, the Unicode
15 | category it belongs to, its casing, and its decimal/hexadecimal/octal
16 | representation.
17 | 
18 | # OPTIONS
19 | 
20 | *-h*
21 | 	Print a short help message and exit.
22 | 
23 | *-V*
24 | 	Print chmap's version and exit.
25 | 
26 | *-l*
27 | 	Print information in the long format.
28 | 
29 | *-C* _WHEN_
30 | 	Control color usage (_WHEN_ can be *always*, *auto*, *never*).
31 | 
32 | # COMMANDS
33 | 
34 | *-r* _RANGE_
35 | 	Print a range of Unicode codepoints. (e.g. `0x033-0x035',
36 | 	or `1,3,6-9')
37 | 
38 | 	Range values can be in binary, octal, hexadecimal, in addition
39 | 	to decimal. In that case, they must be prefixed with a `0b', `0o',
40 | 	or `0x`, respectively.
41 | 
42 | *-c* _CHARS_
43 | 	Print a range of Unicode codepoints that match the provided
44 | 	character(s). (e.g., `Kiëd')
45 | 
46 | 	The input must be a valid UTF8-encoded string.
47 | 
48 | *-s* _REGEX_
49 | 	Search character descriptions for _REGEX_.
50 | 
51 | # EXAMPLES
52 | 
53 | chmap -r 33,36,45-47
54 | 	Print information for characters 33, 36, 45, 46, and 47.
55 | 
56 | chmap -r 0o033,0xAB,0b1101
57 | 	Print information for characters 27, 171, and 13.
58 | 
59 | chmap -c Asbjørn
60 | 	Print information for all characters in the name `Asbjørn'.
61 | 
62 | chmap -s "^latin .\* a$"
63 | 	Print all characters where the description matches the regular
64 | 	expression "^latin .\* a$".
65 | 
66 | # KNOWN ISSUES
67 | 
68 | - chmap is currently buggy and untested on Windows and macOS.
69 | - The display code is utterly incomprehensible.
70 | 
71 | # AUTHORS
72 | 
73 | Kiëd Llaentenn <kiedtl@tilde.team>
74 | 
75 | # REPORTING BUGS
76 | 
77 | Send bugs reports, hate mail, and other chmap-related bikeshedding to the
78 | author's email above, or */msg kiedtl* on freenode.
79 | 
80 | # SEE ALSO
81 | 
82 | The full documentation for *chmap* is not maintained as a Texinfo manual.
83 | If the *info* and *chmap* programs are properly installed on your system,
84 | the command
85 | 
86 | 	*info chmap*
87 | 
88 | should not give you access to the complete manual.
89 | 


--------------------------------------------------------------------------------
/src/arg.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copy me if you can.
 3 |  * by 20h
 4 |  */
 5 | 
 6 | #ifndef ARG_H__
 7 | #define ARG_H__
 8 | 
 9 | char *argv0;
10 | 
11 | /* use main(int argc, char *argv[]) */
12 | #define ARGBEGIN	for (argv0 = *argv, argv++, argc--;\
13 | 					argv[0] && argv[0][0] == '-'\
14 | 					&& argv[0][1];\
15 | 					argc--, argv++) {\
16 | 				char argc_;\
17 | 				char **argv_;\
18 | 				int brk_;\
19 | 				if (argv[0][1] == '-' && argv[0][2] == '\0') {\
20 | 					argv++;\
21 | 					argc--;\
22 | 					break;\
23 | 				}\
24 | 				for (brk_ = 0, argv[0]++, argv_ = argv;\
25 | 						argv[0][0] && !brk_;\
26 | 						argv[0]++) {\
27 | 					if (argv_ != argv)\
28 | 						break;\
29 | 					argc_ = argv[0][0];\
30 | 					switch (argc_)
31 | 
32 | /* Handles obsolete -NUM syntax */
33 | #define ARGNUM				case '0':\
34 | 					case '1':\
35 | 					case '2':\
36 | 					case '3':\
37 | 					case '4':\
38 | 					case '5':\
39 | 					case '6':\
40 | 					case '7':\
41 | 					case '8':\
42 | 					case '9'
43 | 
44 | #define ARGEND			}\
45 | 			}
46 | 
47 | #define ARGC()		argc_
48 | 
49 | #define EARGF(x)	((argv[0][1] == '\0' && argv[1] == NULL)?\
50 | 				((x), abort(), (char *)0) :\
51 | 				(brk_ = 1, (argv[0][1] != '\0')?\
52 | 					(&argv[0][1]) :\
53 | 					(argc--, argv++, argv[0])))
54 | 
55 | #define LNGARG()	&argv[0][0]
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/src/display.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | #include <strings.h>
  5 | 
  6 | #include "util.h"
  7 | #include "unicode.h"
  8 | #include "utf8.h"
  9 | 
 10 | static size_t _state;
 11 | 
 12 | static char *
 13 | fmt_bytes(char *bytes)
 14 | {
 15 | 	static char buf[255];
 16 | 	memset(buf, 0x0, sizeof(buf));
 17 | 
 18 | 	size_t i = 0;
 19 | 	for (; bytes[i]; ++i)
 20 | 		strcat(buf, format("%hhX ", bytes[i]));
 21 | 
 22 | 	buf[(i * 3) - 1] = '\0';
 23 | 	return (char *)&buf;
 24 | }
 25 | 
 26 | static void
 27 | printentry_short(uint32_t entry, char *description, _Bool fancy)
 28 | {
 29 | 	char glyph[7];
 30 | 	bzero(glyph, sizeof(glyph));
 31 | 	utf8_encode(glyph, entry);
 32 | 
 33 | 	char *padding = &"     "[charwidths[entry]];
 34 | 	size_t category = charinfos[entry].category;
 35 | 	_Bool iscontrol = category == UC_Cc;
 36 | 
 37 | 	char *casestr = "other";
 38 | 	if (unicodeisupper(entry))
 39 | 		casestr = "upper";
 40 | 	else if (unicodeislower(entry))
 41 | 		casestr = "lower";
 42 | 
 43 | 	if (fancy && (_state & 1) == 0)
 44 | 		printf("\x1b[100m");
 45 | 
 46 | 	printf("%8s  %s%s  %-11s  %s  %s",
 47 | 		format("U+%04X", entry), iscontrol ? "" : glyph,
 48 | 		padding, fmt_bytes(glyph),
 49 | 		casestr, description ? description : "-");
 50 | 
 51 | 	if (fancy && (_state & 1) == 0)
 52 | 		printf("\x1b[K\x1b[m");
 53 | 
 54 | 	printf("\n");
 55 | }
 56 | 
 57 | static void
 58 | fmt_entry(_Bool fancy, char *key, char *value)
 59 | {
 60 | 	if (fancy)
 61 | 		printf("\033[1m%-14s\033[m %s\n", key, value);
 62 | 	else
 63 | 		printf("%-14s %s\n", key, value);
 64 | }
 65 | 
 66 | static void
 67 | printentry_long(uint32_t entry, char *description, _Bool fancy)
 68 | {
 69 | 	char charbuf[7], charbuf2[7];
 70 | 	bzero(charbuf, sizeof(charbuf));
 71 | 	bzero(charbuf2, sizeof(charbuf2));
 72 | 
 73 | 	utf8_encode(charbuf, entry);
 74 | 
 75 | 	size_t colwidth = charwidths[entry];
 76 | 	struct CharInfo ci = charinfos[entry];
 77 | 	_Bool iscontrol = ci.category == UC_Cc;
 78 | 
 79 | 	fmt_entry(fancy, "codepoint",     format("U+%04X    %-5d 0o%o", entry, entry, entry));
 80 | 	fmt_entry(fancy, "UTF8 encoding", fmt_bytes(charbuf));
 81 | 	fmt_entry(fancy, "glyph",         format("%s (%zd %s)", iscontrol ? "<control>" : charbuf,
 82 | 				colwidth, colwidth == 1 ? "column" : "columns"));
 83 | 	fmt_entry(fancy, "description",   description ? description : "(none)");
 84 | 
 85 | 	if (unicodeisupper(entry)) {
 86 | 		int32_t lower = charinfos[entry].lower;
 87 | 		assert(lower != -1);
 88 | 		utf8_encode(charbuf, lower);
 89 | 		fmt_entry(fancy, "case",
 90 | 			format("uppercase, lowercase: 0x%X %s", lower, charbuf2));
 91 | 	} else if (unicodeislower(entry)) {
 92 | 		int32_t upper = charinfos[entry].upper;
 93 | 		assert(upper != -1);
 94 | 		utf8_encode(charbuf, upper);
 95 | 		fmt_entry(fancy, "case",
 96 | 			format("lowercase, uppercase: 0x%X %s", upper, charbuf2));
 97 | 	} else {
 98 | 		fmt_entry(fancy, "case", "other");
 99 | 	}
100 | 
101 | 	fmt_entry(fancy, "category", (char *)category_strs[ci.category]);
102 | 
103 | 	printf("\n");
104 | }
105 | 
106 | static void
107 | printheader(_Bool flong, _Bool fancy)
108 | {
109 | 	_state = 0;
110 | 
111 | 	if (!flong) {
112 | 		if (fancy) printf("\x1b[1m");
113 | 		printf("codepoint  glyph  encoded      case   description\n");
114 | 		if (fancy) printf("\x1b[0m");
115 | 	}
116 | }
117 | 
118 | static void
119 | printentry(uint32_t entry, char *description, _Bool fancy, _Bool flong)
120 | {
121 | 	++_state;
122 | 
123 | 	if (!flong)
124 | 		printentry_short(entry, description, fancy);
125 | 	else
126 | 		printentry_long(entry, description, fancy);
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main.c:
--------------------------------------------------------------------------------
  1 | #include <err.h>
  2 | #include <regex.h>
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <sqlite3.h>
  8 | #include <string.h>
  9 | #include <unistd.h>
 10 | 
 11 | #include "arg.h"
 12 | #include "display.c"
 13 | #include "util.h"
 14 | #include "range.c"
 15 | #include "unicode.h"
 16 | 
 17 | sqlite3 *db;
 18 | _Bool istty = false;
 19 | _Bool flong = false;
 20 | 
 21 | static void
 22 | range(char *param)
 23 | {
 24 | 	/*
 25 | 	 * TODO: support number characters from other languages
 26 | 	 * e.g. Chinese
 27 | 	 */
 28 | 
 29 | 	uint32_t entries[262144];
 30 | 	ssize_t entries_len = -1;
 31 | 
 32 | 	if ((entries_len = expand_range(param, entries)) < 0)
 33 | 		errx(1, "'%s': invalid range.", param);
 34 | 
 35 | 	printheader(flong, istty);
 36 | 
 37 | 	for (size_t i = 0; i < (size_t)entries_len; ++i) {
 38 | 		if (entries[i] > UNICODE_MAX) {
 39 | 			warnx("%u is above maximum Unicode value", entries[i]);
 40 | 			continue;
 41 | 		}
 42 | 		char *desc = charinfos[entries[i]].desc;
 43 | 		printentry(entries[i], desc, istty, flong);
 44 | 	}
 45 | }
 46 | 
 47 | static void
 48 | chars(char *param)
 49 | {
 50 | 	char *inp = param;
 51 | 	size_t len = strlen(inp);
 52 | 
 53 | 	printheader(flong, istty);
 54 | 
 55 | 	while (*inp) {
 56 | 		size_t   offset = inp - param;
 57 | 		uint32_t charbuf = 0;
 58 | 		ssize_t  runelen = utf8_decode(&charbuf, inp, len - offset);
 59 | 
 60 | 		if (runelen < 0) {
 61 | 			warnx("invalid UTF8 rune at offset %zu", offset);
 62 | 			++inp;
 63 | 			continue;
 64 | 		}
 65 | 
 66 | 		printentry(charbuf, charinfos[charbuf].desc, istty, flong);
 67 | 		inp += runelen;
 68 | 	}
 69 | }
 70 | 
 71 | static void
 72 | search(char *query)
 73 | {
 74 | 	regex_t re;
 75 | 
 76 | 	/* TODO: get char of error and error message */
 77 | 	if (regcomp(&re, query, REG_ICASE))
 78 | 		errx(1, "'%s': invalid regex.", query);
 79 | 
 80 | 	printheader(flong, istty);
 81 | 
 82 | 	for (size_t i = 0; i < UNICODE_MAX; ++i) {
 83 | 		char *desc = charinfos[i].desc;
 84 | 		if (desc == NULL)
 85 | 			continue;
 86 | 		if (regexec(&re, desc, 0, NULL, 0) != REG_NOMATCH)
 87 | 			printentry(i, desc, istty, flong);
 88 | 	}
 89 | 
 90 | 	regfree(&re);
 91 | }
 92 | 
 93 | static void
 94 | usage(_Bool _short)
 95 | {
 96 | 	printf("Usage: chmap [-C always|never|auto] [-l] [-r RANGE] [-c CHARS] [-s REGEX]\n");
 97 | 
 98 | 	if (_short)
 99 | 		exit(0);
100 | 
101 | 	printf("\n");
102 | 	printf("Print information for Unicode characters.\n");
103 | 	printf("\n");
104 | 	printf("OPTIONS:\n");
105 | 	printf("    -l, --long          Show character entries in the long format.\n");
106 | 	printf("    -h, --help          print this help message and exit.\n");
107 | 	printf("    -V, --version       print version and exit.\n");
108 | 	printf("\n");
109 | 	printf("FLAGS:\n");
110 | 	printf("    -r, --range RANGE   print a range of Unicode characters.\n");
111 | 	printf("    -c, --chars CHARS   print a range of Unicode codepoints that match\n");
112 | 	printf("                        provided character(s).\n");
113 | 	printf("    -s, --search REGEX  search character descriptions for REGEX.\n");
114 | 	printf("\n");
115 | 	printf("Full documentation is available locally at chmap(1).\n");
116 | 
117 | 	exit(0);
118 | }
119 | 
120 | static _Bool
121 | usecolor(void)
122 | {
123 | 	if (!isatty(STDOUT_FILENO))
124 | 		return false;
125 | 
126 | 	char *env_NOCOLOR = getenv("NO_COLOR");
127 | 	char *env_TERM = getenv("TERM");
128 | 
129 | 	if (env_NOCOLOR)
130 | 		return false;
131 | 
132 | 	if (!env_TERM || !strcmp(env_TERM, "dumb"))
133 | 		return false;
134 | 
135 | 	return true;
136 | }
137 | 
138 | int
139 | main(int argc, char **argv)
140 | {
141 | 	istty = usecolor();
142 | 
143 | 	ARGBEGIN {
144 | 	break; case 'l':
145 | 		flong = !flong;
146 | 	break; case 'r':
147 | 		range(EARGF(usage(true)));
148 | 	break; case 'c':
149 | 		chars(EARGF(usage(true)));
150 | 	break; case 's':
151 | 		search(EARGF(usage(true)));
152 | 	break; case 'C':
153 | 		optarg = EARGF(usage(true));
154 | 		if (!strncmp(optarg, "au", 2))
155 | 			istty = usecolor();
156 | 		else if (!strncmp(optarg, "al", 2))
157 | 			istty = true;
158 | 		else if (!strncmp(optarg, "ne", 2))
159 | 			istty = false;
160 | 		else
161 | 			usage(true);
162 | 	break; case 'V': case 'v':
163 | 		printf("chmap v%s\n", VERSION);
164 | 		return 0;
165 | 	break; case 'h':
166 | 		usage(false);
167 | 	break; default:
168 | 		usage(true);
169 | 	} ARGEND
170 | }
171 | 


--------------------------------------------------------------------------------
/src/range.c:
--------------------------------------------------------------------------------
  1 | #include <ctype.h>
  2 | #include <stdbool.h>
  3 | #include <stdint.h>
  4 | #include <stdio.h>
  5 | #include <stddef.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | 
  9 | static size_t _buf_len = 0;
 10 | 
 11 | static _Bool
 12 | parse_int(int *x, char *s, char **e, _Bool add, uint32_t *buf)
 13 | {
 14 | 	size_t base;
 15 | 	if (!strncmp(s, "0x", 2) || !strncmp(s, "U+", 2)) {
 16 | 		base = 16;
 17 | 		s += 2;
 18 | 	} else if (!strncmp(s, "0o", 2)) {
 19 | 		base = 8;
 20 | 		s += 2;
 21 | 	} else if (!strncmp(s, "0b", 2)) {
 22 | 		base = 2;
 23 | 		s += 2;
 24 | 	} else {
 25 | 		base = 10;
 26 | 	}
 27 | 
 28 | 	*x = strtol(s, e, base);
 29 | 	_Bool ok = *e != s;
 30 | 
 31 | 	/* HACK: the add parameter controls whether a parsed integer is
 32 | 	 * added to the entries if it succeeds in parsing it.
 33 | 	 *
 34 | 	 * The reason it is needed is because parsed_int is used in two
 35 | 	 * places: 1) in expand_range (where we *want* successfully parse
 36 | 	 * integers to be added to the entries) and 2) in parse_range (where
 37 | 	 * we *don't want* successfully parsed integers to be added to the
 38 | 	 * entries.
 39 | 	 */
 40 | 
 41 | 	if (ok && add) buf[_buf_len] = *x, ++_buf_len;
 42 | 	return ok;
 43 | }
 44 | 
 45 | 
 46 | static _Bool
 47 | parse_range(char *s, char **e, uint32_t *buf)
 48 | {
 49 | 	int x = 0, y = 0;
 50 | 	char *ee;
 51 | 	char *start = s;
 52 | 
 53 | 	/* try to parse left-hand side of range */
 54 | 	if (!parse_int(&x, s, &ee, false, buf))
 55 | 		return false;
 56 | 	s = ee;
 57 | 
 58 | 	/* check if this is really a range, or just
 59 | 	 * a single integer */
 60 | 	if (*s != '-') {
 61 | 		e = &start;
 62 | 		return false;
 63 | 	} else {
 64 | 		++s;
 65 | 	}
 66 | 	
 67 | 	/* try to parse right-hand side of range */
 68 | 	if (!parse_int(&y, s, e, false, buf))
 69 | 		return false;
 70 | 
 71 | 	/* check if left-hand size is greater than
 72 | 	 * right-hand side of range */
 73 | 	if (y < x) return false;
 74 | 
 75 | 	/* copy onto accumulator */
 76 | 	for (size_t i = x; i <= (size_t)y; ++i)
 77 | 		buf[_buf_len] = i, ++_buf_len;
 78 | 	return true;
 79 | }
 80 | 
 81 | static ssize_t
 82 | expand_range(char *s, uint32_t *buf)
 83 | {
 84 | 	_buf_len = 0;
 85 | 	int x = 0;
 86 | 	char **e = &s;
 87 | 
 88 | 	for (;;) {
 89 | 		while (isspace(*s)) ++s;
 90 | 
 91 | 		/*
 92 | 		 * try to parse input as a range, and fall back
 93 | 		 * to parsing input as a single integer if that
 94 | 		 * failed.
 95 | 		 * if both failed, it's probably a syntax error.
 96 | 		 */
 97 | 		if (!parse_range(s, e, buf)) {
 98 | 			if (!parse_int(&x, s, e, true, buf)) {
 99 | 				break;
100 | 			}
101 | 		}
102 | 		s = *e;
103 | 		
104 | 		while (isspace(*s)) ++s;
105 | 		if (strlen(s) == 0) return _buf_len;
106 | 
107 | 		/* check if there's something more to parse */
108 | 		if ((*s) == ',') {
109 | 			++s;
110 | 			continue;
111 | 		}
112 | 
113 | 		break;
114 | 	}
115 | 
116 | 	/* if we broke out of the main loop then a syntax
117 | 	 * error must have occurred */
118 | 	return -1;
119 | }
120 | 


--------------------------------------------------------------------------------
/src/unicode.c:
--------------------------------------------------------------------------------
 1 | #include "unicode.h"
 2 | 
 3 | const char *category_strs[30] = {
 4 | 	[UC_Cn] = "Other (not assigned)",
 5 | 	[UC_Lu] = "Letter (uppercase)",
 6 | 	[UC_Ll] = "Letter (lowercase)",
 7 | 	[UC_Lt] = "Letter (titlecase)",
 8 | 	[UC_Lm] = "Letter (modifier)",
 9 | 	[UC_Lo] = "Letter (other)",
10 | 	[UC_Mn] = "Mark (nonspacing)",
11 | 	[UC_Mc] = "Mark (space combining)",
12 | 	[UC_Me] = "Mark (enclosing)",
13 | 	[UC_Nd] = "Number (decimal digit)",
14 | 	[UC_Nl] = "Number (letter)",
15 | 	[UC_No] = "Number (other)",
16 | 	[UC_Pc] = "Punctuation (connector)",
17 | 	[UC_Pd] = "Punctuation (dash)",
18 | 	[UC_Ps] = "Punctuation (open)",
19 | 	[UC_Pe] = "Punctuation (close)",
20 | 	[UC_Pi] = "Punctuation (initial quote)",
21 | 	[UC_Pf] = "Punctuation (final quote)",
22 | 	[UC_Po] = "Punctuation (other)",
23 | 	[UC_Sm] = "Symbol (math)",
24 | 	[UC_Sc] = "Symbol (currency)",
25 | 	[UC_Sk] = "Symbol (modifier)",
26 | 	[UC_So] = "Symbol (other)",
27 | 	[UC_Zs] = "Separator (space)",
28 | 	[UC_Zl] = "Separator (line)",
29 | 	[UC_Zp] = "Separator (paragraph)",
30 | 	[UC_Cc] = "Other (control)",
31 | 	[UC_Cf] = "Other (format)",
32 | 	[UC_Cs] = "Other (surrogate)",
33 | 	[UC_Co] = "Other (private use)",
34 | };
35 | 
36 | #include "dat/charinfo.c"
37 | #include "dat/charwidths.c"
38 | 
39 | _Bool
40 | unicodeisupper(uint32_t c)
41 | {
42 | 	struct CharInfo ci = charinfos[c];
43 | 	return ci.lower != ci.upper && ci.upper == -1 && ci.category != UC_Lt;
44 | }
45 | 
46 | _Bool
47 | unicodeislower(uint32_t c)
48 | {
49 | 	struct CharInfo ci = charinfos[c];
50 | 	return ci.lower != ci.upper && ci.lower == -1;
51 | }
52 | 


--------------------------------------------------------------------------------
/src/unicode.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNICODE_H
 2 | #define UNICODE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define UNICODE_MAX 0x10FFFF
 7 | 
 8 | enum {
 9 | 	UC_Lu,
10 | 	UC_Ll,
11 | 	UC_Lt,
12 | 	UC_Mn,
13 | 	UC_Mc,
14 | 	UC_Me,
15 | 	UC_Nd,
16 | 	UC_Nl,
17 | 	UC_No,
18 | 	UC_Zs,
19 | 	UC_Zl,
20 | 	UC_Zp,
21 | 	UC_Cc,
22 | 	UC_Cf,
23 | 	UC_Cs,
24 | 	UC_Co,
25 | 	UC_Cn,
26 | 
27 | 	UC_Lm,
28 | 	UC_Lo,
29 | 	UC_Pc,
30 | 	UC_Pd,
31 | 	UC_Ps,
32 | 	UC_Pe,
33 | 	UC_Pi,
34 | 	UC_Pf,
35 | 	UC_Po,
36 | 	UC_Sm,
37 | 	UC_Sc,
38 | 	UC_Sk,
39 | 	UC_So
40 | };
41 | 
42 | enum {
43 | 	UBIDI_AL,
44 | 	UBIDI_AN,
45 | 	UBIDI_B,
46 | 	UBIDI_BN,
47 | 	UBIDI_CS,
48 | 	UBIDI_EN,
49 | 	UBIDI_ES,
50 | 	UBIDI_ET,
51 | 	UBIDI_FSI,
52 | 	UBIDI_L,
53 | 	UBIDI_LRE,
54 | 	UBIDI_LRI,
55 | 	UBIDI_LRO,
56 | 	UBIDI_NSM,
57 | 	UBIDI_ON,
58 | 	UBIDI_PDF,
59 | 	UBIDI_PDI,
60 | 	UBIDI_R,
61 | 	UBIDI_RLE,
62 | 	UBIDI_RLI,
63 | 	UBIDI_RLO,
64 | 	UBIDI_S,
65 | 	UBIDI_WS,
66 | };
67 | 
68 | extern const char *category_strs[30];
69 | 
70 | struct CharInfo {
71 | 	uint8_t category;
72 | 	char bidirect;
73 | 	int32_t decimal;
74 | 	char *desc;
75 | 	int32_t upper;
76 | 	int32_t lower;
77 | };
78 | 
79 | extern const struct CharInfo charinfos[UNICODE_MAX];
80 | extern const uint8_t charwidths[UNICODE_MAX];
81 | 
82 | _Bool unicodeisupper(uint32_t c);
83 | _Bool unicodeislower(uint32_t c);
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/src/utf8.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2010-2013 nsf <no.smile.face@gmail.com>
  3 |  *                    2021 kiedtl <kiedtl@tilde.team>
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |  * of this software and associated documentation files (the "Software"), to deal
  7 |  * in the Software without restriction, including without limitation the rights
  8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |  * copies of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 |  * THE SOFTWARE.
 22 |  */
 23 | 
 24 | #include <stdint.h>
 25 | #include <stddef.h>
 26 | #include <sys/types.h>
 27 | 
 28 | #include "utf8.h"
 29 | #include "unicode.h"
 30 | 
 31 | /*
 32 |  * Invalid starter bytes are marked with 0:
 33 |  *
 34 |  * The first two... cells (C0 and C1) could be used only for a 2-byte encoding
 35 |  * of a 7-bit ASCII character which should be encoded in 1 byte...  such
 36 |  * "overlong" sequences are disallowed. The red cells in the F_ row (F5 to FD)
 37 |  * indicate leading bytes of 4-byte or longer sequences that cannot be valid
 38 |  * because they would encode code points larger than the U+10FFFF limit of
 39 |  * Unicode (a limit derived from the maximum code point encodable in UTF-16).
 40 |  * FE and FF do not match any allowed character pattern and are therefore not
 41 |  * valid start bytes.            -- Wikipedia article on UTF8
 42 |  *
 43 |  * Additionally, values between 0x80 and 0xbf inclusive are marked with 0, as
 44 |  * they are continuation bytes and may not appear at the beginning of an
 45 |  * encoded rune sequence.
 46 |  */
 47 | static const uint8_t utf8_length[256] = {
 48 |      /* 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */
 49 | /* 0 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 50 | /* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 51 | /* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 52 | /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 53 | /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 54 | /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 55 | /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 56 | /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 57 | /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 58 | /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 59 | /* A */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 60 | /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 61 | /* C */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 62 | /* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 63 | /* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 64 | /* F */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 65 | };
 66 | 
 67 | static const uint8_t utf8_mask[6] = {
 68 | 	0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01
 69 | };
 70 | 
 71 | ssize_t utf8_char_to_unicode(uint32_t *out, const char *c);
 72 | size_t  utf8_unicode_to_char(char *out, uint32_t c);
 73 | 
 74 | uint8_t
 75 | utf8_bytesz(char c)
 76 | {
 77 | 	return utf8_length[(uint8_t)c];
 78 | }
 79 | 
 80 | ssize_t
 81 | utf8_decode(uint32_t *out, char *c, size_t sz)
 82 | {
 83 | 	if (c[0] == 0 || sz == 0)
 84 | 		return -1;
 85 | 
 86 | 	uint8_t len = utf8_bytesz(*c);
 87 | 
 88 | 	if (len == 0 || len > sz)
 89 | 		return -1;
 90 | 
 91 | 	uint32_t result = c[0] & utf8_mask[len-1];
 92 | 
 93 | 	for (size_t i = 1; i < len; ++i) {
 94 | 		if ((c[i] & 0xc0) != 0x80)
 95 | 			return -1; /* not a continuation byte */
 96 | 		result <<= 6;
 97 | 		result |= c[i] & 0x3f;
 98 | 	}
 99 | 
100 | 	if (result > UNICODE_MAX)
101 | 		return -1; /* value beyond unicode's 21-bit max */
102 | 	if (result >= 0xD800 && result <= 0xDFFF)
103 | 		return -1; /* surrogate chars */
104 | 	if (result >= 0xFDD0 && result <= 0xFDEF)
105 | 		return -1; /* non-character range */
106 | 	if ((result & 0xFFFE) == 0xFFFE)
107 | 		return -1; /* non-character at plane end */
108 | 
109 | 	*out = result;
110 | 	return (size_t)len;
111 | }
112 | 
113 | ssize_t
114 | utf8_encode(char *out, uint32_t c)
115 | {
116 | 	size_t len = 0, first, i;
117 | 
118 | 	if (c < 0x80) {
119 | 		first = 0;
120 | 		len = 1;
121 | 	} else if (c < 0x800) {
122 | 		/* XXX: we allow encoding surrogate chars, even
123 | 		 * though that's invalid UTF8 */
124 | 		first = 0xc0;
125 | 		len = 2;
126 | 	} else if (c < 0x10000) {
127 | 		first = 0xe0;
128 | 		len = 3;
129 | 	} else if (c < 0x110000) {
130 | 		first = 0xf0;
131 | 		len = 4;
132 | 	} else {
133 | 		return -1;
134 | 	}
135 | 
136 | 	for (i = len - 1; i > 0; --i) {
137 | 		out[i] = (c & 0x3f) | 0x80;
138 | 		c >>= 6;
139 | 	}
140 | 	out[0] = c | first;
141 | 
142 | 	return len;
143 | }
144 | 


--------------------------------------------------------------------------------
/src/utf8.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTF8_H
 2 | #define UTF8_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | uint8_t utf8_bytesz(char c);
 7 | ssize_t utf8_decode(uint32_t *out, char *c, size_t sz);
 8 | ssize_t utf8_encode(char *out, uint32_t c);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdint.h>
 3 | #include <stdarg.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | 
 7 | #include "util.h"
 8 | #include "unicode.h"
 9 | 
10 | char * __attribute__((format(printf, 1, 2)))
11 | format(const char *fmt, ...)
12 | {
13 | 	static char buf[8192];
14 | 	memset(buf, 0x0, sizeof(buf));
15 | 	va_list ap;
16 | 	va_start(ap, fmt);
17 | 	int len = vsnprintf(buf, sizeof(buf), fmt, ap);
18 | 	va_end(ap);
19 | 	assert((size_t) len < sizeof(buf));
20 | 	return (char *) &buf;
21 | }
22 | 


--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
1 | #ifndef UTIL_H
2 | #define UTIL_H
3 | 
4 | #define UNUSED(V) ((void)(V))
5 | 
6 | char * __attribute__((format(printf, 1, 2))) format(const char *fmt, ...);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | pilot_range
2 | pilot_decode
3 | 


--------------------------------------------------------------------------------
/tests/decode.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | #
  3 | # Require GNU printf(3) and GNU bc
  4 | 
  5 | . tests/lib.sh
  6 | 
  7 | hex() {
  8 |     echo "obase=16;ibase=A;${1##0x}" | bc
  9 | }
 10 | 
 11 | decode() {
 12 |     tests/pilot_decode $(/usr/bin/printf "$1") >/dev/null
 13 | }
 14 | 
 15 | begin "Decode capital a"
 16 | cmdout decode '\x41'             'U+0041'
 17 | 
 18 | begin "Decode small o with diaeresis"
 19 | cmdout decode '\xc3\xb6'         'U+00F6'
 20 | 
 21 | begin "Decode cyrillic capital letter zhe"
 22 | cmdout decode '\xd0\x96'         'U+0416'
 23 | 
 24 | begin "Decode euro sign"
 25 | cmdout decode '\xe2\x82\xac'     'U+20AC'
 26 | 
 27 | begin "Decode musical symbol g cleff"
 28 | cmdout decode '\xf0\x9d\x84\x9e' 'U+1D11E'
 29 | 
 30 | begin "Fail on decoding surrogate chars"
 31 | cmdend decode '\xed\xa0\x82' 1
 32 | 
 33 | begin "Fail on first byte == 0xFF"
 34 | cmdend decode '\xff\xc3\xb1' 1
 35 | 
 36 | # The following tests were directly stolen from libutf8proc's
 37 | # test suite.
 38 | 
 39 | test_fail_missing_cont_byte() {
 40 |     for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)" && return 1; done
 41 |     return 0
 42 | }
 43 | begin "Fail on missing continuation bytes"
 44 | test_fail_missing_cont_byte; chkend
 45 | 
 46 | test_fail_missing_cont_byte_before_noncont_0x41() {
 47 |     for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)\x41" && return 1; done
 48 |     return 0
 49 | }
 50 | begin "Fail on lead followed by non-continuation byte 0x41"
 51 | test_fail_missing_cont_byte_before_noncont_0x41; chkend
 52 | 
 53 | test_fail_missing_cont_byte_before_noncont_0xc0() {
 54 |     for byte in $(seq 0xc0 0xff); do decode "\x$(hex $byte)\x41" && return 1; done
 55 |     return 0
 56 | }
 57 | begin "Fail on lead followed by non-continuation byte 0xc0"
 58 | test_fail_missing_cont_byte_before_noncont_0xc0; chkend
 59 | 
 60 | test_fail_lead_cont_byte() {
 61 |     for byte in $(seq 0x80 0xc3); do decode "\x$(hex $byte)" && return 1; done
 62 |     return 0
 63 | }
 64 | begin "Fail on leading continuation byte"
 65 | test_fail_lead_cont_byte; chkend
 66 | 
 67 | test_fail_overlong_2() {
 68 |     for byte in $(seq 0x81 0xbf); do
 69 |         decode "\xc0\x$(hex $byte)" && return 1
 70 |         decode "\xc1\x$(hex $byte)" && return 1
 71 |     done
 72 |     return 0
 73 | }
 74 | begin "Fail on overlong 2byte sequence"
 75 | test_fail_overlong_2; chkend
 76 | 
 77 | test_fail_overlong_3() {
 78 |     for byte in $(seq 0x80 0x9f); do
 79 |         decode "\xe0\x$(hex $byte)\x80" && return 1
 80 |     done
 81 |     return 0
 82 | }
 83 | begin "Fail on overlong 3byte sequence"
 84 | test_fail_overlong_3; chkend
 85 | 
 86 | test_fail_overlong_4() {
 87 |     for byte in $(seq 0x80 0x8f); do
 88 |         decode "\xf0\x$(hex $byte)\x80\x80" && return 1
 89 |     done
 90 |     return 0
 91 | }
 92 | begin "Fail on overlong 4byte sequence"
 93 | test_fail_overlong_4; chkend
 94 | 
 95 | test_fail_abovemax_4() {
 96 |     for byte in $(seq 0x90 0xbf); do decode "\xf4\x$(hex $byte)\x80\x80" && return 1; done
 97 |     for byte in $(seq 0xf5 0xf7); do decode "\x$(hex $byte)\x80\x80\x80" && return 1; done
 98 |     return 0
 99 | }
100 | begin "Fail on 4byte sequence encoding > UNICODE_MAX"
101 | test_fail_abovemax_4; chkend
102 | 
103 | test_fail_invalid_5() {
104 |     for byte in $(seq 0xf8 0xfd); do
105 |         decode "\xf7\x80\x80\x80\x$(hex $byte)" && return 1
106 |     done
107 |     return 0
108 | }
109 | begin "Fail on invalid 5byte sequence"
110 | test_fail_invalid_5; chkend
111 | 
112 | test_fail_invalid_6() {
113 |     for byte in $(seq 0xfc 0xfd); do
114 |         decode "\xf7\x80\x80\x80\x80\x$(hex $byte)" && return 1
115 |     done
116 |     return 0
117 | }
118 | begin "Fail on invalid 6byte sequence"
119 | test_fail_invalid_6; chkend
120 | 


--------------------------------------------------------------------------------
/tests/lib.sh:
--------------------------------------------------------------------------------
 1 | printf '== %s\n' "$0"
 2 | trap "printf '\n'" EXIT
 3 | 
 4 | begin() {
 5 |     trmcols=$(stty size | cut -d' ' -f2)
 6 |     padding=$(($trmcols - 5))
 7 |     printf "%-${padding}s" "$1"
 8 | }
 9 | 
10 | failure() {
11 |     printf 'FAIL\n'
12 | }
13 | 
14 | success() {
15 |     printf '  OK\n'
16 | }
17 | 
18 | cmdout() {
19 |     arg=$(/usr/bin/printf '%b' "$2")
20 |     if [ "$(tests/pilot_$1 $arg)" != "$3" ]; then
21 |         failure
22 |     else
23 |         success
24 |     fi
25 | }
26 | 
27 | cmdend() {
28 |     arg=$(/usr/bin/printf '%b' "$2")
29 |     tests/pilot_$1 $arg 2>/dev/null >&2
30 | 
31 |     if [ $? -ne "$3" ]; then
32 |         failure
33 |     else
34 |         success
35 |     fi
36 | }
37 | 
38 | chkend() {
39 |     if [ $? -eq 0 ]; then success; else failure; fi
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/pilot_decode.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include "utf8.c"
 4 | 
 5 | int
 6 | main(int argc, char **argv)
 7 | {
 8 | 	uint32_t charbuf = 0;
 9 | 	ssize_t runelen  = 0;
10 | 
11 | 	while (*argv[1]) {
12 | 		charbuf = 0;
13 | 		if ((runelen = utf8_decode(&charbuf, argv[1], strlen(argv[1]))) < 0)
14 | 			return 1;
15 | 		printf("U+%04X\n", charbuf);
16 | 		argv[1] += runelen;
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/pilot_range.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "range.c"
 3 | 
 4 | int
 5 | main(int argc, char **argv)
 6 | {
 7 | 	uint32_t entries[4096] = {0};
 8 | 	ssize_t  entries_len = 0;
 9 | 	if ((entries_len = expand_range(argv[1], entries)) < 0)
10 | 		return 1;
11 | 
12 | 	printf("%d", entries[0]);
13 | 	for (size_t i = 1; i < entries_len; ++i) {
14 | 		printf(" %d", entries[i]);
15 | 	}
16 | 
17 | 	printf("\n");
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/range.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | . tests/lib.sh
 4 | 
 5 | begin "Lone stranger"
 6 | cmdout range "128"   "128"
 7 | 
 8 | begin "Simple start-to-end"
 9 | cmdout range "0-5"   "0 1 2 3 4 5"
10 | 
11 | begin "Start-to-end with stranger"
12 | cmdout range "0-5,8" "0 1 2 3 4 5 8"
13 | 
14 | begin "Start-to-end with hexadecimal stranger"
15 | cmdout range "0-5,0xFF" "0 1 2 3 4 5 255"
16 | 
17 | begin "Strangers with negative values"
18 | cmdout range "12,-9,-5--2" "12 -9 -5 -4 -3 -2"
19 | 
20 | begin "Fail when end < start"
21 | cmdend range "5-2" 1
22 | 
23 | begin "Fail on invalid number"
24 | cmdend range "0-abcd" 1
25 | 
26 | begin "Fail on invalid hexadecimal number"
27 | cmdend range "0-12,24,0xFZ" 1
28 | 
29 | begin "Fail when missing end of start-to-end"
30 | cmdend range "12,4-9,0-" 1
31 | 
32 | begin "Fail when missing start of start-to-end"
33 | cmdend range "12,--9,0-8" 1
34 | 


--------------------------------------------------------------------------------
/tool/gencharsh.lua:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env lua
 2 | --
 3 | -- (c) Kiëd Llaentenn <kiedtl@tilde.team>
 4 | -- See the COPYING file for copyright information.
 5 | 
 6 | function mytonumber(a, ...)
 7 |     if a and a ~= "" then
 8 |         return tonumber(a, ...)
 9 |     else
10 |         return -1
11 |     end
12 | end
13 | 
14 | print("\
15 | #include <stdint.h>\
16 | #include \"unicode.h\"\
17 | \
18 | /* This file has automatically generated. */\
19 | \
20 | const struct CharInfo charinfos[UNICODE_MAX] = {\
21 | ")
22 | 
23 | local data = io.stdin:read('*all')
24 | for line in data:gmatch("([^\n]+)\n?") do
25 |     local ch, desc, category, bidi, decimal, olddesc, upper, lower = line:match("(.-);(.-);(.-);.-;(.-);.-;(.-);.-;.-;.-;(.-);.-;(.-);(.-);")
26 | 
27 |     ch = tonumber(ch, 16)
28 |     category = "UC_" .. category
29 |     bidi = "UBIDI_" .. bidi
30 |     decimal = mytonumber(decimal)
31 |     if desc == "<control>" then desc = olddesc end
32 |     upper = mytonumber(upper, 16)
33 |     lower = mytonumber(lower, 16)
34 | 
35 |     desc = desc:lower()
36 | 
37 |     print(string.format("\t[%5d] = { %s, %s, %d, \"%s\", %d, %d },",
38 |         ch, category, bidi, decimal, desc, upper, lower))
39 | end
40 | 
41 | print("};")
42 | 


--------------------------------------------------------------------------------
/tool/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | die() {
 4 |     printf '%s\n' "$1"; exit 1
 5 | }
 6 | 
 7 | [ -z "$1" ]      && die "usage: $0 [destination]"
 8 | [ -f "chmap" ]   || die "can't find chmap"
 9 | [ -f "chmap.1" ] || die "can't find chmap's db"
10 | 
11 | install -Dm755 chmap    "$1/bin/chmap"
12 | install -Dm644 chmap.1  "$1/share/man/man1/chmap.1"
13 | 


--------------------------------------------------------------------------------