├── LICENSE
├── UnicodeData.txt
├── go.mod
└── unicode.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * The names of its contributors may not be used to endorse or
14 | promote products derived from this software without specific prior
15 | written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module robpike.io/cmd/unicode
2 | 
3 | go 1.16
4 | 
5 | 


--------------------------------------------------------------------------------
/unicode.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | /*
  6 | Unicode is a command-line tool for studying Unicode characters.
  7 | 
  8 | usage: unicode [-c] [-d] [-n] [-t]
  9 | 
 10 | 	-c: args are hex; output characters (xyz)
 11 | 	-n: args are characters; output hex (23 or 23-44)
 12 | 	-g: args are regular expressions for matching names
 13 | 	-d: output textual description
 14 | 	-t: output plain text, not one char per line
 15 | 	-U: output full Unicode description
 16 | 
 17 | Default behavior sniffs the arguments to select -c vs. -n.
 18 | */
 19 | package main // import "robpike.io/cmd/unicode"
 20 | 
 21 | import (
 22 | 	"bytes"
 23 | 	_ "embed"
 24 | 	"flag"
 25 | 	"fmt"
 26 | 	"os"
 27 | 	"regexp"
 28 | 	"strconv"
 29 | 	"strings"
 30 | )
 31 | 
 32 | var (
 33 | 	doNum  = flag.Bool("n", false, "output numeric values")
 34 | 	doChar = flag.Bool("c", false, "output characters")
 35 | 	doText = flag.Bool("t", false, "output plain text")
 36 | 	doDesc = flag.Bool("d", false, "describe the characters from the Unicode database, in simple form")
 37 | 	doUnic = flag.Bool("u", false, "describe the characters from the Unicode database, in Unicode form")
 38 | 	doUNIC = flag.Bool("U", false, "describe the characters from the Unicode database, in glorious detail")
 39 | 	doGrep = flag.Bool("g", false, "grep for argument string in data")
 40 | )
 41 | 
 42 | var printRange = false
 43 | 
 44 | //go:generate sh -c "curl http://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt >UnicodeData.txt"
 45 | var (
 46 | 	//go:embed UnicodeData.txt
 47 | 	unicodeDataTxt string
 48 | 	unicodeLines   = splitLines(unicodeDataTxt)
 49 | )
 50 | 
 51 | func main() {
 52 | 	flag.Usage = usage
 53 | 	flag.Parse()
 54 | 	mode()
 55 | 	var codes []rune
 56 | 	switch {
 57 | 	case *doGrep:
 58 | 		codes = argsAreRegexps()
 59 | 	case *doChar:
 60 | 		codes = argsAreNumbers()
 61 | 	case *doNum:
 62 | 		codes = argsAreChars()
 63 | 	}
 64 | 	if *doUnic || *doUNIC || *doDesc {
 65 | 		desc(codes)
 66 | 		return
 67 | 	}
 68 | 	if *doText {
 69 | 		fmt.Printf("%s\n", string(codes))
 70 | 		return
 71 | 	}
 72 | 	b := new(bytes.Buffer)
 73 | 	for i, c := range codes {
 74 | 		switch {
 75 | 		case printRange:
 76 | 			fmt.Fprintf(b, "%.4x %c", c, c)
 77 | 			if i%4 == 3 {
 78 | 				fmt.Fprint(b, "\n")
 79 | 			} else {
 80 | 				fmt.Fprint(b, "\t")
 81 | 			}
 82 | 		case *doChar:
 83 | 			fmt.Fprintf(b, "%c\n", c)
 84 | 		case *doNum:
 85 | 			fmt.Fprintf(b, "%.4x\n", c)
 86 | 		}
 87 | 	}
 88 | 	if b.Len() > 0 && b.Bytes()[b.Len()-1] != '\n' {
 89 | 		fmt.Fprint(b, "\n")
 90 | 	}
 91 | 	fmt.Print(b)
 92 | }
 93 | 
 94 | func fatalf(format string, args ...interface{}) {
 95 | 	if !strings.HasSuffix(format, "\n") {
 96 | 		format += "\n"
 97 | 	}
 98 | 	fmt.Fprintf(os.Stderr, format, args...)
 99 | 	os.Exit(2)
100 | }
101 | 
102 | const usageText = `usage: unicode [-c] [-d] [-n] [-t]
103 | -c: args are hex; output characters (xyz)
104 | -n: args are characters; output hex (23 or 23-44)
105 | -g: args are regular expressions for matching names
106 | -d: output textual description
107 | -t: output plain text, not one char per line
108 | -U: output full Unicode description
109 | 
110 | Default behavior sniffs the arguments to select -c vs. -n.
111 | `
112 | 
113 | func usage() {
114 | 	fatalf("%s", usageText)
115 | }
116 | 
117 | // Mode determines whether we have numeric or character input.
118 | // If there are no flags, we sniff the first argument.
119 | func mode() {
120 | 	if len(flag.Args()) == 0 {
121 | 		usage()
122 | 	}
123 | 	// If grepping names, we need an output format defined; default is numeric.
124 | 	if *doGrep && !(*doNum || *doChar || *doDesc || *doUnic || *doUNIC) {
125 | 		*doNum = true
126 | 	}
127 | 	if *doNum || *doChar {
128 | 		return
129 | 	}
130 | 	alldigits := true
131 | 	numDash := 0
132 | 	for _, r := range strings.Join(flag.Args(), "") {
133 | 		if !strings.ContainsRune("0123456789abcdefABCDEF-", r) {
134 | 			alldigits = false
135 | 		}
136 | 		if r == '-' {
137 | 			numDash++
138 | 		}
139 | 	}
140 | 	// If there is one '-' it's a range; if zero it's just a hex number.
141 | 	if alldigits && numDash <= 1 {
142 | 		*doChar = true
143 | 		return
144 | 	}
145 | 	*doNum = true
146 | }
147 | 
148 | func argsAreChars() []rune {
149 | 	var codes []rune
150 | 	for i, a := range flag.Args() {
151 | 		for _, r := range a {
152 | 			codes = append(codes, r)
153 | 		}
154 | 		// Add space between arguments if output is plain text.
155 | 		if *doText && i < len(flag.Args())-1 {
156 | 			codes = append(codes, ' ')
157 | 		}
158 | 	}
159 | 	return codes
160 | }
161 | 
162 | func argsAreNames() []rune {
163 | 	var codes []rune
164 | 	for i, a := range flag.Args() {
165 | 		for _, r := range a {
166 | 			codes = append(codes, r)
167 | 		}
168 | 		// Add space between arguments if output is plain text.
169 | 		if *doText && i < len(flag.Args())-1 {
170 | 			codes = append(codes, ' ')
171 | 		}
172 | 	}
173 | 	return codes
174 | }
175 | 
176 | func parseRune(s string) rune {
177 | 	r, err := strconv.ParseInt(s, 16, 22)
178 | 	if err != nil {
179 | 		fatalf("%s", err)
180 | 	}
181 | 	return rune(r)
182 | }
183 | 
184 | func argsAreNumbers() []rune {
185 | 	var codes []rune
186 | 	for _, a := range flag.Args() {
187 | 		if s := strings.Split(a, "-"); len(s) == 2 {
188 | 			printRange = true
189 | 			r1 := parseRune(s[0])
190 | 			r2 := parseRune(s[1])
191 | 			if r2 < r1 {
192 | 				usage()
193 | 			}
194 | 			for ; r1 <= r2; r1++ {
195 | 				codes = append(codes, r1)
196 | 			}
197 | 			continue
198 | 		}
199 | 		codes = append(codes, parseRune(a))
200 | 	}
201 | 	return codes
202 | }
203 | 
204 | func argsAreRegexps() []rune {
205 | 	var codes []rune
206 | 	for _, a := range flag.Args() {
207 | 		re, err := regexp.Compile(a)
208 | 		if err != nil {
209 | 			fatalf("%s", err)
210 | 		}
211 | 		for i, line := range unicodeLines {
212 | 			fields := strings.Split(strings.ToLower(line), ";")
213 | 			line = fields[0] + "\t" + fields[1]
214 | 			if fields[10] != "" {
215 | 				line += "; " + fields[10]
216 | 			}
217 | 			if re.MatchString(line) {
218 | 				r, _ := runeOfLine(i, line)
219 | 				codes = append(codes, r)
220 | 			}
221 | 		}
222 | 	}
223 | 	return codes
224 | }
225 | 
226 | func splitLines(text string) []string {
227 | 	lines := strings.Split(text, "\n")
228 | 	// We get an empty final line; drop it.
229 | 	if len(lines) > 0 && len(lines[len(lines)-1]) == 0 {
230 | 		lines = lines[:len(lines)-1]
231 | 	}
232 | 	return lines
233 | }
234 | 
235 | func runeOfLine(i int, line string) (r rune, tab int) {
236 | 	tab = strings.IndexAny(line, "\t;")
237 | 	if tab < 0 {
238 | 		fatalf("malformed database: line %d", i)
239 | 	}
240 | 	return parseRune(line[0:tab]), tab
241 | }
242 | 
243 | func desc(codes []rune) {
244 | 	runeData := make(map[rune]string)
245 | 	for i, l := range unicodeLines {
246 | 		r, tab := runeOfLine(i, l)
247 | 		runeData[r] = l[tab+1:]
248 | 	}
249 | 	if *doUNIC {
250 | 		for _, r := range codes {
251 | 			fmt.Printf("%#U %s", r, dumpUnicode(runeData[r]))
252 | 		}
253 | 	} else if *doUnic {
254 | 		for _, r := range codes {
255 | 			fmt.Printf("%#U %s\n", r, runeData[r])
256 | 		}
257 | 	} else {
258 | 		for _, r := range codes {
259 | 			fields := strings.Split(strings.ToLower(runeData[r]), ";")
260 | 			desc := fields[0]
261 | 			if len(desc) >= 9 && fields[9] != "" {
262 | 				desc += "; " + fields[9]
263 | 			}
264 | 			fmt.Printf("%#U %s\n", r, desc)
265 | 		}
266 | 	}
267 | }
268 | 
269 | var prop = [...]string{
270 | 	"",
271 | 	"category: ",
272 | 	"canonical combining classes: ",
273 | 	"bidirectional category: ",
274 | 	"character decomposition mapping: ",
275 | 	"decimal digit value: ",
276 | 	"digit value: ",
277 | 	"numeric value: ",
278 | 	"mirrored: ",
279 | 	"Unicode 1.0 name: ",
280 | 	"10646 comment field: ",
281 | 	"uppercase mapping: ",
282 | 	"lowercase mapping: ",
283 | 	"titlecase mapping: ",
284 | }
285 | 
286 | func dumpUnicode(s string) []byte {
287 | 	fields := strings.Split(s, ";")
288 | 	if len(fields) == 0 {
289 | 		return []byte{'\n'}
290 | 	}
291 | 	b := new(bytes.Buffer)
292 | 	if len(fields) != len(prop) {
293 | 		fmt.Fprintf(b, "%s: can't print: expected %d fields, got %d\n", s, len(prop), len(fields))
294 | 		return b.Bytes()
295 | 	}
296 | 	for i, f := range fields {
297 | 		if f == "" {
298 | 			continue
299 | 		}
300 | 		if i > 0 {
301 | 			b.WriteByte('\t')
302 | 		}
303 | 		fmt.Fprintf(b, "%s%s\n", prop[i], f)
304 | 	}
305 | 	return b.Bytes()
306 | }
307 | 


--------------------------------------------------------------------------------