├── LICENSE ├── UnicodeData.txt ├── go.mod └── unicode.go /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * The names of its contributors may not be used to endorse or 14 | promote products derived from this software without specific prior 15 | written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module robpike.io/cmd/unicode 2 | 3 | go 1.16 4 | 5 | -------------------------------------------------------------------------------- /unicode.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | /* 6 | Unicode is a command-line tool for studying Unicode characters. 7 | 8 | usage: unicode [-c] [-d] [-n] [-t] 9 | 10 | -c: args are hex; output characters (xyz) 11 | -n: args are characters; output hex (23 or 23-44) 12 | -g: args are regular expressions for matching names 13 | -d: output textual description 14 | -t: output plain text, not one char per line 15 | -U: output full Unicode description 16 | 17 | Default behavior sniffs the arguments to select -c vs. -n. 18 | */ 19 | package main // import "robpike.io/cmd/unicode" 20 | 21 | import ( 22 | "bytes" 23 | _ "embed" 24 | "flag" 25 | "fmt" 26 | "os" 27 | "regexp" 28 | "strconv" 29 | "strings" 30 | ) 31 | 32 | var ( 33 | doNum = flag.Bool("n", false, "output numeric values") 34 | doChar = flag.Bool("c", false, "output characters") 35 | doText = flag.Bool("t", false, "output plain text") 36 | doDesc = flag.Bool("d", false, "describe the characters from the Unicode database, in simple form") 37 | doUnic = flag.Bool("u", false, "describe the characters from the Unicode database, in Unicode form") 38 | doUNIC = flag.Bool("U", false, "describe the characters from the Unicode database, in glorious detail") 39 | doGrep = flag.Bool("g", false, "grep for argument string in data") 40 | ) 41 | 42 | var printRange = false 43 | 44 | //go:generate sh -c "curl http://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt >UnicodeData.txt" 45 | var ( 46 | //go:embed UnicodeData.txt 47 | unicodeDataTxt string 48 | unicodeLines = splitLines(unicodeDataTxt) 49 | ) 50 | 51 | func main() { 52 | flag.Usage = usage 53 | flag.Parse() 54 | mode() 55 | var codes []rune 56 | switch { 57 | case *doGrep: 58 | codes = argsAreRegexps() 59 | case *doChar: 60 | codes = argsAreNumbers() 61 | case *doNum: 62 | codes = argsAreChars() 63 | } 64 | if *doUnic || *doUNIC || *doDesc { 65 | desc(codes) 66 | return 67 | } 68 | if *doText { 69 | fmt.Printf("%s\n", string(codes)) 70 | return 71 | } 72 | b := new(bytes.Buffer) 73 | for i, c := range codes { 74 | switch { 75 | case printRange: 76 | fmt.Fprintf(b, "%.4x %c", c, c) 77 | if i%4 == 3 { 78 | fmt.Fprint(b, "\n") 79 | } else { 80 | fmt.Fprint(b, "\t") 81 | } 82 | case *doChar: 83 | fmt.Fprintf(b, "%c\n", c) 84 | case *doNum: 85 | fmt.Fprintf(b, "%.4x\n", c) 86 | } 87 | } 88 | if b.Len() > 0 && b.Bytes()[b.Len()-1] != '\n' { 89 | fmt.Fprint(b, "\n") 90 | } 91 | fmt.Print(b) 92 | } 93 | 94 | func fatalf(format string, args ...interface{}) { 95 | if !strings.HasSuffix(format, "\n") { 96 | format += "\n" 97 | } 98 | fmt.Fprintf(os.Stderr, format, args...) 99 | os.Exit(2) 100 | } 101 | 102 | const usageText = `usage: unicode [-c] [-d] [-n] [-t] 103 | -c: args are hex; output characters (xyz) 104 | -n: args are characters; output hex (23 or 23-44) 105 | -g: args are regular expressions for matching names 106 | -d: output textual description 107 | -t: output plain text, not one char per line 108 | -U: output full Unicode description 109 | 110 | Default behavior sniffs the arguments to select -c vs. -n. 111 | ` 112 | 113 | func usage() { 114 | fatalf("%s", usageText) 115 | } 116 | 117 | // Mode determines whether we have numeric or character input. 118 | // If there are no flags, we sniff the first argument. 119 | func mode() { 120 | if len(flag.Args()) == 0 { 121 | usage() 122 | } 123 | // If grepping names, we need an output format defined; default is numeric. 124 | if *doGrep && !(*doNum || *doChar || *doDesc || *doUnic || *doUNIC) { 125 | *doNum = true 126 | } 127 | if *doNum || *doChar { 128 | return 129 | } 130 | alldigits := true 131 | numDash := 0 132 | for _, r := range strings.Join(flag.Args(), "") { 133 | if !strings.ContainsRune("0123456789abcdefABCDEF-", r) { 134 | alldigits = false 135 | } 136 | if r == '-' { 137 | numDash++ 138 | } 139 | } 140 | // If there is one '-' it's a range; if zero it's just a hex number. 141 | if alldigits && numDash <= 1 { 142 | *doChar = true 143 | return 144 | } 145 | *doNum = true 146 | } 147 | 148 | func argsAreChars() []rune { 149 | var codes []rune 150 | for i, a := range flag.Args() { 151 | for _, r := range a { 152 | codes = append(codes, r) 153 | } 154 | // Add space between arguments if output is plain text. 155 | if *doText && i < len(flag.Args())-1 { 156 | codes = append(codes, ' ') 157 | } 158 | } 159 | return codes 160 | } 161 | 162 | func argsAreNames() []rune { 163 | var codes []rune 164 | for i, a := range flag.Args() { 165 | for _, r := range a { 166 | codes = append(codes, r) 167 | } 168 | // Add space between arguments if output is plain text. 169 | if *doText && i < len(flag.Args())-1 { 170 | codes = append(codes, ' ') 171 | } 172 | } 173 | return codes 174 | } 175 | 176 | func parseRune(s string) rune { 177 | r, err := strconv.ParseInt(s, 16, 22) 178 | if err != nil { 179 | fatalf("%s", err) 180 | } 181 | return rune(r) 182 | } 183 | 184 | func argsAreNumbers() []rune { 185 | var codes []rune 186 | for _, a := range flag.Args() { 187 | if s := strings.Split(a, "-"); len(s) == 2 { 188 | printRange = true 189 | r1 := parseRune(s[0]) 190 | r2 := parseRune(s[1]) 191 | if r2 < r1 { 192 | usage() 193 | } 194 | for ; r1 <= r2; r1++ { 195 | codes = append(codes, r1) 196 | } 197 | continue 198 | } 199 | codes = append(codes, parseRune(a)) 200 | } 201 | return codes 202 | } 203 | 204 | func argsAreRegexps() []rune { 205 | var codes []rune 206 | for _, a := range flag.Args() { 207 | re, err := regexp.Compile(a) 208 | if err != nil { 209 | fatalf("%s", err) 210 | } 211 | for i, line := range unicodeLines { 212 | fields := strings.Split(strings.ToLower(line), ";") 213 | line = fields[0] + "\t" + fields[1] 214 | if fields[10] != "" { 215 | line += "; " + fields[10] 216 | } 217 | if re.MatchString(line) { 218 | r, _ := runeOfLine(i, line) 219 | codes = append(codes, r) 220 | } 221 | } 222 | } 223 | return codes 224 | } 225 | 226 | func splitLines(text string) []string { 227 | lines := strings.Split(text, "\n") 228 | // We get an empty final line; drop it. 229 | if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { 230 | lines = lines[:len(lines)-1] 231 | } 232 | return lines 233 | } 234 | 235 | func runeOfLine(i int, line string) (r rune, tab int) { 236 | tab = strings.IndexAny(line, "\t;") 237 | if tab < 0 { 238 | fatalf("malformed database: line %d", i) 239 | } 240 | return parseRune(line[0:tab]), tab 241 | } 242 | 243 | func desc(codes []rune) { 244 | runeData := make(map[rune]string) 245 | for i, l := range unicodeLines { 246 | r, tab := runeOfLine(i, l) 247 | runeData[r] = l[tab+1:] 248 | } 249 | if *doUNIC { 250 | for _, r := range codes { 251 | fmt.Printf("%#U %s", r, dumpUnicode(runeData[r])) 252 | } 253 | } else if *doUnic { 254 | for _, r := range codes { 255 | fmt.Printf("%#U %s\n", r, runeData[r]) 256 | } 257 | } else { 258 | for _, r := range codes { 259 | fields := strings.Split(strings.ToLower(runeData[r]), ";") 260 | desc := fields[0] 261 | if len(desc) >= 9 && fields[9] != "" { 262 | desc += "; " + fields[9] 263 | } 264 | fmt.Printf("%#U %s\n", r, desc) 265 | } 266 | } 267 | } 268 | 269 | var prop = [...]string{ 270 | "", 271 | "category: ", 272 | "canonical combining classes: ", 273 | "bidirectional category: ", 274 | "character decomposition mapping: ", 275 | "decimal digit value: ", 276 | "digit value: ", 277 | "numeric value: ", 278 | "mirrored: ", 279 | "Unicode 1.0 name: ", 280 | "10646 comment field: ", 281 | "uppercase mapping: ", 282 | "lowercase mapping: ", 283 | "titlecase mapping: ", 284 | } 285 | 286 | func dumpUnicode(s string) []byte { 287 | fields := strings.Split(s, ";") 288 | if len(fields) == 0 { 289 | return []byte{'\n'} 290 | } 291 | b := new(bytes.Buffer) 292 | if len(fields) != len(prop) { 293 | fmt.Fprintf(b, "%s: can't print: expected %d fields, got %d\n", s, len(prop), len(fields)) 294 | return b.Bytes() 295 | } 296 | for i, f := range fields { 297 | if f == "" { 298 | continue 299 | } 300 | if i > 0 { 301 | b.WriteByte('\t') 302 | } 303 | fmt.Fprintf(b, "%s%s\n", prop[i], f) 304 | } 305 | return b.Bytes() 306 | } 307 | --------------------------------------------------------------------------------