├── .gitignore ├── screenshots ├── blanks.png ├── emoji.png ├── marks.png ├── tamil.png ├── thai.png ├── strange.png └── ideographic_variations.png ├── Gemfile ├── lib ├── uniscribe │ ├── version.rb │ └── kernel_method.rb └── uniscribe.rb ├── MIT-LICENSE.txt ├── Rakefile ├── .github └── workflows │ └── test.yml ├── uniscribe.gemspec ├── CHANGELOG.md ├── bin └── uniscribe ├── CODE_OF_CONDUCT.md ├── README.md └── spec └── uniscribe_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | Gemfile.lock 2 | /pkg 3 | -------------------------------------------------------------------------------- /screenshots/blanks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/blanks.png -------------------------------------------------------------------------------- /screenshots/emoji.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/emoji.png -------------------------------------------------------------------------------- /screenshots/marks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/marks.png -------------------------------------------------------------------------------- /screenshots/tamil.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/tamil.png -------------------------------------------------------------------------------- /screenshots/thai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/thai.png -------------------------------------------------------------------------------- /screenshots/strange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/strange.png -------------------------------------------------------------------------------- /screenshots/ideographic_variations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/uniscribe/HEAD/screenshots/ideographic_variations.png -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "minitest" 6 | gem "rake" 7 | gem "irb" unless RUBY_ENGINE == "jruby" 8 | -------------------------------------------------------------------------------- /lib/uniscribe/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Uniscribe 4 | VERSION = "1.12.0" 5 | 6 | UNICODE_VERSION = "17.0.0" 7 | EMOJI_VERSION = "17.0" 8 | end 9 | -------------------------------------------------------------------------------- /lib/uniscribe/kernel_method.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative '../uniscribe' 4 | 5 | module Kernel 6 | private 7 | 8 | def uniscribe(string, **kwargs) 9 | Uniscribe.of(string, **kwargs) 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2024 Jan Lelis, https://janlelis.com 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # # # 2 | # Get gemspec info 3 | 4 | gemspec_file = Dir['*.gemspec'].first 5 | gemspec = eval File.read(gemspec_file), binding, gemspec_file 6 | info = "#{gemspec.name} | #{gemspec.version} | " \ 7 | "#{gemspec.runtime_dependencies.size} dependencies | " \ 8 | "#{gemspec.files.size} files" 9 | 10 | # # # 11 | # Gem build and install task 12 | 13 | desc info 14 | task :gem do 15 | puts info + "\n\n" 16 | print " "; sh "gem build #{gemspec_file}" 17 | FileUtils.mkdir_p 'pkg' 18 | FileUtils.mv "#{gemspec.name}-#{gemspec.version}.gem", 'pkg' 19 | puts; sh %{gem install --no-document pkg/#{gemspec.name}-#{gemspec.version}.gem} 20 | end 21 | 22 | # # # 23 | # Start an IRB session with the gem loaded 24 | 25 | desc "#{gemspec.name} | IRB" 26 | task :irb do 27 | sh "irb -I ./lib -r #{gemspec.name.gsub '-','/'}/kernel_method" 28 | end 29 | 30 | # # # 31 | # Run specs 32 | 33 | desc "#{gemspec.name} | Spec" 34 | task :spec do 35 | if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ 36 | sh "for %f in (spec/\*.rb) do ruby spec/%f" 37 | else 38 | sh "for file in spec/*.rb; do ruby $file; done" 39 | end 40 | end 41 | task default: :spec 42 | 43 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Ruby ${{ matrix.ruby }} (${{ matrix.os }}) 8 | if: "!contains(github.event.head_commit.message, '[skip ci]')" 9 | strategy: 10 | matrix: 11 | ruby: 12 | - '3.3' 13 | - '3.2' 14 | - '3.1' 15 | - '3.0' 16 | - jruby 17 | - truffleruby 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | runs-on: ${{matrix.os}} 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Ruby 25 | uses: ruby/setup-ruby@v1 26 | with: 27 | ruby-version: ${{matrix.ruby}} 28 | bundler-cache: true 29 | - name: Run tests 30 | run: bundle exec rake 31 | 32 | test-windows: 33 | name: Ruby ${{ matrix.ruby }} (windows-latest) 34 | if: "!contains(github.event.head_commit.message, '[skip ci]')" 35 | strategy: 36 | matrix: 37 | ruby: 38 | - '3.4' 39 | - '3.3' 40 | - '3.2' 41 | - '3.1' 42 | - '3.0' 43 | - jruby 44 | runs-on: windows-latest 45 | steps: 46 | - uses: actions/checkout@v2 47 | - name: Set up Ruby 48 | uses: ruby/setup-ruby@v1 49 | with: 50 | ruby-version: ${{matrix.ruby}} 51 | bundler-cache: true 52 | - name: Run tests 53 | run: bundle exec rake 54 | -------------------------------------------------------------------------------- /uniscribe.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | require File.dirname(__FILE__) + "/lib/uniscribe/version" 4 | 5 | Gem::Specification.new do |gem| 6 | gem.name = "uniscribe" 7 | gem.version = Uniscribe::VERSION 8 | gem.summary = "Describes Unicode characters" 9 | gem.description = "Explains Unicode characters/code points: Displays their name, category, and shows compositions" 10 | gem.authors = ["Jan Lelis"] 11 | gem.email = ["hi@ruby.consulting"] 12 | gem.homepage = "https://github.com/janlelis/uniscribe" 13 | gem.license = "MIT" 14 | 15 | gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|screenshots)/} 16 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 17 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 18 | gem.require_paths = ["lib"] 19 | gem.metadata = { "rubygems_mfa_required" => "true" } 20 | 21 | gem.required_ruby_version = ">= 2.1" 22 | gem.add_dependency "unicode-name", "~> 1.14" 23 | gem.add_dependency "unicode-sequence_name", "~> 1.16" 24 | gem.add_dependency "unicode-display_width", "~> 3.2" 25 | gem.add_dependency "unicode-emoji", "~> 4.1" 26 | gem.add_dependency "unicode-version", "~> 1.4" 27 | gem.add_dependency "symbolify", "~> 1.4" 28 | gem.add_dependency "characteristics", "~> 1.8" 29 | gem.add_dependency "paint", ">= 0.9", "< 3.0" 30 | gem.add_dependency "rationalist", "~> 2.0", ">= 2.0.1" 31 | end 32 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## CHANGELOG 2 | 3 | ### 1.12.0 4 | 5 | - Bump dependencies to Unicode 17.0 6 | 7 | ### 1.11.1 8 | 9 | - Allow unicode-display_width and unicode-emoji to be upgraded to next major version 10 | 11 | ### 1.11.0 12 | 13 | - Bump unicode-name and unicode-sequence_name: 14 | Fixed and improved character names, see respective changelogs 15 | 16 | ### 1.10.0 17 | 18 | - Update Unicode data to 16.0 (except for grapheme detection, which depends on Ruby's version) 19 | - Update Emoji data to 16.0 20 | 21 | ### 1.9.0 22 | 23 | - Update Unicode data to 15.1 (except for grapheme detection, which depends on Ruby's version) 24 | - Update Emoji data to 15.1 25 | 26 | ### 1.8.0 27 | 28 | - Update Unicode data to 15.0 (except for grapheme detection, which depends on Ruby's version) 29 | - Update Emoji data to 15.0 30 | 31 | ### 1.7.0 32 | 33 | - Update Unicode data to 14.0 (except for grapheme detection, which depends on Ruby's version) 34 | - Update Emoji data to 14.0 35 | 36 | ### 1.6.0 37 | 38 | - Update Unicode data to 13.0 (except for grapheme detection, which depends on Ruby's version) 39 | - Update Emoji data to 13.1 40 | 41 | ### 1.5.1 42 | 43 | - Move unicode-version from version.rb to uniscribe.rb to fix gemspec loading 44 | 45 | ### 1.5.0 46 | 47 | - Update Emoji data to 13.0 48 | - Use unicode-version gem for Ruby / Unicode mapping 49 | - Fix Object#=~ deprecation warning 50 | 51 | ### 1.4.0 52 | 53 | - Update Unicode (and Emoji) data to 12.1 (except for grapheme detection) 54 | 55 | ### 1.3.0 56 | 57 | - Update Unicode data to 12.0 (except for grapheme detection) 58 | 59 | ### 1.2.0 60 | 61 | - Update Unicode data to 11.0 (except for grapheme detection) 62 | 63 | ### 1.1.0 64 | 65 | - Update Unicode data to 10.0 (except for grapheme detection) 66 | 67 | ### 1.0.0 68 | 69 | - Initial release 70 | -------------------------------------------------------------------------------- /bin/uniscribe: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "rationalist" 5 | require "uniscribe" 6 | 7 | argv = Rationalist.parse( 8 | ARGV, 9 | string: '_', 10 | alias: { 11 | e: 'encoding', 12 | v: 'version', 13 | }, 14 | boolean: [ 15 | 'help', 16 | 'version', 17 | 'wide-ambiguous', 18 | ] 19 | ) 20 | 21 | if argv[:version] 22 | puts "uniscribe #{Uniscribe::VERSION} by #{Paint["J-_-L", :bold]} " 23 | puts "Unicode version is #{Uniscribe::UNICODE_VERSION} (glyph detection #{Uniscribe::UNICODE_VERSION_GLYPH_DETECTION || "[not supported]"})" 24 | puts "Emoji version is #{Uniscribe::EMOJI_VERSION}" 25 | exit(0) 26 | end 27 | 28 | if argv[:help] 29 | puts <<-HELP 30 | 31 | #{Paint["DESCRIPTION", :underline]} 32 | 33 | Describes a string of Unicode characters with their name and shows compositions. 34 | 35 | #{Paint["USAGE", :underline]} 36 | 37 | #{Paint["uniscribe", :bold]} [options] data 38 | 39 | --encoding | -e | which (Unicode) encoding to use for given data 40 | --help | | this help page 41 | --version | -v | displays version of uniscribe 42 | --wide-ambiguous | | ambiguous characters 43 | 44 | #{Paint["COLOR CODING", :underline]} 45 | 46 | #{Paint["blank", Uniscribe::COLORS[:blank]]} 47 | #{Paint["control", Uniscribe::COLORS[:control]]} 48 | #{Paint["format", Uniscribe::COLORS[:format]]} 49 | #{Paint["mark", Uniscribe::COLORS[:mark]]} 50 | #{Paint["unassigned", Uniscribe::COLORS[:unassigned]]} 51 | #{Paint["unassigned and ignorable", Uniscribe::COLORS[:ignorable]]} 52 | 53 | random color for other characters and compositions 54 | 55 | #{Paint["MORE INFO", :underline]} 56 | 57 | https://github.com/janlelis/uniscribe 58 | 59 | HELP 60 | exit(0) 61 | end 62 | 63 | if argv[:_] && argv[:_][0] 64 | data = argv[:_][0] 65 | elsif !$stdin.tty? 66 | data = $stdin.read 67 | else 68 | data = nil 69 | end 70 | 71 | begin 72 | Uniscribe.of(data) 73 | rescue ArgumentError 74 | $stderr.puts Paint[$!.message, :red] 75 | exit(1) 76 | end 77 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at opensource@janlelis.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uniscribe | Describe the Unicode [![[version]](https://badge.fury.io/rb/uniscribe.svg)](https://badge.fury.io/rb/uniscribe) [![[ci]](https://github.com/janlelis/uniscribe/workflows/Test/badge.svg)](https://github.com/janlelis/uniscribe/actions?query=workflow%3ATest) 2 | 3 | Describes Unicode characters with their name and shows compositions. **UNICODE 17.0**\* 4 | 5 | - Helps you understand how glyphs and codepoints are structured within the data 6 | - Gives you the names of glyphs and codepoints, which can be used for further research 7 | - Highlights invalid/special/blank codepoints 8 | 9 | Uses a similar color coding like its lower-level companion tool [unibits](https://github.com/janlelis/unibits). 10 | 11 | ## Setup 12 | 13 | Make sure you have Ruby installed and installing gems works properly. Then do: 14 | 15 | ``` 16 | $ gem install uniscribe 17 | ``` 18 | 19 | ## Usage 20 | 21 | Pass the string to debug to uniscribe: 22 | 23 | ### From CLI 24 | 25 | ``` 26 | $ uniscribe "test strı̈ng" 27 | ``` 28 | 29 | ### From Ruby 30 | 31 | ```ruby 32 | require "uniscribe/kernel_method" 33 | uniscribe "test strı̈ng" 34 | ``` 35 | 36 | ### Output 37 | 38 | ``` 39 | 40 | 0074 ├─ t ├─ LATIN SMALL LETTER T 41 | 0065 ├─ e ├─ LATIN SMALL LETTER E 42 | 0073 ├─ s ├─ LATIN SMALL LETTER S 43 | 0074 ├─ t ├─ LATIN SMALL LETTER T 44 | 0020 ├─ ] [ ├─ SPACE 45 | 0073 ├─ s ├─ LATIN SMALL LETTER S 46 | 0074 ├─ t ├─ LATIN SMALL LETTER T 47 | 0072 ├─ r ├─ LATIN SMALL LETTER R 48 | ---- ├┬ ı̈ ├┬ Composition 49 | 0131 │├─ ı │├─ LATIN SMALL LETTER DOTLESS I 50 | 0308 │└─ ◌̈ │└─ COMBINING DIAERESIS 51 | 006E ├─ n ├─ LATIN SMALL LETTER N 52 | 0067 ├─ g ├─ LATIN SMALL LETTER G 53 | 54 | ``` 55 | 56 | ## Examples 57 | 58 | ### Tamil 59 | 60 | `>> uniscribe "நகரத்தில்"` 61 | 62 | ![Screenshot Tamil](/screenshots/tamil.png?raw=true "Tamil") 63 | 64 | ### Thai 65 | 66 | `>> uniscribe "ม้าลายหกตัว"` 67 | 68 | ![Screenshot Thai](/screenshots/thai.png?raw=true "Thai") 69 | 70 | ### Ideographic Variations 71 | 72 | `>> uniscribe "辻󠄀㚑󠄁"` 73 | 74 | ![Screenshot Ideographic Variations](/screenshots/ideographic_variations.png?raw=true "Ideographic Variations") 75 | 76 | (the variation is not visible in the screenshot, because my system does not render it correctly) 77 | 78 | ### Emoji Sequences 79 | 80 | `>> uniscribe "3️⃣🤸‍♀"` 81 | 82 | ![Screenshot Emoji](/screenshots/emoji.png?raw=true "Emoji") 83 | 84 | ### Lots of Combining Marks 85 | 86 | `>> uniscribe "̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍"` 87 | 88 | ![Screenshot Marks](/screenshots/marks.png?raw=true "Marks") 89 | 90 | ### Random Sequences of some Special Unicode Codepoints 91 | 92 | `>> uniscribe "\0A\u{E01D7}\x7F\r\n\u{D0000}\u{81}\u{FFF9}B\u{FFFB}🏴\u{E0061}\u{E007F}\u{10FFFF}"` 93 | 94 | ![Screenshot Strange](/screenshots/strange.png?raw=true "Strange") 95 | 96 | ### Some Blanks 97 | 98 | `>> uniscribe "­ᅠ 𝅸"` 99 | 100 | ![Screenshot Blanks](/screenshots/blanks.png?raw=true "Blanks") 101 | 102 | ## \*Notes 103 | 104 | Although the gem is generally up to date with Unicode 16.0, the proper detection of compositions / graphemes / combined characters [depends on your Ruby version](https://idiosyncratic-ruby.com/73-unicode-version-mapping.html): 105 | 106 | You can run `uniscribe -v` to check for the Unicode level of your uniscribe version. 107 | 108 | Also see 109 | 110 | - CLI: [unibits](https://github.com/janlelis/unibits) - visualizes Unicode encodings 111 | - CLI: [unicopy](https://github.com/janlelis/unicopy) - copy codepoints to clipboard 112 | - Website: [character.construction](https://character.construction) - lists notable codepoints 113 | - Ruby Library: [symbolify](https://github.com/janlelis/symbolify) - used for safely printing individual codepoints 114 | - Ruby Library: [characteristics](https://github.com/janlelis/characteristics) - used for detecting blanks and similar 115 | - Unicode® Standard Annex #29: [Unicode Text Segmentation](https://unicode.org/reports/tr29/) 116 | - Talk: [Ten Unicode Characters You Should Know About as a Programmer](https://www.youtube.com/watch?v=hlryzsdGtZo) 117 | 118 | Copyright (C) 2017-2024 Jan Lelis . Released under the MIT license. 119 | -------------------------------------------------------------------------------- /lib/uniscribe.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "uniscribe/version" 4 | 5 | require "unicode/name" 6 | require "unicode/sequence_name" 7 | require "symbolify" 8 | require "characteristics" 9 | require "paint" 10 | require "unicode/display_width" 11 | require "unicode/emoji" 12 | require "unicode/version" 13 | 14 | module Uniscribe 15 | UNICODE_VERSION_GLYPH_DETECTION = RUBY_ENGINE == "ruby" && 16 | Unicode::Version.unicode_version 17 | 18 | SUPPORTED_ENCODINGS = Encoding.name_list.grep( 19 | Regexp.union( 20 | /^UTF-8$/, 21 | /^UTF8-/, 22 | /^UTF-...E$/, 23 | /^US-ASCII$/, 24 | /^ISO-8859-1$/, 25 | ) 26 | ).sort.freeze 27 | 28 | COLORS = { 29 | control: "#0000FF", 30 | blank: "#33AADD", 31 | format: "#FF00FF", 32 | mark: "#228822", 33 | unassigned: "#FF5500", 34 | ignorable: "#FFAA00", 35 | } 36 | 37 | def self.of(string, encoding: nil, wide_ambiguous: false) 38 | string = convert_to_encoding_or_raise(string, encoding) 39 | glyphs = string.encode("UTF-8").scan(/\X/) 40 | 41 | visualize(glyphs, wide_ambiguous: wide_ambiguous) 42 | end 43 | 44 | def self.convert_to_encoding_or_raise(string, encoding) 45 | raise ArgumentError, "no data given to uniscribe" if !string || string.empty? 46 | 47 | string.force_encoding(encoding) if encoding 48 | 49 | case string.encoding.name 50 | when *SUPPORTED_ENCODINGS 51 | unless string.valid_encoding? 52 | raise ArgumentError, "uniscribe can only describe strings with a valid encoding" 53 | end 54 | 55 | string 56 | when 'UTF-16', 'UTF-32' 57 | raise ArgumentError, "unibits only supports #{string.encoding.name} with specified endianess, please use #{string.encoding.name}LE or #{string.encoding.name}BE" 58 | else 59 | raise ArgumentError, "uniscribe can only describe Unicode strings (or US-ASCII or ISO-8859-1)" 60 | end 61 | end 62 | 63 | def self.visualize(glyphs, wide_ambiguous: false) 64 | puts 65 | ( glyphs[0..-2] || [] ).each{ |glyph| 66 | cps = glyph.codepoints 67 | if cps.size > 1 68 | puts_composition(cps, wide_ambiguous) 69 | else 70 | puts_codepoint(cps[0], false, false, wide_ambiguous) 71 | end 72 | } 73 | 74 | cps = glyphs[-1].codepoints 75 | if cps.size > 1 76 | puts_composition(cps, wide_ambiguous) 77 | else 78 | puts_codepoint(cps[0], false, true, wide_ambiguous) 79 | end 80 | puts 81 | end 82 | 83 | def self.puts_composition(cps, wide_ambiguous = false) 84 | char = cps.pack("U*") 85 | if sequence_name = Unicode::SequenceName.of(char) 86 | name = "Composition: #{sequence_name}" 87 | else 88 | name = "Composition" 89 | end 90 | char_color = random_color 91 | cp_hex = "----" 92 | symbolified_char = symbolify_composition(char) 93 | padding = determine_padding(symbolified_char, false, wide_ambiguous) 94 | 95 | puts " %s ├┬ %s%s├┬ %s" % [ 96 | Paint[cp_hex, char_color], 97 | Paint[symbolified_char, char_color], 98 | padding, 99 | Paint[name, char_color], 100 | ] 101 | ( cps[0..-2] || [] ).each{ |cp| 102 | puts_codepoint(cp, true, false, wide_ambiguous) 103 | } 104 | puts_codepoint(cps[-1], true, true, wide_ambiguous) 105 | end 106 | 107 | def self.puts_codepoint(cp, composed = false, last = false, wide_ambiguous = false) 108 | char = [cp].pack("U*") 109 | char_info = UnicodeCharacteristics.new(char) 110 | char_color = determine_codepoint_color(char_info) 111 | cp_hex = cp.to_s(16).rjust(4, "0").rjust(6).upcase 112 | symbolified_char = Symbolify.unicode(char, char_info) 113 | if composed && !last 114 | branch = "│├─" 115 | elsif composed && last 116 | branch = "│└─" 117 | else 118 | branch = "├─" 119 | end 120 | name = determine_codepoint_name(char) 121 | padding = determine_padding(symbolified_char, composed, wide_ambiguous) 122 | 123 | puts " %s %s %s%s%s %s" % [ 124 | Paint[cp_hex, char_color], 125 | branch, 126 | Paint[symbolified_char, char_color], 127 | padding, 128 | branch, 129 | Paint[name, char_color], 130 | ] 131 | end 132 | 133 | def self.determine_codepoint_color(char_info) 134 | if !char_info.assigned? 135 | if char_info.ignorable? 136 | COLORS[:ignorable] 137 | else 138 | COLORS[:unassigned] 139 | end 140 | elsif char_info.blank? 141 | COLORS[:blank] 142 | elsif char_info.control? 143 | COLORS[:control] 144 | elsif char_info.format? 145 | COLORS[:format] 146 | elsif char_info.unicode? && char_info.category[0] == "M" 147 | COLORS[:mark] 148 | else 149 | random_color 150 | end 151 | end 152 | 153 | def self.random_color 154 | "%.2x%.2x%.2x" % [rand(90) + 60, rand(90) + 60, rand(90) + 60] 155 | end 156 | 157 | def self.determine_codepoint_name(char) 158 | name = Unicode::Name.correct(char) 159 | return name if name 160 | 161 | name = Unicode::Name.label(char) 162 | as = Unicode::Name.aliases(char) 163 | return name if !as 164 | 165 | alias_ = ( as[:control] && as[:control][0] || 166 | as[:figment] && as[:figment][0] || 167 | as[:alternate] && as[:alternate][0] || 168 | as[:abbreviation] && as[:abbreviation][0] ) 169 | return name if !alias_ 170 | 171 | name + " " + alias_ 172 | end 173 | 174 | def self.determine_padding(char, composed, wide_ambiguous) 175 | required_width = Unicode::DisplayWidth.of(char, wide_ambiguous ? 2 : 1, {}, emoji: true) 176 | required_width += 1 if composed 177 | required_width = 0 if required_width < 0 178 | 179 | case required_width 180 | when 0...5 181 | "\t\t" 182 | when 5...10 183 | "\t" 184 | else 185 | "" 186 | end 187 | end 188 | 189 | def self.symbolify_composition(char) 190 | char_infos = char.chars.map{ |c| UnicodeCharacteristics.new(c) } 191 | 192 | case 193 | when char_infos.any?{ |c| !c.assigned? } 194 | "n/a" 195 | when char_infos.all?{ |c| c.separator? } 196 | "⏎" 197 | when char_infos.all?{ |c| c.category == "Mn" || c.category == "Me" } 198 | if char_infos.any?{ |c| c.category == "Mn" } 199 | "◌" + char 200 | else 201 | " " + char 202 | end 203 | when char_infos.all?{ |c| c.blank? } 204 | "]" + char + "[" 205 | else 206 | char 207 | end 208 | end 209 | end 210 | 211 | -------------------------------------------------------------------------------- /spec/uniscribe_spec.rb: -------------------------------------------------------------------------------- 1 | require_relative "../lib/uniscribe/kernel_method" 2 | require "minitest/autorun" 3 | 4 | describe Uniscribe do 5 | def check(string_to_test, match_regex) 6 | uniscribe(string_to_test) 7 | assert_output(match_regex){ uniscribe(string_to_test) } 8 | end 9 | 10 | describe "displays codepoints" do 11 | it "LATIN CAPITAL LETTER" do 12 | check "AB", /0041.*0042/m 13 | end 14 | 15 | it "AERIAL TRAMWAY" do 16 | check "🚡", /1F6A1/ 17 | end 18 | end 19 | 20 | describe "displays glyph itself" do 21 | it "LATIN CAPITAL LETTER" do 22 | check "AB", /A.*B/m 23 | end 24 | 25 | it "AERIAL TRAMWAY" do 26 | check "🚡", /🚡/ 27 | end 28 | end 29 | 30 | describe "displays names" do 31 | it "LATIN CAPITAL LETTER" do 32 | check "AB", /LATIN CAPITAL LETTER A.*LATIN CAPITAL LETTER B/m 33 | end 34 | 35 | it "AERIAL TRAMWAY" do 36 | check "🚡", /AERIAL TRAMWAY/ 37 | end 38 | end 39 | 40 | describe "supported encodings" do 41 | it "works with UTF-16" do 42 | check "🚡".encode("UTF-16LE"), /AERIAL TRAMWAY/ 43 | end 44 | 45 | it "works with UTF-32" do 46 | check "🚡".encode("UTF-32BE"), /AERIAL TRAMWAY/ 47 | end 48 | 49 | it "works with US-ASCII" do 50 | check "AB".force_encoding("US-ASCII"), /LATIN CAPITAL LETTER A.*LATIN CAPITAL LETTER B/m 51 | end 52 | 53 | it "works with ISO-8859-1" do 54 | check "AB\x81".force_encoding("ISO-8859-1"), /LATIN CAPITAL LETTER A.*LATIN CAPITAL LETTER B.* HIGH OCTET PRESET/m 55 | end 56 | end 57 | 58 | describe "example compositions" do 59 | describe "combining marks" do 60 | it "DIAERESIS" do 61 | check "g̈", /Composition.*LATIN SMALL LETTER G.*DIAERESIS/m 62 | end 63 | 64 | it "RING BELOW" do 65 | check "n̥", /Composition.*LATIN SMALL LETTER N.*COMBINING RING BELOW/m 66 | end 67 | 68 | it "ARABIC FATHA" do 69 | check "دَ", /Composition.*ARABIC LETTER DAL.*ARABIC FATHA/m 70 | end 71 | 72 | it "ACUTE ACCENT" do 73 | check "ά", /Composition.*GREEK SMALL LETTER ALPHA.*COMBINING ACUTE ACCENT/m 74 | end 75 | 76 | it "HEBREW POINT HIRIQ" do 77 | check "חִ", /Composition.*HEBREW LETTER HET.*HEBREW POINT HIRIQ/m 78 | end 79 | 80 | it "THAI CHARACTER SARA U" do 81 | check "จุ", /Composition.*THAI CHARACTER CHO CHAN.*THAI CHARACTER SARA U/m 82 | end 83 | end 84 | 85 | describe "misc scripts" do 86 | if RUBY_VERSION >= "2.4.0" 87 | it "HANGUL" do 88 | check "ᅘᆇᇈ", /Composition.*HANGUL CHOSEONG SSANGHIEUH.*HANGUL JUNGSEONG YO-O.*HANGUL JONGSEONG NIEUN-PANSIOS/m 89 | end 90 | 91 | it "HANGUL 2" do 92 | check "각", /Composition.*HANGUL CHOSEONG KIYEOK.*HANGUL JUNGSEONG A.*HANGUL JONGSEONG KIYEOK/m 93 | end 94 | 95 | it "HANGUL 3" do 96 | check "ᄇᄉᄐ", /Composition.*HANGUL CHOSEONG PIEUP.*HANGUL CHOSEONG SIOS.*HANGUL CHOSEONG THIEUTH/m 97 | end 98 | 99 | it "TAMIL" do 100 | check "நி", /Composition.*TAMIL SYLLABLE NI.*TAMIL LETTER NA.*TAMIL VOWEL SIGN I/m 101 | end 102 | 103 | it "DEVANAGARI" do 104 | check "षि", /Composition.*DEVANAGARI LETTER SSA.*DEVANAGARI VOWEL SIGN I/m 105 | end 106 | end 107 | end 108 | 109 | describe "zwj and zwnj" do 110 | if RUBY_VERSION >= "2.4.0" 111 | it "ZWJ" do 112 | check "क्‍", /Composition.*DEVANAGARI LETTER KA.*DEVANAGARI SIGN VIRAMA.*ZERO WIDTH JOINER/m 113 | end 114 | 115 | it "ZWNJ" do 116 | check "t‌", /Composition.*LATIN SMALL LETTER T.*ZERO WIDTH NON-JOINER/m 117 | end 118 | end 119 | end 120 | 121 | describe "misc variations" do 122 | it "TEXT STYLE" do 123 | check "‼︎", /Composition.*(text style).*DOUBLE EXCLAMATION MARK.*VARIATION SELECTOR-15/m 124 | end 125 | 126 | it "EMOJI STYLE" do 127 | check "‼️", /Composition.*(emoji style).*DOUBLE EXCLAMATION MARK.*VARIATION SELECTOR-16/m 128 | end 129 | 130 | it "DOTTED FORM" do 131 | check "င︀", /Composition.*(dotted form).*MYANMAR LETTER NGA.*VARIATION SELECTOR-1/m 132 | end 133 | 134 | it "MONGOLIAN SECOND FORM" do 135 | check "ᠠ᠋", /Composition.*(second form).*MONGOLIAN LETTER A.*MONGOLIAN FREE VARIATION SELECTOR ONE/m 136 | end 137 | 138 | it "CJK COMPATIBILITY IDEOGRAPH-2F81F" do 139 | check "㓟︀", /Composition.*CJK COMPATIBILITY IDEOGRAPH-2F81F.*CJK UNIFIED IDEOGRAPH-34DF.*VARIATION SELECTOR-1/m 140 | end 141 | 142 | it "CID+6238" do 143 | check "胥󠄀", /Composition.*CID\+6238.*CJK UNIFIED IDEOGRAPH-80E5.*VARIATION SELECTOR-17/m 144 | end 145 | end 146 | 147 | describe "misc other" do 148 | it "KEYCAP" do 149 | check "5⃣", /Composition.*DIGIT FIVE.*COMBINING ENCLOSING KEYCAP/m 150 | end 151 | 152 | if RUBY_VERSION >= "2.4.0" 153 | it "␍ + ␊" do 154 | check "\r\n", /Composition.* CARRIAGE RETURN.* LINE FEED/m 155 | end 156 | 157 | it "REGIONAL" do 158 | check "🇺🇳", /Composition.*UNITED NATIONS.*REGIONAL INDICATOR SYMBOL LETTER U.*REGIONAL INDICATOR SYMBOL LETTER N/m 159 | end 160 | 161 | it "TAG SEQUENCE" do 162 | check "🏴󠁧󠁢󠁳󠁣󠁴󠁿", /Composition.*SCOTLAND.*WAVING BLACK FLAG.*TAG LATIN SMALL LETTER G.*TAG LATIN SMALL LETTER B.*TAG LATIN SMALL LETTER S.*TAG LATIN SMALL LETTER C.*TAG LATIN SMALL LETTER T.*CANCEL TAG/m 163 | end 164 | 165 | it "EMOJI MODIFIER" do 166 | check "🙅🏿", /Composition.*PERSON GESTURING NO: DARK SKIN TONE.*FACE WITH NO GOOD GESTURE.*EMOJI MODIFIER FITZPATRICK TYPE-6/m 167 | end 168 | 169 | it "EMOJI ZWJ SEQUENCE" do 170 | check "👩‍👩‍👦‍👦", /Composition.*FAMILY.*WOMAN.*ZERO WIDTH JOINER.*WOMAN.*ZERO WIDTH JOINER.*BOY.*ZERO WIDTH JOINER.*BOY/m 171 | end 172 | end 173 | end 174 | end 175 | 176 | describe "unusual codepoints" do 177 | if RUBY_VERSION >= "2.4.0" 178 | it "safely prints and highlights unusual codepoints" do 179 | check "\0A\u{E01D7}\x7F\r\n\u{D0000}\u{81}\u{FFF9}B\u{FFFB}🏴\u{E0061}\u{E007F}\u{10FFFF}", / NULL.*Composition.*LATIN CAPITAL LETTER A.*VARIATION SELECTOR-232.* DELETE.*Composition.* CARRIAGE RETURN.* LINE FEED.*.* HIGH OCTET PRESET.*INTERLINEAR ANNOTATION ANCHOR.*LATIN CAPITAL LETTER B.*INTERLINEAR ANNOTATION TERMINATOR.*Composition.*WAVING BLACK FLAG.*TAG LATIN SMALL LETTER A.*CANCEL TAG.*/m 180 | end 181 | end 182 | 183 | it "safely prints and highlights various blanks" do 184 | check "­ᅠ 𝅸", /SOFT HYPHEN.*HANGUL JUNGSEONG FILLER.*EM QUAD.*INHIBIT ARABIC FORM SHAPING.*ZERO WIDTH NO-BREAK SPACE.*MUSICAL SYMBOL END SLUR/m 185 | end 186 | end 187 | end 188 | --------------------------------------------------------------------------------